-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
69 lines (47 loc) · 1.71 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
from bs4 import BeautifulSoup
URL = 'https://pratham.org/'
def get_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
all_links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if href.startswith("/"):
section_url = f"{url}{href}"
elif href.startswith('http'):
section_url = href
else:
continue
all_links.append(section_url)
return set(all_links)
def extract_single_link(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
content = {
'headers': [header.get_text(strip=True) for header in soup.find_all(['h1', 'h2', 'h3'])],
'paragraphs': [para.get_text(strip=True) for para in soup.find_all('p')],
}
return content
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return {'headers': [], 'paragraphs': []}
def extract_all_links(links):
all_content = []
for link in links:
content = extract_single_link(link)
all_content.append(content)
return all_content
def clean_and_combine(all_content):
all_paragraphs = []
for data in all_content:
data['paragraphs'] = [para for para in data['paragraphs'] if len(para) >= 40]
all_paragraphs.extend(para_list for para_list in data['paragraphs'])
return all_paragraphs
def run_scraper():
links = get_links(URL)
content = extract_all_links(links)
all_paragraphs = clean_and_combine(content)
return all_paragraphs