-
Notifications
You must be signed in to change notification settings - Fork 7
/
scraper_utils.py
124 lines (104 loc) · 4.31 KB
/
scraper_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import wikipediaapi
import pandas as pd
import concurrent.futures
from tqdm import tqdm
def wiki_scrape(start_page_name, verbose=True):
"""Method to scrape Wikipedia pages associated with/linked to a starting page
Parameters
----------
start_page_name : str
Name of page to start scraping from
verbose : bool, optional
Flag for displaying progress bar and verbose output
Returns
-------
sources : pd.DataFrame
DataFrame containing all scraped Wikipedia articles linked to start_page_name,
with entries ('page', 'text', 'link', 'categories')
References
----------
Modified from https://towardsdatascience.com/auto-generated-knowledge-graphs-92ca99a81121
"""
def follow_link(link):
"""Helper function to follow links using Wikipedia API
"""
try:
page = wiki_api.page(link)
if page.exists():
d = {'page': link, 'text': page.text, 'link': page.fullurl,
'categories': list(page.categories.keys())}
return d
else:
return None
except:
return None
# Instantiate Wikipedia API
wiki_api = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)
# Scrape starting page
page_name = wiki_api.page(start_page_name)
if not page_name.exists():
print('page does not exist')
return
# Initialize dict (to be converted to df)
sources = [{
'page': start_page_name,
'text': page_name.text,
'link': page_name.fullurl,
'categories': list(page_name.categories.keys())
}]
page_links = set(page_name.links.keys())
# Multiprocessing to parallely scrape from multiple pages
progress = tqdm(desc='Links Scraped', unit='', total=len(page_links)) if verbose else None
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Follow links from list of links
future_link = {executor.submit(follow_link, link): link for link in page_links}
for future in concurrent.futures.as_completed(future_link):
data = future.result()
progress.update(1) if verbose else None
if data:
# Update sources dict
sources.append(data)
progress.close() if verbose else None
# Convert dict to df
sources = pd.DataFrame(sources)
# Filter out generic Wikipedia pages
blacklist = ('Template', 'Help:', 'Category:', 'Portal:', 'Wikipedia:', 'Talk:')
sources = sources[(len(sources['text']) > 20)
& ~(sources['page'].str.startswith(blacklist))]
sources['categories'] = sources.categories.apply(lambda x: [y[9:] for y in x])
return sources
def build_category_whitelist(wiki_data, page_whitelist, cat_blacklist):
"""Helper function to build whitelist of page categories
This method finds a set of page categories which we can use to
reduce the amount of pages we use for building knowledge graphs.
Typically, we want to build KGs about particular domains and
specific pages.
Parameters
----------
wiki_data : pd.DataFrame
page_whitelist : list
List of pages from whose categories we select a domain-specific subset
cat_blacklist : list
List of categories which we don't want to include in the whitelist
Returns
-------
cat_whitelist : set/list
List of categories which we want to build KGs about
"""
cat_whitelist = []
for page_name in page_whitelist:
# Iterate over categies list for each page in the page whitelist
categories = list(wiki_data[wiki_data.page==page_name].categories)[0]
for cat in categories:
relevant_cat = True
for unwanted in cat_blacklist:
# If given category is part of blacklisted categories,
# do not add it to whitelist
if unwanted in cat:
relevant_cat = False
break
# All non-blacklisted categories from the page whitelist
# are added to categories whitelist
if relevant_cat:
cat_whitelist.append(cat)
return set(cat_whitelist)