forked from richardbatstone/Legal_copora
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LIscrape.py
47 lines (37 loc) · 1.82 KB
/
LIscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import scrapy
import csv
import re
class LIsrape(scrapy.Spider):
name = "LIscrape"
def start_requests(self):
URLlist = []
# I compiled a list of URLs to scrape (listed in TopDomains.txt) and added suffixes to URL before adding to the URLlist
# To run at command line: scrapy runspider LIscrape.py -s DOWNLOAD_DELAY=[x]
with open('TopDomains.txt', 'r') as file:
for line in file:
link = line[:-1]
for i in range(11):
j = str(i + 1)
link_ = link + j
URLlist.append(link_)
for url in URLlist:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self,response):
# Select the first div tag with a data-clause-snippet attribute.
# Extract and concatenate text from that tag into a single string.
# The reference to 'class = "hlt-add"' is to parse the mark-up style text.
# Remove \n breaks and strip leading/trailing white space.
# Try searching on word characters. If present, start output from first character (i.e. remove punctuation
# symbols etc. from start of string.)
clause = response.xpath('//div[@data-clause-snippet]')[0:10]
title = response.xpath('//title/text()').extract_first()
for item in clause:
text = "".join(item.xpath('./span[@class = "hlt-add"]/text()|./text()').extract())
textout = re.sub('\\n','',text).strip()
m = re.search('\w', textout)
if m:
textout = textout[m.start():]
with open('LIdata_out.csv', 'a', newline='') as outfile:
LIWriter = csv.writer(outfile, delimiter='|', quotechar='"',
quoting=csv.QUOTE_MINIMAL)
LIWriter.writerow([title, textout])