-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
executable file
·110 lines (91 loc) · 3.54 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 14 12:16:51 2021
@author: Leila Zahedi
"""
import time
import requests
from bs4 import BeautifulSoup
import json
import string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
base = "https://www.nytimes.com"
browser = webdriver.Chrome('chromedriver')
wait = WebDriverWait(browser, 10)
browser.get('https://www.nytimes.com/search?dropmab=false&endDate=20210119&query=%E2%80%9CSanctuary%20cities%E2%80%9D&sort=best&startDate=20180101&types=article')
while True:
try:
time.sleep(1)
show_more = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@type="button"][contains(.,"Show More")]')))
show_more.click()
except Exception as e:
print(e)
break
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('ol', {'data-testid':'search-results'})
links = search_results.find_all('a')
with open('links.txt', 'w') as f:
for link in links:
f.write(str(link['href']))
f.write("\n")
with open("NYnews.csv","w") as f:
for link in links:
link_url = link['href']
title = link.find('h4')
try:
date= link.find_all("span")[0].get_text().split("|")[1].split("Page")[0][:-2]
except:
if link.find_all("span"):
date=link.find_all("span")[0].get_text().replace("PRINT EDITION", "")
if ", Page" in date:
date = date[:date.index(", Page")]
else:
date=None
if title is not None:
title = title.text
else:
title = None
try:
typeN= link['href'].split("/")[4]
except:
typeN=None
try:
author= str(link).split("By")[1].split("<")[0]
except:
author=None
print(str(date) + '\n'+ str(title) + '\n' + str(typeN) +
'\n' + str(author)+
'\n'+ str("https://www.nytimes.com")+ str(link['href']))
response = requests.get(base + link_url)
soup_link = BeautifulSoup(response.text, 'html.parser')
scripts = soup_link.find_all('script')
for script in scripts:
if script.text is not None and 'window.__preloadedData = ' in script.text:
jsonStr = script.text
jsonStr = jsonStr.split('window.__preloadedData = ')[-1]
jsonStr = jsonStr.rsplit(';',1)[0]
try:
jsonData = json.loads(jsonStr)
except:
continue
article = []
for k, v in jsonData['initialState'].items():
w=1
try:
if v['__typename'] == 'TextInline':
article.append(v['text'])
#print (v['text'])
except:
continue
article = [ each.strip() for each in article ]
article = ''.join([('' if c in string.punctuation else ' ')+c for c in article]).strip()
print (article + '\n')
f.write(str(date) + '\t'+ str(title) + '\t' + str(typeN) +
'\t' + str(author)+
'\t'+ str("https://www.nytimes.com")+ str(link['href'])+ '\t'+ article + '\n')
print("Complete")
browser.quit()