-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_scraper_abad.py
83 lines (71 loc) · 3.75 KB
/
twitter_scraper_abad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
from requests_html import HTMLSession, HTML
from datetime import datetime
session = HTMLSession()
def get_tweets(user, pages=25):
"""Gets tweets for a given user, via the Twitter frontend API."""
url = f'https://twitter.com/i/profiles/show/{user}/timeline/tweets?include_available_features=1&include_entities=1&include_new_items_bar=true'
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': f'https://twitter.com/{user}',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
'X-Twitter-Active-User': 'yes',
'X-Requested-With': 'XMLHttpRequest'
}
amountPages = pages
def gen_tweets(pages):
r = session.get(url, headers=headers)
while pages > 0:
status = 'ok'
try:
html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8')
except:
# let other errors raise
status = 'page not found'
comma = ","
dot = "."
tweets = []
for tweet in html.find('.stream-item'):
try:
text = tweet.find('.tweet-text')[0].full_text
except:
continue
tweetId = tweet.find(
'.js-permalink')[0].attrs['data-conversation-id']
timestamp = datetime.fromtimestamp(
int(tweet.find('._timestamp')[0].attrs['data-time-ms'])/1000.0)
interactions = [x.text for x in tweet.find(
'.ProfileTweet-actionCount')]
replies = int(interactions[0].split(" ")[0].replace(comma, "").replace(dot,""))
retweets = int(interactions[1].split(" ")[
0].replace(comma, "").replace(dot,""))
likes = int(interactions[2].split(" ")[0].replace(comma, "").replace(dot,""))
hashtags = [hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag')]
urls = [url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')]
photos = [photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer')]
videos = []
video_nodes = tweet.find(".PlayableMedia-player")
for node in video_nodes:
styles = node.attrs['style'].split()
for style in styles:
if style.startswith('background'):
tmp = style.split('/')[-1]
video_id = tmp[:tmp.index('.jpg')]
videos.append({'id': video_id})
tweets.append({'tweetId': tweetId, 'time': timestamp, 'text': text,
'replies': replies, 'retweets': retweets, 'likes': likes,
'entries': {
'hashtags': hashtags, 'urls': urls,
'photos': photos, 'videos': videos
}
})
last_tweet = html.find('.stream-item')[-1].attrs['data-item-id']
for tweet in tweets:
if tweet:
tweet['text'] = re.sub('http', ' http', tweet['text'], 1)
yield {'tweet': tweet, 'status': status }
r = session.get(
url, params = {'max_position': last_tweet}, headers = headers)
pages += -1
print('progress:', (amountPages-pages)/amountPages * 100, '%')
yield from gen_tweets(pages)