-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_sentiment.py
68 lines (47 loc) · 2.1 KB
/
process_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
import multiprocessing as mp
import re
from tqdm.auto import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from text2emotion import get_emotion
import nltk
import os
nltk.download('vader_lexicon')
nltk.download('omw-1.4')
ENG_TWEETS_FILE = '/Users/deryadurmush/Desktop/Jobs /balenciaga/balenciaga.pkl'
NON_ENG_TWEETS_FILE = '/Users/deryadurmush/Desktop/Jobs /balenciaga/translated_balenciaga.pkl'
OUTPUT_FILE = 'preprocessed_tweets.pkl'
tweets = pd.read_pickle(ENG_TWEETS_FILE)
eng_tweets = tweets[tweets['lang'] == 'en']
non_eng_tweets = pd.read_pickle(NON_ENG_TWEETS_FILE)
tweets = pd.concat([eng_tweets, non_eng_tweets], axis=0).reset_index(drop=True)
def clean_tweet(tweet):
"""Cleans a tweet by removing URLs, mentions, hashtags, and non-alphanumeric characters."""
tweet = re.sub(r'http\S+', '', tweet)
tweet = re.sub(r'@\S+', '', tweet)
tweet = re.sub(r'#\S+', '', tweet)
tweet = re.sub(r'[^a-zA-Z0-9 ]', '', tweet)
tweet = tweet.lower()
return tweet
def analyze_sentiment_vader(tweet):
"""Analyzes sentiment using VADER."""
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(tweet)
return scores['compound']
def analyze_emotions(tweet):
"""Extracts emotions using text2emotion."""
return get_emotion(tweet)
tqdm.pandas()
NUM_PROCESSES = mp.cpu_count()
CHUNKSIZE = len(tweets) // NUM_PROCESSES
if __name__ == '__main__':
with mp.Pool(processes=NUM_PROCESSES) as pool:
tweets['clean_tweet'] = pool.map(clean_tweet, tweets['text'], chunksize=CHUNKSIZE)
with mp.Pool(processes=NUM_PROCESSES) as pool:
tweets['sentiment_vader'] = pool.map(analyze_sentiment_vader, tweets['clean_tweet'], chunksize=CHUNKSIZE)
with mp.Pool(processes=NUM_PROCESSES) as pool:
tweets['emotions'] = pool.map(analyze_emotions, tweets['clean_tweet'], chunksize=CHUNKSIZE)
emotion_columns = ['Happy', 'Angry', 'Surprise', 'Sad', 'Fear']
tweets[emotion_columns] = tweets['emotions'].apply(lambda x: pd.Series(x))
tweets.to_pickle(OUTPUT_FILE)
print("Preprocessing complete!")