-
Notifications
You must be signed in to change notification settings - Fork 4
/
source_code.py
160 lines (121 loc) · 4.58 KB
/
source_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
# coding: utf-8
#To analyze movie reviews and build a sentiment classification model
#Author: Akshay Mattoo
import pandas as pd
import os
train_data_pos = []
abs_path = "/Users/akshaymattoo/Desktop/movie_rating_pred/aclImdb/train/pos/"
for file in os.listdir(abs_path):
curr_path = abs_path + file
with open(curr_path, 'r') as fd:
data = fd.read()
data = data.lower()
train_data_pos.append(data)
train_data_neg = []
abs_path = "/Users/akshaymattoo/Desktop/movie_rating_pred/aclImdb/train/neg/"
for file in os.listdir(abs_path):
curr_path = abs_path + file
with open(curr_path, 'r') as fd:
data = fd.read()
data = data.lower()
train_data_neg.append(data)
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
tokenizer = RegexpTokenizer("[a-z1-9]+")
english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
wanted_stopwords = ["not", "wasn't", "won", "lose", "lost", "don't", "down", "any", "isn't", "yourself", "won't", "didn't", "should've", "should", "did", "doesn't", "our"]
for word in wanted_stopwords:
if word in english_stopwords:
english_stopwords.remove(word)
def cleanData(data):
data = data.replace("<br /><br />"," ")
#Tokenize the data
tokens = tokenizer.tokenize(data)
cleaned_tokens = [token for token in tokens if token not in english_stopwords]
stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]
lem_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
return lem_tokens
cleaned_pos_data = [cleanData(data) for data in train_data_pos]
cleaned_neg_data = [cleanData(data) for data in train_data_neg]
vocab_size = 0;
pos_word_freq = {}
pos_word_count = 0
for review in cleaned_pos_data:
for word in review:
pos_word_count += 1
pos_word_freq.setdefault(word, 1)
if (pos_word_freq[word]==1):
vocab_size += 1
pos_word_freq[word] += 1
neg_word_freq = {}
neg_word_count = 0
for review in cleaned_neg_data:
for word in review:
neg_word_count += 1
neg_word_freq.setdefault(word, 1)
if (neg_word_freq[word]==1):
vocab_size += 1
neg_word_freq[word] += 1
total_count = 50000
pos_count = 25000
neg_count = 25000
prior_prob_pos = pos_count/total_count
prior_prob_neg = neg_count/total_count
test_data_pos = []
abs_path = "/Users/akshaymattoo/Desktop/movie_rating_pred/aclImdb/test/pos/"
for file in os.listdir(abs_path):
curr_path = abs_path + file
with open(curr_path, 'r') as fd:
data = fd.read()
data = data.lower()
test_data_pos.append(data)
test_data_neg = []
abs_path = "/Users/akshaymattoo/Desktop/movie_rating_pred/aclImdb/test/neg/"
for file in os.listdir(abs_path):
curr_path = abs_path + file
with open(curr_path, 'r') as fd:
data = fd.read()
data = data.lower()
test_data_neg.append(data)
cleaned_test_pos = [cleanData(data) for data in test_data_pos]
cleaned_test_neg = [cleanData(data) for data in test_data_neg]
correct_pred_count = 0;
for review in cleaned_test_pos:
pos_prob = 1
neg_prob = 1
for word in review:
pos_likelihood = (1 / (pos_word_count + vocab_size))
if word in pos_word_freq:
pos_likelihood = ((pos_word_freq[word] + 1) / (pos_word_count + vocab_size))
pos_prob *= pos_likelihood
neg_likelihood = (1 / (neg_word_count + vocab_size))
if word in neg_word_freq:
neg_likelihood = ((neg_word_freq[word] + 1) / (neg_word_count + vocab_size))
neg_prob *= neg_likelihood
neg_prob *= prior_prob_neg
pos_prob *= prior_prob_pos
if (pos_prob > neg_prob):
correct_pred_count += 1
for review in cleaned_test_neg:
pos_prob = 1
neg_prob = 1
for word in review:
pos_likelihood = (1 / (pos_word_count + vocab_size))
if word in pos_word_freq:
pos_likelihood = ((pos_word_freq[word] + 1) / (pos_word_count + vocab_size))
pos_prob *= pos_likelihood
neg_likelihood = (1 / (neg_word_count + vocab_size))
if word in neg_word_freq:
neg_likelihood = ((neg_word_freq[word] + 1) / (neg_word_count + vocab_size))
neg_prob *= neg_likelihood
neg_prob *= prior_prob_neg
pos_prob *= prior_prob_pos
if (neg_prob > pos_prob):
correct_pred_count += 1
accuracy = correct_pred_count/500
print(accuracy)