-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathproject_main.py
432 lines (320 loc) · 13.1 KB
/
project_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import nltk
from nltk.stem.porter import *
import re
import pickle
import pandas as pd
import random
import sys
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import tensorflow as tf
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from instance import Instance
from weighting import voting
#from bayes import BayesModel
from proximity import ProximityModel
from attempt import LSTM
from voting_classifier import VotingModel
from confusion_matrix import ConfusionMatrix
# TODO Run tests on multiple voting systems at once
# TODO First run project_main.py, then tests.py
def main(tperc, seed, fpaths, weighting_type):
"""Parses files, trains the models, tests the models,
creates the weights, makes predictions and evaluates results"""
files = openFiles(fpaths)
instances = parseFiles(files)
train_set, test_set1, test_set2 = splitSets(tperc, seed, instances)
# Initialize all models
p = ProximityModel()
v = VotingModel()
# b = BayesModel()
r = LSTM()
print("Initialized all models!")
# Train all models
p.train(train_set)
v.train(train_set)
# b.train(train_set)
# r.train(train_set)
print("Trained all models!")
# Run models and store first set of results
p_pred = p.batchTest(test_set1)
v_pred = v.batchTest(test_set1)
# b_pred = b.batchTest(test_set1)
r_pred = r.batchTest(test_set1)
print("Predictions made for first test set!")
# Store first set of predictions
preds1 = [p_pred, v_pred, r_pred]#, b_pred, r_pred]
test_set1_labels = [i.getLabel() for i in test_set1]
store_preds(preds1, test_set1_labels, 1)
print("Stored predictions for first test set!")
# Get confusion matrices for first set of results
p_cm = ConfusionMatrix(test_set1_labels, p_pred, "Proximity")
v_cm = ConfusionMatrix(test_set1_labels, v_pred, "Voting")
# b_cm = ConfusionMatrix(test_set1_labels, b_pred, "Bayes")
r_cm = ConfusionMatrix(test_set1_labels, r_pred, "LSTM")
confusionMatrices = [p_cm, v_cm, r_cm]
# confusionMatices = [p_cm, v_cm, b_cm, r_cm]
# Save individual confusion matrices to files
for cm in confusionMatrices:
cm.store_cm()
print("Individual confusion matrices created and stored!")
# Second set of predictions
p_pred2 = p.batchTest(test_set2)
v_pred2 = v.batchTest(test_set2)
#b_pred2 = b.batchTest(test_set2)
r_pred2 = r.batchTest(test_set2)
print("Predictions made for second test set!")
# Store second set of predictions
preds2 = [p_pred2, v_pred2, r_pred2] # , b_pred2, r_pred2]
test_set2_labels = [i.getLabel() for i in test_set2]
store_preds(preds2, test_set2_labels, 2)
print("Stored predictions for second test set!")
# Weight second set of results, using confusion matrices from first set
weightingInput = [
[confusionMatrices[0], p_pred2],
[confusionMatrices[1], v_pred2]
# [confusionMatrices[2] ,b_pred2],
[confusionMatrices[2], r_pred2],
]
# Get the weighting results
guesses = voting(weightingInput, weighting_type)
print("Voting done!")
# print(guesses)
# Create confusion matrix for final model and store it in a file
final_cm = ConfusionMatrix(test_set2_labels, guesses, "Final_Model_" + weighting_type)
final_cm.store_cm()
print("Stored confusion matrix!")
# Store second set of tweets and guesses
test_set2_tweets = [t.getFullTweet() for t in test_set2]
store_new_labels(test_set2_tweets, guesses, test_set2_labels)
print("Stored new predictions!")
def alternative_main(tperc, seed, weighting_type, fpaths):
"""COPY OF MAIN FUNCTION - EXCEPT ONLY USED TO STORE PREDICTIONS FOR BOTH TEST SETS
Parses files, trains the models, tests the models,
creates the weights, makes predictions"""
files = openFiles(fpaths)
instances = parseFiles(files)
train_set, test_set1, test_set2 = splitSets(tperc, seed, instances)
# Initialize all models
p = ProximityModel()
v = VotingModel()
# b = BayesModel()
r = LSTM()
print("Initialized all models!")
# Train all models
p.train(train_set)
v.train(train_set)
# b.train(train_set)
# r.train(train_set)
print("Trained all models!")
# Run models and store first set of results
p_pred = p.batchTest(test_set1)
v_pred = v.batchTest(test_set1)
# b_pred = b.batchTest(test_set1)
r_pred = r.batchTest(test_set1)
print("Predictions made for first test set!")
# Store first set of predictions
preds1 = [p_pred, v_pred, r_pred]#, b_pred, r_pred]
test_set1_labels = [i.getLabel() for i in test_set1]
store_preds(preds1, test_set1_labels, 1)
print("Stored predictions for first test set!")
# Run models and store second set of results
p_pred2 = p.batchTest(test_set2)
v_pred2 = v.batchTest(test_set2)
# b_pred2 = b.batchTest(test_set2)
r_pred2 = r.batchTest(test_set2)
print("Predictions made for second test set!")
# Store first set of predictions
preds2 = [p_pred2, v_pred2, r_pred2] # , b_pred2, r_pred2]
test_set2_labels = [i.getLabel() for i in test_set2]
store_preds(preds2, test_set2_labels, 2)
print("Stored predictions for second test set!")
def run_multiple_voting():
"""Run tests on multiple weighting systems given stored predictions of the classifiers in main()
To be used in conjunction with alternative_main to determine which weighting method performs better
"""
# Load predictions from all classifiers and actual labels for test_set_1
preds1, actual1 = load_preds(1)
# Load predictions from all classifiers and actual labels for test_set_2
preds2, actual2 = load_preds(2)
# Create confusion matrices for each classifier
p_cm = ConfusionMatrix(actual1, preds1[0], "Proximity")
v_cm = ConfusionMatrix(actual1, preds1[1], "Voting")
# b_cm = ConfusionMatrix(actual1, preds1[2], "Bayes")
# r_cm = ConfusionMatrix(actual1, preds1[2], "LSTM")
confusionMatrices = [p_cm, v_cm]
# confusionMatices = [p_cm, v_cm, b_cm, r_cm]
# Save individual confusion matrices to files
for cm in confusionMatrices:
cm.store_cm()
print("Individual confusion matrices created and stored!")
# Weight second set of results, using confusion matrices from first set
weightingInput = [
[confusionMatrices[0], preds2[0]],
[confusionMatrices[1], preds2[1]]
# [confusionMatrices[2] ,b.batchTest(test_set2)],
# [confusionMatrices[3], r.batchTest(test_set2)],
]
# Get the weighted voting results
votes_p = voting(weightingInput, "Precision")
votes_CEN_p = voting(weightingInput, "CEN_Precision")
votes_CEN = voting(weightingInput, "CEN")
votes_eq = voting(weightingInput, "Equal_Vote")
# Check metrics
print(classification_report(actual2, votes_p))
print(classification_report(actual2, votes_CEN_p))
print(classification_report(actual2, votes_CEN))
print(classification_report(actual2, votes_eq))
# Create final confusion matrices depending on votes
p_cm = ConfusionMatrix(actual2, votes_p, "Precision")
p_CEN_cm = ConfusionMatrix(actual2, votes_CEN_p, "CEN_Precision")
CEN_cm = ConfusionMatrix(actual2, votes_CEN, "CEN")
eq_cm = ConfusionMatrix(actual2, votes_eq, "Equal")
# Store confusion matrices
p_cm.store_cm()
p_CEN_cm.store_cm()
CEN_cm.store_cm()
eq_cm.store_cm()
return votes_p, votes_CEN_p, votes_CEN, votes_eq
def store_new_labels(t2tweets, guesses, labels):
"""Creates a csv document that stores the tweets tested and their predicted labels"""
with open("FinalModel_Predictions.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(["tweet", "class", "label"])
for tweet, guess, label in zip(t2tweets, guesses, labels):
writer.writerow([tweet, guess, label])
def store_preds(preds, actual, num_test):
"""Stores the list of predictions into a text file to be loaded later
num_test: number of test_set
"""
num_test = str(num_test)
# Proximity, Voting, Bayes, LSTM
f1 = open("proximity_preds_" + num_test + ".txt", "w+b")
f2 = open("voting_preds_" + num_test + ".txt", "w+b")
#f3 = open("bayes_preds_" + num_test + ".txt", "w+b")
f4 = open("lstm_preds_" + num_test + ".txt", "w+b")
f5 = open("actual_labels_" + num_test + ".txt", "w+b")
files = [f1, f2, f4, f5]#, f3, f4]
for i in range(len(preds)):
pickle.dump(preds[i], files[i])
pickle.dump(actual, f5)
def load_preds(num_test):
"""Loads the predictions file and returns a list of prediction lists
num_test: number of test_set
"""
num_test = str(num_test)
# Proximity, Voting, Bayes, LSTM
f1 = open("proximity_preds_" + num_test + ".txt", "rb")
f2 = open("voting_preds_" + num_test + ".txt", "rb")
#f3 = open("bayes_preds_" + num_test + ".txt", "rb")
f4 = open("lstm_preds_" + num_test + ".txt", "rb")
f5 = open("actual_labels_" + num_test + ".txt", "rb")
files = [f1, f2, f4, f5]#, f3, f4]
preds = []
for i in range(len(files)-1):
l = pickle.load(files[i])
preds.append(l)
actual = pickle.load(f5)
return preds, actual
def preprocess(text_string):
"""
Accepts a text string and replaces:
1) urls with URLHERE
2) lots of whitespace with one instance
3) mentions with MENTIONHERE
This allows us to get standardized counts of urls and mentions
Without caring about specific people mentioned
"""
space_pattern = '\s+'
giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
'[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
mention_regex = '@[\w\-]+'
parsed_text = re.sub(space_pattern, ' ', text_string)
parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
# parsed_text = parsed_text.code("utf-8", errors='ignore')
return parsed_text
def tokenize(tweet):
"""Removes punctuation & excess whitespace, sets to lowercase,
and stems tweets. Returns a list of stemmed tokens."""
stemmer = PorterStemmer()
tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
# tokens = re.split("[^a-zA-Z]*", tweet.lower())
tokens = [stemmer.stem(t) for t in tweet.split()]
return tokens
def basic_tokenize(tweet):
"""Same as tokenize but without the stemming"""
tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
return tweet.split()
def main_parser(f):
""""
@input file
@output list of instance objects
Reads files in the format as labeled_data.csv as a pandas dataframe
This means that it contains a top row with the words tweets | class,
so they can be referenced easily.
Creates instance objects with the full text, the tokenized text and the label
"""
# Read inputs using pandas
df = pd.read_csv(f)
raw_tweets = df.tweet
labels = df['class'].astype(int)
instances = []
# Process tweets and create instances
for tweet, label in zip(raw_tweets, labels):
# Raw tweet and label
i = Instance()
i.label = label
i.fulltweet = tweet
# Get just text
clean_tweet = preprocess(tweet)
i.clean_tweet = clean_tweet
# Tokenize tweet
tokenized_tweet = basic_tokenize(clean_tweet)
# stemmed_tweet = tokenize(clean_tweet)
i.wordlist = tokenized_tweet
instances.append(i)
return instances
def splitSets(tperc, seed, instances):
random.seed(seed)
random.shuffle(instances)
split = int(tperc * len(instances))
second_split = int(split + float(len(instances) - split) / 2)
return instances[:split], instances[split:second_split], instances[second_split:]
def parseFiles(files):
instances = []
for f in files:
instances += main_parser(f)
return instances
def openFiles(filepaths): # takes in file paths and attempts to open, fails if zero valid files
files = []
for fp in filepaths:
try:
f = open(fp, 'r')
files.append(f)
except FileNotFoundError:
print("Readin: NonFatal: file " + str(fp) + " not found.")
if len(files) == 0:
raise FileNotFoundError("Readin: Fatal: No Files Found")
else:
return files
if __name__ == "__main__":
if len(sys.argv) < 5:
# not enough args to make tperc or seed
raise IndexError("Readin: Fatal: Not enough given arguments.")
else:
tperc = float(sys.argv[1])
seed = int(sys.argv[2])
weighting_type = sys.argv[3]
fpaths = sys.argv[4:]
# Check that weighting type is correct
if weighting_type not in ["Precision", "CEN_Precision", "CEN", "Equal_Vote"]:
print("Please give a correct weighting option. Choose from the following: "
"\nPrecision, CEN_Precision, CEN, Equal_Vote")
else:
#main(tperc, seed, fpaths, weighting_type)
alternative_main(tperc, seed, weighting_type, fpaths)
#run_multiple_voting()