-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
190 lines (154 loc) · 8.52 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
COMP9417
Assignment: Fake news Challenge
Authors: Shashank Reddy Boosi(z5222766), Connor McLeod (z5058240), Leonard Lee(z5173917), Darren Zhang (z5113901)
main.py: Main file for program execution
"""
from src.data_import import FakeNewsData
from src.train_validation_split import DataSplit
from src.preprocess import Preprocess
from src.feature_extraction import Features
from src.models import Models
from src.score import LABELS
from src.utils import input_file, output_file
import scipy.sparse as sp
import os
import time
# Global Variables
trainStancePath = "data/train_stances.csv"
testStancePath = "data/competition_test_stances.csv"
trainBodyPath = "data/train_bodies.csv"
testBodyPath = "data/competition_test_bodies.csv"
# header attributes
primary_id = "Body ID"
stance = "Stance"
body = "articleBody"
headline = "Headline"
base_preprocess_path = "preprocessed_data"
base_feature_path = "final_features"
output = "output"
def target_labels(stances):
labels = []
for i in range(len(stances)):
labels.append(LABELS.index(stances[i][stance]))
return labels
def headlines_bodies(temp_headline, temp_body):
headlines = []
bodies = []
for i in range(len(temp_headline)):
bodies.append(temp_body[int(temp_headline[i][primary_id])])
headlines.append(temp_headline[i][headline])
return headlines, bodies
'''
This files combine all the data mining part starting from Data importing, spliting,
pre processing, feature transformation, modelling and visualization.
Check README for clear understanding of what is happening.
'''
if __name__ == "__main__":
t0 = time.time()
# Importing the data
train = FakeNewsData(trainStancePath, trainBodyPath)
test = FakeNewsData(testStancePath, testBodyPath)
# Extracting IDs for data splitting
ids = list(train.articleBody.keys())
# The DataSplit generates the train and validation splits according to our split size
print("Data Splitting")
train_validation_split = DataSplit(ids=ids, headline=train.headlineInstances, split_size=0.8)
train_stances, validation_stances = train_validation_split.split()
# Preprocess the train
print("Start of pre-processing for train")
if not (os.path.exists(base_preprocess_path + "/" + "training_headlines.p") and os.path.exists(
base_preprocess_path + "/" + "training_bodies.p")):
preprocessed_train_data = Preprocess(headline=train_stances, body=train.articleBody,
preprocess_type="lemma")
train_preprocessed_headlines, train_preprocessed_bodies = preprocessed_train_data.get_clean_headlines_and_bodies()
output_file(train_preprocessed_headlines, base_preprocess_path + "/" + "training_headlines.p")
output_file(train_preprocessed_bodies, base_preprocess_path + "/" + "training_bodies.p")
else:
train_preprocessed_headlines = input_file(base_preprocess_path + "/" + "training_headlines.p")
train_preprocessed_bodies = input_file(base_preprocess_path + "/" + "training_bodies.p")
# Preprocess the validation
print("Start of pre-processing for validation")
if not (os.path.exists(base_preprocess_path + "/" + "validation_headlines.p") and os.path.exists(
base_preprocess_path + "/" + "validation_bodies.p")):
preprocessed_validation_data = Preprocess(headline=validation_stances, body=train.articleBody,
preprocess_type="lemma")
validation_preprocessed_headlines, validation_preprocessed_bodies = preprocessed_validation_data.get_clean_headlines_and_bodies()
output_file(validation_preprocessed_headlines, base_preprocess_path + "/" + "validation_headlines.p")
output_file(validation_preprocessed_bodies, base_preprocess_path + "/" + "validation_bodies.p")
else:
validation_preprocessed_headlines = input_file(base_preprocess_path + "/" + "validation_headlines.p")
validation_preprocessed_bodies = input_file(base_preprocess_path + "/" + "validation_bodies.p")
# Preprocess the test
print("Start of pre-processing for test")
if not (os.path.exists(base_preprocess_path + "/" + "test_headlines.p") and os.path.exists(
base_preprocess_path + "/" + "test_bodies.p")):
preprocessed_test_data = Preprocess(headline=test.headlineInstances, body=test.articleBody,
preprocess_type="lemma")
test_preprocessed_headlines, test_preprocessed_bodies = preprocessed_test_data.get_clean_headlines_and_bodies()
output_file(test_preprocessed_headlines, base_preprocess_path + "/" + "test_headlines.p")
output_file(test_preprocessed_bodies, base_preprocess_path + "/" + "test_bodies.p")
else:
test_preprocessed_headlines = input_file(base_preprocess_path + "/" + "test_headlines.p")
test_preprocessed_bodies = input_file(base_preprocess_path + "/" + "test_bodies.p")
# Split headlines and bodies for train, validation and test
train_headlines, train_bodies = headlines_bodies(train_stances, train.articleBody)
validation_headlines, validation_bodies = headlines_bodies(validation_stances, train.articleBody)
test_headlines, test_bodies = headlines_bodies(test.headlineInstances, test.articleBody)
if not (os.path.exists(base_feature_path + "/" + "train_features.p") and os.path.exists(
base_feature_path + "/" + "validation_features.p") and os.path.exists(
base_feature_path + "/" + "test_features.p")):
# Feature extraction and combining them for the models
print("Feature extraction for train")
train_features = Features(train_preprocessed_headlines, train_preprocessed_bodies,
train_headlines,
train_bodies)
# TF-IDF weight extraction
train_tfidf_weights, validation_tfidf_weights, test_tfidf_weights = train_features.tfidf_extraction(
validation_headlines, validation_bodies, test_headlines, test_bodies)
# Sentence weighting for train
train_sentence_weights = train_features.sentence_weighting()
print("Feature extraction for validation")
validation_features = Features(validation_preprocessed_headlines, validation_preprocessed_bodies,
validation_headlines, validation_bodies)
# Sentence weighting for validation
validation_sentence_weights = validation_features.sentence_weighting()
print("Feature extraction for test")
test_features = Features(test_preprocessed_headlines, test_preprocessed_bodies,
test_headlines, test_bodies)
# Sentence weighting for test
test_sentence_weights = test_features.sentence_weighting()
# Combine the features to prepare them as an inout for the models
final_train_features = sp.bmat([[train_tfidf_weights, train_sentence_weights.T]]).A
output_file(final_train_features, base_feature_path + "/" + "train_features.p")
final_validation_features = sp.bmat([[validation_tfidf_weights, validation_sentence_weights.T]]).A
output_file(final_validation_features, base_feature_path + "/" + "validation_features.p")
final_test_features = sp.bmat([[test_tfidf_weights, test_sentence_weights.T]]).A
output_file(final_test_features, base_feature_path + "/" + "test_features.p")
else:
print("Feature Extraction")
final_train_features = input_file(base_feature_path + "/" + "train_features.p")
final_validation_features = input_file(base_feature_path + "/" + "validation_features.p")
final_test_features = input_file(base_feature_path + "/" + "test_features.p")
t1 = time.time()
print("Time for feature extraction is:", t1 - t0)
# Target variables
train_target_labels = target_labels(train_stances)
validation_target_labels = target_labels(validation_stances)
test_target_labels = target_labels(test.headlineInstances)
# Modelling the features
print("Start of Modelling")
models = Models(final_train_features, final_validation_features, final_test_features, train_target_labels,
validation_target_labels, test_target_labels)
# Calling the 4 models
models.get_lr()
models.get_dt()
models.get_nb()
models.get_rf()
'''
Used read_from_csv in utils to know the actual labels and the predicted labels
to produce the correctness visualizations graphs for the report.
'''
t2 = time.time()
print("Time for the total is:", t2 - t0)
print("\nEnd of tests\n")