-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
60 lines (45 loc) · 1.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from pkg.data_processing import load_and_preprocess_data, balance_dataset, preprocess_text
from pkg.model import tokenize_and_pad, build_model
print(f"TensorFlow version: {tf.__version__}")
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
def main():
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
# Load and preprocess data
script_dir = os.path.dirname(os.path.abspath(__file__))
dataframe_path = os.path.join(script_dir, 'data', 'reviews_Automotive_5.json.gz')
if not os.path.exists(dataframe_path):
print(f"Error: File not found at {dataframe_path}")
return
review_df = load_and_preprocess_data(dataframe_path)
# Balance dataset
sample_df = balance_dataset(review_df)
sample_df = preprocess_text(sample_df)
# Tokenize and pad sequences
X, tokenizer = tokenize_and_pad(sample_df['reviewText'])
y = sample_df['positive']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build and train model
max_features = 2000
embed_dim = 64
lstm_out = 16
model = build_model(max_features, embed_dim, lstm_out, X.shape[1])
print(model.summary())
batch_size = 32
model.fit(X_train, y_train,
epochs=10,
batch_size=batch_size,
validation_data=(X_test, y_test),
callbacks=[EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=2, verbose=2)])
# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}")
if __name__ == "__main__":
main()