From b5629d55f51ba9bbd4728fcec944baf61c837162 Mon Sep 17 00:00:00 2001 From: PedroSeber Date: Mon, 27 Nov 2023 01:14:29 -0500 Subject: [PATCH] Removed the .ipynb file to streamline the code --- ANN_train.ipynb | 627 ------------------------------------------------ 1 file changed, 627 deletions(-) delete mode 100644 ANN_train.ipynb diff --git a/ANN_train.ipynb b/ANN_train.ipynb deleted file mode 100644 index 4ddbbc3..0000000 --- a/ANN_train.ipynb +++ /dev/null @@ -1,627 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ca3663b3", - "metadata": {}, - "source": [ - "## Parameters that may be changed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b890d1c", - "metadata": {}, - "outputs": [], - "source": [ - "activ_fun_list = ['tanhshrink'] # A list of strings representing the activation functions used during cross-validation.\n", - " # Check the class SequenceMLP to see what functions are available\n", - "lstm_size = 75 # Size of the LSTM layer. Set to 0 to use only an MLP\n", - "weight_hyperparam = [1, 30] # The weight used in the loss function for positive-class predictions. Should be [1, number > 1]\n", - "data_version = 'v5' # The version of the data to be used. Should be of the form v#. Should be left as v5\n", - "window_size = 20 # The number of AAs before and after the central S/T. Used only when data_version == 'v5'\n", - "batch_size = 32 # The batch size used during cross-validation" - ] - }, - { - "cell_type": "markdown", - "id": "4ae8a6c0", - "metadata": {}, - "source": [ - "## Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3d5c783", - "metadata": {}, - "outputs": [], - "source": [ - "# General & data manipulation imports\n", - "import pandas as pd\n", - "import numpy as np\n", - "from os import mkdir\n", - "from os.path import isdir, isfile\n", - "from sklearn.model_selection import train_test_split, StratifiedKFold\n", - "from sklearn import preprocessing\n", - "# Torch & model creation imports\n", - "import torch\n", - "from torch.utils.data import Dataset, DataLoader\n", - "from collections import OrderedDict\n", - "# Training & validation imports\n", - "from itertools import product\n", - "# Results & visualization imports\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", - "# Convenience imports\n", - "from time import time" - ] - }, - { - "cell_type": "markdown", - "id": "cbd7959b", - "metadata": {}, - "source": [ - "## Data Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1520f1f", - "metadata": {}, - "outputs": [], - "source": [ - "if data_version in {'v1', 'v2'}:\n", - " window_size_path = ''\n", - " myshape_X = data.shape[1] - 1 # For convenience when declaring ANNs\n", - " data = torch.Tensor(pd.read_csv(f'OH_data_{data_version}.csv').values) # X and y values\n", - "elif data_version in {'v3', 'v4'}:\n", - " window_size_path = ''\n", - " myshape_X = 76 # Manually declaring, 76 because the v1 dataset had 76 features\n", - " data = torch.Tensor(pd.read_csv(f'OH_data_{data_version}.csv').values) # y values\n", - "else:\n", - " window_size_path = f'_{window_size}-window'\n", - " myshape_X = 75 # Rounded the 76 to 75\n", - " data = torch.Tensor(pd.read_csv(f'OH_data_{data_version}_5-window.csv').values) # y values\n", - "# Loading and transforming the data if using an LSTM\n", - "if lstm_size:\n", - " lstm_data = torch.Tensor(np.load(f'OH_LSTM_data_{data_version}{window_size_path}.npy'))\n", - "# Pre-declaring paths for convenience (to save / load results)\n", - "if lstm_size:\n", - " working_dir = f'RNN_{lstm_size}_results_{data_version}-data{window_size_path}'\n", - "else:\n", - " working_dir = f'ANN_results_{data_version}-data'\n", - "if not isdir(working_dir):\n", - " mkdir(working_dir)\n", - "\n", - "# Setting each activ_fun to lowercase for consistency\n", - "activ_fun_list = [activ_fun.casefold() for activ_fun in activ_fun_list]\n", - "\n", - "# Data splitting - 80% Cross Validation, 20% Test\n", - "if lstm_size:\n", - " cv_data, test_data, cv_lstm_data, test_lstm_data = train_test_split(data, lstm_data, test_size = 0.2, random_state = 123)\n", - "else:\n", - " cv_data, test_data = train_test_split(data, test_size = 0.2, random_state = 123)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e641d76f", - "metadata": {}, - "outputs": [], - "source": [ - "class MyDataset(Dataset):\n", - " def __init__(self, data, lstm_data = None):\n", - " self.Xdata = data[:, :-1]\n", - " self.ydata = data[:, -1].type(torch.LongTensor)\n", - " self.lstm_data = lstm_data\n", - " \n", - " def __len__(self):\n", - " return len(self.Xdata)\n", - " \n", - " def __getitem__(self, idx):\n", - " if isinstance(self.lstm_data, torch.Tensor):\n", - " return self.Xdata[idx], self.ydata[idx], self.lstm_data[idx]\n", - " else:\n", - " return self.Xdata[idx], self.ydata[idx]" - ] - }, - { - "cell_type": "markdown", - "id": "a1f7142a", - "metadata": {}, - "source": [ - "## Model & Run Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "763718b7", - "metadata": {}, - "outputs": [], - "source": [ - "# MLP or LSTM+MLP model\n", - "class SequenceMLP(torch.nn.Module):\n", - " def __init__(self, layers, activ_fun = 'relu', lstm_size = 0):\n", - " super(SequenceMLP, self).__init__()\n", - " # Setup to convert string (first notebook cell) to activation function\n", - " if activ_fun == 'relu':\n", - " torch_activ_fun = torch.nn.ReLU()\n", - " elif activ_fun == 'tanh':\n", - " torch_activ_fun = torch.nn.Tanh()\n", - " elif activ_fun == 'sigmoid':\n", - " torch_activ_fun = torch.nn.Sigmoid()\n", - " elif activ_fun == 'tanhshrink':\n", - " torch_activ_fun = torch.nn.Tanhshrink()\n", - " elif activ_fun == 'selu':\n", - " torch_activ_fun = torch.nn.SELU()\n", - " #elif activ_fun == 'attention':\n", - " # torch_activ_fun = torch.nn.MultiheadAttention(myshape_X, 4)\n", - " else:\n", - " raise ValueError(f'Invalid activ_fun. You passed {activ_fun}')\n", - "\n", - " # LSTM cell\n", - " if lstm_size:\n", - " self.lstm = torch.nn.LSTM(20, lstm_size, num_layers=1, batch_first=True, bidirectional=True)\n", - "\n", - " # Transforming layers list into OrderedDict with layers + activation\n", - " mylist = list()\n", - " for idx, elem in enumerate(layers):\n", - " mylist.append((f'Linear{idx}', torch.nn.Linear(layers[idx][0], layers[idx][1]) ))\n", - " if idx < len(layers)-1:\n", - " mylist.append((f'{activ_fun}{idx}', torch_activ_fun))\n", - " # OrderedDict into NN\n", - " self.model = torch.nn.Sequential(OrderedDict(mylist))\n", - " self.sigmoid = torch.nn.Sigmoid()\n", - " \n", - " def forward(self, x, lstm_data = None):\n", - " if 'lstm' in dir(self):\n", - " _, (ht, _) = self.lstm(lstm_data) # Passing only the seq data through the LSTM\n", - " to_MLP = (ht[0] + ht[1]) / 2 # Average between forward and backward\n", - " out = self.model(to_MLP)\n", - " else:\n", - " out = self.model(x)\n", - " probs = self.sigmoid(out)\n", - " return probs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a346b2a1", - "metadata": {}, - "outputs": [], - "source": [ - "class CosineScheduler: # Code obtained from https://d2l.ai/chapter_optimization/lr-scheduler.html\n", - " def __init__(self, max_update, base_lr=0.01, final_lr=0, warmup_steps=0, warmup_begin_lr=0):\n", - " self.base_lr_orig = base_lr\n", - " self.max_update = max_update\n", - " self.final_lr = final_lr\n", - " self.warmup_steps = warmup_steps\n", - " self.warmup_begin_lr = warmup_begin_lr\n", - " self.max_steps = self.max_update - self.warmup_steps\n", - "\n", - " def get_warmup_lr(self, epoch):\n", - " increase = (self.base_lr_orig - self.warmup_begin_lr) * float(epoch) / float(self.warmup_steps)\n", - " return self.warmup_begin_lr + increase\n", - "\n", - " def __call__(self, epoch):\n", - " if epoch < self.warmup_steps:\n", - " return self.get_warmup_lr(epoch)\n", - " if epoch <= self.max_update:\n", - " self.base_lr = self.final_lr + (\n", - " self.base_lr_orig - self.final_lr) * (1 + np.cos(\n", - " np.pi * (epoch - self.warmup_steps) / self.max_steps)) / 2\n", - " return self.base_lr" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b914347", - "metadata": {}, - "outputs": [], - "source": [ - "# A helper function that is called every epoch of training or validation\n", - "def loop_model(model, optimizer, loader, loss_function, epoch, batch_size, lstm_size = None, evaluation = False):\n", - " if evaluation:\n", - " model.eval()\n", - " val_pred = torch.empty((len(loader.dataset), 2))\n", - " val_y = torch.empty((len(loader.dataset)), dtype = torch.long)\n", - " else:\n", - " model.train()\n", - " batch_losses = []\n", - " for idx, data in enumerate(loader):\n", - " if lstm_size:\n", - " X, y, lstm = data\n", - " lstm = lstm.cuda()\n", - " else:\n", - " X, y = data\n", - " lstm = None\n", - " X = X.cuda()\n", - " y = y.cuda()\n", - " pred = model(X, lstm)\n", - " loss = loss_function(pred, y)\n", - " batch_losses.append(loss.item()) # Saving losses\n", - " # Backpropagation\n", - " if not evaluation:\n", - " optimizer.zero_grad()\n", - " loss.backward()\n", - " optimizer.step()\n", - " else:\n", - " val_pred[idx*batch_size:(idx*batch_size)+len(pred), :] = pred.cpu().detach()\n", - " val_y[idx*batch_size:(idx*batch_size)+len(y)] = y\n", - " if evaluation: # Obtaining the validation F1 score\n", - " val_pred_CM = val_pred.argmax(axis=1)\n", - " CM = confusion_matrix(val_y, val_pred_CM) # Confusion matrix to make F1 calcs easier\n", - " if CM[1,1]+CM[1,0] and CM[1,1]+CM[0,1]: # Avoids dividing by 0\n", - " rec = CM[1,1]/(CM[1,1]+CM[1,0])\n", - " pre = CM[1,1]/(CM[1,1]+CM[0,1])\n", - " else:\n", - " rec, pre = 0, 0\n", - " if rec and pre: # Avoids dividing by 0 when calculating F1\n", - " F1 = 2/(1/rec + 1/pre)\n", - " else:\n", - " F1 = 0\n", - " return np.array(batch_losses).mean(), F1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e71a1ef", - "metadata": {}, - "outputs": [], - "source": [ - "# Setting up the hyperparameters\n", - "n_epochs = 70\n", - "layers = [\n", - " # 1 hidden layer\n", - " #[(myshape_X, myshape_X*12), (myshape_X*12, 2)],\n", - " #[(myshape_X, myshape_X*11), (myshape_X*11, 2)],\n", - " #[(myshape_X, myshape_X*10), (myshape_X*10, 2)],\n", - " #[(myshape_X, myshape_X*9), (myshape_X*9, 2)],\n", - " #[(myshape_X, myshape_X*8), (myshape_X*8, 2)],\n", - " #[(myshape_X, myshape_X*7), (myshape_X*7, 2)],\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, 2)],\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, 2)],\n", - " #[(myshape_X, myshape_X*4), (myshape_X*4, 2)],\n", - " [(myshape_X, myshape_X*3), (myshape_X*3, 2)],\n", - " [(myshape_X, myshape_X*2), (myshape_X*2, 2)],\n", - " [(myshape_X, myshape_X), (myshape_X, 2)],\n", - " [(myshape_X, myshape_X//2), (myshape_X//2, 2)],\n", - " [(myshape_X, 20), (20, 2)],\n", - " # 2 hidden layers\n", - " #[(myshape_X, myshape_X*7), (myshape_X*7, myshape_X*7), (myshape_X*7, 2)], # 7-7\n", - " #[(myshape_X, myshape_X*7), (myshape_X*7, myshape_X*6), (myshape_X*6, 2)], # 7-6\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*6), (myshape_X*6, 2)], # 6-6\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*5), (myshape_X*5, 2)], # 6-5\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*4), (myshape_X*4, 2)], # 6-4\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*3), (myshape_X*3, 2)], # 6-3\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, myshape_X*5), (myshape_X*5, 2)], # 5-5\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, myshape_X*4), (myshape_X*4, 2)], # 5-4\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, myshape_X*3), (myshape_X*3, 2)], # 5-3\n", - " #[(myshape_X, myshape_X*4), (myshape_X*4, myshape_X*4), (myshape_X*4, 2)], # 4-4\n", - " #[(myshape_X, myshape_X*4), (myshape_X*4, myshape_X*3), (myshape_X*3, 2)],\n", - " #[(myshape_X, myshape_X*3), (myshape_X*3, myshape_X*3), (myshape_X*3, 2)],\n", - " #[(myshape_X, myshape_X*3), (myshape_X*3, myshape_X*2), (myshape_X*2, 2)],\n", - " #[(myshape_X, myshape_X*2), (myshape_X*2, myshape_X*2), (myshape_X*2, 2)],\n", - " #[(myshape_X, myshape_X*2), (myshape_X*2, myshape_X), (myshape_X, 2)],\n", - " #[(myshape_X, myshape_X), (myshape_X, myshape_X), (myshape_X, 2)],\n", - " # 3 hidden layers\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*6), (myshape_X*6, myshape_X*5), (myshape_X*5, 2)], # 6-6-5\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*5), (myshape_X*5, myshape_X*5), (myshape_X*5, 2)], # 6-5-5\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*5), (myshape_X*5, myshape_X*4), (myshape_X*4, 2)], # 6-5-4\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*5), (myshape_X*5, myshape_X*3), (myshape_X*3, 2)], # 6-5-3\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*4), (myshape_X*4, myshape_X*4), (myshape_X*4, 2)], # 6-4-4\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*4), (myshape_X*4, myshape_X*3), (myshape_X*3, 2)], # 6-4-3\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*4), (myshape_X*4, myshape_X*2), (myshape_X*2, 2)], # 6-4-2\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*4), (myshape_X*4, myshape_X*1), (myshape_X*1, 2)], # 6-4-1\n", - " #[(myshape_X, myshape_X*6), (myshape_X*6, myshape_X*3), (myshape_X*3, myshape_X*3), (myshape_X*3, 2)], # 6-3-3\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, myshape_X*4), (myshape_X*4, myshape_X*4), (myshape_X*4, 2)], # 5-4-4\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, myshape_X*4), (myshape_X*4, myshape_X*3), (myshape_X*3, 2)], # 5-4-3\n", - " #[(myshape_X, myshape_X*5), (myshape_X*5, myshape_X*4), (myshape_X*4, myshape_X*2), (myshape_X*2, 2)], # 5-4-2\n", - "]\n", - "\n", - "#lr_vals = [1e-2, 5e-3, 1e-3, 5e-4]\n", - "lr_vals = [1e-3]\n", - "hyperparam_list = list(product(layers, lr_vals))\n", - "# v1: There are 42'981 total points / 570 positive (1.33%) -> \"natural\" my_weight[1] = (42981-570)/570 = 74.4\n", - "# v3: There are 41'600 total points / 535 positive (1.29%) -> (41600-535)/535 = 76.8\n", - "my_weight = torch.Tensor(weight_hyperparam)\n", - "my_loss = torch.nn.CrossEntropyLoss(weight = my_weight).cuda()" - ] - }, - { - "cell_type": "markdown", - "id": "340c2c96", - "metadata": {}, - "source": [ - "## Training and validating the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49180b4c", - "metadata": {}, - "outputs": [], - "source": [ - "def CV_model(activ_fun, working_dir, F1_score_file):\n", - " \"\"\"\n", - " This function runs a cross-validation procedure for each combination of layers + learning rates\n", - " Results are saved in a .csv file inside {working_dir}\n", - " \"\"\"\n", - " # LSTM changes the configuration of the first layer. Thus, need to increase the ...\n", - " # size of the 1st MLP layer to lstm_size\n", - " if lstm_size:\n", - " for cur_hp in hyperparam_list:\n", - " cur_hp[0][0] = (lstm_size, cur_hp[0][0][1])\n", - " # Recording the validation F1 scores and losses\n", - " try:\n", - " final_val_F1 = pd.read_csv(f'{working_dir}/{F1_score_file}', index_col = 0)\n", - " except FileNotFoundError:\n", - " final_val_F1 = pd.DataFrame(np.nan, index = lr_vals, columns = [str(elem) for elem in layers])\n", - "\n", - " # Train and validate\n", - " print(f'Beginning CV on activation function {activ_fun} (weight = {weight_hyperparam[1]})')\n", - " for cur_idx, cur_hp in enumerate(hyperparam_list):\n", - " # We added a new layer configuration to the hyperparameters\n", - " if not str(cur_hp[0]) in list(final_val_F1.columns):\n", - " final_val_F1.insert(layers.index(cur_hp[0]), str(cur_hp[0]), np.nan) # layers.index to ensure consistent order\n", - " # We added a new learning rate to the hyperparameters\n", - " if not cur_hp[1] in final_val_F1.index.to_list():\n", - " final_val_F1.loc[cur_hp[1], :] = np.nan\n", - " final_val_F1 = final_val_F1.sort_index(ascending = False) # Sorting the indices\n", - "\n", - " # Run CV only if we do not have validation losses for this set of parameters\n", - " if np.isnan( final_val_F1.at[cur_hp[1], str(cur_hp[0])] ):\n", - " print(f'Beginning hyperparameters {cur_idx+1:2}/{len(hyperparam_list)} for {activ_fun}; layers = {cur_hp[0]}, lr = {cur_hp[1]}')\n", - " temp_val_F1 = 0\n", - " #temp_val_loss = 0\n", - " my_kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)\n", - " for fold_idx, (train_idx, val_idx) in enumerate(my_kfold.split(cv_data[:, :-1], cv_data[:, -1])):\n", - " print(f'Current fold: {fold_idx+1}/{my_kfold.n_splits}', end = '\\r')\n", - " # Creating the Datasets\n", - " if lstm_size:\n", - " train_dataset_fold = MyDataset(cv_data[train_idx], cv_lstm_data[train_idx])\n", - " val_dataset_fold = MyDataset(cv_data[val_idx], cv_lstm_data[val_idx])\n", - " else:\n", - " train_dataset_fold = MyDataset(cv_data[train_idx])\n", - " val_dataset_fold = MyDataset(cv_data[val_idx])\n", - "\n", - " # Creating the DataLoaders\n", - " train_loader_fold = DataLoader(train_dataset_fold, batch_size, shuffle = True)\n", - " val_loader_fold = DataLoader(val_dataset_fold, batch_size, shuffle = True)\n", - " best_F1_fold = 0\n", - " while best_F1_fold == 0: # Rare initializations have no improvement at all\n", - " # Declaring the model and optimizer\n", - " model = SequenceMLP(cur_hp[0], activ_fun, lstm_size).cuda()\n", - " optimizer = torch.optim.AdamW(model.parameters(), lr = cur_hp[1], weight_decay = 1e-2)\n", - " #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor = 0.5, patience = 10, verbose = True, min_lr = 1e-5)\n", - " # First 10 epochs involve linearly increasing the LR, then it decreases in a cosine-like way to final_lr until epoch n_epochs-10\n", - " scheduler = CosineScheduler(n_epochs-10, base_lr = cur_hp[1], warmup_steps = 10, final_lr = cur_hp[1]/15)\n", - " # Train and validate\n", - " for epoch in range(n_epochs):\n", - " t1 = time()\n", - " if epoch != 0:\n", - " print(f'Current fold: {fold_idx+1}/{my_kfold.n_splits}; epoch: {epoch+1:2}/{n_epochs}; Best F1 = {best_F1_fold*100:5.2f}; Epoch time = {delta_t:.2f} ', end = '\\r')\n", - " else:\n", - " print(f'Current fold: {fold_idx+1}/{my_kfold.n_splits}; epoch: {epoch+1:2}/{n_epochs}')\n", - " loop_model(model, optimizer, train_loader_fold, my_loss, epoch, batch_size, lstm_size)\n", - " val_loss, F1 = loop_model(model, optimizer, val_loader_fold, my_loss, epoch, batch_size, lstm_size, evaluation = True)\n", - " if F1 > best_F1_fold:\n", - " best_F1_fold = F1\n", - " if scheduler.__module__ == 'torch.optim.lr_scheduler': # Pytorch built-in scheduler\n", - " scheduler.step(val_loss)\n", - " else: # Custom scheduler\n", - " for param_group in optimizer.param_groups:\n", - " param_group['lr'] = scheduler(epoch)\n", - " t2 = time()\n", - " delta_t = t2 - t1\n", - " print(f'Fold {fold_idx+1}/{my_kfold.n_splits} done; Best F1 = {best_F1_fold*100:5.2f}; Epoch time = {delta_t:.2f}' + ' '*18)\n", - " temp_val_F1 += best_F1_fold / my_kfold.n_splits\n", - " #temp_val_loss += my_loss(val_pred.cuda(), val_y.cuda()) / my_kfold.n_splits\n", - "\n", - " # Saving the average validation F1 after CV\n", - " #temp_val_loss = temp_val_loss.cpu().detach().item()\n", - " final_val_F1.at[cur_hp[1], str(cur_hp[0])] = temp_val_F1\n", - " final_val_F1.to_csv(f'{working_dir}/{F1_score_file}')\n", - " return final_val_F1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a39504e", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "final_val_F1_list = np.empty_like(activ_fun_list, dtype = object) # This will hold multiple DataFrames, one for each activation function\n", - "for idx, activ_fun in enumerate(activ_fun_list):\n", - " F1_score_file = f'ANN_F1_{activ_fun}_{weight_hyperparam[1]}weight.csv' # Results file setup\n", - " final_val_F1_list[idx] = CV_model(activ_fun, working_dir, F1_score_file) # Running the CV" - ] - }, - { - "cell_type": "markdown", - "id": "d27a8d0d", - "metadata": {}, - "source": [ - "## Final Evaluation - Testing the best model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae1ebb1d", - "metadata": {}, - "outputs": [], - "source": [ - "def run_final_evaluation(model, activ_fun, threshold = 0.5):\n", - " model.eval()\n", - " # Train loss\n", - " train_pred = torch.empty((len(train_loader.dataset), 2))\n", - " train_y = torch.empty((len(train_loader.dataset)), dtype = torch.long)\n", - " for idx, data in enumerate(train_loader):\n", - " if lstm_size:\n", - " X, y, lstm = data\n", - " lstm = lstm.cuda()\n", - " else:\n", - " X, y = data\n", - " lstm = None\n", - " X = X.cuda()\n", - " pred = model(X, lstm).cpu().detach()\n", - " train_pred[idx*batch_size:(idx*batch_size)+len(pred), :] = pred\n", - " train_y[idx*batch_size:(idx*batch_size)+len(y)] = y\n", - " # Renormalizing the train_pred\n", - " train_pred = (train_pred.T / train_pred.sum(axis=1)).T\n", - " # Train confusion matrix\n", - " train_pred_CM = train_pred[:, 1] >= threshold\n", - " CM = confusion_matrix(train_y, train_pred_CM)\n", - " if CM[1,1]+CM[0,1]:\n", - " rec = CM[1,1]/(CM[1,1]+CM[1,0])\n", - " pre = CM[1,1]/(CM[1,1]+CM[0,1])\n", - " f1 = 2/(1/rec + 1/pre)\n", - " else:\n", - " rec, pre, f1 = 0, 0, 0\n", - " print(f'The train recall was {rec*100:.2f}%')\n", - " print(f'The train precision was {pre*100:.2f}%')\n", - " print(f'The train F1 score was {f1*100:.2f}%')\n", - "\n", - " # Test loss\n", - " test_pred = torch.empty((len(test_loader.dataset), 2))\n", - " test_y = torch.empty((len(test_loader.dataset)), dtype = torch.long)\n", - " for idx, data in enumerate(test_loader):\n", - " if lstm_size:\n", - " X, y, lstm = data\n", - " lstm = lstm.cuda()\n", - " else:\n", - " X, y = data\n", - " lstm = None\n", - " X = X.cuda()\n", - " pred = model(X, lstm).cpu().detach()\n", - " test_pred[idx*batch_size:(idx*batch_size)+len(pred), :] = pred\n", - " test_y[idx*batch_size:(idx*batch_size)+len(y)] = y\n", - " test_loss = my_loss(test_pred.cuda(), test_y.cuda())\n", - " print(f'The test loss was {test_loss:.3f}')\n", - " # Renormalizing the test_pred\n", - " test_pred = (test_pred.T / test_pred.sum(axis=1)).T\n", - " # Test confusion matrix\n", - " test_pred_CM = test_pred[:, 1] >= threshold\n", - " CM = confusion_matrix(test_y, test_pred_CM)\n", - " if CM[1,1]+CM[0,1]:\n", - " rec = CM[1,1]/(CM[1,1]+CM[1,0])\n", - " pre = CM[1,1]/(CM[1,1]+CM[0,1])\n", - " f1 = 2/(1/rec + 1/pre)\n", - " else:\n", - " rec, pre, f1 = 0, 0, 0\n", - " print(f'The test recall was {rec*100:.2f}%')\n", - " print(f'The test precision was {pre*100:.2f}%')\n", - " print(f'The test F1 score was {f1*100:.2f}%')\n", - " fig, ax = plt.subplots(figsize = (8,8))\n", - " ax.set_title(f'{activ_fun} - Test Confusion Matrix')\n", - " _ = ConfusionMatrixDisplay(CM).plot(ax = ax)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d768af3", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# Creating the full training Dataset / DataLoader\n", - "if lstm_size:\n", - " train_dataset = MyDataset(cv_data, cv_lstm_data)\n", - " test_dataset = MyDataset(test_data, test_lstm_data)\n", - "else:\n", - " train_dataset = MyDataset(cv_data)\n", - " test_dataset = MyDataset(test_data)\n", - "# Creating the DataLoaders\n", - "train_loader = DataLoader(train_dataset, batch_size, shuffle = True)\n", - "test_loader = DataLoader(test_dataset, batch_size, shuffle = True)\n", - "\n", - "for final_val_F1, activ_fun in zip(final_val_F1_list, activ_fun_list):\n", - " best_model_file = f'ANN_{activ_fun}_{weight_hyperparam[1]}weight_dict.pt'\n", - " # Finding the best hyperparameters\n", - " best_idx = np.unravel_index(np.nanargmax(final_val_F1.values), final_val_F1.shape)\n", - " best_LR = final_val_F1.index[best_idx[0]]\n", - " best_neurons_str = final_val_F1.columns[best_idx[1]]\n", - " # Converting the best number of neurons from str to list\n", - " best_neurons = []\n", - " temp_number = []\n", - " temp_tuple = []\n", - " for elem in best_neurons_str:\n", - " if elem in '0123456789':\n", - " temp_number.append(elem)\n", - " elif elem in {',', ')'} and temp_number: # Finished a number. 2nd check because there is a comma right after )\n", - " converted_number = ''.join(temp_number)\n", - " temp_tuple.append( int(converted_number) )\n", - " temp_number = []\n", - " if elem in {')'}: # Also finished a tuple\n", - " best_neurons.append(tuple(temp_tuple))\n", - " temp_tuple = []\n", - " # Re-declaring the model\n", - " model = SequenceMLP(best_neurons, activ_fun, lstm_size).cuda()\n", - "\n", - " # Checking if we already retrained this model\n", - " try:\n", - " mydict = torch.load(f'{working_dir}/{best_model_file}')\n", - " model.load_state_dict(mydict)\n", - " except FileNotFoundError: # Retraining the model with the full training set\n", - " optimizer = torch.optim.Adam(model.parameters(), lr = best_LR)\n", - " # First 10 epochs involve linearly increasing the LR, then it decreases in a cosine-like way to final_lr until epoch n_epochs-10\n", - " scheduler = CosineScheduler(n_epochs-10, base_lr = best_LR, warmup_steps = 10, final_lr = best_LR/15)\n", - " # Retrain\n", - " for epoch in range(n_epochs):\n", - " print(f'Final training for {activ_fun}: epoch {epoch+1:3}/{n_epochs}' + ' '*20, end = '\\r')\n", - " loop_model(model, optimizer, train_loader, my_loss, epoch, batch_size, lstm_size)\n", - " if scheduler.__module__ == 'torch.optim.lr_scheduler': # Pytorch built-in scheduler\n", - " scheduler.step(val_loss)\n", - " else: # Custom scheduler\n", - " for param_group in optimizer.param_groups:\n", - " param_group['lr'] = scheduler(epoch)\n", - " # Save the retrained model\n", - " torch.save(model.state_dict(), f'{working_dir}/{best_model_file}')\n", - "\n", - " # CV Data\n", - " print(f'Final results for {activ_fun} & weight {weight_hyperparam[1]}')\n", - " print(f'Best hyperparameters: {best_neurons}, {best_LR}')\n", - " print(f'CV F1 score: {final_val_F1.iat[best_idx]:.4f}')\n", - " run_final_evaluation(model, activ_fun, 0.5)\n", - " print()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}