From 03116068e5ef6158996a58c66723f68bf42bf54f Mon Sep 17 00:00:00 2001 From: George Tay <73705042+asdfghjkxd@users.noreply.github.com> Date: Wed, 12 Jan 2022 11:35:00 +0800 Subject: [PATCH] Removed New Functionalities Keeping in line with the requirements of the competition and the dependencies conflict between numpy and gensim, new advanced functionalities added have been removed from the main distribution of the app. --- .gitignore | 2 + README.md | 11 +- api/endpoints/__init__.py | 0 api/endpoints/dtm/__init__.py | 0 api/endpoints/dtm/dtm.py | 66 -- api/endpoints/lca/__init__.py | 0 api/endpoints/lca/clean.py | 279 -------- api/endpoints/lca/modify.py | 65 -- api/endpoints/lca/query.py | 59 -- api/endpoints/mt/__init__.py | 0 api/endpoints/mt/model_trainer.py | 176 ----- api/endpoints/tk/__init__.py | 0 api/endpoints/tk/toolkit_nlp.py | 886 ------------------------- api/main.py | 29 - app.py | 4 +- config.py | 446 ------------- pyfiles/pages/document_term_matrix.py | 2 - pyfiles/pages/load_clean_visualise.py | 5 +- pyfiles/pages/model_trainer.py | 892 -------------------------- pyfiles/pages/toolkit_nlp.py | 3 - requirements.txt | 11 +- utils/helper.py | 3 - 22 files changed, 9 insertions(+), 2930 deletions(-) delete mode 100644 api/endpoints/__init__.py delete mode 100644 api/endpoints/dtm/__init__.py delete mode 100644 api/endpoints/dtm/dtm.py delete mode 100644 api/endpoints/lca/__init__.py delete mode 100644 api/endpoints/lca/clean.py delete mode 100644 api/endpoints/lca/modify.py delete mode 100644 api/endpoints/lca/query.py delete mode 100644 api/endpoints/mt/__init__.py delete mode 100644 api/endpoints/mt/model_trainer.py delete mode 100644 api/endpoints/tk/__init__.py delete mode 100644 api/endpoints/tk/toolkit_nlp.py delete mode 100644 api/main.py delete mode 100644 pyfiles/pages/model_trainer.py diff --git a/.gitignore b/.gitignore index 1f7ac75..78ddfc5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class *.pyc +*.DS_STORE # C extensions *.so @@ -27,6 +28,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +futures/ # PyInstaller # Usually these files are written by a python script from a template diff --git a/README.md b/README.md index 3e00126..9620c37 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,9 @@ created pre-made Docker images hosted on Github Packages for you to use. To do s system and run the following commands on Terminal or Powershell: ```shell -docker pull docker pull ghcr.io/asdfghjkxd/app:2.2 +docker pull docker pull ghcr.io/asdfghjkxd/app:main -docker run -it -p 5000:8501 --name news ghcr.io/asdfghjkxd/app:2.2 +docker run -it -p 5000:8501 --name news ghcr.io/asdfghjkxd/app:main ``` The created Docker Container can then be accessed through `localhost` on Port `5000`! @@ -35,7 +35,7 @@ The created Docker Container can then be accessed through `localhost` on Port `5 If Command Lines are not your thing, you can do the same using the Docker Desktop GUI! Just follow the steps below to set up the Container: -- Open up Terminal or Powershell and key in the command `docker pull ghcr.io/asdfghjkxd/app:2.2` word for word (we +- Open up Terminal or Powershell and key in the command `docker pull ghcr.io/asdfghjkxd/app:main` word for word (we promise this is the only Command Line step in the entire process!) - Click on the _Images_ tab on the sidebar and find the image you have pulled in the above step - Click on the _Run_ button @@ -81,8 +81,3 @@ following tasks on your dataset: - Named Entity Recognition - Position of Speech Tagging - Summary - - -### NLP Model Trainer -This module will allow you to train NLP models you can use for your NLP tasks. This module requires you to have a -compatible GPU (NVIDIA GPUs) to run inference/classification tasks. diff --git a/api/endpoints/__init__.py b/api/endpoints/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/dtm/__init__.py b/api/endpoints/dtm/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/dtm/dtm.py b/api/endpoints/dtm/dtm.py deleted file mode 100644 index bb4768c..0000000 --- a/api/endpoints/dtm/dtm.py +++ /dev/null @@ -1,66 +0,0 @@ -import pandas as pd - -from typing import Union -from io import StringIO -from fastapi import APIRouter, HTTPException, File, UploadFile -from fastapi.encoders import jsonable_encoder -from nltk.corpus import stopwords -from sklearn.feature_extraction.text import CountVectorizer - -router = APIRouter(prefix='/endpoints', - tags=['dtm'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/dtm') -async def dtm(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data') -> dict: - """ - This function takes in CSV data that is compatible with a pandas DataFrame, creates a Document-Term Matrix and - returns it to the user in JSON format - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - counter_object = CountVectorizer(stop_words=stopwords.words('english')) - word_string = ' '.join(raw_data[data_column]) - - dict_data = { - 'text': word_string - } - - series_data = pd.DataFrame(data=dict_data, index=[0]) - series_data = counter_object.fit_transform(series_data.text) - dtm_ = pd.DataFrame(series_data.toarray(), - columns=counter_object.get_feature_names(), - index=[0]) - - if not dtm_.empty: - dtm_copy = dtm_.copy().transpose() - dtm_copy.columns = ['Word Frequency'] - dtm_copy.sort_values(by=['Word Frequency'], ascending=False, inplace=True) - data = { - 'dtm': dtm_copy.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Document-Term Matrix was not properly prepared. Try ' - 'again.') diff --git a/api/endpoints/lca/__init__.py b/api/endpoints/lca/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/lca/clean.py b/api/endpoints/lca/clean.py deleted file mode 100644 index d88381f..0000000 --- a/api/endpoints/lca/clean.py +++ /dev/null @@ -1,279 +0,0 @@ -import pathlib -import nltk -import numpy as np -import pandas as pd -import texthero as hero -import fastapi - -from texthero import preprocessing -from io import StringIO -from typing import Union -from fastapi.encoders import jsonable_encoder -from fastapi import APIRouter, HTTPException, File, UploadFile -from texthero import stopwords -from nltk.stem import WordNetLemmatizer - -# define constants -lemmatizer = WordNetLemmatizer() -SIMPLE_PIPELINE = [ - preprocessing.remove_html_tags, - preprocessing.remove_diacritics, - preprocessing.remove_whitespace, - preprocessing.remove_urls, - preprocessing.drop_no_content - ] -PIPELINE = [ - preprocessing.fillna, - preprocessing.lowercase, - preprocessing.remove_punctuation, - preprocessing.remove_html_tags, - preprocessing.remove_diacritics, - preprocessing.remove_whitespace, - preprocessing.remove_urls, - preprocessing.drop_no_content - ] - - -def lemmatizeText(text): - """ - This function iterates through the pandas dataframe and lemmatizes the words - - Parameters - ---------- - :param text: Text to lemmatize (string) - ---------- - """ - return [lemmatizer.lemmatize(word) for word in text] - - -# API router -router = APIRouter(prefix='/endpoints/lca/clean', - tags=['clean'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/no-clean') -async def no_clean(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data'): - """ - This function takes in JSON data that is compatible with a pandas DataFrame, encodes it in the ASCII format and - decodes it back into ASCII - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - if not raw_data.empty: - raw_data[data_column] = raw_data[data_column].str.encode('ascii', 'ignore') \ - .str.decode('ascii') - raw_data = pd.DataFrame(data=raw_data) - raw_data = raw_data.dropna() - data = { - 'original': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Data is not properly loaded. Try again.') - - -@router.post('/simple-clean') -async def simple_clean(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', tokenize: bool = True): - """ - This function takes in JSON data that is compatible with a pandas DataFrame, encodes it in the ASCII format and - decodes back into ASCII, and finally apply the 'Simple' Cleaning Pipeline and tokenizing the data (if the flag is - set to True) - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **tokenize**: Flag to determine whether to tokenize the data and to return it - """ - - cleaned_data_tokenized = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - raw_data[data_column] = raw_data[data_column].str.encode('ascii', 'ignore') \ - .str.decode('ascii') - raw_data = pd.DataFrame(data=raw_data) - raw_data = raw_data.dropna() - - try: - cleaned_data = raw_data[[data_column]] - cleaned_data['CLEANED CONTENT'] = hero.clean(cleaned_data[data_column], SIMPLE_PIPELINE) - cleaned_data['CLEANED CONTENT'].replace('', np.nan, inplace=True) - cleaned_data.dropna(inplace=True, subset=['CLEANED CONTENT']) - - cleaned_data = cleaned_data.astype(str) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - if tokenize: - try: - cleaned_data_tokenized = hero.tokenize(cleaned_data['CLEANED CONTENT']).to_frame().astype(str) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - - if not cleaned_data.empty and not cleaned_data_tokenized.empty: - data = { - 'cleaned_untokenized': cleaned_data.to_json(), - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return data - elif not cleaned_data.empty and cleaned_data_tokenized.empty: - data = { - 'cleaned_untokenized': cleaned_data.to_json() - } - return data - elif cleaned_data.empty and not cleaned_data_tokenized.empty: - data = { - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Data is not properly loaded. Try again.') - - -@router.post('/complex-clean') -async def complex_clean(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', - tokenize: bool = True, stopwords_list: Union[str, list] = None): - """ - This function takes in JSON data that is compatible with a pandas DataFrame, encodes it in the ASCII format and - decodes back into ASCII, and finally apply the 'Complex' Cleaning Pipeline and tokenzing the data (if the flag is - set to True) - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **tokenize**: Flag to determine whether to tokenize the data and to return it - - **stopwords_list**: A string (delimited by commas) or a list containing words to extend onto the default stopwords - list. - """ - - finalised = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - # stopwords check - if stopwords_list is not None: - if type(stopwords_list) is str: - try: - if len(stopwords_list) != 0: - stopwords_list = stopwords.DEFAULT.union(set(word.strip().lower() for word in - stopwords_list.split(sep=','))) - finalised = True - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - elif type(stopwords_list) is list: - stopwords_list = stopwords.DEFAULT.union(stopwords_list) - finalised = True - else: - raise HTTPException(status_code=404, detail='Invalid type for stopwords_list ') - else: - stopwords_list = stopwords.DEFAULT - finalised = True - - if finalised: - try: - cleaned_data = raw_data[[data_column]] - cleaned_data['CLEANED CONTENT'] = hero.clean(cleaned_data[data_column], PIPELINE) - cleaned_data['CLEANED CONTENT'] = hero.remove_digits(cleaned_data['CLEANED CONTENT'], only_blocks=False) - cleaned_data['CLEANED CONTENT'] = hero.remove_stopwords(cleaned_data['CLEANED CONTENT'], stopwords_list) - cleaned_data_tokenized = hero.tokenize(cleaned_data['CLEANED CONTENT']) - cleaned_data_tokenized = cleaned_data_tokenized.apply(lemmatizeText) - - fin_list = [[word for word in text if word.lower() in set(nltk.corpus.words.words()) or not - word.isalpha()] for text in cleaned_data_tokenized] - - cleaned_data['CLEANED CONTENT'] = [' '.join(text) for text in fin_list] - cleaned_data_tokenized.update([str(text) for text in fin_list]) - cleaned_data_tokenized = cleaned_data_tokenized.to_frame().astype(str) - cleaned_data['CLEANED CONTENT'].replace('', np.nan, inplace=True) - cleaned_data.dropna(subset=['CLEANED CONTENT'], inplace=True) - cleaned_data = cleaned_data.astype(str) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - if not cleaned_data.empty and not cleaned_data_tokenized.empty: - if tokenize: - data = { - 'original': raw_data.to_json(), - 'cleaned_untokenized': cleaned_data.to_json(), - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return jsonable_encoder(data) - else: - data = { - 'original': raw_data.to_json(), - 'cleaned_tokenized': cleaned_data.to_json() - } - return jsonable_encoder(data) - elif not cleaned_data.empty and cleaned_data_tokenized.empty: - data = { - 'original': raw_data.to_json(), - 'cleaned_untokenized': cleaned_data.to_json() - } - return jsonable_encoder(data) - elif cleaned_data.empty and not cleaned_data_tokenized.empty: - if tokenize: - data = { - 'original': raw_data.to_json(), - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return jsonable_encoder(data) - else: - data = { - 'original': raw_data.to_json() - } - return jsonable_encoder(data) - elif cleaned_data.empty and cleaned_data_tokenized.empty: - raise HTTPException(status_code=404, detail='Data is not properly loaded. Try again.') - else: - raise HTTPException(status_code=404, detail='Data is not properly processed. Try again.') \ No newline at end of file diff --git a/api/endpoints/lca/modify.py b/api/endpoints/lca/modify.py deleted file mode 100644 index ed2303f..0000000 --- a/api/endpoints/lca/modify.py +++ /dev/null @@ -1,65 +0,0 @@ -"""This file contains code used to modify the input DataFrame in the JSON format""" - -import pandas as pd -import fastapi -import pycountry - -from io import StringIO -from fastapi.encoders import jsonable_encoder -from fastapi import APIRouter, HTTPException, File, UploadFile -from collections import Counter - -router = APIRouter(prefix='/endpoints/lca/modify', - tags=['modify'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/country-extraction') -async def extract_country(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = ''): - """ - Searches for instances of country names being mentioned in the DataFrame passed to it and returns the DataFrame - modified with the country names extracted - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column where the data of interest is found in - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - try: - raw_data = raw_data.astype(object) - raw_data['COUNTRIES'] = raw_data[data_column].astype(str).apply( - lambda x: [country.name for country in pycountry.countries if country.name.lower() in x.lower()]) - new_list = raw_data['COUNTRIES'].to_list() - temp = [] - for ls in new_list: - temp.extend(ls) - zipped = list(zip(Counter(temp).keys(), Counter(temp).values())) - - globe_data = pd.DataFrame(data=zipped, index=range(len(zipped)), columns=['country', 'count']) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - data = { - 'data': globe_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Data is not processed properly. Try again.') diff --git a/api/endpoints/lca/query.py b/api/endpoints/lca/query.py deleted file mode 100644 index c54acfc..0000000 --- a/api/endpoints/lca/query.py +++ /dev/null @@ -1,59 +0,0 @@ -"""This file contains the code used for querying data from a given DataFrame that is passed to it in the JSON format""" - -import pandas as pd -import fastapi - -from io import StringIO -from fastapi.encoders import jsonable_encoder -from fastapi import APIRouter, HTTPException, File, UploadFile - -router = APIRouter(prefix='/endpoints/lca/modify', - tags=['modify'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/query') -async def query(file: UploadFile = File(...), ftype: str = 'csv', query_: str = None, data_column: str = '', - match: bool = True): - """ - Queries the input DataFrame in the form of JSON to find matching strings for query - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: The column name where the data to query is found - - **query_**: The string or list to query for in the data - - **match**: The strictness of query - True if query is case-sensitive - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - try: - temp = raw_data.copy() - query_data = temp.loc[temp[data_column].str.contains(query_, case=match)] - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - data = { - 'data': query_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Data is not processed properly. Try again.') diff --git a/api/endpoints/mt/__init__.py b/api/endpoints/mt/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/mt/model_trainer.py b/api/endpoints/mt/model_trainer.py deleted file mode 100644 index 15d56d0..0000000 --- a/api/endpoints/mt/model_trainer.py +++ /dev/null @@ -1,176 +0,0 @@ -import os -import subprocess -import zipfile - -from fastapi import HTTPException, APIRouter -from fastapi.responses import FileResponse - -router = APIRouter(prefix='/endpoints', - tags=['trainer'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/trainer') -async def trainer(model_name_or_path: str, dataset: str, attack: str, task_type: str = 'classification', - model_max_length: str = None, model_num_labels: int = None, - dataset_train_split: float = None, - dataset_eval_split: float = None, - filter_train_by_labels: str = None, - filter_eval_by_labels: str = None, num_epochs: int = 3, - num_clean_epochs: int = 1, attack_epoch_interval: int = 1, - early_stopping_epochs: int = None, learning_rate: float = 5e-5, - num_warmup_steps: int = 500, weight_decay: float = 0.01, - per_device_train_batch_size: int = 8, per_device_eval_batch_size: int = 32, - gradient_accumulation_steps: int = 1, random_seed: int = 786, - parallel: bool = False, load_best_model_at_end: bool = False, - alpha: float = 1.0, num_train_adv_examples: int = -1, - query_budget_train: float = None, - attack_num_workers_per_device: int = 1, output_dir: str = './output', - checkpoint_interval_steps: float = None, checkpoint_interval_epochs: int = None, - save_last: bool = True, log_to_tb: bool = False, - tb_log_dir: str = None, log_to_wandb: bool = False, - wandb_project: str = 'textattack', logging_interval_step: int = 1): - """ - This function is used to call the textattack CLI to run model training using the target system - - - **model_name_or_path**: Name of the model to use or path to the model on the system - - **dataset**: Name of the dataset to use or the Dataset object generated by the user - - **attack**: Attack string - - **task_type**: Action to take while training - - **model_max_length**: Model Max Length - - **model_num_labels**: Number of Labels - - **dataset_train_split**: Train split for dataset - - **dataset_eval_split**: Evaluation split for dataset - - **filter_train_by_labels**: Filter Train Data By Labels - - **filter_eval_by_labels**: Filter Evaluation Data By Labels - - **num_epochs**: Total number of epochs for training - - **num_clean_epochs**: Number of epochs to train on just the original training dataset before adversarial training - - **attack_epoch_interval**: Generate a new adversarial training set every N epochs - - **early_stopping_epochs**: Number of epochs validation must increase before stopping early - - **learning_rate**: Learning rate of the model - - **num_warmup_steps**: The number of steps for the warmup phase of linear scheduler - - **weight_decay**: Weight decay (L2 penalty) - - **per_device_train_batch_size**: The batch size per GPU/CPU for training - - **per_device_eval_batch_size**: The batch size per GPU/CPU for evaluation - - **gradient_accumulation_steps**: Number of updates steps to accumulate the gradients before performing a - backward/update pass - - **random_seed**: Random seed for reproducibility - - **parallel**: Use Multiple GPUs using torch.DataParallel class - - **load_best_model_at_end**: keep track of the best model across training and load it at the end - - **alpha**: The weight for adversarial loss - - **num_train_adv_examples**: The number of samples to successfully attack when generating adversarial training - set before start of every epoch - - **query_budget_train**: The max query budget to use when generating adversarial training set - - **attack_num_workers_per_device**: Number of worker processes to run per device for attack - - **output_dir**: Directory to output training logs and checkpoints - - **checkpoint_interval_steps**: Save after N updates - - **checkpoint_interval_epochs**: Save after N epochs - - **save_last**: Save the model at end of training - - **log_to_tb**: Log to Tensorboard - - **tb_log_dir**: Directory to output training logs and checkpoints - - **log_to_wandb**: Log to Wandb - - **wandb_project**: Name of Wandb project for logging - - **logging_interval_step**: Log to Tensorboard/Wandb every N training steps - """ - - var_list = ['textattack', 'train'] - maps = { - 'model_name_or_path': ['--model-name-or-path', model_name_or_path], - 'dataset': ['--dataset', dataset], - 'attack': ['--attack', attack], - 'task_type': ['--task-type', task_type], - 'model_max_length': ['--model-max-length', model_max_length], - 'model_num_labels': ['--model-num-labels', model_num_labels], - 'dataset_train_split': ['--dataset-train-split', dataset_train_split], - 'dataset_eval_split': ['--dataset-eval-split', dataset_eval_split], - 'filter_train_by_labels': ['--filter-train-by-labels', filter_train_by_labels], - 'filter_eval_by_labels': ['--filter-eval-by-labels', filter_eval_by_labels], - 'num_epochs': ['--num-epochs', num_epochs], - 'num_clean_epochs': ['--num-clean-epochs', num_clean_epochs], - 'attack_epoch_interval': ['--attack-epoch-interval', attack_epoch_interval], - 'early_stopping_epochs': ['--early-stopping-epochs', early_stopping_epochs], - 'learning_rate': ['--learning-rate', learning_rate], - 'num_warmup_steps': ['--num-warmup-steps', num_warmup_steps], - 'weight_decay': ['--weight-decay', weight_decay], - 'per_device_train_batch_size': ['--per-device-train-batch-size', per_device_train_batch_size], - 'per_device_eval_batch_size': ['--per-device-eval-batch-size', per_device_eval_batch_size], - 'gradient_accumulation_steps': ['--gradient-accumulation-steps', gradient_accumulation_steps], - 'random_seed': ['--random-seed', random_seed], - 'parallel': ['--parallel', parallel], - 'load_best_model_at_end': ['--load-best-model-at-end', load_best_model_at_end], - 'alpha': ['--alpha', alpha], - 'num_train_adv_examples': ['--num-train-adv-examples', num_train_adv_examples], - 'query_budget_train': ['--query-budget-train', query_budget_train], - 'attack_num_workers_per_device': ['--attack-num-workers-per-device', attack_num_workers_per_device], - 'output_dir': ['--output-dir', output_dir], - 'checkpoint_interval_steps': ['--checkpoint-interval-steps', checkpoint_interval_steps], - 'checkpoint_interval_epochs': ['--checkpoint-interval-epochs', checkpoint_interval_epochs], - 'save_last': ['--save-last', save_last], - 'log_to_tb': ['--log-to-tb', log_to_tb], - 'tb_log_dir': ['--tb-log-dir', tb_log_dir], - 'log_to_wandb': ['--log-to-wandb', log_to_wandb], - 'wandb_project': ['--wandb-project', wandb_project], - 'logging_interval_step': ['--logging-interval-step', logging_interval_step] - } - maps = {key: value for key, value in maps.items() if value[1] is not None} - for k, v in maps.items(): - var_list.extend(v) - - var_list = [str(iter_) for iter_ in var_list if type(iter_) is not bool] - - try: - subprocess.run(var_list) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if output_dir is not None: - try: - with zipfile.ZipFile('file.zip', 'w') as zipped: - for folder, subfolder, fnames in os.walk(output_dir): - for fname in fnames: - fpath = os.path.join(folder, fname) - zipped.write(fpath, os.path.basename(fpath)) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - return FileResponse('file.zip', media_type='application/zip', filename='file.zip') - else: - raise HTTPException(status_code=404, detail='Error: The model directory is not found. Try again.') diff --git a/api/endpoints/tk/__init__.py b/api/endpoints/tk/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/tk/toolkit_nlp.py b/api/endpoints/tk/toolkit_nlp.py deleted file mode 100644 index 2b6cc0b..0000000 --- a/api/endpoints/tk/toolkit_nlp.py +++ /dev/null @@ -1,886 +0,0 @@ -import logging -import os -from collections import Counter -from heapq import nlargest -from string import punctuation -import numpy as np -import pandas as pd -import spacy -import streamlit as st -import plotly.graph_objs as go -import plotly.figure_factory as ff -import plotly.express as px -import nltk -import pyLDAvis -import pyLDAvis.gensim_models -import pyLDAvis.sklearn -import textattack.models.wrappers -import torch -import tensorflow as tf -import matplotlib.pyplot as plt -import transformers - -from io import StringIO -from fastapi import APIRouter, HTTPException, File, UploadFile -from fastapi.encoders import jsonable_encoder -from operator import itemgetter -from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline, AutoModelForSequenceClassification -from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from nltk.sentiment.vader import SentimentIntensityAnalyzer -from spacy.lang.en.stop_words import STOP_WORDS -from spacy.lang.en import English -from spacy import displacy -from wordcloud import WordCloud -from textblob import TextBlob - -# API router -router = APIRouter(prefix='/endpoints/toolkit', - tags=['toolkit'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - -# file counter -fc = 0 - - -def summarise(text, stopwords, pos_tag, nlp, sent_count): - """ - This function summarise the text dataframe - - Parameters - ---------- - text: DataFrame - nlp: NLP model - pos_tag: Text pos tag - stopwords: Stopwords - sent_count: Number of sentences to summarise to - ---------- - """ - - try: - # DEFINE LISTS AND DICTS - keyword = [] - sent_strength = {} - data = nlp(str(text)) - - # EXTRACT KEYWORDS FROM TEXT - for token in data: - if token.text in stopwords or token.text in punctuation: - continue - if token.pos_ in pos_tag: - keyword.append(token.text) - - # COUNT THE FREQUENCY OF WORDS - freq_word = Counter(keyword) - max_freq = Counter(keyword).most_common(1)[0][1] - for word in freq_word.keys(): - freq_word[word] = (freq_word[word] / max_freq) - - # CALCULATE SENTENCE SCORES - for sent in data.sents: - for word in sent: - if word.text in freq_word.keys(): - if sent in sent_strength.keys(): - sent_strength[sent] += freq_word[word.text] - else: - sent_strength[sent] = freq_word[word.text] - - # CONCATENATE THE STRINGS IN THE LIST TO A LARGER STRING - summarized_sentences = nlargest(sent_count, sent_strength, key=sent_strength.get) - final_sentences = [w.text for w in summarized_sentences] - summary = ' '.join(final_sentences) - except Exception: - return text - else: - return summary - - -def modelIterator(model, vectoriser, top_n, vb=True): - """ - This function prints out and returns the extracted topics for the NLP model passed on to it - - Parameters - ---------- - model: NLP Model - vectoriser: Vectorised text - top_n: Number of Topics to return - vb: Verbose tag (will print out the topics if set to True - --------- - """ - frame_list = [] - - for id_, topic in enumerate(model.components_): - lister = [(vectoriser.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]] - df = pd.DataFrame(data=lister, - index=range(len(lister)), - columns=['word', 'weight']) - - if vb: - st.markdown(f'### Topic {id_}') - st.dataframe(df) - - frame_list.append(df) - - return frame_list - - -def dominantTopic(vect, model, n_words): - """ - Returns the topic text - - Parameters - ---------- - vect: Vectorizer used - model: NLP Model - n_words: Number of Topics to return - ---------- - """ - kw = np.array(vect.get_feature_names()) - topic_kw = [] - for weights in model.components_: - top_kw = (-weights).argsort()[:n_words] - topic_kw.append(kw.take(top_kw)) - - return topic_kw - - -@router.post('/wordcloud') -async def wordcloud(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', max_word: int = 200, - contour: int = 3, width: int = 800, height: int = 400, colour: str = 'steelblue'): - """ - Wordcloud creation - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **max_word**: Max number of words to render in the wordcloud image - - **contour**: Contour width - - **width**: Width of the wordcloud image - - **height**: Height of the wordcloud image - - **colour**: Colour of the background - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - raw_data = raw_data[[data_column]] - wc = WordCloud(background_color='white', - max_words=max_word, - contour_width=contour, - width=width, - height=height, - contour_color=colour) - wc.generate(' '.join(raw_data[data_column])) - img = wc.to_image() - data = { - 'image': str(img.tobytes()) - } - - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Document-Term Matrix was not properly prepared. Try ' - 'again.') - - -@router.post('/ner') -async def ner(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', - model: str = 'en_core_web_sm', one_datapoint_analyser: int = None): - """ - Conduct NER analysis - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **one_datapoint_analyser**: The datapoint to render into HTML format - """ - - NLP = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - # init the required columns - raw_data = raw_data[[data_column]] - raw_data['NER'] = '' - raw_data['COMPILED_LABELS'] = '' - raw_data = raw_data.astype(str) - - if model == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_sm') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - elif model == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_lg') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - - for index in range(len(raw_data)): - temp_nlp = NLP(raw_data[data_column][index]) - raw_data.at[index, 'NER'] = str(list(zip([word.text for word in temp_nlp.ents], - [word.label_ for word in temp_nlp.ents]))) - raw_data.at[index, 'COMPILED_LABELS'] = str(list(set([word.label_ for word in temp_nlp.ents]))) - - if one_datapoint_analyser is not None: - cpy = raw_data.copy() - temp = cpy[data_column][one_datapoint_analyser] - render = displacy.render(list(NLP(str(temp)).sents), - style='ent', - page=True) - data = { - 'data': raw_data.to_json(), - 'render': render - } - return jsonable_encoder(data) - else: - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/pos') -async def pos(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = '', - model: str = 'en_core_web_sm', one_datapoint_analyser: int = None, compact: bool = True, - colour: str = 'steelblue', bg: str = 'white'): - """ - Conduct POS tagging - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **one_datapoint_analyser**: The datapoint to render into HTML format - - **compact**: Compact the renders - - **colour**: Colour of the words in the render - - **bg**: Colour of the background - """ - - NLP = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - raw_data = raw_data[[data_column]] - raw_data['POS'] = '' - raw_data = raw_data.astype(str) - - if model == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_sm') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - elif model == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_lg') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - - for index in range(len(raw_data)): - temp_nlp = NLP(raw_data[data_column][index]) - raw_data.at[index, 'POS'] = str(list(zip([str(word) for word in temp_nlp], - [word.pos_ for word in temp_nlp]))) - raw_data.at[index, 'COMPILED_LABELS'] = str(list(set([word.pos_ for word in temp_nlp]))) - - if one_datapoint_analyser is not None: - cpy = raw_data.copy() - temp = cpy[data_column][one_datapoint_analyser] - render = displacy.render(list(NLP(str(temp)).sents), - style='dep', - options={ - 'compact': compact, - 'color': colour, - 'bg': bg, - }) - data = { - 'data': raw_data.to_json(), - 'render': render - } - return jsonable_encoder(data) - else: - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/summarise') -def summarise(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', mode: str = 'basic', - model: str = 'en_core_web_sm', sentence_len: int = 3, min_words: int = 80, max_words: str = 150, - max_tensor: int = 512): - """ - Summarise texts - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **mode**: Define whether or not to conduct 'basic' or 'advanced' summarisation on input data - - **model**: spaCy model to load - - **sentence_len**: The maximum length of sentence to return - - **min_words**: The minimum number of words to include in the summary - - **max_words**: The maximum number of words to include in the summary - - **max_tensor**: The maximum number of input tensors for advanced summarisation process - """ - - NLP = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - # load up the data first - raw_data = raw_data[[data_column]] - raw_data['SUMMARY'] = np.nan - raw_data = raw_data.astype(str) - - if not raw_data.empty: - if mode == 'basic': - if model == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_sm') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - elif model == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_lg') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - - stopwords = list(STOP_WORDS) - pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] - raw_data['SUMMARY'] = raw_data[data_column]. \ - apply(lambda x: summarise(x, stopwords, pos_tag, NLP, sentence_len)) - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - - elif mode == 'advanced': - if torch.cuda.is_available(): - try: - torch.cuda.get_device_name(torch.cuda.current_device()) - except AssertionError: - raise HTTPException(status_code=415, detail='Error: CUDA Device is not enabled. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - tokenizer = AutoTokenizer.from_pretrained('t5-base') - model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) - raw_data = raw_data.astype(object) - raw_data['ENCODED'] = raw_data[data_column]. \ - apply(lambda x: tokenizer.encode('summarize: ' + x, - return_tensors='pt', - max_length=max_tensor, - truncation=True)) - raw_data['OUTPUTS'] = raw_data['ENCODED']. \ - apply(lambda x: model.generate(x, - max_length=max_words, - min_length=min_words, - length_penalty=5.0, - num_beams=2)) - raw_data['SUMMARISED'] = raw_data['OUTPUTS'].apply( - lambda x: tokenizer.decode(x[0])) - raw_data.drop(columns=['ENCODED', 'OUTPUTS'], inplace=True) - raw_data['SUMMARISED'] = raw_data['SUMMARISED']. \ - str.replace(' ', '').str.replace('', '') - raw_data = raw_data.astype(str) - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/sentiment') -def sentiment(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', model: str = 'vader', - colour: str = '#2ACAEA'): - """ - Conduct Sentiment Analysis - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **colour**: Colour of plots generated - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - if model == 'vader': - replacer = { - r"'": '', - r'[^\w\s]': ' ', - r' \d+': ' ', - r' +': ' ' - } - - raw_data['VADER SENTIMENT TEXT'] = raw_data[data_column]. \ - replace(to_replace=replacer, regex=True) - - vader_analyser = SentimentIntensityAnalyzer() - sent_score_list = [] - sent_label_list = [] - - # scoring - for i in raw_data['VADER SENTIMENT TEXT'].tolist(): - sent_score = vader_analyser.polarity_scores(i) - - if sent_score['compound'] > 0: - sent_score_list.append(sent_score['compound']) - sent_label_list.append('Positive') - elif sent_score['compound'] == 0: - sent_score_list.append(sent_score['compound']) - sent_label_list.append('Neutral') - elif sent_score['compound'] < 0: - sent_score_list.append(sent_score['compound']) - sent_label_list.append('Negative') - - raw_data['VADER OVERALL SENTIMENT'] = sent_label_list - raw_data['VADER OVERALL SCORE'] = sent_score_list - raw_data['VADER POSITIVE SCORING'] = [vader_analyser.polarity_scores(doc)['pos'] for doc in - raw_data['VADER SENTIMENT TEXT'].values.tolist()] - raw_data['VADER NEUTRAL SCORING'] = [vader_analyser.polarity_scores(doc)['neu'] for doc in - raw_data['VADER SENTIMENT TEXT'].values.tolist()] - raw_data['VADER NEGATIVE SCORING'] = [vader_analyser.polarity_scores(doc)['neg'] for doc in - raw_data['VADER SENTIMENT TEXT'].values.tolist()] - - # create plots - hac_plot = ff.create_distplot([raw_data['VADER OVERALL SCORE'].tolist()], - ['VADER'], - colors=[colour], - bin_size=0.25, - curve_type='normal', - show_rug=False, - show_hist=False) - hac_plot.update_layout(title_text='Distribution Plot', - xaxis_title='VADER Score', - yaxis_title='Frequency Density', - legend_title='Frequency Density') - data = { - 'data': raw_data.to_json(), - 'dot_image': hac_plot.to_image(format="png") - - } - return jsonable_encoder(data) - - elif model == 'textblob': - pol_list = [] - sub_list = [] - - # scoring: polarity - raw_data['POLARITY SCORE'] = raw_data[data_column]. \ - apply(lambda x: TextBlob(x).sentiment.polarity) - for i in raw_data['POLARITY SCORE'].tolist(): - if float(i) > 0: - pol_list.append('Positive') - elif float(i) < 0: - pol_list.append('Negative') - elif float(i) == 0: - pol_list.append('Neutral') - raw_data['POLARITY SENTIMENT'] = pol_list - - # scoring: subjectivity - raw_data['SUBJECTIVITY SCORE'] = raw_data[data_column].apply( - lambda x: TextBlob(x).sentiment.subjectivity - ) - for i in raw_data['SUBJECTIVITY SCORE'].tolist(): - if float(i) < 0.5: - sub_list.append('Objective') - elif float(i) > 0.5: - sub_list.append('Subjective') - elif float(i) == 0.5: - sub_list.append('Neutral') - raw_data['SUBJECTIVITY SENTIMENT'] = sub_list - hac_plot = px.scatter(raw_data[['SUBJECTIVITY SCORE', 'POLARITY SCORE']], - x='SUBJECTIVITY SCORE', - y='POLARITY SCORE', - labels={ - 'SUBJECTIVITY SCORE': 'Subjectivity', - 'POLARITY SCORE': 'Polarity' - }) - hac_plot1 = ff.create_distplot([raw_data['SUBJECTIVITY SCORE'].tolist(), - raw_data['POLARITY SCORE'].tolist()], - ['Subjectivity', 'Polarity'], - curve_type='normal', - show_rug=False, - show_hist=False) - data = { - 'data': raw_data.to_json(), - 'dot_image': hac_plot.to_image(format="png"), - 'word_image': hac_plot1.to_image(format="png") - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/modelling') -def topic_modelling(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', model: str = 'lda', - num_topics: int = 10, max_features: int = 5000, max_iter: int = 10, min_df: int = 5, - max_df: float = 0.90, worker: int = 1, colour: str = 'steelblue', alpha: float = 0.10, - l1_ratio: float = 0.50): - """ - Topic Modelling - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **num_topics**: Number of topics to model - - **max_features**: Maximum number of features to consider - - **max_iter**: Maximum number of epochs to fit data - - **min_df**: Minimum length of words - - **max_df**: Maximum length of words - - **worker**: Number of workers - - **colour**: Colour of the plots - - **alpha**: Alpha value - - **l1_ratio**: L1 ratio value - """ - - global fc - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - try: - cv = CountVectorizer(min_df=min_df, - max_df=max_df, - stop_words='english', - lowercase=True, - token_pattern=r'[a-zA-Z\-][a-zA-Z\-]{2,}', - max_features=max_features) - vectorised = cv.fit_transform(raw_data[data_column]) - except ValueError: - raise HTTPException(status_code=415, detail='Error: The column loaded is empty or has invalid data' - ' points. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if model == 'lda': - LDA = LatentDirichletAllocation(n_components=num_topics, - max_iter=max_iter, - learning_method='online', - n_jobs=worker) - LDA_data = LDA.fit_transform(vectorised) - topic_text = modelIterator(LDA, cv, top_n=num_topics, - vb=False) - keywords = pd.DataFrame(dominantTopic(vect=cv, model=LDA, - n_words=num_topics)) - keywords.columns = [f'word_{i}' for i in range(keywords.shape[1])] - keywords.index = [f'topic_{i}' for i in range(keywords.shape[0])] - LDA_vis = pyLDAvis.sklearn.prepare(LDA, vectorised, cv, mds='tsne') - pyLDAvis.save_html(LDA_vis, - str(os.path.join(os.getcwd(), f'lda_id{fc}.html'))) - with open(os.path.join(os.getcwd(), f'lda_id{fc}.html')) as f: - render = f.read() - fc += 1 - - data = { - 'topic_text': {i: (topic_text[i].to_json()) for i - in range(len(topic_text))}, - 'data': raw_data.to_json(), - 'keywords': keywords.to_json(), - 'render': render - } - - return jsonable_encoder(data) - - elif model == 'nmf': - TFIDF = TfidfVectorizer(max_df=max_df, - min_df=min_df, - max_features=max_features, - stop_words='english') - TFIDF_vectorised = TFIDF.fit_transform(raw_data - [data_column] - .values.astype(str)) - NMF_model = NMF(n_components=num_topics, - max_iter=max_iter, - random_state=1, - alpha=alpha, - l1_ratio=l1_ratio).fit(TFIDF_vectorised) - topic_text = modelIterator(model=NMF_model, - vectoriser=TFIDF, - top_n=num_topics, - vb=False) - keywords = pd.DataFrame(dominantTopic(model=NMF_model, - vect=TFIDF, - n_words=num_topics)) - keywords.columns = [f'word_{i}' for i in range(keywords.shape[1])] - keywords.index = [f'topic_{i}' for i in range(keywords.shape[0])] - data = { - 'topic_text': {i: (topic_text[i].to_json()) for i - in range(len(topic_text))}, - 'data': raw_data.to_json(), - 'keywords': keywords.to_json() - } - return jsonable_encoder(data) - - elif model == 'lsi': - LSI = TruncatedSVD(n_components=num_topics, n_iter=max_iter) - LSI_data = LSI.fit_transform(vectorised) - topic_text = modelIterator(LSI, cv, - top_n=num_topics, vb=False) - keywords = pd.DataFrame(dominantTopic(model=LSI, vect=cv, - n_words=num_topics)) - keywords.columns = [f'word_{i}' for i in range(keywords.shape[1])] - keywords.index = [f'topic_{i}' for i in range(keywords.shape[0])] - - # SVD - svd_2d = TruncatedSVD(n_components=2) - data_2d = svd_2d.fit_transform(vectorised) - - mar_fig = go.Scattergl( - x=data_2d[:, 0], - y=data_2d[:, 1], - mode='markers', - marker=dict( - color=colour, - line=dict(width=1) - ), - text=cv.get_feature_names(), - hovertext=cv.get_feature_names(), - hoverinfo='text' - ) - mar_fig = [mar_fig] - mar_fig = go.Figure(data=mar_fig, layout=go.Layout(title='Scatter Plot')) - word_fig = go.Scattergl( - x=data_2d[:, 0], - y=data_2d[:, 1], - mode='text', - marker=dict( - color=colour, - line=dict(width=1) - ), - text=cv.get_feature_names(), - ) - word_fig = [word_fig] - word_fig = go.Figure(data=word_fig, layout=go.Layout(title='Scatter Word Plot')) - - data = { - 'topic_text': {i: (topic_text[i].to_json()) for i - in range(len(topic_text))}, - 'data': raw_data.to_json(), - 'keywords': keywords.to_json(), - 'point_figure': mar_fig.to_image(format='png'), - 'word_figure': word_fig.to_image(format='png') - } - - return jsonable_encoder(data) - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/classification') -def classification(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', topics: str = ''): - """ - Conduct Text Classification - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **topics**: A string (delimited by commas) or a list of topics to classify data into - """ - - if torch.cuda.is_available(): - try: - torch.cuda.get_device_name(torch.cuda.current_device()) - except AssertionError: - raise HTTPException(status_code=415, detail='Error: CUDA Device is not enabled. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(object) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(object) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(object) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if type(topics) == str: - topics = [word.strip().lower() for word in topics.split(sep=',')] - elif type(topics) == list: - topics = topics - else: - raise HTTPException(status_code=415, detail='Error: Invalid data type for topics.') - - classifier = pipeline('zero-shot-classification') - raw_data['TEST'] = raw_data[data_column].apply(lambda x: classifier(x, topics)) - raw_data['CLASSIFIED'] = raw_data['TEST']. \ - apply(lambda x: list(zip(x['labels'].tolist(), x['scores'].tolist()))) - raw_data['MOST PROBABLE TOPIC'] = raw_data['CLASSIFIED'].apply(lambda x: max(x, key=itemgetter[1])[0]) - raw_data = raw_data.astype(str) - - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: CUDA Device is not detected. Try again.') diff --git a/api/main.py b/api/main.py deleted file mode 100644 index c4b32da..0000000 --- a/api/main.py +++ /dev/null @@ -1,29 +0,0 @@ -"""This is the main file where the API server is started""" - -from fastapi import FastAPI -from endpoints.lca import clean, modify, query -from endpoints.dtm import dtm -from endpoints.mt import model_trainer -from endpoints.tk import toolkit_nlp - -# instantiate the app -app = FastAPI() - -# add the routers -app.include_router(clean.router) -app.include_router(modify.router) -app.include_router(query.router) -app.include_router(model_trainer.router) -app.include_router(dtm.router) -app.include_router(toolkit_nlp.router) - - -@app.get('/') -def root(): - """ - This function is called when the user navigates to the root path of the localhost path generated by uvicorn when - the API server is started - """ - - return {'description': 'Welcome to the Homepage of the ArticPy API!'} - diff --git a/app.py b/app.py index 6e2541d..d6d7648 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,6 @@ # INIT STREAMLIT CONFIG import streamlit as st + st.set_page_config(page_title='ArticPy', page_icon='❄', menu_items={ @@ -13,7 +14,7 @@ # CUSTOM PAGE IMPORTS from pyfiles.multipage import MultiPage -from pyfiles.pages import load_clean_visualise, document_term_matrix, toolkit_nlp, model_trainer +from pyfiles.pages import load_clean_visualise, document_term_matrix, toolkit_nlp # INSTANTIATE THE APP app = MultiPage() @@ -22,7 +23,6 @@ app.add_page('Load, Clean and Visualise Data', load_clean_visualise.app) app.add_page('DTM and Word Frequency Analysis', document_term_matrix.app) app.add_page('NLP Toolkit', toolkit_nlp.app) -app.add_page('NLP Model Trainer', model_trainer.app) # RUN THE APP try: diff --git a/config.py b/config.py index 895ac46..98e3dd7 100644 --- a/config.py +++ b/config.py @@ -152,449 +152,3 @@ 'MIN_WORDS': 80, 'SUM_MODE': 'Basic' } - -trainer = { - 'TRANSFORMERS_CHOICES': ('Pre Training', 'CausalLM', 'MaskedLM', 'Seq2SeqLM', 'SequenceClassification', - 'MultipleChoice', 'NextSentencePrediction', 'TokenClassificaition', 'QuestionAnswering', - 'TableQuestionAnswering'), - 'TRANSFORMERS_SELECTION': None, - 'MODEL_MODE': 'Training', - 'MODEL_FILE': None, - 'TRAINING_PARAMS': [], - 'API': True, - 'attack': None, - 'model_max_length': None, - 'model_num_labels': None, - 'dataset_train_split': None, - 'dataset_eval_split': None, - 'filter_train_by_labels': None, - 'filter_eval_by_labels': None, - 'num_epochs': 3, - 'num_clean_epochs': 1, - 'attack_epoch_interval': 1, - 'early_stopping_epochs': None, - 'learning_rate': 5e-5, - 'num_warmup_steps': 500, - 'weight_decay': 0.01, - 'per_device_train_batch_size': 8, - 'per_device_eval_batch_size': 32, - 'gradient_accumulation_steps': 1, - 'random_seed': 786, - 'parallel': False, - 'load_best_model_at_end': False, - 'alpha': 1.0, - 'num_train_adv_examples': -1, - 'query_budget_train': None, - 'attack_num_workers_per_device': 1, - 'output_dir': None, - # 'output_dir': f'{os.getcwd()}/outputs/{datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")}', - 'checkpoint_interval_steps': None, - 'checkpoint_interval_epochs': None, - 'save_last': True, - 'log_to_tb': False, - 'tb_log_dir': None, - # 'tb_log_dir': r'./runs', - 'log_to_wandb': False, - 'wandb_project': 'textattack', - 'logging_interval_step': 1, - 'ML_MODEL': '', - 'ML_POSSIBLE_PICKS': ('albert-base-v2-CoLA', 'bert-base-uncased-CoLA', 'distilbert-base-cased-CoLA', - 'distilbert-base-uncased-CoLA', 'roberta-base-CoLA', 'xlnet-base-cased-CoLA', - 'albert-base-v2-RTE', 'albert-base-v2-snli', 'albert-base-v2-WNLI', 'bert-base-uncased-MNLI', - 'bert-base-uncased-QNLI', 'bert-base-uncased-RTE', 'bert-base-uncased-snli', - 'bert-base-uncased-WNLI', 'distilbert-base-cased-snli', 'distilbert-base-uncased-MNLI', - 'distilbert-base-uncased-RTE', 'distilbert-base-uncased-WNLI', 'roberta-base-QNLI', - 'roberta-base-RTE', 'roberta-base-WNLI', 'xlnet-base-cased-RTE', 'xlnet-base-cased-WNLI', - 'albert-base-v2-QQP', 'bert-base-uncased-QQP', 'distilbert-base-uncased-QNLI', - 'distilbert-base-cased-QQP', 'albert-base-v2-STS-B', 'bert-base-uncased-MRPC', - 'bert-base-uncased-STS-B', 'distilbert-base-cased-MRPC', 'distilbert-base-cased-STS-B', - 'distilbert-base-uncased-MRPC', 'roberta-base-MRPC', 'roberta-base-STS-B', - 'xlnet-base-cased-MRPC', 'xlnet-base-cased-STS-B', 'albert-base-v2-imdb', - 'albert-base-v2-rotten-tomatoes', 'albert-base-v2-SST-2', 'albert-base-v2-yelp-polarity', - 'bert-base-uncased-imdb', 'bert-base-uncased-rotten-tomatoes', 'bert-base-uncased-SST-2', - 'bert-base-uncased-yelp-polarity', 'cnn-imdb', 'cnn-mr', 'cnn-sst2', 'cnn-yelp', - 'distilbert-base-cased-SST-2', 'distilbert-base-uncased-imdb', - 'distilbert-base-uncased-rotten-tomatoes', 'lstm-imdb', 'lstm-mr', 'lstm-sst2', 'lstm-yelp', - 'roberta-base-imdb', 'roberta-base-rotten-tomatoes', 'roberta-base-SST-2', - 'xlnet-base-cased-imdb', 'xlnet-base-cased-rotten-tomatoes', 'albert-base-v2-ag-news', - 'bert-base-uncased-ag-news', 'cnn-ag-news', 'distilbert-base-uncased-ag-news', 'lstm-ag-news', - 'roberta-base-ag-news', 'bert-base-uncased', 'bert-large-uncased', 'bert-base-cased', - 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', - 'bert-base-chinese', 'bert-base-german-cased', 'bert-large-uncased-whole-word-masking', - 'bert-large-cased-whole-word-masking', - 'bert-large-uncased-whole-word-masking-finetuned-squad', - 'bert-large-cased-whole-word-masking-finetuned-squad', - 'bert-base-cased-finetuned-mrpc', 'bert-base-german-dbmdz-cased', - 'bert-base-german-dbmdz-uncased', 'cl-tohoku/bert-base-japanese', - 'cl-tohoku/bert-base-japanese-whole-word-masking', 'cl-tohoku/bert-base-japanese-char', - 'cl-tohoku/bert-base-japanese-char-whole-word-masking', 'TurkuNLP/bert-base-finnish-cased-v1', - 'TurkuNLP/bert-base-finnish-uncased-v1', 'wietsedv/bert-base-dutch-cased', 'openai-gpt', - 'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'transfo-xl-wt103', 'xlnet-base-cased', - 'xlnet-large-cased', 'xlm-mlm-en-2048', 'xlm-mlm-ende-1024', 'xlm-mlm-enfr-1024', - 'xlm-mlm-enro-1024', 'xlm-mlm-xnli15-1024', 'xlm-mlm-tlm-xnli15-1024', 'xlm-clm-enfr-1024', - 'xlm-clm-ende-1024', 'xlm-mlm-17-1280', 'xlm-mlm-100-1280', 'roberta-base', 'roberta-large', - 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', - 'roberta-large-openai-detector', 'distilbert-base-uncased', - 'distilbert-base-uncased-distilled-squad', 'distilbert-base-cased', - 'distilbert-base-cased-distilled-squad', 'distilgpt2', - 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased', 'ctrl', - 'camembert-base', 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', - 'albert-xxlarge-v1', 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', - 'albert-xxlarge-v2', 't5-small', 't5-base', 't5-large', 't5-3B', 't5-11B', 'xlm-roberta-base', - 'xlm-roberta-large', 'flaubert/flaubert_small_cased', - 'flaubert/flaubert_base_uncased', 'flaubert/flaubert_base_cased', - 'flaubert/flaubert_large_cased', 'facebook/bart-large', - 'facebook/bart-base', 'facebook/bart-large-mnli', - 'facebook/bart-large-cnn', 'DialoGPT-small', 'DialoGPT-medium', - 'DialoGPT-large', 'reformer-enwik8', 'reformer-crime-and-punishment', - 'Helsinki-NLP/opus-mt-{src}-{tgt}', 'google/pegasus-{dataset}', - 'allenai/longformer-base-4096', 'allenai/longformer-large-4096', - 'facebook/mbart-large-cc25', 'facebook/mbart-large-en-ro', - 'lxmert-base-uncased', 'funnel-transformer/small', - 'funnel-transformer/small-base', 'funnel-transformer/medium', - 'funnel-transformer/medium-base', 'funnel-transformer/intermediate', - 'funnel-transformer/intermediate-base', 'funnel-transformer/large', - 'funnel-transformer/large-base', 'funnel-transformer/xlarge', - 'funnel-transformer/xlarge-base', 'microsoft/layoutlm-base-uncased', - 'microsoft/layoutlm-large-uncased'), - 'DATASET_POSSIBLE_PICKS': ('super_glue', 'glue', 'anli', 'wino_bias', 'squad', 'imdb', 'wikitext', 'trec', 'race', - 'adversarial_qa', 'duorc', 'squad_v2', 'winogrande', 'cosmos_qa', 'quail', 'xsum', - 'cnn_dailymail', 'piqa', 'paws', 'hellaswag', 'ai2_arc', 'ropes', 'rotten_tomatoes', - 'amazon_polarity', 'quoref', 'wiki_qa', 'cos_e', 'hans', 'ag_news', 'common_gen', - 'mc_taco', 'gigaword', 'wiki_hop', 'wmt16', 'wiqa', 'qasc', 'winograd_wsc', - 'common_voice', 'quartz', 'yelp_review_full', 'samsum', 'crows_pairs', 'openbookqa', - 'qa_srl', 'multi_news', 'social_i_qa', 'nq_open', 'quac', 'web_questions', 'dream', - 'kilt_tasks', 'wiki_bio', 'drop', 'trivia_qa', 'sciq', 'quarel', 'lambada', 'coqa', - 'circa', 'mc4', 'conll2003', 'dbpedia_14', 'xnli', 'emotion', 'story_cloze', - 'app_reviews', 'snli', 'mlqa', 'code_search_net', 'c4', 'multi_nli', - 'amazon_reviews_multi', 'hate_speech18', 'tweet_eval', 'timit_asr', 'oscar', 'sst', - 'hate_speech_offensive', 'wikiann', 'wikipedia', 'opus_euconst', 'math_qa', 'swag', - 'pubmed_qa', 'scientific_papers', 'banking77', 'cbt', 'commonsense_qa', 'yelp_polarity', - 'stsb_multi_mt', 'bookcorpus', 'squad_adversarial', 'scan', 'financial_phrasebank', - 'craffel/openai_lambada', 'wiki_dpr', 'amazon_us_reviews', 'math_dataset', 'tab_fact', - 'subjqa', 'OTHERS'), - # mappings area split -> dataset: [(subsets), (dataset_columns), (split)] - 'SUBSET_MAPPINGS': {'super_glue': [('boolq', 'cb', 'copa', 'multirc', 'record', 'rte', 'wic', 'wsc', 'wsc.fixed', - 'axb', 'axg'), (), ('train', 'test', 'validation')], - 'glue': [('cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', - 'qnli', 'rte', 'wnli', 'ax'), (), ('train', 'test', 'validation')], - 'anli': [(), (), ('train_r1', 'dev_r1', 'test_r1', 'train_r2', 'dev_r2', 'test_r2', - 'train_r3', 'dev_r3', 'test_r3')], - 'wino_bias': [('type1_pro', 'type1_anti', 'type2_pro', 'type2_anti'), (), - ('test', 'validation')], - 'squad': [(), ('question', 'title', 'id', 'context', 'answers'), ('train', 'validation')], - 'imdb': [(), (), ('train', 'test', 'unsupervised')], - 'wikitext': [('wikitext-103-raw-v1', 'wikitext-2-raw-v1', 'wikitext-103-v1', 'wikitext-2-v1'), - (), ('train', 'test', 'validation')], - 'trec': [(), ('label-coarse', 'text', 'label-fine'), ('train', 'test')], - 'race': [('high', 'middle', 'all'), ('example_id', 'article', 'answer', 'question', 'options'), - ('train', 'test', 'validation')], - 'adversarial_qa': [('adversarialQA', 'dbidaf', 'dbert', 'droberta'), (), - ('train', 'test', 'validation')], - 'duorc': [('SelfRC', 'ParaphraseRC'), (), ('train', 'test', 'validation')], - 'squad_v2': [(), ('question', 'title', 'id', 'context', 'answers'), ('train', 'validation')], - 'winogrande': [('winogrande_xs', 'winogrande_s', 'winogrande_m', 'winogrande_l', - 'winogrande_xl', 'winogrande_debiased'), (), ('train', 'test', 'validation')], - 'cosmos_qa': [(), ('question', 'answer2', 'answer0', 'answer1', 'label', 'id', 'answer3', - 'context'), ('train', 'test', 'validation')], - 'quail': [(), ('question', 'question_type', 'id', 'context_id', 'domain', 'context', - 'correct_answer_id', 'answers', 'question_id', 'metadata'), - ('train', 'challenge', 'validation')], - 'xsum': [(), (), ('train', 'test', 'validation')], - 'cnn_dailymail': [(), (), ('train', 'test', 'validation')], - 'piqa': [(), (), ('train', 'test', 'validation')], - 'paws': [('labelle_final', 'labelled_swap', 'unlabelled_final'), - (), ('train', 'test', 'validation')], - 'hellaswag': [(), (), ('train', 'test', 'validation')], - 'ai2_arc': [('ARC-Challenge', 'ARC-Easy'), (), ('train', 'test', 'validation')], - 'ropes': [(), (), ('train', 'test', 'validation')], - 'rotten_tomatoes': [(), (), ('train', 'test', 'validation')], - 'amazon_polarity': [(), (), ()], - 'quoref': [(), (), ('train', 'validation')], - 'wiki_qa': [(), (), ()], - 'cos_e': [('v1.0', 'v1.11'), (), ('train', 'validation')], - 'hans': [(), (), ('train', 'validation')], - 'ag_news': [(), (), ('train', 'test')], - 'common_gen': [(), (), ('train', 'test', 'validation')], - 'mc_taco': [(), (), ('test', 'validation')], - 'gigaword': [(), (), ('train', 'test', 'validation')], - 'wiki_hop': [('original', 'masked'), (), ('train', 'validation')], - 'wmt16': [('cs-en', 'de-en', 'fi-en', 'ro-en', 'ru-en', 'te-en'), - (), ('train', 'test', 'validation')], - 'wiqa': [(), (), ('train', 'test', 'validation')], - 'qasc': [(), (), ('train', 'test', 'validation')], - 'winograd_wsc': [(), (), ()], - 'common_voice': [(), (), ()], - 'quartz': [(), (), ('train', 'test', 'validation')], - 'yelp_review_full': [(), (), ()], - 'samsum': [(), (), ('train', 'test', 'val')], - 'crows_pairs': [(), (), ()], - 'openbookqa': [('main', 'additional'), (), ('train', 'test', 'validation')], - 'qa_srl': [(), (), ('train', 'test', 'validation')], - 'multi_news': [(), (), ('train', 'test', 'validation')], - 'social_i_qa': [(), (), ('train', 'validation')], - 'nq_open': [(), (), ('train', 'validation')], - 'quac': [(), (), ('train', 'validation')], - 'web_questions': [(), (), ('train', 'test')], - 'dream': [(), (), ('train', 'test', 'validation')], - 'kilt_tasks': [('triviaqa_support_only', 'fever', 'aidayago2', 'wned', 'cweb', 'trex', - 'structured_zeroshot', 'nq', 'hotpotqa', 'eli5', 'wow'), - (), ('train', 'test', 'validation')], - 'wiki_bio': [(), (), ('train', 'test', 'validation')], - 'drop': [(), (), ('train', 'validation')], - 'trivia_qa': [('rc', 'rc.nocontext', 'unfiltered', 'unfiltered.nocontext'), - (), ('train', 'test', 'validation')], - 'sciq': [(), (), ('train', 'test', 'validation')], - 'quarel': [(), (), ('train', 'test', 'validation')], - 'lambada': [(), (), ('train', 'test', 'dev')], - 'coqa': [(), (), ('train', 'validation')], - 'circa': [(), (), ('train')], - 'mc4': [('af', 'am', 'ar', 'az', 'be', 'bg', 'bg-Latn', 'bn', 'ca', 'ceb', 'co', 'cs', 'cy', - 'da', 'de', 'el', 'el-Latn', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fil', 'fr', - 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'haw', 'hi', 'hi-Latn', 'hmn', 'ht', 'hu', 'hy', - 'id', 'ig', 'is', 'it', 'iw', 'ja', 'ja-Latn', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', - 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', - 'mt', 'my', 'ne', 'nl', 'no', 'ny', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'ru-Latn', - 'sd', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'st', 'su', 'sv', 'sw', 'ta', - 'te', 'tg', 'th', 'tr', 'uk', 'und', 'ur', 'uz', 'vi', 'xh', 'yi', 'yo', 'zh', - 'zh-Latn', 'zu'), (), ('train', 'test', 'validation')], - - 'conll2003': [(), (), ('train', 'test', 'validation')], - 'dbpedia_14': [(), (), ('train', 'test')], - 'xnli': [('all_languages', 'ar', 'bg', 'de', 'el'), (), ('train', 'test', 'validation')], - 'emotion': [(), (), ('train', 'test', 'validation')], - 'story_cloze': [('2016', '2018'), (), ('test', 'validation')], - 'app_reviews': [(), (), ()], - 'snli': [(), (), ('train', 'test', 'validation')], - 'mlqa': [('mlqa-translate-test.ar', 'mlqa-translate-test.de', 'mlqa-translate-test.es', - 'mlqa-translate-test.hi', 'mlqa-translate-test.vi'), (), - ('train', 'test', 'validation')], - 'code_search_net': [('all', 'java', 'go', 'python', 'javascript', 'ruby', 'php'), - (), ('train', 'test', 'validation')], - 'c4': [('en', 'realnewslike', 'en.noclean', 'realnewslike'), - (), ('train', 'validation')], - 'multi_nli': [(), (), ('train', 'validation_matched', 'validation_mismatched')], - 'amazon_reviews_multi': [('all_languages', 'de', 'en', 'es', 'fr', 'ja', 'zh'), - (), ('train', 'test', 'validation')], - 'hate_speech18': [(), (), ()], - 'tweet_eval': [('emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', - 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', - 'stance_hillary'), (), ('train', 'test', 'validation')], - 'timit_asr': [(), (), ('train', 'test')], - 'oscar': [('unshuffled_original_af', 'unshuffled_original_sq', 'unshuffled_original_als', - 'unshuffled_original_am', 'unshuffled_original_ar', 'unshuffled_original_an', - 'unshuffled_original_hy', 'unshuffled_original_as', 'unshuffled_original_ast', - 'unshuffled_original_av', 'unshuffled_original_az', 'unshuffled_original_ba', - 'unshuffled_original_eu', 'unshuffled_original_bar', 'unshuffled_original_be', - 'unshuffled_original_bn', 'unshuffled_original_bh', 'unshuffled_original_bpy', - 'unshuffled_original_bs', 'unshuffled_original_br', 'unshuffled_original_bg', - 'unshuffled_original_my', 'unshuffled_original_ca', 'unshuffled_original_ceb', - 'unshuffled_original_bcl', 'unshuffled_original_km', 'unshuffled_original_ckb', - 'unshuffled_original_cbk', 'unshuffled_original_ce', 'unshuffled_original_zh', - 'unshuffled_original_cv', 'unshuffled_original_kw', 'unshuffled_original_hr', - 'unshuffled_original_cs', 'unshuffled_original_da', 'unshuffled_original_dv', - 'unshuffled_original_diq', 'unshuffled_original_nl', 'unshuffled_original_mhr', - 'unshuffled_original_arz', 'unshuffled_original_eml', 'unshuffled_original_en', - 'unshuffled_original_myv', 'unshuffled_original_eo', 'unshuffled_original_et', - 'unshuffled_original_fi', 'unshuffled_original_fr', 'unshuffled_original_gl', - 'unshuffled_original_ka', 'unshuffled_original_de', 'unshuffled_original_gom', - 'unshuffled_original_gn', 'unshuffled_original_gu', 'unshuffled_original_ht', - 'unshuffled_original_he', 'unshuffled_original_hi', 'unshuffled_original_hu', - 'unshuffled_original_is', 'unshuffled_original_io', 'unshuffled_original_ilo', - 'unshuffled_original_id', 'unshuffled_original_ia', 'unshuffled_original_ie', - 'unshuffled_original_ga', 'unshuffled_original_it', 'unshuffled_original_ja', - 'unshuffled_original_jv', 'unshuffled_original_xal', 'unshuffled_original_kn', - 'unshuffled_original_krc', 'unshuffled_original_kk', 'unshuffled_original_ky', - 'unshuffled_original_kv', 'unshuffled_original_ko', 'unshuffled_original_ku', - 'unshuffled_original_lo', 'unshuffled_original_la', 'unshuffled_original_lv', - 'unshuffled_original_lez', 'unshuffled_original_li', 'unshuffled_original_lt', - 'unshuffled_original_jbo', 'unshuffled_original_lmo', 'unshuffled_original_nds', - 'unshuffled_original_dsb', 'unshuffled_original_lb', 'unshuffled_original_mk', - 'unshuffled_original_mai', 'unshuffled_original_mg', 'unshuffled_original_ms', - 'unshuffled_original_ml', 'unshuffled_original_mt', 'unshuffled_original_mr', - 'unshuffled_original_mzn', 'unshuffled_original_min', 'unshuffled_original_xmf', - 'unshuffled_original_mwl', 'unshuffled_original_el', 'unshuffled_original_mn', - 'unshuffled_original_nah', 'unshuffled_original_nap', 'unshuffled_original_ne', - 'unshuffled_original_new', 'unshuffled_original_frr', 'unshuffled_original_lrc', - 'unshuffled_original_no', 'unshuffled_original_nn', 'unshuffled_original_oc', - 'unshuffled_original_or', 'unshuffled_original_os', 'unshuffled_original_pam', - 'unshuffled_original_pa', 'unshuffled_original_fa', 'unshuffled_original_pms', - 'unshuffled_original_pl', 'unshuffled_original_pt', 'unshuffled_original_ps', - 'unshuffled_original_qu', 'unshuffled_original_ro', 'unshuffled_original_rm', - 'unshuffled_original_bxr', 'unshuffled_original_ru', 'unshuffled_original_sa', - 'unshuffled_original_gd', 'unshuffled_original_sr', 'unshuffled_original_sh', - 'unshuffled_original_scn', 'unshuffled_original_sd', 'unshuffled_original_si', - 'unshuffled_original_sk', 'unshuffled_original_sl', 'unshuffled_original_so', - 'unshuffled_original_azb', 'unshuffled_original_es', 'unshuffled_original_su', - 'unshuffled_original_sw', 'unshuffled_original_sv', 'unshuffled_original_tl', - 'unshuffled_original_tg', 'unshuffled_original_ta', 'unshuffled_original_tt', - 'unshuffled_original_te', 'unshuffled_original_th', 'unshuffled_original_bo', - 'unshuffled_original_tr', 'unshuffled_original_tk', 'unshuffled_original_tyv', - 'unshuffled_original_ug', 'unshuffled_original_uk', 'unshuffled_original_hsb', - 'unshuffled_original_ur', 'unshuffled_original_uz', 'unshuffled_original_vec', - 'unshuffled_original_vi', 'unshuffled_original_vo', 'unshuffled_original_wa', - 'unshuffled_original_war', 'unshuffled_original_cy', 'unshuffled_original_fy', - 'unshuffled_original_mrj', 'unshuffled_original_pnb', 'unshuffled_original_wuu', - 'unshuffled_original_sah', 'unshuffled_original_yi', 'unshuffled_original_yo', - 'unshuffled_original_yue', 'unshuffled_deduplicated_af', - 'unshuffled_deduplicated_sq', 'unshuffled_deduplicated_als', - 'unshuffled_deduplicated_am', 'unshuffled_deduplicated_ar', - 'unshuffled_deduplicated_an', 'unshuffled_deduplicated_hy', - 'unshuffled_deduplicated_as', 'unshuffled_deduplicated_ast', - 'unshuffled_deduplicated_av', 'unshuffled_deduplicated_az', - 'unshuffled_deduplicated_ba', 'unshuffled_deduplicated_eu', - 'unshuffled_deduplicated_bar', 'unshuffled_deduplicated_be', - 'unshuffled_deduplicated_bn', 'unshuffled_deduplicated_bh', - 'unshuffled_deduplicated_bpy', 'unshuffled_deduplicated_bs', - 'unshuffled_deduplicated_br', 'unshuffled_deduplicated_bg', - 'unshuffled_deduplicated_my', 'unshuffled_deduplicated_ca', - 'unshuffled_deduplicated_ceb', 'unshuffled_deduplicated_bcl', - 'unshuffled_deduplicated_km', 'unshuffled_deduplicated_ckb', - 'unshuffled_deduplicated_cbk', 'unshuffled_deduplicated_ce', - 'unshuffled_deduplicated_zh', 'unshuffled_deduplicated_cv', - 'unshuffled_deduplicated_kw', 'unshuffled_deduplicated_hr', - 'unshuffled_deduplicated_cs', 'unshuffled_deduplicated_da', - 'unshuffled_deduplicated_dv', 'unshuffled_deduplicated_diq', - 'unshuffled_deduplicated_nl', 'unshuffled_deduplicated_mhr', - 'unshuffled_deduplicated_arz', 'unshuffled_deduplicated_eml', - 'unshuffled_deduplicated_en', 'unshuffled_deduplicated_myv', - 'unshuffled_deduplicated_eo', 'unshuffled_deduplicated_et', - 'unshuffled_deduplicated_fi', 'unshuffled_deduplicated_fr', - 'unshuffled_deduplicated_gl', 'unshuffled_deduplicated_ka', - 'unshuffled_deduplicated_de', 'unshuffled_deduplicated_gom', - 'unshuffled_deduplicated_gn', 'unshuffled_deduplicated_gu', - 'unshuffled_deduplicated_ht', 'unshuffled_deduplicated_he', - 'unshuffled_deduplicated_hi', 'unshuffled_deduplicated_hu', - 'unshuffled_deduplicated_is', 'unshuffled_deduplicated_io', - 'unshuffled_deduplicated_ilo', 'unshuffled_deduplicated_id', - 'unshuffled_deduplicated_ia', 'unshuffled_deduplicated_ie', - 'unshuffled_deduplicated_ga', 'unshuffled_deduplicated_it', - 'unshuffled_deduplicated_ja', 'unshuffled_deduplicated_jv', - 'unshuffled_deduplicated_xal', 'unshuffled_deduplicated_kn', - 'unshuffled_deduplicated_krc', 'unshuffled_deduplicated_kk', - 'unshuffled_deduplicated_ky', 'unshuffled_deduplicated_kv', - 'unshuffled_deduplicated_ko', 'unshuffled_deduplicated_ku', - 'unshuffled_deduplicated_lo', 'unshuffled_deduplicated_la', - 'unshuffled_deduplicated_lv', 'unshuffled_deduplicated_lez', - 'unshuffled_deduplicated_li', 'unshuffled_deduplicated_lt', - 'unshuffled_deduplicated_jbo', 'unshuffled_deduplicated_lmo', - 'unshuffled_deduplicated_nds', 'unshuffled_deduplicated_dsb', - 'unshuffled_deduplicated_lb', 'unshuffled_deduplicated_mk', - 'unshuffled_deduplicated_mai', 'unshuffled_deduplicated_mg', - 'unshuffled_deduplicated_ms', 'unshuffled_deduplicated_ml', - 'unshuffled_deduplicated_mt', 'unshuffled_deduplicated_mr', - 'unshuffled_deduplicated_mzn', 'unshuffled_deduplicated_min', - 'unshuffled_deduplicated_xmf', 'unshuffled_deduplicated_mwl', - 'unshuffled_deduplicated_el', 'unshuffled_deduplicated_mn', - 'unshuffled_deduplicated_nah', 'unshuffled_deduplicated_nap', - 'unshuffled_deduplicated_ne', 'unshuffled_deduplicated_new', - 'unshuffled_deduplicated_frr', 'unshuffled_deduplicated_lrc', - 'unshuffled_deduplicated_no', 'unshuffled_deduplicated_nn', - 'unshuffled_deduplicated_oc', 'unshuffled_deduplicated_or', - 'unshuffled_deduplicated_os', 'unshuffled_deduplicated_pam', - 'unshuffled_deduplicated_pa', 'unshuffled_deduplicated_fa', - 'unshuffled_deduplicated_pms', 'unshuffled_deduplicated_pl', - 'unshuffled_deduplicated_pt', 'unshuffled_deduplicated_ps', - 'unshuffled_deduplicated_qu', 'unshuffled_deduplicated_ro', - 'unshuffled_deduplicated_rm', 'unshuffled_deduplicated_bxr', - 'unshuffled_deduplicated_ru', 'unshuffled_deduplicated_sa', - 'unshuffled_deduplicated_gd', 'unshuffled_deduplicated_sr', - 'unshuffled_deduplicated_sh', 'unshuffled_deduplicated_scn', - 'unshuffled_deduplicated_sd', 'unshuffled_deduplicated_si', - 'unshuffled_deduplicated_sk', 'unshuffled_deduplicated_sl', - 'unshuffled_deduplicated_so', 'unshuffled_deduplicated_azb', - 'unshuffled_deduplicated_es', 'unshuffled_deduplicated_su', - 'unshuffled_deduplicated_sw', 'unshuffled_deduplicated_sv', - 'unshuffled_deduplicated_tl', 'unshuffled_deduplicated_tg', - 'unshuffled_deduplicated_ta', 'unshuffled_deduplicated_tt', - 'unshuffled_deduplicated_te', 'unshuffled_deduplicated_th', - 'unshuffled_deduplicated_bo', 'unshuffled_deduplicated_tr', - 'unshuffled_deduplicated_tk', 'unshuffled_deduplicated_tyv', - 'unshuffled_deduplicated_ug', 'unshuffled_deduplicated_uk', - 'unshuffled_deduplicated_hsb', 'unshuffled_deduplicated_ur', - 'unshuffled_deduplicated_uz', 'unshuffled_deduplicated_vec', - 'unshuffled_deduplicated_vi', 'unshuffled_deduplicated_vo', - 'unshuffled_deduplicated_wa', 'unshuffled_deduplicated_war', - 'unshuffled_deduplicated_cy', 'unshuffled_deduplicated_fy', - 'unshuffled_deduplicated_mrj', 'unshuffled_deduplicated_pnb', - 'unshuffled_deduplicated_wuu', 'unshuffled_deduplicated_sah', - 'unshuffled_deduplicated_yi', 'unshuffled_deduplicated_yo', - 'unshuffled_deduplicated_yue'), (), ('train', 'test', 'validation')], - 'sst': [('default', 'dictionary' 'ptb'), (), ('train', 'test', 'validation')], - 'hate_speech_offensive': [(), (), ('train')], - 'wikiann': [('ace', 'af', 'als', 'am', 'an', 'ang', 'ar', 'arc', 'arz', 'as', 'ast', 'ay', - 'az', 'ba', 'bar', 'bat-smg', 'be', 'be-x-old', 'bg', 'bh', 'bn', 'bo', 'br', - 'bs', 'ca', 'cbk-zam', 'cdo', 'ce', 'ceb', 'ckb', 'co', 'crh', 'cs', 'csb', 'cv', - 'cy', 'da', 'de', 'diq', 'dv', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', - 'fa', 'fi', 'fiu-vro', 'fo', 'fr', 'frr', 'fur', 'fy', 'ga', 'gan', 'gd', 'gl', - 'gn', 'gu', 'hak', 'he', 'hi', 'hr', 'hsb', 'hu', 'hy', 'ia', 'id', 'ig', 'ilo', - 'io', 'is', 'it', 'ja', 'jbo', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ksh', 'ku', - 'ky', 'la', 'lb', 'li', 'lij', 'lmo', 'ln', 'lt', 'lv', 'map-bms', 'mg', 'mhr', - 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'mwl', 'my', 'mzn', 'nap', 'nds', - 'ne', 'nl', 'nn', 'no', 'nov', 'oc', 'or', 'os', 'pa', 'pdc', 'pl', 'pms', 'pnb', - 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'rw', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', - 'si', 'simple', 'sk', 'sl', 'so', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', - 'tg', 'th', 'tk', 'tl', 'tr', 'tt', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 'vi', - 'vls', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zea', 'zh', 'zh-classical', - 'zh-min-nan', 'zh-yue'), (), ('train', 'test', 'validation')], - 'wikipedia': [('20200501.de', '20200501.en', '20200501.fr', '20200501.frr', '20200501.it'), - (), ('train', 'test', 'validation')], - 'opus_euconst': [('cs-da', 'cs-de', 'cs-el'), (), ('train', 'test', 'validation')], - 'math_qa': [(), (), ('train', 'test', 'validation')], - 'swag': [(), (), ('train', 'test', 'validation')], - 'pubmed_qa': [(), (), ()], - 'scientific_papers': [('pubmed', 'arxiv'), (), ('train', 'test', 'validation')], - 'banking77': [(), (), ('train', 'test')], - 'cbt': [('raw', 'V', 'P', 'CN', 'NE'), (), ('train', 'test', 'validation')], - 'commonsense_qa': [(), (), ('train', 'test', 'validation')], - 'yelp_polarity': [(), (), ('train', 'test')], - 'stsb_multi_mt': [('en', 'de', 'es', 'ft', 'it', 'nl', 'pl', 'pt', 'ru', 'zh'), - (), ('train', 'test', 'validation')], - 'bookcorpus': [(), (), ('train')], - 'squad_adversarial': [('AddSent', 'AddOneSent'), (), ()], - 'scan': [('addprim_jump', 'addprim_turn_left', 'filler_num0', 'filler_num1', 'filler_num2'), - (), ('train', 'test')], - 'financial_phrasebank': [('sentences_50agree', 'sentences_66agree', 'sentences_75agree', - 'sentences_allagree'), (), ()], - 'craffel/openai_lambada': [(), (), ('test')], - 'wiki_dpr': [('psgs_w100.multiset.compressed', 'psgs_w100.multiset.exact', - 'psgs_w100.multiset.no_index', 'psgs_w100.nq.compressed', 'psgs_w100.nq.exact'), - (), ('train')], - 'amazon_us_reviews': [('Apparel_v1_00', 'Automotive_v1_00', 'Baby_v1_00', 'Beauty_v1_00', - 'Books_v1_00'), (), ('train')], - 'math_dataset': [('algebra__linear_1d', 'algebra__linear_1d_composed', 'algebra__linear_2d', - 'algebra__linear_2d_composed', 'algebra__polynomial_roots'), - (), ('train', 'test')], - 'tab_fact': [(), (), ()], - 'subjqa': [('tripadvisor', 'restaurants', 'movies', 'books', 'electronics', 'grocery'), - (), ('train', 'test', 'dev')], - 'OTHERS': [(), (), ()]}, - 'SUBSET': None, - 'MODEL_COL': None, - 'SPLIT_TRAIN': None, - 'SPLIT_TEST': None, - 'DATASET': '', - 'TOKENZIER': None, - 'WRAPPED_MODEL': None, - 'TRAINING_DATA': None, - 'TASK_TYPE': None, - 'ATTACK': None, - 'ATTACK_RECIPES': ('None', 'A2T (A2T: Attack for Adversarial Training Recipe)', - 'BAE (BAE: BERT-Based Adversarial Examples)', 'BERT-Attack', 'CheckList', 'CLARE Recipe', - 'DeepWordBug', 'Faster Alzantot Genetic Algorithm', 'Alzantot Genetic Algorithm', 'HotFlip', - 'Improved Genetic Algorithm', 'Input Reduction', 'Kuleshov2017', 'MORPHEUS2020', - 'Pruthi2019: Combating with Robust Word Recognition', 'Particle Swarm Optimization', 'PWWS', - 'Seq2Sick', 'TextBugger', 'TextFooler (Is BERT Really Robust?)'), - 'ATTACK_MODEL': None, - 'PRED_FILE': None, - 'FILE': 'Small File(s)', - 'MODE': 'CSV', - 'CSP': None, - 'DATA_COLUMN': None, - 'MODEL_PATH': None, - 'PRED_FILEPATH': None, - 'PRED_DATA': [pd.DataFrame, list], - 'PATH_EXIST': False, - 'PRED_SEQ': None, - 'PREDS': [] -} diff --git a/pyfiles/pages/document_term_matrix.py b/pyfiles/pages/document_term_matrix.py index fef75df..0cacd46 100644 --- a/pyfiles/pages/document_term_matrix.py +++ b/pyfiles/pages/document_term_matrix.py @@ -7,11 +7,9 @@ # -------------------------------------------------------------------------------------------------------------------- # # | IMPORT RELEVANT LIBRARIES | # # -------------------------------------------------------------------------------------------------------------------- # -import io import os import pathlib import platform - import pandas as pd import streamlit as st diff --git a/pyfiles/pages/load_clean_visualise.py b/pyfiles/pages/load_clean_visualise.py index 96b9a2b..8f8c2eb 100644 --- a/pyfiles/pages/load_clean_visualise.py +++ b/pyfiles/pages/load_clean_visualise.py @@ -9,9 +9,7 @@ # -------------------------------------------------------------------------------------------------------------------- # # | IMPORT RELEVANT LIBRARIES | # # -------------------------------------------------------------------------------------------------------------------- # -import pathlib import re -import nltk import numpy as np import pandas as pd import pycountry @@ -22,10 +20,9 @@ from streamlit_tags import st_tags from texthero import stopwords from collections import Counter -from texthero import preprocessing import plotly.express as px from utils import csp_downloaders -from utils.helper import readFile, lemmatizeText, downloadCorpora, printDataFrame, prettyDownload +from utils.helper import readFile, lemmatizeText, printDataFrame, prettyDownload from st_aggrid import AgGrid, DataReturnMode, GridUpdateMode, GridOptionsBuilder diff --git a/pyfiles/pages/model_trainer.py b/pyfiles/pages/model_trainer.py deleted file mode 100644 index fd9f42d..0000000 --- a/pyfiles/pages/model_trainer.py +++ /dev/null @@ -1,892 +0,0 @@ -""" -This module allows the user to train models and to predict NLP data -""" - -# -------------------------------------------------------------------------------------------------------------------- # -# | IMPORT RELEVANT LIBRARIES | # -# -------------------------------------------------------------------------------------------------------------------- # -import os -import pandas as pd -import streamlit as st -import textattack.models.wrappers -import torch -import subprocess -import transformers - -from streamlit_tags import st_tags -from datetime import datetime -from config import trainer -from utils import csp_downloaders -from utils.helper import readFile - - -# -------------------------------------------------------------------------------------------------------------------- # -# | MAIN APP FUNCTIONALITY | # -# -------------------------------------------------------------------------------------------------------------------- # -def app(): - """ - Main function that will be called when the app is run - """ - - st.markdown('# NLP Model Trainer and Predictor') - st.markdown('This function allows you to train and create a ML Model to classify the topic of the News Article ' - 'passed on to the dataset. This function requires the use of the PyTorch Library to train and ' - 'evaluate your model. Ensure that you have downloaded and installed the correct PyTorch library ' - 'corresponding to your CUDA version.') - - st.markdown('---') - col1, col2 = st.columns(2) - with col1: - st.markdown('### PyTorch for CUDA 10.2') - if st.button('Install Relevant Packages', key='10.2'): - os.system('pip install torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102' - ' -f https://download.pytorch.org/whl/cu102/torch_stable.html') - with col2: - st.markdown('### PyTorch for CUDA 11.3') - if st.button('Install Relevant Packages', key='11.3'): - os.system('pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113' - ' -f https://download.pytorch.org/whl/cu113/torch_stable.html') - st.markdown('\n\n') - - if st.button('Check if GPU is properly installed'): - st.info(f'GPU Installation Status: **{torch.cuda.is_available()}**') - if st.button('Check GPU used'): - try: - st.info(f'GPU Device **{torch.cuda.get_device_name(torch.cuda.current_device())}** in use.') - except AssertionError: - st.error('Your version of PyTorch is CPU-optimised. Download and install any of the above two ' - 'supported GPU-enabled PyTorch versions to use your GPU and silence this error.') - except Exception as ex: - st.error(ex) - - st.markdown('---') - st.markdown('## Mode Selector') - trainer['MODEL_MODE'] = st.selectbox('Select the actions you want to perform', ('Training', 'Evaluation')) - - if trainer['MODEL_MODE'] == 'Training': - # FLAGS - st.markdown('## Options\n\n' - '### Transformers Selection') - trainer['TRANSFORMERS_SELECTION'] = st.selectbox('Choose Transformers Auto Model Class to Use', - options=trainer['TRANSFORMERS_CHOICES'], - help='Note that this selection is important as failure to ' - 'use the correct class will result in errors when ' - 'running the Training step.', - key='transformers') - - st.markdown('### Training Parameters') - trainer['API'] = st.checkbox('Use Training API?', - help='Note that with this option selected, you must ensure that your GPU has ' - 'sufficient GPU memory to run the networks/models you selected. If you ' - 'are unsure, it is better to use the Command Line Argument API to fine ' - 'tune the model parameters before starting the training.', - value=True) - - if trainer['API']: - trainer['TRAINING_PARAMS'] = st.multiselect('Select Training Parameters', - ('num_epochs', 'num_clean_epochs', 'attack_epoch_interval', - 'early_stopping_epochs', 'learning_rate', - 'num_warmup_steps', - 'weight_decay', 'per_device_train_batch_size', - 'per_device_eval_batch_size', - 'gradient_accumulation_steps', 'random_seed', 'parallel', - 'load_best_model_at_end', 'alpha', - 'num_train_adv_examples', 'query_budget_train', - 'attack_num_workers_per_device', 'output_dir', - 'checkpoint_interval_steps', 'checkpoint_interval_epochs', - 'save_last', 'log_to_tb', 'tb_log_dir', 'log_to_wandb', - 'wandb_project', 'logging_interval_step'), - default=('num_epochs', 'per_device_train_batch_size')) - else: - trainer['TRAINING_PARAMS'] = st.multiselect('Select Training Parameters', - ('attack', 'model_max_length', - 'model_num_labels', 'dataset_train_split', - 'dataset_eval_split', 'filter_train_by_labels', - 'filter_eval_by_labels', 'num_epochs', 'num_clean_epochs', - 'attack_epoch_interval', 'early_stopping_epochs', - 'learning_rate', 'num_warmup_steps', - 'weight_decay', 'per_device_train_batch_size', - 'per_device_eval_batch_size', - 'gradient_accumulation_steps', 'random_seed', 'parallel', - 'load_best_model_at_end', 'alpha', - 'num_train_adv_examples', 'query_budget_train', - 'attack_num_workers_per_device', 'output_dir', - 'checkpoint_interval_steps', 'checkpoint_interval_epochs', - 'save_last', 'log_to_tb', 'tb_log_dir', 'log_to_wandb', - 'wandb_project', 'logging_interval_step'), - default=('model_max_length', 'num_epochs', - 'per_device_train_batch_size', 'model_num_labels')) - - # DEFINE PARAMETERS - if 'attack' in trainer['TRAINING_PARAMS']: - trainer['attack'] = st.text_input('Attack string', key='attack') - else: - trainer['attack'] = None - - if 'model_max_length' in trainer['TRAINING_PARAMS']: - if st.checkbox('Define Model Max Length'): - trainer['model_max_length'] = st.number_input('Model Max Length', - min_value=1, - max_value=1000000, - value=64, - key='model_max_length') - else: - trainer['model_max_length'] = None - - if 'model_num_labels' in trainer['TRAINING_PARAMS']: - if st.checkbox('Define Number of Labels'): - trainer['model_num_labels'] = st.number_input('Number of Labels', - min_value=1, - max_value=1000000, - value=1, - key='model_num_labels') - else: - trainer['model_num_labels'] = None - - if 'filter_train_by_labels' in trainer['TRAINING_PARAMS']: - trainer['filter_train_by_labels'] = st_tags(label='Filter Train Data By Labels', - key='filter_train', - text='Press Enter to add in more labels...', - maxtags=9999999) - else: - trainer['filter_train_by_labels'] = None - - if 'filter_eval_by_labels' in trainer['TRAINING_PARAMS']: - trainer['filter_eval_by_labels'] = st_tags(label='Filter Train Data By Labels', - key='filter_test', - text='Press Enter to add in more labels...', - maxtags=9999999) - else: - trainer['filter_eval_by_labels'] = None - - if 'num_epochs' in trainer['TRAINING_PARAMS']: - trainer['num_epochs'] = st.number_input('Total number of epochs for training', - min_value=1, - max_value=1000000, - value=3, - key='num_epochs') - else: - if trainer['API']: - trainer['num_epochs'] = 3 - else: - trainer['num_epochs'] = None - - if 'num_clean_epochs' in trainer['TRAINING_PARAMS']: - trainer['num_clean_epochs'] = st.number_input('Number of epochs to train on just the original ' - 'training dataset before adversarial training', - min_value=1, - max_value=1000000, - value=1, - key='num_clean_epochs') - else: - if trainer['API']: - trainer['num_clean_epochs'] = 1 - else: - trainer['num_clean_epochs'] = None - - if 'attack_epoch_interval' in trainer['TRAINING_PARAMS']: - trainer['attack_epoch_interval'] = st.number_input('Generate a new adversarial training set every ' - 'N epochs', - min_value=1, - max_value=1000000, - value=1, - key='attack_epoch_interval') - else: - if trainer['API']: - trainer['attack_epoch_interval'] = 1 - else: - trainer['attack_epoch_interval'] = None - - if 'early_stopping_epochs' in trainer['TRAINING_PARAMS']: - trainer['early_stopping_epochs'] = st.number_input('Number of epochs validation must increase ' - 'before stopping early', - min_value=1, - max_value=1000000, - value=1, - key='early_stopping_epochs') - else: - trainer['early_stopping_epochs'] = None - - if 'learning_rate' in trainer['TRAINING_PARAMS']: - trainer['learning_rate'] = st.number_input('Number of epochs validation must increase before ' - 'stopping early', - min_value=0., - max_value=1., - value=5e-5, - step=0.000001, - format='%.6f', - key='learning_rate') - else: - if trainer['API']: - trainer['learning_rate'] = 5e-5 - else: - trainer['learning_rate'] = None - - if 'num_warmup_steps' in trainer['TRAINING_PARAMS']: - if st.checkbox('Define in float?'): - trainer['num_warmup_steps'] = st.number_input('The number of steps for the warmup phase of ' - 'linear scheduler', - min_value=0., - max_value=1., - value=0.50, - step=0.001, - format='%.3f', - key='num_warmup_steps') - else: - trainer['num_warmup_steps'] = st.number_input('The number of steps for the warmup phase of ' - 'linear scheduler', - min_value=1, - max_value=1000000, - value=500, - key='num_warmup_steps') - else: - if trainer['API']: - trainer['num_warmup_steps'] = 500 - else: - trainer['num_warmup_steps'] = None - - if 'weight_decay' in trainer['TRAINING_PARAMS']: - trainer['weight_decay'] = st.number_input('Weight decay (L2 penalty)', - min_value=0., - max_value=1., - value=0.01, - step=0.01, - format='%.2f', - key='weight_decay') - else: - if trainer['API']: - trainer['weight_decay'] = 0.01 - else: - trainer['weight_decay'] = None - - if 'per_device_train_batch_size' in trainer['TRAINING_PARAMS']: - trainer['per_device_train_batch_size'] = st.number_input('The batch size per GPU/CPU for training', - min_value=1, - max_value=1000000, - value=8, - key='per_device_train_batch_size') - else: - if trainer['API']: - trainer['per_device_train_batch_size'] = 8 - else: - trainer['per_device_train_batch_size'] = None - - if 'per_device_eval_batch_size' in trainer['TRAINING_PARAMS']: - trainer['per_device_eval_batch_size'] = st.number_input('The batch size per GPU/CPU for evaluation', - min_value=1, - max_value=1000000, - value=32, - key='per_device_eval_batch_size') - else: - if trainer['API']: - trainer['per_device_eval_batch_size'] = 32 - else: - trainer['per_device_eval_batch_size'] = None - - if 'gradient_accumulation_steps' in trainer['TRAINING_PARAMS']: - trainer['gradient_accumulation_steps'] = st.number_input('Number of updates steps to accumulate ' - 'the gradients before performing a ' - 'backward/update pass', - min_value=1, - max_value=1000000, - value=32, - key='gradient_accumulation_steps') - else: - if trainer['API']: - trainer['gradient_accumulation_steps'] = 1 - else: - trainer['gradient_accumulation_steps'] = None - - if 'random_seed' in trainer['TRAINING_PARAMS']: - trainer['random_seed'] = st.number_input('Random seed for reproducibility', - min_value=1, - max_value=1000000, - value=786, - key='random_seed') - else: - if trainer['API']: - trainer['random_seed'] = 786 - else: - trainer['random_seed'] = None - - if 'parallel' in trainer['TRAINING_PARAMS']: - trainer['parallel'] = st.checkbox('Use Multiple GPUs using torch.DataParallel class?', - value=False, - key='parallel') - else: - if trainer['API']: - trainer['parallel'] = False - else: - trainer['parallel'] = None - - if 'load_best_model_at_end' in trainer['TRAINING_PARAMS']: - trainer['load_best_model_at_end'] = st.checkbox('keep track of the best model across training and ' - 'load it at the end', - value=False, - key='parallel') - else: - trainer['load_best_model_at_end'] = False - - if 'alpha' in trainer['TRAINING_PARAMS']: - trainer['alpha'] = st.number_input('The weight for adversarial loss', - min_value=0., - max_value=1., - value=0.50, - step=0.001, - format='%.3f', - key='alpha') - else: - if trainer['API']: - trainer['alpha'] = 1.0 - else: - trainer['alpha'] = None - - if 'num_train_adv_examples' in trainer['TRAINING_PARAMS']: - if st.checkbox('Use Float Parameters?'): - trainer['num_train_adv_examples'] = st.number_input('The number of samples to successfully ' - 'attack when generating adversarial ' - 'training set before start of every epoch', - min_value=0., - max_value=1., - value=0.50, - step=0.001, - format='%.3f', - key='num_train_adv_examples') - else: - trainer['num_train_adv_examples'] = st.number_input('The number of samples to successfully ' - 'attack when generating adversarial ' - 'training set before start of every epoch', - min_value=1, - max_value=1000000, - value=8, - key='per_device_train_batch_size') - else: - if trainer['API']: - trainer['num_train_adv_examples'] = -1 - else: - trainer['num_train_adv_examples'] = None - - if 'query_budget_train' in trainer['TRAINING_PARAMS']: - if st.checkbox('Set Max Query Budget?', value=False): - trainer['query_budget_train'] = st.number_input('The max query budget to use when generating ' - 'adversarial training set', - min_value=1, - max_value=1000000, - value=1, - key='query_budget_train') - else: - trainer['query_budget_train'] = None - - if 'attack_num_workers_per_device' in trainer['TRAINING_PARAMS']: - if st.checkbox('Set Number of Worker Process to run attack?', value=False): - trainer['attack_num_workers_per_device'] = st.number_input('Number of worker processes to run ' - 'per device for attack', - min_value=1, - max_value=1000000, - value=1, - key='attack_num_workers_per_device') - else: - if trainer['API']: - trainer['attack_num_workers_per_device'] = 1 - else: - trainer['attack_num_workers_per_device'] = None - - if 'output_dir' in trainer['TRAINING_PARAMS']: - dt = datetime.now() - trainer['output_dir'] = st.text_input('Directory to output training logs and checkpoints', - value=f'/outputs/{dt.strftime("%Y-%m-%d-%H-%M-%S-%f")}', - key='output_dir') - else: - trainer['output_dir'] = f'/outputs/{datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")}' - - if 'checkpoint_interval_steps' in trainer['TRAINING_PARAMS']: - if st.checkbox('Save Model Checkpoint after every N updates?'): - trainer['checkpoint_interval_steps'] = st.number_input('Save after N updates', - min_value=1, - max_value=1000000, - value=1, - key='checkpoint_interval_steps') - else: - trainer['checkpoint_interval_steps'] = None - - if 'checkpoint_interval_epochs' in trainer['TRAINING_PARAMS']: - if st.checkbox('Save Model Checkpoint after every N epochs?'): - trainer['checkpoint_interval_epochs'] = st.number_input('Save after N epochs', - min_value=1, - max_value=1000000, - value=1, - key='checkpoint_interval_epochs') - else: - trainer['checkpoint_interval_epochs'] = None - - if 'save_last' in trainer['TRAINING_PARAMS']: - trainer['save_last'] = st.checkbox('Save the model at end of training', - value=True, - key='save_last') - else: - if trainer['API']: - trainer['save_last'] = True - else: - trainer['save_last'] = None - - if 'log_to_tb' in trainer['TRAINING_PARAMS']: - trainer['log_to_tb'] = st.checkbox('Log to Tensorboard', - value=False, - key='log_to_tb') - else: - if trainer['API']: - trainer['log_to_tb'] = False - else: - trainer['log_to_tb'] = None - - if 'tb_log_dir' in trainer['TRAINING_PARAMS']: - trainer['tb_log_dir'] = st.text_input('Directory to output training logs and checkpoints', - value=r'./runs', - key='tb_log_dir') - else: - trainer['tb_log_dir'] = r'./runs' - - if 'log_to_wandb' in trainer['TRAINING_PARAMS']: - trainer['log_to_wandb'] = st.checkbox('Log to Wandb', - value=False, - key='log_to_wandb') - else: - if trainer['API']: - trainer['log_to_wandb'] = False - else: - trainer['log_to_wandb'] = None - - if 'wandb_project' in trainer['TRAINING_PARAMS']: - trainer['wandb_project'] = st.text_input('Name of Wandb project for logging', - value=r'textattack', - key='wandb_project') - else: - if trainer['API']: - trainer['wandb_project'] = 'textattack' - else: - trainer['wandb_project'] = None - - if 'logging_interval_step' in trainer['TRAINING_PARAMS']: - trainer['logging_interval_step'] = st.number_input('Log to Tensorboard/Wandb every N training ' - 'steps', - min_value=1, - max_value=1000000, - value=1, - key='logging_interval_step') - else: - if trainer['API']: - trainer['logging_interval_step'] = 1 - else: - trainer['logging_interval_step'] = None - - if st.checkbox('Attack Model with confusion datasets?', value=False): - trainer['ATTACK'] = st.selectbox('Choose Attack recipes to execute on Model', - trainer['ATTACK_RECIPES']) - if trainer['ATTACK'] == 'None': - trainer['ATTACK_MODEL'] = None - - st.markdown('### Model and Data Selection') - col, col_ = st.columns(2) - trainer['MODEL'] = col.selectbox('Choose Model to Use', - trainer['ML_POSSIBLE_PICKS'], - key='mdl') - trainer['DATASET'] = col.selectbox('Choose Dataset to Use', - trainer['DATASET_POSSIBLE_PICKS'], - help='Due to the sheer number of datasets availble on HuggingFace, ' - 'we have only provided the top 100 datasets on the website.', - key='datasets') - - trainer['TASK_TYPE'] = col.selectbox('Choose Task for Model to Complete', ('classification', 'regression')) - - if len(trainer['SUBSET_MAPPINGS'][trainer['DATASET']][0]) != 0: - trainer['SUBSET'] = col_.selectbox('Select Subset of Data to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][0]) - else: - trainer['SUBSET'] = None - - if len(trainer['SUBSET_MAPPINGS'][trainer['DATASET']][1]) != 0: - trainer['MODEL_COL'] = col_.selectbox('Select Data Columns to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][1], - key='column_dat') - else: - trainer['MODEL_COL'] = None - - if len(trainer['SUBSET_MAPPINGS'][trainer['DATASET']][2]) > 0: - trainer['SPLIT_TRAIN'] = col_.selectbox('Select Training Split to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][2], - key='train') - trainer['SPLIT_TEST'] = col_.selectbox('Select Testing Split to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][2], - key='test') - if trainer['SPLIT_TRAIN'] == trainer['SPLIT_TEST']: - st.warning('**Warning**: Your Training and Testing Dataset should not be the same. Ensure that ' - 'you have selected the right dataset to use for your model.') - else: - st.warning('**Warning:** This dataset does not have data split properly. You may wish to use another ' - 'dataset or to edit the dataset before passing it into the model for training.') - trainer['SPLIT_TRAIN'] = None - trainer['SPLIT_TEST'] = None - - with st.expander('Dataset Explorer'): - st.markdown('### Dataset Explorer\n' - 'Use the above flags to define the Dataset to download and explore.') - st.info(f'**Current Dataset Chosen**: {trainer["DATASET"]}') - if st.button(f'Explore {trainer["DATASET"]}'): - train = textattack.datasets.HuggingFaceDataset(name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TRAIN']) - test = textattack.datasets.HuggingFaceDataset(name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TEST']) - st.markdown(f'### Training Data\n\n' - f'**First Entry**: {train[0]}\n\n' - f'**Last Entry**: {train[-1]}\n\n' - f'**Length of Dataset**: {len(train)}') - st.markdown(f'### Testing Data\n\n' - f'**First Entry**: {test[0]}\n\n' - f'**Last Entry**: {test[-1]}\n\n' - f'**Length of Dataset**: {len(test)}') - - st.markdown('## Begin Training\n\n' - 'Kindly ensure that the models you have chosen above is compatible with the dataset. Failure to ' - 'do so will result in errors.') - if st.button('Proceed'): - if trainer['API']: - # transformers model selector - st.info(f'Loading {trainer["TRANSFORMERS_SELECTION"]} Class...') - if trainer['TRANSFORMERS_SELECTION'] == 'Pre Training': - trainer['ML_MODEL'] = transformers.AutoModelForPreTraining.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'CausalLM': - trainer['ML_MODEL'] = transformers.AutoModelForCausalLM.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'MaskedLM': - trainer['ML_MODEL'] = transformers.AutoModelForMaskedLM.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'Seq2SeqLM': - trainer['ML_MODEL'] = transformers.AutoModelForSeq2SeqLM.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'SequenceClassification': - trainer['ML_MODEL'] = transformers.AutoModelForSequenceClassification.from_pretrained( - trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'MultipleChoice': - trainer['ML_MODEL'] = transformers.AutoModelForMultipleChoice.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'NextSentencePrediction': - trainer['ML_MODEL'] = transformers.AutoModelForNextSentencePrediction.from_pretrained( - trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'TokenClassificaition': - trainer['ML_MODEL'] = transformers.AutoModelForTokenClassification.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'QuestionAnswering': - trainer['ML_MODEL'] = transformers.AutoModelForQuestionAnswering.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'TableQuestionAnswering': - trainer['ML_MODEL'] = transformers.AutoModelForTableQuestionAnswering.from_pretrained( - trainer['MODEL']) - - trainer['TOKENIZER'] = transformers.AutoTokenizer.from_pretrained(trainer['MODEL']) - trainer['WRAPPED_MODEL'] = textattack.models.wrappers.HuggingFaceModelWrapper(trainer['ML_MODEL'], - trainer['TOKENIZER']) - trainer['TRAINING_DATA'] = textattack.datasets.HuggingFaceDataset( - name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TRAIN'] - ) - trainer['EVAL_DATA'] = textattack.datasets.HuggingFaceDataset( - name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TEST'] - ) - - if trainer['ATTACK'] != 'None': - if trainer['ATTACK'] == 'A2T (A2T: Attack for Adversarial Training Recipe)': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.A2TYoo2021.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'BAE (BAE: BERT-Based Adversarial Examples)': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.BAEGarg2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'BERT-Attack': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.BERTAttackLi2020.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'CheckList': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.CheckList2020.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'CLARE Recipe': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.CLARE2020.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'DeepWordBug': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.DeepWordBugGao2018. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Faster Alzantot Genetic Algorithm': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.FasterGeneticAlgorithmJia2019. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Alzantot Genetic Algorithm': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.GeneticAlgorithmAlzantot2018. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'HotFlip': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.HotFlipEbrahimi2017. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Improved Genetic Algorithm': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.IGAWang2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Input Reduction': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.InputReductionFeng2018. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Kuleshov2017': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.Kuleshov2017.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'MORPHEUS2020': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.MorpheusTan2020.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Pruthi2019: Combating with Robust Word Recognition': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.Pruthi2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Particle Swarm Optimization': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.PSOZang2020.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'PWWS': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.PWWSRen2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Seq2Sick': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.Seq2SickCheng2018BlackBox. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'TextBugger': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.TextBuggerLi2018.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'TextFooler (Is BERT Really Robust?)': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.TextFoolerJin2019. \ - build(trainer['WRAPPED_MODEL']) - - trainer['TRAINING_ARGS'] = textattack.TrainingArgs( - num_epochs=trainer['num_epochs'], - num_clean_epochs=trainer['num_clean_epochs'], - attack_epoch_interval=trainer['attack_epoch_interval'], - early_stopping_epochs=trainer['early_stopping_epochs'], - learning_rate=trainer['learning_rate'], - num_warmup_steps=trainer['num_warmup_steps'], - weight_decay=trainer['weight_decay'], - per_device_train_batch_size=trainer['per_device_train_batch_size'], - per_device_eval_batch_size=trainer['per_device_eval_batch_size'], - gradient_accumulation_steps=trainer['gradient_accumulation_steps'], - random_seed=trainer['random_seed'], - parallel=trainer['parallel'], - load_best_model_at_end=trainer['load_best_model_at_end'], - alpha=trainer['alpha'], - num_train_adv_examples=trainer['num_train_adv_examples'], - query_budget_train=trainer['query_budget_train'], - attack_num_workers_per_device=trainer['attack_num_workers_per_device'], - output_dir=trainer['output_dir'], - checkpoint_interval_steps=trainer['checkpoint_interval_steps'], - checkpoint_interval_epochs=trainer['checkpoint_interval_epochs'], - save_last=trainer['save_last'], - log_to_tb=trainer['log_to_tb'], - tb_log_dir=trainer['tb_log_dir'], - log_to_wandb=trainer['log_to_wandb'], - wandb_project=trainer['wandb_project'], - logging_interval_step=trainer['logging_interval_step'] - ) - trainer['TRAINER'] = textattack.Trainer( - model_wrapper=trainer['WRAPPED_MODEL'], - task_type=trainer['TASK_TYPE'], - attack=trainer['ATTACK_MODEL'], - train_dataset=trainer['TRAINING_DATA'], - eval_dataset=trainer['EVAL_DATA'], - training_args=trainer['TRAINING_ARGS'] - ) - - with st.spinner('Training Model... Refer to your Terminal for more information...'): - try: - trainer['TRAINER'].train() - except Exception as ex: - st.error(ex) - else: - st.success(f'Successfully trained model! Model saved in {os.getcwd()}{trainer["output_dir"]}.') - - else: - with st.spinner('Training Model... Refer to your Terminal for more information...'): - var_list = ['textattack', 'train'] - maps = { - 'model_name_or_path': ['--model-name-or-path', trainer['MODEL']], - 'dataset': ['--dataset', trainer['DATASET']], - 'attack': ['--attack', trainer['attack']], - 'task_type': ['--task-type', trainer['TASK_TYPE']], - 'model_max_length': ['--model-max-length', trainer['model_max_length']], - 'model_num_labels': ['--model-num-labels', trainer['model_num_labels']], - 'dataset_train_split': ['--dataset-train-split', trainer['dataset_train_split']], - 'dataset_eval_split': ['--dataset-eval-split', trainer['dataset_eval_split']], - 'filter_train_by_labels': ['--filter-train-by-labels', trainer['filter_train_by_labels']], - 'filter_eval_by_labels': ['--filter-eval-by-labels', trainer['filter_eval_by_labels']], - 'num_epochs': ['--num-epochs', trainer['num_epochs']], - 'num_clean_epochs': ['--num-clean-epochs', trainer['num_clean_epochs']], - 'attack_epoch_interval': ['--attack-epoch-interval', trainer['attack_epoch_interval']], - 'early_stopping_epochs': ['--early-stopping-epochs', trainer['early_stopping_epochs']], - 'learning_rate': ['--learning-rate', trainer['learning_rate']], - 'num_warmup_steps': ['--num-warmup-steps', trainer['num_warmup_steps']], - 'weight_decay': ['--weight-decay', trainer['weight_decay']], - 'per_device_train_batch_size': ['--per-device-train-batch-size', - trainer['per_device_train_batch_size']], - 'per_device_eval_batch_size': ['--per-device-eval-batch-size', - trainer['per_device_eval_batch_size']], - 'gradient_accumulation_steps': ['--gradient-accumulation-steps', - trainer['gradient_accumulation_steps']], - 'random_seed': ['--random-seed', trainer['random_seed']], - 'parallel': ['--parallel', trainer['parallel']], - 'load_best_model_at_end': ['--load-best-model-at-end', trainer['load_best_model_at_end']], - 'alpha': ['--alpha', trainer['alpha']], - 'num_train_adv_examples': ['--num-train-adv-examples', trainer['num_train_adv_examples']], - 'query_budget_train': ['--query-budget-train', trainer['query_budget_train']], - 'attack_num_workers_per_device': ['--attack-num-workers-per-device', - trainer['attack_num_workers_per_device']], - 'output_dir': ['--output-dir', trainer['output_dir']], - 'checkpoint_interval_steps': ['--checkpoint-interval-steps', - trainer['checkpoint_interval_steps']], - 'checkpoint_interval_epochs': ['--checkpoint-interval-epochs', - trainer['checkpoint_interval_epochs']], - 'save_last': ['--save-last', trainer['save_last']], - 'log_to_tb': ['--log-to-tb', trainer['log_to_tb']], - 'tb_log_dir': ['--tb-log-dir', trainer['tb_log_dir']], - 'log_to_wandb': ['--log-to-wandb', trainer['log_to_wandb']], - 'wandb_project': ['--wandb-project', trainer['wandb_project']], - 'logging_interval_step': ['--logging-interval-step', - trainer['logging_interval_step']] - } - - # only include variables that are defined - maps = {key: value for key, value in maps.items() if value[1] is not None} - for k, v in maps.items(): - var_list.extend(v) - - var_list = [str(iter_) for iter_ in var_list if type(iter_) is not bool] - print(var_list) - - # run the command - st.markdown('### Outputs') - try: - results = subprocess.run(var_list, capture_output=True) - except Exception as ex: - st.error(ex) - else: - st.markdown('#### Outputs') - try: - results.check_returncode() - st.write(results.stdout) - except subprocess.CalledProcessError: - st.error('Error: Command cannot be executed. Try again.') - finally: - if os.path.exist(os.path.join(os.getcwd(), trainer['output_dir'])): - st.success(f'Successfully trained model! Model saved in {os.getcwd()}' - f'{trainer["output_dir"]}.') - else: - st.error('Error: Model is not saved.') - - elif trainer['MODEL_MODE'] == 'Evaluation': - st.markdown('## Options') - trainer['SAVE'] = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name ' - 'and content being downloaded again, a unique file ' - 'identifier is tacked onto the filename.') - trainer['VERBOSE'] = st.checkbox('Display Outputs?') - - if trainer['VERBOSE']: - trainer['VERBOSITY'] = st.slider('Data points', - key='Data points to display?', - min_value=0, - max_value=1000, - value=20, - help='Select 0 to display all Data Points') - trainer['ADVANCED_ANALYSIS'] = st.checkbox('Display Advanced DataFrame Statistics?', - help='This option will analyse your DataFrame and display ' - 'advanced statistics on it. Note that this will require ' - 'some time and processing power to complete. Deselect this ' - 'option if this if you do not require it.') - - st.markdown('## Upload Prediction Data and Model\n') - st.markdown('### Prediction Data') - col3, col3_ = st.columns(2) - - trainer['FILE'] = col3.selectbox('Select the Size of File to Load', ('Local', 'Online'), - help='Choose "Local" if you wish to upload a file from your machine or choose ' - '"Online" if you wish to pull a file from any one of the supported Cloud ' - 'Service Providers.') - trainer['MODE'] = col3_.selectbox('Define the Data Input Format', ('CSV', 'XLSX', 'PKL', 'JSON', 'HDF5')) - - if trainer['FILE'] == 'Local': - trainer['PRED_FILEPATH'] = st.file_uploader(f'Load {trainer["MODE"]} File', type=[trainer['MODE']]) - if trainer['PRED_FILEPATH'] is not None: - trainer['PRED_DATA'] = readFile(trainer['PRED_FILEPATH'], trainer['MODE']) - if not trainer['PRED_DATA'].empty: - trainer['PRED_DATA'] = trainer['PRED_DATA'].astype(str) - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - else: - trainer['PRED_DATA'] = pd.DataFrame() - - elif trainer['FILE'] == 'Online': - st.info(f'File Format Selected: **{trainer["MODE"]}**') - trainer['CSP'] = st.selectbox('CSP', ('Select a CSP', 'Azure', 'Amazon', 'Google')) - - if trainer['CSP'] == 'Azure': - azure = csp_downloaders.AzureDownloader() - if azure.SUCCESSFUL: - try: - azure.downloadBlob() - trainer['PRED_DATA'] = readFile(azure.AZURE_DOWNLOAD_PATH, trainer['MODE']) - except Exception as ex: - st.error(f'Error: {ex}. Try again.') - - if not trainer['PRED_DATA'].empty: - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - - elif trainer['CSP'] == 'Amazon': - aws = csp_downloaders.AWSDownloader() - if aws.SUCCESSFUL: - try: - aws.downloadFile() - trainer['PRED_DATA'] = readFile(aws.AWS_FILE_NAME, trainer['MODE']) - except Exception as ex: - st.error(f'Error: {ex}. Try again.') - - if not trainer['PRED_DATA'].empty: - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - - elif trainer['CSP'] == 'Google': - gcs = csp_downloaders.GoogleDownloader() - if gcs.SUCCESSFUL: - try: - gcs.downloadBlob() - trainer['PRED_DATA'] = readFile(gcs.GOOGLE_DESTINATION_FILE_NAME, trainer['MODE']) - except Exception as ex: - st.error(f'Error: {ex}. Try again.') - - if not trainer['PRED_DATA'].empty: - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - - st.markdown('### Model\n' - 'Due to the tendency for model files to be larger than the 200 MB limit of the File Uploader ' - 'Widget, you will need to provide a path to the model. The following text input widget will ' - 'display the current working directory where this app is launched from.') - trainer['MODEL_PATH'] = st.text_input('Key in the path to the model below', - value=os.getcwd(), - key='model_path') - if os.path.exists(trainer['MODEL_PATH']): - st.success(f'File Path {trainer["MODEL_PATH"]} exists!') - trainer['PATH_EXIST'] = True - else: - st.error(f'Error: {trainer["MODEL_PATH"]} is invalid!') - trainer['PATH_EXIST'] = False - - # PREDICTIONS - st.markdown('## Prediction') - st.markdown('Ensure that all your data is properly loaded before proceeding.') - - if st.button('Proceed?'): - if trainer['PATH_EXIST']: - trainer['PRED_DATA'] = trainer['PRED_DATA'][[trainer['DATA_COLUMN']]] - trainer['PRED_DATA'] = trainer['PRED_DATA'].to_list() - - try: - trainer['ML_MODEL'] = torch.load('MODEL_PATH') - predictions = trainer['ML_MODEL'](trainer['PRED_DATA']) - except Exception as ex: - st.error(ex) - else: - st.markdown('### Predicted Data') - st.write(predictions) - else: - st.error('Error: Model File Path is not valid. Try again.') diff --git a/pyfiles/pages/toolkit_nlp.py b/pyfiles/pages/toolkit_nlp.py index 519e571..cb2e387 100644 --- a/pyfiles/pages/toolkit_nlp.py +++ b/pyfiles/pages/toolkit_nlp.py @@ -20,14 +20,11 @@ import pyLDAvis.gensim_models import pyLDAvis.sklearn import streamlit.components.v1 -import textattack.models.wrappers import torch -import tensorflow as tf import matplotlib.pyplot as plt import transformers from streamlit_tags import st_tags -from datetime import datetime from config import toolkit from operator import itemgetter from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline, AutoModelForSequenceClassification diff --git a/requirements.txt b/requirements.txt index 6fca680..59442fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,17 +27,8 @@ transformers~=4.10.2 pytorch-lightning==1.5.4 pathlib~=1.0.1 pyyaml~=5.4.1 -tensorflow~=2.7.0 -mlxtend~=0.19.0 matplotlib~=3.4.3 -textattack~=0.3.4 datetime~=4.3 scikit-learn~=0.24.2 -tensorflow-text~=2.7.3 -fastapi~=0.70.1 -uvicorn~=0.16.0 -aiofiles~=0.8.0 -python-multipart~=0.0.5 +pillow~=9.0.0 streamlit-tags~=1.2.8 -pillow~=8.3.2 -cython~=0.29.26 diff --git a/utils/helper.py b/utils/helper.py index 8926461..cd7d583 100644 --- a/utils/helper.py +++ b/utils/helper.py @@ -6,7 +6,6 @@ # | IMPORT RELEVANT LIBRARIES | # # -------------------------------------------------------------------------------------------------------------------- # import io -import logging import os import typing import nltk @@ -21,7 +20,6 @@ import pickle import uuid import re -import urllib.parse from collections import Counter from heapq import nlargest @@ -29,7 +27,6 @@ from PIL import Image from nltk.stem import WordNetLemmatizer from streamlit_pandas_profiling import st_profile_report -from config import toolkit # -------------------------------------------------------------------------------------------------------------------- # # | DOWNLOAD DEPENDENCIES | #