diff --git a/.gitignore b/.gitignore index 1f7ac75..78ddfc5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class *.pyc +*.DS_STORE # C extensions *.so @@ -27,6 +28,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +futures/ # PyInstaller # Usually these files are written by a python script from a template diff --git a/README.md b/README.md index 3e00126..9620c37 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,9 @@ created pre-made Docker images hosted on Github Packages for you to use. To do s system and run the following commands on Terminal or Powershell: ```shell -docker pull docker pull ghcr.io/asdfghjkxd/app:2.2 +docker pull docker pull ghcr.io/asdfghjkxd/app:main -docker run -it -p 5000:8501 --name news ghcr.io/asdfghjkxd/app:2.2 +docker run -it -p 5000:8501 --name news ghcr.io/asdfghjkxd/app:main ``` The created Docker Container can then be accessed through `localhost` on Port `5000`! @@ -35,7 +35,7 @@ The created Docker Container can then be accessed through `localhost` on Port `5 If Command Lines are not your thing, you can do the same using the Docker Desktop GUI! Just follow the steps below to set up the Container: -- Open up Terminal or Powershell and key in the command `docker pull ghcr.io/asdfghjkxd/app:2.2` word for word (we +- Open up Terminal or Powershell and key in the command `docker pull ghcr.io/asdfghjkxd/app:main` word for word (we promise this is the only Command Line step in the entire process!) - Click on the _Images_ tab on the sidebar and find the image you have pulled in the above step - Click on the _Run_ button @@ -81,8 +81,3 @@ following tasks on your dataset: - Named Entity Recognition - Position of Speech Tagging - Summary - - -### NLP Model Trainer -This module will allow you to train NLP models you can use for your NLP tasks. This module requires you to have a -compatible GPU (NVIDIA GPUs) to run inference/classification tasks. diff --git a/api/endpoints/__init__.py b/api/endpoints/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/dtm/__init__.py b/api/endpoints/dtm/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/dtm/dtm.py b/api/endpoints/dtm/dtm.py deleted file mode 100644 index bb4768c..0000000 --- a/api/endpoints/dtm/dtm.py +++ /dev/null @@ -1,66 +0,0 @@ -import pandas as pd - -from typing import Union -from io import StringIO -from fastapi import APIRouter, HTTPException, File, UploadFile -from fastapi.encoders import jsonable_encoder -from nltk.corpus import stopwords -from sklearn.feature_extraction.text import CountVectorizer - -router = APIRouter(prefix='/endpoints', - tags=['dtm'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/dtm') -async def dtm(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data') -> dict: - """ - This function takes in CSV data that is compatible with a pandas DataFrame, creates a Document-Term Matrix and - returns it to the user in JSON format - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - counter_object = CountVectorizer(stop_words=stopwords.words('english')) - word_string = ' '.join(raw_data[data_column]) - - dict_data = { - 'text': word_string - } - - series_data = pd.DataFrame(data=dict_data, index=[0]) - series_data = counter_object.fit_transform(series_data.text) - dtm_ = pd.DataFrame(series_data.toarray(), - columns=counter_object.get_feature_names(), - index=[0]) - - if not dtm_.empty: - dtm_copy = dtm_.copy().transpose() - dtm_copy.columns = ['Word Frequency'] - dtm_copy.sort_values(by=['Word Frequency'], ascending=False, inplace=True) - data = { - 'dtm': dtm_copy.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Document-Term Matrix was not properly prepared. Try ' - 'again.') diff --git a/api/endpoints/lca/__init__.py b/api/endpoints/lca/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/lca/clean.py b/api/endpoints/lca/clean.py deleted file mode 100644 index d88381f..0000000 --- a/api/endpoints/lca/clean.py +++ /dev/null @@ -1,279 +0,0 @@ -import pathlib -import nltk -import numpy as np -import pandas as pd -import texthero as hero -import fastapi - -from texthero import preprocessing -from io import StringIO -from typing import Union -from fastapi.encoders import jsonable_encoder -from fastapi import APIRouter, HTTPException, File, UploadFile -from texthero import stopwords -from nltk.stem import WordNetLemmatizer - -# define constants -lemmatizer = WordNetLemmatizer() -SIMPLE_PIPELINE = [ - preprocessing.remove_html_tags, - preprocessing.remove_diacritics, - preprocessing.remove_whitespace, - preprocessing.remove_urls, - preprocessing.drop_no_content - ] -PIPELINE = [ - preprocessing.fillna, - preprocessing.lowercase, - preprocessing.remove_punctuation, - preprocessing.remove_html_tags, - preprocessing.remove_diacritics, - preprocessing.remove_whitespace, - preprocessing.remove_urls, - preprocessing.drop_no_content - ] - - -def lemmatizeText(text): - """ - This function iterates through the pandas dataframe and lemmatizes the words - - Parameters - ---------- - :param text: Text to lemmatize (string) - ---------- - """ - return [lemmatizer.lemmatize(word) for word in text] - - -# API router -router = APIRouter(prefix='/endpoints/lca/clean', - tags=['clean'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/no-clean') -async def no_clean(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data'): - """ - This function takes in JSON data that is compatible with a pandas DataFrame, encodes it in the ASCII format and - decodes it back into ASCII - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - if not raw_data.empty: - raw_data[data_column] = raw_data[data_column].str.encode('ascii', 'ignore') \ - .str.decode('ascii') - raw_data = pd.DataFrame(data=raw_data) - raw_data = raw_data.dropna() - data = { - 'original': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Data is not properly loaded. Try again.') - - -@router.post('/simple-clean') -async def simple_clean(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', tokenize: bool = True): - """ - This function takes in JSON data that is compatible with a pandas DataFrame, encodes it in the ASCII format and - decodes back into ASCII, and finally apply the 'Simple' Cleaning Pipeline and tokenizing the data (if the flag is - set to True) - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **tokenize**: Flag to determine whether to tokenize the data and to return it - """ - - cleaned_data_tokenized = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - raw_data[data_column] = raw_data[data_column].str.encode('ascii', 'ignore') \ - .str.decode('ascii') - raw_data = pd.DataFrame(data=raw_data) - raw_data = raw_data.dropna() - - try: - cleaned_data = raw_data[[data_column]] - cleaned_data['CLEANED CONTENT'] = hero.clean(cleaned_data[data_column], SIMPLE_PIPELINE) - cleaned_data['CLEANED CONTENT'].replace('', np.nan, inplace=True) - cleaned_data.dropna(inplace=True, subset=['CLEANED CONTENT']) - - cleaned_data = cleaned_data.astype(str) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - if tokenize: - try: - cleaned_data_tokenized = hero.tokenize(cleaned_data['CLEANED CONTENT']).to_frame().astype(str) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - - if not cleaned_data.empty and not cleaned_data_tokenized.empty: - data = { - 'cleaned_untokenized': cleaned_data.to_json(), - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return data - elif not cleaned_data.empty and cleaned_data_tokenized.empty: - data = { - 'cleaned_untokenized': cleaned_data.to_json() - } - return data - elif cleaned_data.empty and not cleaned_data_tokenized.empty: - data = { - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Data is not properly loaded. Try again.') - - -@router.post('/complex-clean') -async def complex_clean(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', - tokenize: bool = True, stopwords_list: Union[str, list] = None): - """ - This function takes in JSON data that is compatible with a pandas DataFrame, encodes it in the ASCII format and - decodes back into ASCII, and finally apply the 'Complex' Cleaning Pipeline and tokenzing the data (if the flag is - set to True) - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **tokenize**: Flag to determine whether to tokenize the data and to return it - - **stopwords_list**: A string (delimited by commas) or a list containing words to extend onto the default stopwords - list. - """ - - finalised = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - # stopwords check - if stopwords_list is not None: - if type(stopwords_list) is str: - try: - if len(stopwords_list) != 0: - stopwords_list = stopwords.DEFAULT.union(set(word.strip().lower() for word in - stopwords_list.split(sep=','))) - finalised = True - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - elif type(stopwords_list) is list: - stopwords_list = stopwords.DEFAULT.union(stopwords_list) - finalised = True - else: - raise HTTPException(status_code=404, detail='Invalid type for stopwords_list ') - else: - stopwords_list = stopwords.DEFAULT - finalised = True - - if finalised: - try: - cleaned_data = raw_data[[data_column]] - cleaned_data['CLEANED CONTENT'] = hero.clean(cleaned_data[data_column], PIPELINE) - cleaned_data['CLEANED CONTENT'] = hero.remove_digits(cleaned_data['CLEANED CONTENT'], only_blocks=False) - cleaned_data['CLEANED CONTENT'] = hero.remove_stopwords(cleaned_data['CLEANED CONTENT'], stopwords_list) - cleaned_data_tokenized = hero.tokenize(cleaned_data['CLEANED CONTENT']) - cleaned_data_tokenized = cleaned_data_tokenized.apply(lemmatizeText) - - fin_list = [[word for word in text if word.lower() in set(nltk.corpus.words.words()) or not - word.isalpha()] for text in cleaned_data_tokenized] - - cleaned_data['CLEANED CONTENT'] = [' '.join(text) for text in fin_list] - cleaned_data_tokenized.update([str(text) for text in fin_list]) - cleaned_data_tokenized = cleaned_data_tokenized.to_frame().astype(str) - cleaned_data['CLEANED CONTENT'].replace('', np.nan, inplace=True) - cleaned_data.dropna(subset=['CLEANED CONTENT'], inplace=True) - cleaned_data = cleaned_data.astype(str) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - if not cleaned_data.empty and not cleaned_data_tokenized.empty: - if tokenize: - data = { - 'original': raw_data.to_json(), - 'cleaned_untokenized': cleaned_data.to_json(), - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return jsonable_encoder(data) - else: - data = { - 'original': raw_data.to_json(), - 'cleaned_tokenized': cleaned_data.to_json() - } - return jsonable_encoder(data) - elif not cleaned_data.empty and cleaned_data_tokenized.empty: - data = { - 'original': raw_data.to_json(), - 'cleaned_untokenized': cleaned_data.to_json() - } - return jsonable_encoder(data) - elif cleaned_data.empty and not cleaned_data_tokenized.empty: - if tokenize: - data = { - 'original': raw_data.to_json(), - 'cleaned_tokenized': cleaned_data_tokenized.to_json() - } - return jsonable_encoder(data) - else: - data = { - 'original': raw_data.to_json() - } - return jsonable_encoder(data) - elif cleaned_data.empty and cleaned_data_tokenized.empty: - raise HTTPException(status_code=404, detail='Data is not properly loaded. Try again.') - else: - raise HTTPException(status_code=404, detail='Data is not properly processed. Try again.') \ No newline at end of file diff --git a/api/endpoints/lca/modify.py b/api/endpoints/lca/modify.py deleted file mode 100644 index ed2303f..0000000 --- a/api/endpoints/lca/modify.py +++ /dev/null @@ -1,65 +0,0 @@ -"""This file contains code used to modify the input DataFrame in the JSON format""" - -import pandas as pd -import fastapi -import pycountry - -from io import StringIO -from fastapi.encoders import jsonable_encoder -from fastapi import APIRouter, HTTPException, File, UploadFile -from collections import Counter - -router = APIRouter(prefix='/endpoints/lca/modify', - tags=['modify'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/country-extraction') -async def extract_country(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = ''): - """ - Searches for instances of country names being mentioned in the DataFrame passed to it and returns the DataFrame - modified with the country names extracted - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column where the data of interest is found in - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - try: - raw_data = raw_data.astype(object) - raw_data['COUNTRIES'] = raw_data[data_column].astype(str).apply( - lambda x: [country.name for country in pycountry.countries if country.name.lower() in x.lower()]) - new_list = raw_data['COUNTRIES'].to_list() - temp = [] - for ls in new_list: - temp.extend(ls) - zipped = list(zip(Counter(temp).keys(), Counter(temp).values())) - - globe_data = pd.DataFrame(data=zipped, index=range(len(zipped)), columns=['country', 'count']) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - data = { - 'data': globe_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Data is not processed properly. Try again.') diff --git a/api/endpoints/lca/query.py b/api/endpoints/lca/query.py deleted file mode 100644 index c54acfc..0000000 --- a/api/endpoints/lca/query.py +++ /dev/null @@ -1,59 +0,0 @@ -"""This file contains the code used for querying data from a given DataFrame that is passed to it in the JSON format""" - -import pandas as pd -import fastapi - -from io import StringIO -from fastapi.encoders import jsonable_encoder -from fastapi import APIRouter, HTTPException, File, UploadFile - -router = APIRouter(prefix='/endpoints/lca/modify', - tags=['modify'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/query') -async def query(file: UploadFile = File(...), ftype: str = 'csv', query_: str = None, data_column: str = '', - match: bool = True): - """ - Queries the input DataFrame in the form of JSON to find matching strings for query - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: The column name where the data to query is found - - **query_**: The string or list to query for in the data - - **match**: The strictness of query - True if query is case-sensitive - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - try: - temp = raw_data.copy() - query_data = temp.loc[temp[data_column].str.contains(query_, case=match)] - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - data = { - 'data': query_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Data is not processed properly. Try again.') diff --git a/api/endpoints/mt/__init__.py b/api/endpoints/mt/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/mt/model_trainer.py b/api/endpoints/mt/model_trainer.py deleted file mode 100644 index 15d56d0..0000000 --- a/api/endpoints/mt/model_trainer.py +++ /dev/null @@ -1,176 +0,0 @@ -import os -import subprocess -import zipfile - -from fastapi import HTTPException, APIRouter -from fastapi.responses import FileResponse - -router = APIRouter(prefix='/endpoints', - tags=['trainer'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - - -@router.post('/trainer') -async def trainer(model_name_or_path: str, dataset: str, attack: str, task_type: str = 'classification', - model_max_length: str = None, model_num_labels: int = None, - dataset_train_split: float = None, - dataset_eval_split: float = None, - filter_train_by_labels: str = None, - filter_eval_by_labels: str = None, num_epochs: int = 3, - num_clean_epochs: int = 1, attack_epoch_interval: int = 1, - early_stopping_epochs: int = None, learning_rate: float = 5e-5, - num_warmup_steps: int = 500, weight_decay: float = 0.01, - per_device_train_batch_size: int = 8, per_device_eval_batch_size: int = 32, - gradient_accumulation_steps: int = 1, random_seed: int = 786, - parallel: bool = False, load_best_model_at_end: bool = False, - alpha: float = 1.0, num_train_adv_examples: int = -1, - query_budget_train: float = None, - attack_num_workers_per_device: int = 1, output_dir: str = './output', - checkpoint_interval_steps: float = None, checkpoint_interval_epochs: int = None, - save_last: bool = True, log_to_tb: bool = False, - tb_log_dir: str = None, log_to_wandb: bool = False, - wandb_project: str = 'textattack', logging_interval_step: int = 1): - """ - This function is used to call the textattack CLI to run model training using the target system - - - **model_name_or_path**: Name of the model to use or path to the model on the system - - **dataset**: Name of the dataset to use or the Dataset object generated by the user - - **attack**: Attack string - - **task_type**: Action to take while training - - **model_max_length**: Model Max Length - - **model_num_labels**: Number of Labels - - **dataset_train_split**: Train split for dataset - - **dataset_eval_split**: Evaluation split for dataset - - **filter_train_by_labels**: Filter Train Data By Labels - - **filter_eval_by_labels**: Filter Evaluation Data By Labels - - **num_epochs**: Total number of epochs for training - - **num_clean_epochs**: Number of epochs to train on just the original training dataset before adversarial training - - **attack_epoch_interval**: Generate a new adversarial training set every N epochs - - **early_stopping_epochs**: Number of epochs validation must increase before stopping early - - **learning_rate**: Learning rate of the model - - **num_warmup_steps**: The number of steps for the warmup phase of linear scheduler - - **weight_decay**: Weight decay (L2 penalty) - - **per_device_train_batch_size**: The batch size per GPU/CPU for training - - **per_device_eval_batch_size**: The batch size per GPU/CPU for evaluation - - **gradient_accumulation_steps**: Number of updates steps to accumulate the gradients before performing a - backward/update pass - - **random_seed**: Random seed for reproducibility - - **parallel**: Use Multiple GPUs using torch.DataParallel class - - **load_best_model_at_end**: keep track of the best model across training and load it at the end - - **alpha**: The weight for adversarial loss - - **num_train_adv_examples**: The number of samples to successfully attack when generating adversarial training - set before start of every epoch - - **query_budget_train**: The max query budget to use when generating adversarial training set - - **attack_num_workers_per_device**: Number of worker processes to run per device for attack - - **output_dir**: Directory to output training logs and checkpoints - - **checkpoint_interval_steps**: Save after N updates - - **checkpoint_interval_epochs**: Save after N epochs - - **save_last**: Save the model at end of training - - **log_to_tb**: Log to Tensorboard - - **tb_log_dir**: Directory to output training logs and checkpoints - - **log_to_wandb**: Log to Wandb - - **wandb_project**: Name of Wandb project for logging - - **logging_interval_step**: Log to Tensorboard/Wandb every N training steps - """ - - var_list = ['textattack', 'train'] - maps = { - 'model_name_or_path': ['--model-name-or-path', model_name_or_path], - 'dataset': ['--dataset', dataset], - 'attack': ['--attack', attack], - 'task_type': ['--task-type', task_type], - 'model_max_length': ['--model-max-length', model_max_length], - 'model_num_labels': ['--model-num-labels', model_num_labels], - 'dataset_train_split': ['--dataset-train-split', dataset_train_split], - 'dataset_eval_split': ['--dataset-eval-split', dataset_eval_split], - 'filter_train_by_labels': ['--filter-train-by-labels', filter_train_by_labels], - 'filter_eval_by_labels': ['--filter-eval-by-labels', filter_eval_by_labels], - 'num_epochs': ['--num-epochs', num_epochs], - 'num_clean_epochs': ['--num-clean-epochs', num_clean_epochs], - 'attack_epoch_interval': ['--attack-epoch-interval', attack_epoch_interval], - 'early_stopping_epochs': ['--early-stopping-epochs', early_stopping_epochs], - 'learning_rate': ['--learning-rate', learning_rate], - 'num_warmup_steps': ['--num-warmup-steps', num_warmup_steps], - 'weight_decay': ['--weight-decay', weight_decay], - 'per_device_train_batch_size': ['--per-device-train-batch-size', per_device_train_batch_size], - 'per_device_eval_batch_size': ['--per-device-eval-batch-size', per_device_eval_batch_size], - 'gradient_accumulation_steps': ['--gradient-accumulation-steps', gradient_accumulation_steps], - 'random_seed': ['--random-seed', random_seed], - 'parallel': ['--parallel', parallel], - 'load_best_model_at_end': ['--load-best-model-at-end', load_best_model_at_end], - 'alpha': ['--alpha', alpha], - 'num_train_adv_examples': ['--num-train-adv-examples', num_train_adv_examples], - 'query_budget_train': ['--query-budget-train', query_budget_train], - 'attack_num_workers_per_device': ['--attack-num-workers-per-device', attack_num_workers_per_device], - 'output_dir': ['--output-dir', output_dir], - 'checkpoint_interval_steps': ['--checkpoint-interval-steps', checkpoint_interval_steps], - 'checkpoint_interval_epochs': ['--checkpoint-interval-epochs', checkpoint_interval_epochs], - 'save_last': ['--save-last', save_last], - 'log_to_tb': ['--log-to-tb', log_to_tb], - 'tb_log_dir': ['--tb-log-dir', tb_log_dir], - 'log_to_wandb': ['--log-to-wandb', log_to_wandb], - 'wandb_project': ['--wandb-project', wandb_project], - 'logging_interval_step': ['--logging-interval-step', logging_interval_step] - } - maps = {key: value for key, value in maps.items() if value[1] is not None} - for k, v in maps.items(): - var_list.extend(v) - - var_list = [str(iter_) for iter_ in var_list if type(iter_) is not bool] - - try: - subprocess.run(var_list) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if output_dir is not None: - try: - with zipfile.ZipFile('file.zip', 'w') as zipped: - for folder, subfolder, fnames in os.walk(output_dir): - for fname in fnames: - fpath = os.path.join(folder, fname) - zipped.write(fpath, os.path.basename(fpath)) - except Exception as ex: - raise HTTPException(status_code=404, detail=ex) - else: - return FileResponse('file.zip', media_type='application/zip', filename='file.zip') - else: - raise HTTPException(status_code=404, detail='Error: The model directory is not found. Try again.') diff --git a/api/endpoints/tk/__init__.py b/api/endpoints/tk/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/api/endpoints/tk/toolkit_nlp.py b/api/endpoints/tk/toolkit_nlp.py deleted file mode 100644 index 2b6cc0b..0000000 --- a/api/endpoints/tk/toolkit_nlp.py +++ /dev/null @@ -1,886 +0,0 @@ -import logging -import os -from collections import Counter -from heapq import nlargest -from string import punctuation -import numpy as np -import pandas as pd -import spacy -import streamlit as st -import plotly.graph_objs as go -import plotly.figure_factory as ff -import plotly.express as px -import nltk -import pyLDAvis -import pyLDAvis.gensim_models -import pyLDAvis.sklearn -import textattack.models.wrappers -import torch -import tensorflow as tf -import matplotlib.pyplot as plt -import transformers - -from io import StringIO -from fastapi import APIRouter, HTTPException, File, UploadFile -from fastapi.encoders import jsonable_encoder -from operator import itemgetter -from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline, AutoModelForSequenceClassification -from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from nltk.sentiment.vader import SentimentIntensityAnalyzer -from spacy.lang.en.stop_words import STOP_WORDS -from spacy.lang.en import English -from spacy import displacy -from wordcloud import WordCloud -from textblob import TextBlob - -# API router -router = APIRouter(prefix='/endpoints/toolkit', - tags=['toolkit'], - responses={200: {'description': 'OK'}, - 404: {'description': 'Resource Not Found'}, - 415: {'description': 'Unsupported Media Type'}}) - -# file counter -fc = 0 - - -def summarise(text, stopwords, pos_tag, nlp, sent_count): - """ - This function summarise the text dataframe - - Parameters - ---------- - text: DataFrame - nlp: NLP model - pos_tag: Text pos tag - stopwords: Stopwords - sent_count: Number of sentences to summarise to - ---------- - """ - - try: - # DEFINE LISTS AND DICTS - keyword = [] - sent_strength = {} - data = nlp(str(text)) - - # EXTRACT KEYWORDS FROM TEXT - for token in data: - if token.text in stopwords or token.text in punctuation: - continue - if token.pos_ in pos_tag: - keyword.append(token.text) - - # COUNT THE FREQUENCY OF WORDS - freq_word = Counter(keyword) - max_freq = Counter(keyword).most_common(1)[0][1] - for word in freq_word.keys(): - freq_word[word] = (freq_word[word] / max_freq) - - # CALCULATE SENTENCE SCORES - for sent in data.sents: - for word in sent: - if word.text in freq_word.keys(): - if sent in sent_strength.keys(): - sent_strength[sent] += freq_word[word.text] - else: - sent_strength[sent] = freq_word[word.text] - - # CONCATENATE THE STRINGS IN THE LIST TO A LARGER STRING - summarized_sentences = nlargest(sent_count, sent_strength, key=sent_strength.get) - final_sentences = [w.text for w in summarized_sentences] - summary = ' '.join(final_sentences) - except Exception: - return text - else: - return summary - - -def modelIterator(model, vectoriser, top_n, vb=True): - """ - This function prints out and returns the extracted topics for the NLP model passed on to it - - Parameters - ---------- - model: NLP Model - vectoriser: Vectorised text - top_n: Number of Topics to return - vb: Verbose tag (will print out the topics if set to True - --------- - """ - frame_list = [] - - for id_, topic in enumerate(model.components_): - lister = [(vectoriser.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]] - df = pd.DataFrame(data=lister, - index=range(len(lister)), - columns=['word', 'weight']) - - if vb: - st.markdown(f'### Topic {id_}') - st.dataframe(df) - - frame_list.append(df) - - return frame_list - - -def dominantTopic(vect, model, n_words): - """ - Returns the topic text - - Parameters - ---------- - vect: Vectorizer used - model: NLP Model - n_words: Number of Topics to return - ---------- - """ - kw = np.array(vect.get_feature_names()) - topic_kw = [] - for weights in model.components_: - top_kw = (-weights).argsort()[:n_words] - topic_kw.append(kw.take(top_kw)) - - return topic_kw - - -@router.post('/wordcloud') -async def wordcloud(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', max_word: int = 200, - contour: int = 3, width: int = 800, height: int = 400, colour: str = 'steelblue'): - """ - Wordcloud creation - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **max_word**: Max number of words to render in the wordcloud image - - **contour**: Contour width - - **width**: Width of the wordcloud image - - **height**: Height of the wordcloud image - - **colour**: Colour of the background - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - raw_data = raw_data[[data_column]] - wc = WordCloud(background_color='white', - max_words=max_word, - contour_width=contour, - width=width, - height=height, - contour_color=colour) - wc.generate(' '.join(raw_data[data_column])) - img = wc.to_image() - data = { - 'image': str(img.tobytes()) - } - - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Document-Term Matrix was not properly prepared. Try ' - 'again.') - - -@router.post('/ner') -async def ner(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', - model: str = 'en_core_web_sm', one_datapoint_analyser: int = None): - """ - Conduct NER analysis - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **one_datapoint_analyser**: The datapoint to render into HTML format - """ - - NLP = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - # init the required columns - raw_data = raw_data[[data_column]] - raw_data['NER'] = '' - raw_data['COMPILED_LABELS'] = '' - raw_data = raw_data.astype(str) - - if model == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_sm') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - elif model == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_lg') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - - for index in range(len(raw_data)): - temp_nlp = NLP(raw_data[data_column][index]) - raw_data.at[index, 'NER'] = str(list(zip([word.text for word in temp_nlp.ents], - [word.label_ for word in temp_nlp.ents]))) - raw_data.at[index, 'COMPILED_LABELS'] = str(list(set([word.label_ for word in temp_nlp.ents]))) - - if one_datapoint_analyser is not None: - cpy = raw_data.copy() - temp = cpy[data_column][one_datapoint_analyser] - render = displacy.render(list(NLP(str(temp)).sents), - style='ent', - page=True) - data = { - 'data': raw_data.to_json(), - 'render': render - } - return jsonable_encoder(data) - else: - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/pos') -async def pos(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = '', - model: str = 'en_core_web_sm', one_datapoint_analyser: int = None, compact: bool = True, - colour: str = 'steelblue', bg: str = 'white'): - """ - Conduct POS tagging - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **one_datapoint_analyser**: The datapoint to render into HTML format - - **compact**: Compact the renders - - **colour**: Colour of the words in the render - - **bg**: Colour of the background - """ - - NLP = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - raw_data = raw_data[[data_column]] - raw_data['POS'] = '' - raw_data = raw_data.astype(str) - - if model == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_sm') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - elif model == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_lg') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - - for index in range(len(raw_data)): - temp_nlp = NLP(raw_data[data_column][index]) - raw_data.at[index, 'POS'] = str(list(zip([str(word) for word in temp_nlp], - [word.pos_ for word in temp_nlp]))) - raw_data.at[index, 'COMPILED_LABELS'] = str(list(set([word.pos_ for word in temp_nlp]))) - - if one_datapoint_analyser is not None: - cpy = raw_data.copy() - temp = cpy[data_column][one_datapoint_analyser] - render = displacy.render(list(NLP(str(temp)).sents), - style='dep', - options={ - 'compact': compact, - 'color': colour, - 'bg': bg, - }) - data = { - 'data': raw_data.to_json(), - 'render': render - } - return jsonable_encoder(data) - else: - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/summarise') -def summarise(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', mode: str = 'basic', - model: str = 'en_core_web_sm', sentence_len: int = 3, min_words: int = 80, max_words: str = 150, - max_tensor: int = 512): - """ - Summarise texts - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **mode**: Define whether or not to conduct 'basic' or 'advanced' summarisation on input data - - **model**: spaCy model to load - - **sentence_len**: The maximum length of sentence to return - - **min_words**: The minimum number of words to include in the summary - - **max_words**: The maximum number of words to include in the summary - - **max_tensor**: The maximum number of input tensors for advanced summarisation process - """ - - NLP = None - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - # load up the data first - raw_data = raw_data[[data_column]] - raw_data['SUMMARY'] = np.nan - raw_data = raw_data.astype(str) - - if not raw_data.empty: - if mode == 'basic': - if model == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_sm') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - elif model == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - logging.warning('Model not found, downloading...') - try: - os.system('python -m spacy download en_core_web_lg') - except Exception as ex: - logging.error(f'Unable to download Model. Error: {ex}') - raise HTTPException(status_code=415, detail=ex) - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - - stopwords = list(STOP_WORDS) - pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] - raw_data['SUMMARY'] = raw_data[data_column]. \ - apply(lambda x: summarise(x, stopwords, pos_tag, NLP, sentence_len)) - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - - elif mode == 'advanced': - if torch.cuda.is_available(): - try: - torch.cuda.get_device_name(torch.cuda.current_device()) - except AssertionError: - raise HTTPException(status_code=415, detail='Error: CUDA Device is not enabled. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - tokenizer = AutoTokenizer.from_pretrained('t5-base') - model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) - raw_data = raw_data.astype(object) - raw_data['ENCODED'] = raw_data[data_column]. \ - apply(lambda x: tokenizer.encode('summarize: ' + x, - return_tensors='pt', - max_length=max_tensor, - truncation=True)) - raw_data['OUTPUTS'] = raw_data['ENCODED']. \ - apply(lambda x: model.generate(x, - max_length=max_words, - min_length=min_words, - length_penalty=5.0, - num_beams=2)) - raw_data['SUMMARISED'] = raw_data['OUTPUTS'].apply( - lambda x: tokenizer.decode(x[0])) - raw_data.drop(columns=['ENCODED', 'OUTPUTS'], inplace=True) - raw_data['SUMMARISED'] = raw_data['SUMMARISED']. \ - str.replace(' ', '').str.replace('', '') - raw_data = raw_data.astype(str) - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/sentiment') -def sentiment(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', model: str = 'vader', - colour: str = '#2ACAEA'): - """ - Conduct Sentiment Analysis - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **colour**: Colour of plots generated - """ - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - if model == 'vader': - replacer = { - r"'": '', - r'[^\w\s]': ' ', - r' \d+': ' ', - r' +': ' ' - } - - raw_data['VADER SENTIMENT TEXT'] = raw_data[data_column]. \ - replace(to_replace=replacer, regex=True) - - vader_analyser = SentimentIntensityAnalyzer() - sent_score_list = [] - sent_label_list = [] - - # scoring - for i in raw_data['VADER SENTIMENT TEXT'].tolist(): - sent_score = vader_analyser.polarity_scores(i) - - if sent_score['compound'] > 0: - sent_score_list.append(sent_score['compound']) - sent_label_list.append('Positive') - elif sent_score['compound'] == 0: - sent_score_list.append(sent_score['compound']) - sent_label_list.append('Neutral') - elif sent_score['compound'] < 0: - sent_score_list.append(sent_score['compound']) - sent_label_list.append('Negative') - - raw_data['VADER OVERALL SENTIMENT'] = sent_label_list - raw_data['VADER OVERALL SCORE'] = sent_score_list - raw_data['VADER POSITIVE SCORING'] = [vader_analyser.polarity_scores(doc)['pos'] for doc in - raw_data['VADER SENTIMENT TEXT'].values.tolist()] - raw_data['VADER NEUTRAL SCORING'] = [vader_analyser.polarity_scores(doc)['neu'] for doc in - raw_data['VADER SENTIMENT TEXT'].values.tolist()] - raw_data['VADER NEGATIVE SCORING'] = [vader_analyser.polarity_scores(doc)['neg'] for doc in - raw_data['VADER SENTIMENT TEXT'].values.tolist()] - - # create plots - hac_plot = ff.create_distplot([raw_data['VADER OVERALL SCORE'].tolist()], - ['VADER'], - colors=[colour], - bin_size=0.25, - curve_type='normal', - show_rug=False, - show_hist=False) - hac_plot.update_layout(title_text='Distribution Plot', - xaxis_title='VADER Score', - yaxis_title='Frequency Density', - legend_title='Frequency Density') - data = { - 'data': raw_data.to_json(), - 'dot_image': hac_plot.to_image(format="png") - - } - return jsonable_encoder(data) - - elif model == 'textblob': - pol_list = [] - sub_list = [] - - # scoring: polarity - raw_data['POLARITY SCORE'] = raw_data[data_column]. \ - apply(lambda x: TextBlob(x).sentiment.polarity) - for i in raw_data['POLARITY SCORE'].tolist(): - if float(i) > 0: - pol_list.append('Positive') - elif float(i) < 0: - pol_list.append('Negative') - elif float(i) == 0: - pol_list.append('Neutral') - raw_data['POLARITY SENTIMENT'] = pol_list - - # scoring: subjectivity - raw_data['SUBJECTIVITY SCORE'] = raw_data[data_column].apply( - lambda x: TextBlob(x).sentiment.subjectivity - ) - for i in raw_data['SUBJECTIVITY SCORE'].tolist(): - if float(i) < 0.5: - sub_list.append('Objective') - elif float(i) > 0.5: - sub_list.append('Subjective') - elif float(i) == 0.5: - sub_list.append('Neutral') - raw_data['SUBJECTIVITY SENTIMENT'] = sub_list - hac_plot = px.scatter(raw_data[['SUBJECTIVITY SCORE', 'POLARITY SCORE']], - x='SUBJECTIVITY SCORE', - y='POLARITY SCORE', - labels={ - 'SUBJECTIVITY SCORE': 'Subjectivity', - 'POLARITY SCORE': 'Polarity' - }) - hac_plot1 = ff.create_distplot([raw_data['SUBJECTIVITY SCORE'].tolist(), - raw_data['POLARITY SCORE'].tolist()], - ['Subjectivity', 'Polarity'], - curve_type='normal', - show_rug=False, - show_hist=False) - data = { - 'data': raw_data.to_json(), - 'dot_image': hac_plot.to_image(format="png"), - 'word_image': hac_plot1.to_image(format="png") - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/modelling') -def topic_modelling(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', model: str = 'lda', - num_topics: int = 10, max_features: int = 5000, max_iter: int = 10, min_df: int = 5, - max_df: float = 0.90, worker: int = 1, colour: str = 'steelblue', alpha: float = 0.10, - l1_ratio: float = 0.50): - """ - Topic Modelling - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **model**: spaCy model to load - - **num_topics**: Number of topics to model - - **max_features**: Maximum number of features to consider - - **max_iter**: Maximum number of epochs to fit data - - **min_df**: Minimum length of words - - **max_df**: Maximum length of words - - **worker**: Number of workers - - **colour**: Colour of the plots - - **alpha**: Alpha value - - **l1_ratio**: L1 ratio value - """ - - global fc - - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(str) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(str) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(str) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if not raw_data.empty: - try: - cv = CountVectorizer(min_df=min_df, - max_df=max_df, - stop_words='english', - lowercase=True, - token_pattern=r'[a-zA-Z\-][a-zA-Z\-]{2,}', - max_features=max_features) - vectorised = cv.fit_transform(raw_data[data_column]) - except ValueError: - raise HTTPException(status_code=415, detail='Error: The column loaded is empty or has invalid data' - ' points. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if model == 'lda': - LDA = LatentDirichletAllocation(n_components=num_topics, - max_iter=max_iter, - learning_method='online', - n_jobs=worker) - LDA_data = LDA.fit_transform(vectorised) - topic_text = modelIterator(LDA, cv, top_n=num_topics, - vb=False) - keywords = pd.DataFrame(dominantTopic(vect=cv, model=LDA, - n_words=num_topics)) - keywords.columns = [f'word_{i}' for i in range(keywords.shape[1])] - keywords.index = [f'topic_{i}' for i in range(keywords.shape[0])] - LDA_vis = pyLDAvis.sklearn.prepare(LDA, vectorised, cv, mds='tsne') - pyLDAvis.save_html(LDA_vis, - str(os.path.join(os.getcwd(), f'lda_id{fc}.html'))) - with open(os.path.join(os.getcwd(), f'lda_id{fc}.html')) as f: - render = f.read() - fc += 1 - - data = { - 'topic_text': {i: (topic_text[i].to_json()) for i - in range(len(topic_text))}, - 'data': raw_data.to_json(), - 'keywords': keywords.to_json(), - 'render': render - } - - return jsonable_encoder(data) - - elif model == 'nmf': - TFIDF = TfidfVectorizer(max_df=max_df, - min_df=min_df, - max_features=max_features, - stop_words='english') - TFIDF_vectorised = TFIDF.fit_transform(raw_data - [data_column] - .values.astype(str)) - NMF_model = NMF(n_components=num_topics, - max_iter=max_iter, - random_state=1, - alpha=alpha, - l1_ratio=l1_ratio).fit(TFIDF_vectorised) - topic_text = modelIterator(model=NMF_model, - vectoriser=TFIDF, - top_n=num_topics, - vb=False) - keywords = pd.DataFrame(dominantTopic(model=NMF_model, - vect=TFIDF, - n_words=num_topics)) - keywords.columns = [f'word_{i}' for i in range(keywords.shape[1])] - keywords.index = [f'topic_{i}' for i in range(keywords.shape[0])] - data = { - 'topic_text': {i: (topic_text[i].to_json()) for i - in range(len(topic_text))}, - 'data': raw_data.to_json(), - 'keywords': keywords.to_json() - } - return jsonable_encoder(data) - - elif model == 'lsi': - LSI = TruncatedSVD(n_components=num_topics, n_iter=max_iter) - LSI_data = LSI.fit_transform(vectorised) - topic_text = modelIterator(LSI, cv, - top_n=num_topics, vb=False) - keywords = pd.DataFrame(dominantTopic(model=LSI, vect=cv, - n_words=num_topics)) - keywords.columns = [f'word_{i}' for i in range(keywords.shape[1])] - keywords.index = [f'topic_{i}' for i in range(keywords.shape[0])] - - # SVD - svd_2d = TruncatedSVD(n_components=2) - data_2d = svd_2d.fit_transform(vectorised) - - mar_fig = go.Scattergl( - x=data_2d[:, 0], - y=data_2d[:, 1], - mode='markers', - marker=dict( - color=colour, - line=dict(width=1) - ), - text=cv.get_feature_names(), - hovertext=cv.get_feature_names(), - hoverinfo='text' - ) - mar_fig = [mar_fig] - mar_fig = go.Figure(data=mar_fig, layout=go.Layout(title='Scatter Plot')) - word_fig = go.Scattergl( - x=data_2d[:, 0], - y=data_2d[:, 1], - mode='text', - marker=dict( - color=colour, - line=dict(width=1) - ), - text=cv.get_feature_names(), - ) - word_fig = [word_fig] - word_fig = go.Figure(data=word_fig, layout=go.Layout(title='Scatter Word Plot')) - - data = { - 'topic_text': {i: (topic_text[i].to_json()) for i - in range(len(topic_text))}, - 'data': raw_data.to_json(), - 'keywords': keywords.to_json(), - 'point_figure': mar_fig.to_image(format='png'), - 'word_figure': word_fig.to_image(format='png') - } - - return jsonable_encoder(data) - else: - raise HTTPException(status_code=415, detail='Error: Data not loaded properly. Try again.') - - -@router.post('/classification') -def classification(file: UploadFile = File(...), ftype: str = 'csv', data_column: str = 'data', topics: str = ''): - """ - Conduct Text Classification - - - **file**: Data - - **ftype**: The file format to read the input data as - - **data_column**: Column in the pandas DataFrame to process - - **topics**: A string (delimited by commas) or a list of topics to classify data into - """ - - if torch.cuda.is_available(): - try: - torch.cuda.get_device_name(torch.cuda.current_device()) - except AssertionError: - raise HTTPException(status_code=415, detail='Error: CUDA Device is not enabled. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - try: - if ftype == 'csv': - raw_data = pd.read_csv(StringIO(str(file.file.read(), 'latin1')), encoding='latin1').astype(object) - elif ftype == 'xlsx': - raw_data = pd.read_excel(StringIO(str(file.file.read(), 'utf-8')), engine='openpyxl').astype(object) - elif ftype == 'json': - raw_data = pd.read_json(StringIO(str(file.file.read(), 'utf-8'))).astype(object) - else: - raise HTTPException(status_code=415, detail='Error: File format input is not supported. Try again.') - except Exception as ex: - raise HTTPException(status_code=415, detail=ex) - else: - if type(topics) == str: - topics = [word.strip().lower() for word in topics.split(sep=',')] - elif type(topics) == list: - topics = topics - else: - raise HTTPException(status_code=415, detail='Error: Invalid data type for topics.') - - classifier = pipeline('zero-shot-classification') - raw_data['TEST'] = raw_data[data_column].apply(lambda x: classifier(x, topics)) - raw_data['CLASSIFIED'] = raw_data['TEST']. \ - apply(lambda x: list(zip(x['labels'].tolist(), x['scores'].tolist()))) - raw_data['MOST PROBABLE TOPIC'] = raw_data['CLASSIFIED'].apply(lambda x: max(x, key=itemgetter[1])[0]) - raw_data = raw_data.astype(str) - - data = { - 'data': raw_data.to_json() - } - return jsonable_encoder(data) - else: - raise HTTPException(status_code=404, detail='Error: CUDA Device is not detected. Try again.') diff --git a/api/main.py b/api/main.py deleted file mode 100644 index c4b32da..0000000 --- a/api/main.py +++ /dev/null @@ -1,29 +0,0 @@ -"""This is the main file where the API server is started""" - -from fastapi import FastAPI -from endpoints.lca import clean, modify, query -from endpoints.dtm import dtm -from endpoints.mt import model_trainer -from endpoints.tk import toolkit_nlp - -# instantiate the app -app = FastAPI() - -# add the routers -app.include_router(clean.router) -app.include_router(modify.router) -app.include_router(query.router) -app.include_router(model_trainer.router) -app.include_router(dtm.router) -app.include_router(toolkit_nlp.router) - - -@app.get('/') -def root(): - """ - This function is called when the user navigates to the root path of the localhost path generated by uvicorn when - the API server is started - """ - - return {'description': 'Welcome to the Homepage of the ArticPy API!'} - diff --git a/app.py b/app.py index 6e2541d..d6d7648 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,6 @@ # INIT STREAMLIT CONFIG import streamlit as st + st.set_page_config(page_title='ArticPy', page_icon='❄', menu_items={ @@ -13,7 +14,7 @@ # CUSTOM PAGE IMPORTS from pyfiles.multipage import MultiPage -from pyfiles.pages import load_clean_visualise, document_term_matrix, toolkit_nlp, model_trainer +from pyfiles.pages import load_clean_visualise, document_term_matrix, toolkit_nlp # INSTANTIATE THE APP app = MultiPage() @@ -22,7 +23,6 @@ app.add_page('Load, Clean and Visualise Data', load_clean_visualise.app) app.add_page('DTM and Word Frequency Analysis', document_term_matrix.app) app.add_page('NLP Toolkit', toolkit_nlp.app) -app.add_page('NLP Model Trainer', model_trainer.app) # RUN THE APP try: diff --git a/config.py b/config.py index 895ac46..98e3dd7 100644 --- a/config.py +++ b/config.py @@ -152,449 +152,3 @@ 'MIN_WORDS': 80, 'SUM_MODE': 'Basic' } - -trainer = { - 'TRANSFORMERS_CHOICES': ('Pre Training', 'CausalLM', 'MaskedLM', 'Seq2SeqLM', 'SequenceClassification', - 'MultipleChoice', 'NextSentencePrediction', 'TokenClassificaition', 'QuestionAnswering', - 'TableQuestionAnswering'), - 'TRANSFORMERS_SELECTION': None, - 'MODEL_MODE': 'Training', - 'MODEL_FILE': None, - 'TRAINING_PARAMS': [], - 'API': True, - 'attack': None, - 'model_max_length': None, - 'model_num_labels': None, - 'dataset_train_split': None, - 'dataset_eval_split': None, - 'filter_train_by_labels': None, - 'filter_eval_by_labels': None, - 'num_epochs': 3, - 'num_clean_epochs': 1, - 'attack_epoch_interval': 1, - 'early_stopping_epochs': None, - 'learning_rate': 5e-5, - 'num_warmup_steps': 500, - 'weight_decay': 0.01, - 'per_device_train_batch_size': 8, - 'per_device_eval_batch_size': 32, - 'gradient_accumulation_steps': 1, - 'random_seed': 786, - 'parallel': False, - 'load_best_model_at_end': False, - 'alpha': 1.0, - 'num_train_adv_examples': -1, - 'query_budget_train': None, - 'attack_num_workers_per_device': 1, - 'output_dir': None, - # 'output_dir': f'{os.getcwd()}/outputs/{datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")}', - 'checkpoint_interval_steps': None, - 'checkpoint_interval_epochs': None, - 'save_last': True, - 'log_to_tb': False, - 'tb_log_dir': None, - # 'tb_log_dir': r'./runs', - 'log_to_wandb': False, - 'wandb_project': 'textattack', - 'logging_interval_step': 1, - 'ML_MODEL': '', - 'ML_POSSIBLE_PICKS': ('albert-base-v2-CoLA', 'bert-base-uncased-CoLA', 'distilbert-base-cased-CoLA', - 'distilbert-base-uncased-CoLA', 'roberta-base-CoLA', 'xlnet-base-cased-CoLA', - 'albert-base-v2-RTE', 'albert-base-v2-snli', 'albert-base-v2-WNLI', 'bert-base-uncased-MNLI', - 'bert-base-uncased-QNLI', 'bert-base-uncased-RTE', 'bert-base-uncased-snli', - 'bert-base-uncased-WNLI', 'distilbert-base-cased-snli', 'distilbert-base-uncased-MNLI', - 'distilbert-base-uncased-RTE', 'distilbert-base-uncased-WNLI', 'roberta-base-QNLI', - 'roberta-base-RTE', 'roberta-base-WNLI', 'xlnet-base-cased-RTE', 'xlnet-base-cased-WNLI', - 'albert-base-v2-QQP', 'bert-base-uncased-QQP', 'distilbert-base-uncased-QNLI', - 'distilbert-base-cased-QQP', 'albert-base-v2-STS-B', 'bert-base-uncased-MRPC', - 'bert-base-uncased-STS-B', 'distilbert-base-cased-MRPC', 'distilbert-base-cased-STS-B', - 'distilbert-base-uncased-MRPC', 'roberta-base-MRPC', 'roberta-base-STS-B', - 'xlnet-base-cased-MRPC', 'xlnet-base-cased-STS-B', 'albert-base-v2-imdb', - 'albert-base-v2-rotten-tomatoes', 'albert-base-v2-SST-2', 'albert-base-v2-yelp-polarity', - 'bert-base-uncased-imdb', 'bert-base-uncased-rotten-tomatoes', 'bert-base-uncased-SST-2', - 'bert-base-uncased-yelp-polarity', 'cnn-imdb', 'cnn-mr', 'cnn-sst2', 'cnn-yelp', - 'distilbert-base-cased-SST-2', 'distilbert-base-uncased-imdb', - 'distilbert-base-uncased-rotten-tomatoes', 'lstm-imdb', 'lstm-mr', 'lstm-sst2', 'lstm-yelp', - 'roberta-base-imdb', 'roberta-base-rotten-tomatoes', 'roberta-base-SST-2', - 'xlnet-base-cased-imdb', 'xlnet-base-cased-rotten-tomatoes', 'albert-base-v2-ag-news', - 'bert-base-uncased-ag-news', 'cnn-ag-news', 'distilbert-base-uncased-ag-news', 'lstm-ag-news', - 'roberta-base-ag-news', 'bert-base-uncased', 'bert-large-uncased', 'bert-base-cased', - 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', - 'bert-base-chinese', 'bert-base-german-cased', 'bert-large-uncased-whole-word-masking', - 'bert-large-cased-whole-word-masking', - 'bert-large-uncased-whole-word-masking-finetuned-squad', - 'bert-large-cased-whole-word-masking-finetuned-squad', - 'bert-base-cased-finetuned-mrpc', 'bert-base-german-dbmdz-cased', - 'bert-base-german-dbmdz-uncased', 'cl-tohoku/bert-base-japanese', - 'cl-tohoku/bert-base-japanese-whole-word-masking', 'cl-tohoku/bert-base-japanese-char', - 'cl-tohoku/bert-base-japanese-char-whole-word-masking', 'TurkuNLP/bert-base-finnish-cased-v1', - 'TurkuNLP/bert-base-finnish-uncased-v1', 'wietsedv/bert-base-dutch-cased', 'openai-gpt', - 'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'transfo-xl-wt103', 'xlnet-base-cased', - 'xlnet-large-cased', 'xlm-mlm-en-2048', 'xlm-mlm-ende-1024', 'xlm-mlm-enfr-1024', - 'xlm-mlm-enro-1024', 'xlm-mlm-xnli15-1024', 'xlm-mlm-tlm-xnli15-1024', 'xlm-clm-enfr-1024', - 'xlm-clm-ende-1024', 'xlm-mlm-17-1280', 'xlm-mlm-100-1280', 'roberta-base', 'roberta-large', - 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', - 'roberta-large-openai-detector', 'distilbert-base-uncased', - 'distilbert-base-uncased-distilled-squad', 'distilbert-base-cased', - 'distilbert-base-cased-distilled-squad', 'distilgpt2', - 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased', 'ctrl', - 'camembert-base', 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', - 'albert-xxlarge-v1', 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', - 'albert-xxlarge-v2', 't5-small', 't5-base', 't5-large', 't5-3B', 't5-11B', 'xlm-roberta-base', - 'xlm-roberta-large', 'flaubert/flaubert_small_cased', - 'flaubert/flaubert_base_uncased', 'flaubert/flaubert_base_cased', - 'flaubert/flaubert_large_cased', 'facebook/bart-large', - 'facebook/bart-base', 'facebook/bart-large-mnli', - 'facebook/bart-large-cnn', 'DialoGPT-small', 'DialoGPT-medium', - 'DialoGPT-large', 'reformer-enwik8', 'reformer-crime-and-punishment', - 'Helsinki-NLP/opus-mt-{src}-{tgt}', 'google/pegasus-{dataset}', - 'allenai/longformer-base-4096', 'allenai/longformer-large-4096', - 'facebook/mbart-large-cc25', 'facebook/mbart-large-en-ro', - 'lxmert-base-uncased', 'funnel-transformer/small', - 'funnel-transformer/small-base', 'funnel-transformer/medium', - 'funnel-transformer/medium-base', 'funnel-transformer/intermediate', - 'funnel-transformer/intermediate-base', 'funnel-transformer/large', - 'funnel-transformer/large-base', 'funnel-transformer/xlarge', - 'funnel-transformer/xlarge-base', 'microsoft/layoutlm-base-uncased', - 'microsoft/layoutlm-large-uncased'), - 'DATASET_POSSIBLE_PICKS': ('super_glue', 'glue', 'anli', 'wino_bias', 'squad', 'imdb', 'wikitext', 'trec', 'race', - 'adversarial_qa', 'duorc', 'squad_v2', 'winogrande', 'cosmos_qa', 'quail', 'xsum', - 'cnn_dailymail', 'piqa', 'paws', 'hellaswag', 'ai2_arc', 'ropes', 'rotten_tomatoes', - 'amazon_polarity', 'quoref', 'wiki_qa', 'cos_e', 'hans', 'ag_news', 'common_gen', - 'mc_taco', 'gigaword', 'wiki_hop', 'wmt16', 'wiqa', 'qasc', 'winograd_wsc', - 'common_voice', 'quartz', 'yelp_review_full', 'samsum', 'crows_pairs', 'openbookqa', - 'qa_srl', 'multi_news', 'social_i_qa', 'nq_open', 'quac', 'web_questions', 'dream', - 'kilt_tasks', 'wiki_bio', 'drop', 'trivia_qa', 'sciq', 'quarel', 'lambada', 'coqa', - 'circa', 'mc4', 'conll2003', 'dbpedia_14', 'xnli', 'emotion', 'story_cloze', - 'app_reviews', 'snli', 'mlqa', 'code_search_net', 'c4', 'multi_nli', - 'amazon_reviews_multi', 'hate_speech18', 'tweet_eval', 'timit_asr', 'oscar', 'sst', - 'hate_speech_offensive', 'wikiann', 'wikipedia', 'opus_euconst', 'math_qa', 'swag', - 'pubmed_qa', 'scientific_papers', 'banking77', 'cbt', 'commonsense_qa', 'yelp_polarity', - 'stsb_multi_mt', 'bookcorpus', 'squad_adversarial', 'scan', 'financial_phrasebank', - 'craffel/openai_lambada', 'wiki_dpr', 'amazon_us_reviews', 'math_dataset', 'tab_fact', - 'subjqa', 'OTHERS'), - # mappings area split -> dataset: [(subsets), (dataset_columns), (split)] - 'SUBSET_MAPPINGS': {'super_glue': [('boolq', 'cb', 'copa', 'multirc', 'record', 'rte', 'wic', 'wsc', 'wsc.fixed', - 'axb', 'axg'), (), ('train', 'test', 'validation')], - 'glue': [('cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched', 'mnli_matched', - 'qnli', 'rte', 'wnli', 'ax'), (), ('train', 'test', 'validation')], - 'anli': [(), (), ('train_r1', 'dev_r1', 'test_r1', 'train_r2', 'dev_r2', 'test_r2', - 'train_r3', 'dev_r3', 'test_r3')], - 'wino_bias': [('type1_pro', 'type1_anti', 'type2_pro', 'type2_anti'), (), - ('test', 'validation')], - 'squad': [(), ('question', 'title', 'id', 'context', 'answers'), ('train', 'validation')], - 'imdb': [(), (), ('train', 'test', 'unsupervised')], - 'wikitext': [('wikitext-103-raw-v1', 'wikitext-2-raw-v1', 'wikitext-103-v1', 'wikitext-2-v1'), - (), ('train', 'test', 'validation')], - 'trec': [(), ('label-coarse', 'text', 'label-fine'), ('train', 'test')], - 'race': [('high', 'middle', 'all'), ('example_id', 'article', 'answer', 'question', 'options'), - ('train', 'test', 'validation')], - 'adversarial_qa': [('adversarialQA', 'dbidaf', 'dbert', 'droberta'), (), - ('train', 'test', 'validation')], - 'duorc': [('SelfRC', 'ParaphraseRC'), (), ('train', 'test', 'validation')], - 'squad_v2': [(), ('question', 'title', 'id', 'context', 'answers'), ('train', 'validation')], - 'winogrande': [('winogrande_xs', 'winogrande_s', 'winogrande_m', 'winogrande_l', - 'winogrande_xl', 'winogrande_debiased'), (), ('train', 'test', 'validation')], - 'cosmos_qa': [(), ('question', 'answer2', 'answer0', 'answer1', 'label', 'id', 'answer3', - 'context'), ('train', 'test', 'validation')], - 'quail': [(), ('question', 'question_type', 'id', 'context_id', 'domain', 'context', - 'correct_answer_id', 'answers', 'question_id', 'metadata'), - ('train', 'challenge', 'validation')], - 'xsum': [(), (), ('train', 'test', 'validation')], - 'cnn_dailymail': [(), (), ('train', 'test', 'validation')], - 'piqa': [(), (), ('train', 'test', 'validation')], - 'paws': [('labelle_final', 'labelled_swap', 'unlabelled_final'), - (), ('train', 'test', 'validation')], - 'hellaswag': [(), (), ('train', 'test', 'validation')], - 'ai2_arc': [('ARC-Challenge', 'ARC-Easy'), (), ('train', 'test', 'validation')], - 'ropes': [(), (), ('train', 'test', 'validation')], - 'rotten_tomatoes': [(), (), ('train', 'test', 'validation')], - 'amazon_polarity': [(), (), ()], - 'quoref': [(), (), ('train', 'validation')], - 'wiki_qa': [(), (), ()], - 'cos_e': [('v1.0', 'v1.11'), (), ('train', 'validation')], - 'hans': [(), (), ('train', 'validation')], - 'ag_news': [(), (), ('train', 'test')], - 'common_gen': [(), (), ('train', 'test', 'validation')], - 'mc_taco': [(), (), ('test', 'validation')], - 'gigaword': [(), (), ('train', 'test', 'validation')], - 'wiki_hop': [('original', 'masked'), (), ('train', 'validation')], - 'wmt16': [('cs-en', 'de-en', 'fi-en', 'ro-en', 'ru-en', 'te-en'), - (), ('train', 'test', 'validation')], - 'wiqa': [(), (), ('train', 'test', 'validation')], - 'qasc': [(), (), ('train', 'test', 'validation')], - 'winograd_wsc': [(), (), ()], - 'common_voice': [(), (), ()], - 'quartz': [(), (), ('train', 'test', 'validation')], - 'yelp_review_full': [(), (), ()], - 'samsum': [(), (), ('train', 'test', 'val')], - 'crows_pairs': [(), (), ()], - 'openbookqa': [('main', 'additional'), (), ('train', 'test', 'validation')], - 'qa_srl': [(), (), ('train', 'test', 'validation')], - 'multi_news': [(), (), ('train', 'test', 'validation')], - 'social_i_qa': [(), (), ('train', 'validation')], - 'nq_open': [(), (), ('train', 'validation')], - 'quac': [(), (), ('train', 'validation')], - 'web_questions': [(), (), ('train', 'test')], - 'dream': [(), (), ('train', 'test', 'validation')], - 'kilt_tasks': [('triviaqa_support_only', 'fever', 'aidayago2', 'wned', 'cweb', 'trex', - 'structured_zeroshot', 'nq', 'hotpotqa', 'eli5', 'wow'), - (), ('train', 'test', 'validation')], - 'wiki_bio': [(), (), ('train', 'test', 'validation')], - 'drop': [(), (), ('train', 'validation')], - 'trivia_qa': [('rc', 'rc.nocontext', 'unfiltered', 'unfiltered.nocontext'), - (), ('train', 'test', 'validation')], - 'sciq': [(), (), ('train', 'test', 'validation')], - 'quarel': [(), (), ('train', 'test', 'validation')], - 'lambada': [(), (), ('train', 'test', 'dev')], - 'coqa': [(), (), ('train', 'validation')], - 'circa': [(), (), ('train')], - 'mc4': [('af', 'am', 'ar', 'az', 'be', 'bg', 'bg-Latn', 'bn', 'ca', 'ceb', 'co', 'cs', 'cy', - 'da', 'de', 'el', 'el-Latn', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fil', 'fr', - 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'haw', 'hi', 'hi-Latn', 'hmn', 'ht', 'hu', 'hy', - 'id', 'ig', 'is', 'it', 'iw', 'ja', 'ja-Latn', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', - 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', - 'mt', 'my', 'ne', 'nl', 'no', 'ny', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'ru-Latn', - 'sd', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'st', 'su', 'sv', 'sw', 'ta', - 'te', 'tg', 'th', 'tr', 'uk', 'und', 'ur', 'uz', 'vi', 'xh', 'yi', 'yo', 'zh', - 'zh-Latn', 'zu'), (), ('train', 'test', 'validation')], - - 'conll2003': [(), (), ('train', 'test', 'validation')], - 'dbpedia_14': [(), (), ('train', 'test')], - 'xnli': [('all_languages', 'ar', 'bg', 'de', 'el'), (), ('train', 'test', 'validation')], - 'emotion': [(), (), ('train', 'test', 'validation')], - 'story_cloze': [('2016', '2018'), (), ('test', 'validation')], - 'app_reviews': [(), (), ()], - 'snli': [(), (), ('train', 'test', 'validation')], - 'mlqa': [('mlqa-translate-test.ar', 'mlqa-translate-test.de', 'mlqa-translate-test.es', - 'mlqa-translate-test.hi', 'mlqa-translate-test.vi'), (), - ('train', 'test', 'validation')], - 'code_search_net': [('all', 'java', 'go', 'python', 'javascript', 'ruby', 'php'), - (), ('train', 'test', 'validation')], - 'c4': [('en', 'realnewslike', 'en.noclean', 'realnewslike'), - (), ('train', 'validation')], - 'multi_nli': [(), (), ('train', 'validation_matched', 'validation_mismatched')], - 'amazon_reviews_multi': [('all_languages', 'de', 'en', 'es', 'fr', 'ja', 'zh'), - (), ('train', 'test', 'validation')], - 'hate_speech18': [(), (), ()], - 'tweet_eval': [('emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', - 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', - 'stance_hillary'), (), ('train', 'test', 'validation')], - 'timit_asr': [(), (), ('train', 'test')], - 'oscar': [('unshuffled_original_af', 'unshuffled_original_sq', 'unshuffled_original_als', - 'unshuffled_original_am', 'unshuffled_original_ar', 'unshuffled_original_an', - 'unshuffled_original_hy', 'unshuffled_original_as', 'unshuffled_original_ast', - 'unshuffled_original_av', 'unshuffled_original_az', 'unshuffled_original_ba', - 'unshuffled_original_eu', 'unshuffled_original_bar', 'unshuffled_original_be', - 'unshuffled_original_bn', 'unshuffled_original_bh', 'unshuffled_original_bpy', - 'unshuffled_original_bs', 'unshuffled_original_br', 'unshuffled_original_bg', - 'unshuffled_original_my', 'unshuffled_original_ca', 'unshuffled_original_ceb', - 'unshuffled_original_bcl', 'unshuffled_original_km', 'unshuffled_original_ckb', - 'unshuffled_original_cbk', 'unshuffled_original_ce', 'unshuffled_original_zh', - 'unshuffled_original_cv', 'unshuffled_original_kw', 'unshuffled_original_hr', - 'unshuffled_original_cs', 'unshuffled_original_da', 'unshuffled_original_dv', - 'unshuffled_original_diq', 'unshuffled_original_nl', 'unshuffled_original_mhr', - 'unshuffled_original_arz', 'unshuffled_original_eml', 'unshuffled_original_en', - 'unshuffled_original_myv', 'unshuffled_original_eo', 'unshuffled_original_et', - 'unshuffled_original_fi', 'unshuffled_original_fr', 'unshuffled_original_gl', - 'unshuffled_original_ka', 'unshuffled_original_de', 'unshuffled_original_gom', - 'unshuffled_original_gn', 'unshuffled_original_gu', 'unshuffled_original_ht', - 'unshuffled_original_he', 'unshuffled_original_hi', 'unshuffled_original_hu', - 'unshuffled_original_is', 'unshuffled_original_io', 'unshuffled_original_ilo', - 'unshuffled_original_id', 'unshuffled_original_ia', 'unshuffled_original_ie', - 'unshuffled_original_ga', 'unshuffled_original_it', 'unshuffled_original_ja', - 'unshuffled_original_jv', 'unshuffled_original_xal', 'unshuffled_original_kn', - 'unshuffled_original_krc', 'unshuffled_original_kk', 'unshuffled_original_ky', - 'unshuffled_original_kv', 'unshuffled_original_ko', 'unshuffled_original_ku', - 'unshuffled_original_lo', 'unshuffled_original_la', 'unshuffled_original_lv', - 'unshuffled_original_lez', 'unshuffled_original_li', 'unshuffled_original_lt', - 'unshuffled_original_jbo', 'unshuffled_original_lmo', 'unshuffled_original_nds', - 'unshuffled_original_dsb', 'unshuffled_original_lb', 'unshuffled_original_mk', - 'unshuffled_original_mai', 'unshuffled_original_mg', 'unshuffled_original_ms', - 'unshuffled_original_ml', 'unshuffled_original_mt', 'unshuffled_original_mr', - 'unshuffled_original_mzn', 'unshuffled_original_min', 'unshuffled_original_xmf', - 'unshuffled_original_mwl', 'unshuffled_original_el', 'unshuffled_original_mn', - 'unshuffled_original_nah', 'unshuffled_original_nap', 'unshuffled_original_ne', - 'unshuffled_original_new', 'unshuffled_original_frr', 'unshuffled_original_lrc', - 'unshuffled_original_no', 'unshuffled_original_nn', 'unshuffled_original_oc', - 'unshuffled_original_or', 'unshuffled_original_os', 'unshuffled_original_pam', - 'unshuffled_original_pa', 'unshuffled_original_fa', 'unshuffled_original_pms', - 'unshuffled_original_pl', 'unshuffled_original_pt', 'unshuffled_original_ps', - 'unshuffled_original_qu', 'unshuffled_original_ro', 'unshuffled_original_rm', - 'unshuffled_original_bxr', 'unshuffled_original_ru', 'unshuffled_original_sa', - 'unshuffled_original_gd', 'unshuffled_original_sr', 'unshuffled_original_sh', - 'unshuffled_original_scn', 'unshuffled_original_sd', 'unshuffled_original_si', - 'unshuffled_original_sk', 'unshuffled_original_sl', 'unshuffled_original_so', - 'unshuffled_original_azb', 'unshuffled_original_es', 'unshuffled_original_su', - 'unshuffled_original_sw', 'unshuffled_original_sv', 'unshuffled_original_tl', - 'unshuffled_original_tg', 'unshuffled_original_ta', 'unshuffled_original_tt', - 'unshuffled_original_te', 'unshuffled_original_th', 'unshuffled_original_bo', - 'unshuffled_original_tr', 'unshuffled_original_tk', 'unshuffled_original_tyv', - 'unshuffled_original_ug', 'unshuffled_original_uk', 'unshuffled_original_hsb', - 'unshuffled_original_ur', 'unshuffled_original_uz', 'unshuffled_original_vec', - 'unshuffled_original_vi', 'unshuffled_original_vo', 'unshuffled_original_wa', - 'unshuffled_original_war', 'unshuffled_original_cy', 'unshuffled_original_fy', - 'unshuffled_original_mrj', 'unshuffled_original_pnb', 'unshuffled_original_wuu', - 'unshuffled_original_sah', 'unshuffled_original_yi', 'unshuffled_original_yo', - 'unshuffled_original_yue', 'unshuffled_deduplicated_af', - 'unshuffled_deduplicated_sq', 'unshuffled_deduplicated_als', - 'unshuffled_deduplicated_am', 'unshuffled_deduplicated_ar', - 'unshuffled_deduplicated_an', 'unshuffled_deduplicated_hy', - 'unshuffled_deduplicated_as', 'unshuffled_deduplicated_ast', - 'unshuffled_deduplicated_av', 'unshuffled_deduplicated_az', - 'unshuffled_deduplicated_ba', 'unshuffled_deduplicated_eu', - 'unshuffled_deduplicated_bar', 'unshuffled_deduplicated_be', - 'unshuffled_deduplicated_bn', 'unshuffled_deduplicated_bh', - 'unshuffled_deduplicated_bpy', 'unshuffled_deduplicated_bs', - 'unshuffled_deduplicated_br', 'unshuffled_deduplicated_bg', - 'unshuffled_deduplicated_my', 'unshuffled_deduplicated_ca', - 'unshuffled_deduplicated_ceb', 'unshuffled_deduplicated_bcl', - 'unshuffled_deduplicated_km', 'unshuffled_deduplicated_ckb', - 'unshuffled_deduplicated_cbk', 'unshuffled_deduplicated_ce', - 'unshuffled_deduplicated_zh', 'unshuffled_deduplicated_cv', - 'unshuffled_deduplicated_kw', 'unshuffled_deduplicated_hr', - 'unshuffled_deduplicated_cs', 'unshuffled_deduplicated_da', - 'unshuffled_deduplicated_dv', 'unshuffled_deduplicated_diq', - 'unshuffled_deduplicated_nl', 'unshuffled_deduplicated_mhr', - 'unshuffled_deduplicated_arz', 'unshuffled_deduplicated_eml', - 'unshuffled_deduplicated_en', 'unshuffled_deduplicated_myv', - 'unshuffled_deduplicated_eo', 'unshuffled_deduplicated_et', - 'unshuffled_deduplicated_fi', 'unshuffled_deduplicated_fr', - 'unshuffled_deduplicated_gl', 'unshuffled_deduplicated_ka', - 'unshuffled_deduplicated_de', 'unshuffled_deduplicated_gom', - 'unshuffled_deduplicated_gn', 'unshuffled_deduplicated_gu', - 'unshuffled_deduplicated_ht', 'unshuffled_deduplicated_he', - 'unshuffled_deduplicated_hi', 'unshuffled_deduplicated_hu', - 'unshuffled_deduplicated_is', 'unshuffled_deduplicated_io', - 'unshuffled_deduplicated_ilo', 'unshuffled_deduplicated_id', - 'unshuffled_deduplicated_ia', 'unshuffled_deduplicated_ie', - 'unshuffled_deduplicated_ga', 'unshuffled_deduplicated_it', - 'unshuffled_deduplicated_ja', 'unshuffled_deduplicated_jv', - 'unshuffled_deduplicated_xal', 'unshuffled_deduplicated_kn', - 'unshuffled_deduplicated_krc', 'unshuffled_deduplicated_kk', - 'unshuffled_deduplicated_ky', 'unshuffled_deduplicated_kv', - 'unshuffled_deduplicated_ko', 'unshuffled_deduplicated_ku', - 'unshuffled_deduplicated_lo', 'unshuffled_deduplicated_la', - 'unshuffled_deduplicated_lv', 'unshuffled_deduplicated_lez', - 'unshuffled_deduplicated_li', 'unshuffled_deduplicated_lt', - 'unshuffled_deduplicated_jbo', 'unshuffled_deduplicated_lmo', - 'unshuffled_deduplicated_nds', 'unshuffled_deduplicated_dsb', - 'unshuffled_deduplicated_lb', 'unshuffled_deduplicated_mk', - 'unshuffled_deduplicated_mai', 'unshuffled_deduplicated_mg', - 'unshuffled_deduplicated_ms', 'unshuffled_deduplicated_ml', - 'unshuffled_deduplicated_mt', 'unshuffled_deduplicated_mr', - 'unshuffled_deduplicated_mzn', 'unshuffled_deduplicated_min', - 'unshuffled_deduplicated_xmf', 'unshuffled_deduplicated_mwl', - 'unshuffled_deduplicated_el', 'unshuffled_deduplicated_mn', - 'unshuffled_deduplicated_nah', 'unshuffled_deduplicated_nap', - 'unshuffled_deduplicated_ne', 'unshuffled_deduplicated_new', - 'unshuffled_deduplicated_frr', 'unshuffled_deduplicated_lrc', - 'unshuffled_deduplicated_no', 'unshuffled_deduplicated_nn', - 'unshuffled_deduplicated_oc', 'unshuffled_deduplicated_or', - 'unshuffled_deduplicated_os', 'unshuffled_deduplicated_pam', - 'unshuffled_deduplicated_pa', 'unshuffled_deduplicated_fa', - 'unshuffled_deduplicated_pms', 'unshuffled_deduplicated_pl', - 'unshuffled_deduplicated_pt', 'unshuffled_deduplicated_ps', - 'unshuffled_deduplicated_qu', 'unshuffled_deduplicated_ro', - 'unshuffled_deduplicated_rm', 'unshuffled_deduplicated_bxr', - 'unshuffled_deduplicated_ru', 'unshuffled_deduplicated_sa', - 'unshuffled_deduplicated_gd', 'unshuffled_deduplicated_sr', - 'unshuffled_deduplicated_sh', 'unshuffled_deduplicated_scn', - 'unshuffled_deduplicated_sd', 'unshuffled_deduplicated_si', - 'unshuffled_deduplicated_sk', 'unshuffled_deduplicated_sl', - 'unshuffled_deduplicated_so', 'unshuffled_deduplicated_azb', - 'unshuffled_deduplicated_es', 'unshuffled_deduplicated_su', - 'unshuffled_deduplicated_sw', 'unshuffled_deduplicated_sv', - 'unshuffled_deduplicated_tl', 'unshuffled_deduplicated_tg', - 'unshuffled_deduplicated_ta', 'unshuffled_deduplicated_tt', - 'unshuffled_deduplicated_te', 'unshuffled_deduplicated_th', - 'unshuffled_deduplicated_bo', 'unshuffled_deduplicated_tr', - 'unshuffled_deduplicated_tk', 'unshuffled_deduplicated_tyv', - 'unshuffled_deduplicated_ug', 'unshuffled_deduplicated_uk', - 'unshuffled_deduplicated_hsb', 'unshuffled_deduplicated_ur', - 'unshuffled_deduplicated_uz', 'unshuffled_deduplicated_vec', - 'unshuffled_deduplicated_vi', 'unshuffled_deduplicated_vo', - 'unshuffled_deduplicated_wa', 'unshuffled_deduplicated_war', - 'unshuffled_deduplicated_cy', 'unshuffled_deduplicated_fy', - 'unshuffled_deduplicated_mrj', 'unshuffled_deduplicated_pnb', - 'unshuffled_deduplicated_wuu', 'unshuffled_deduplicated_sah', - 'unshuffled_deduplicated_yi', 'unshuffled_deduplicated_yo', - 'unshuffled_deduplicated_yue'), (), ('train', 'test', 'validation')], - 'sst': [('default', 'dictionary' 'ptb'), (), ('train', 'test', 'validation')], - 'hate_speech_offensive': [(), (), ('train')], - 'wikiann': [('ace', 'af', 'als', 'am', 'an', 'ang', 'ar', 'arc', 'arz', 'as', 'ast', 'ay', - 'az', 'ba', 'bar', 'bat-smg', 'be', 'be-x-old', 'bg', 'bh', 'bn', 'bo', 'br', - 'bs', 'ca', 'cbk-zam', 'cdo', 'ce', 'ceb', 'ckb', 'co', 'crh', 'cs', 'csb', 'cv', - 'cy', 'da', 'de', 'diq', 'dv', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', - 'fa', 'fi', 'fiu-vro', 'fo', 'fr', 'frr', 'fur', 'fy', 'ga', 'gan', 'gd', 'gl', - 'gn', 'gu', 'hak', 'he', 'hi', 'hr', 'hsb', 'hu', 'hy', 'ia', 'id', 'ig', 'ilo', - 'io', 'is', 'it', 'ja', 'jbo', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ksh', 'ku', - 'ky', 'la', 'lb', 'li', 'lij', 'lmo', 'ln', 'lt', 'lv', 'map-bms', 'mg', 'mhr', - 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'mwl', 'my', 'mzn', 'nap', 'nds', - 'ne', 'nl', 'nn', 'no', 'nov', 'oc', 'or', 'os', 'pa', 'pdc', 'pl', 'pms', 'pnb', - 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'rw', 'sa', 'sah', 'scn', 'sco', 'sd', 'sh', - 'si', 'simple', 'sk', 'sl', 'so', 'sq', 'sr', 'su', 'sv', 'sw', 'szl', 'ta', 'te', - 'tg', 'th', 'tk', 'tl', 'tr', 'tt', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 'vi', - 'vls', 'vo', 'wa', 'war', 'wuu', 'xmf', 'yi', 'yo', 'zea', 'zh', 'zh-classical', - 'zh-min-nan', 'zh-yue'), (), ('train', 'test', 'validation')], - 'wikipedia': [('20200501.de', '20200501.en', '20200501.fr', '20200501.frr', '20200501.it'), - (), ('train', 'test', 'validation')], - 'opus_euconst': [('cs-da', 'cs-de', 'cs-el'), (), ('train', 'test', 'validation')], - 'math_qa': [(), (), ('train', 'test', 'validation')], - 'swag': [(), (), ('train', 'test', 'validation')], - 'pubmed_qa': [(), (), ()], - 'scientific_papers': [('pubmed', 'arxiv'), (), ('train', 'test', 'validation')], - 'banking77': [(), (), ('train', 'test')], - 'cbt': [('raw', 'V', 'P', 'CN', 'NE'), (), ('train', 'test', 'validation')], - 'commonsense_qa': [(), (), ('train', 'test', 'validation')], - 'yelp_polarity': [(), (), ('train', 'test')], - 'stsb_multi_mt': [('en', 'de', 'es', 'ft', 'it', 'nl', 'pl', 'pt', 'ru', 'zh'), - (), ('train', 'test', 'validation')], - 'bookcorpus': [(), (), ('train')], - 'squad_adversarial': [('AddSent', 'AddOneSent'), (), ()], - 'scan': [('addprim_jump', 'addprim_turn_left', 'filler_num0', 'filler_num1', 'filler_num2'), - (), ('train', 'test')], - 'financial_phrasebank': [('sentences_50agree', 'sentences_66agree', 'sentences_75agree', - 'sentences_allagree'), (), ()], - 'craffel/openai_lambada': [(), (), ('test')], - 'wiki_dpr': [('psgs_w100.multiset.compressed', 'psgs_w100.multiset.exact', - 'psgs_w100.multiset.no_index', 'psgs_w100.nq.compressed', 'psgs_w100.nq.exact'), - (), ('train')], - 'amazon_us_reviews': [('Apparel_v1_00', 'Automotive_v1_00', 'Baby_v1_00', 'Beauty_v1_00', - 'Books_v1_00'), (), ('train')], - 'math_dataset': [('algebra__linear_1d', 'algebra__linear_1d_composed', 'algebra__linear_2d', - 'algebra__linear_2d_composed', 'algebra__polynomial_roots'), - (), ('train', 'test')], - 'tab_fact': [(), (), ()], - 'subjqa': [('tripadvisor', 'restaurants', 'movies', 'books', 'electronics', 'grocery'), - (), ('train', 'test', 'dev')], - 'OTHERS': [(), (), ()]}, - 'SUBSET': None, - 'MODEL_COL': None, - 'SPLIT_TRAIN': None, - 'SPLIT_TEST': None, - 'DATASET': '', - 'TOKENZIER': None, - 'WRAPPED_MODEL': None, - 'TRAINING_DATA': None, - 'TASK_TYPE': None, - 'ATTACK': None, - 'ATTACK_RECIPES': ('None', 'A2T (A2T: Attack for Adversarial Training Recipe)', - 'BAE (BAE: BERT-Based Adversarial Examples)', 'BERT-Attack', 'CheckList', 'CLARE Recipe', - 'DeepWordBug', 'Faster Alzantot Genetic Algorithm', 'Alzantot Genetic Algorithm', 'HotFlip', - 'Improved Genetic Algorithm', 'Input Reduction', 'Kuleshov2017', 'MORPHEUS2020', - 'Pruthi2019: Combating with Robust Word Recognition', 'Particle Swarm Optimization', 'PWWS', - 'Seq2Sick', 'TextBugger', 'TextFooler (Is BERT Really Robust?)'), - 'ATTACK_MODEL': None, - 'PRED_FILE': None, - 'FILE': 'Small File(s)', - 'MODE': 'CSV', - 'CSP': None, - 'DATA_COLUMN': None, - 'MODEL_PATH': None, - 'PRED_FILEPATH': None, - 'PRED_DATA': [pd.DataFrame, list], - 'PATH_EXIST': False, - 'PRED_SEQ': None, - 'PREDS': [] -} diff --git a/pyfiles/pages/document_term_matrix.py b/pyfiles/pages/document_term_matrix.py index fef75df..0cacd46 100644 --- a/pyfiles/pages/document_term_matrix.py +++ b/pyfiles/pages/document_term_matrix.py @@ -7,11 +7,9 @@ # -------------------------------------------------------------------------------------------------------------------- # # | IMPORT RELEVANT LIBRARIES | # # -------------------------------------------------------------------------------------------------------------------- # -import io import os import pathlib import platform - import pandas as pd import streamlit as st diff --git a/pyfiles/pages/load_clean_visualise.py b/pyfiles/pages/load_clean_visualise.py index 96b9a2b..8f8c2eb 100644 --- a/pyfiles/pages/load_clean_visualise.py +++ b/pyfiles/pages/load_clean_visualise.py @@ -9,9 +9,7 @@ # -------------------------------------------------------------------------------------------------------------------- # # | IMPORT RELEVANT LIBRARIES | # # -------------------------------------------------------------------------------------------------------------------- # -import pathlib import re -import nltk import numpy as np import pandas as pd import pycountry @@ -22,10 +20,9 @@ from streamlit_tags import st_tags from texthero import stopwords from collections import Counter -from texthero import preprocessing import plotly.express as px from utils import csp_downloaders -from utils.helper import readFile, lemmatizeText, downloadCorpora, printDataFrame, prettyDownload +from utils.helper import readFile, lemmatizeText, printDataFrame, prettyDownload from st_aggrid import AgGrid, DataReturnMode, GridUpdateMode, GridOptionsBuilder diff --git a/pyfiles/pages/model_trainer.py b/pyfiles/pages/model_trainer.py deleted file mode 100644 index fd9f42d..0000000 --- a/pyfiles/pages/model_trainer.py +++ /dev/null @@ -1,892 +0,0 @@ -""" -This module allows the user to train models and to predict NLP data -""" - -# -------------------------------------------------------------------------------------------------------------------- # -# | IMPORT RELEVANT LIBRARIES | # -# -------------------------------------------------------------------------------------------------------------------- # -import os -import pandas as pd -import streamlit as st -import textattack.models.wrappers -import torch -import subprocess -import transformers - -from streamlit_tags import st_tags -from datetime import datetime -from config import trainer -from utils import csp_downloaders -from utils.helper import readFile - - -# -------------------------------------------------------------------------------------------------------------------- # -# | MAIN APP FUNCTIONALITY | # -# -------------------------------------------------------------------------------------------------------------------- # -def app(): - """ - Main function that will be called when the app is run - """ - - st.markdown('# NLP Model Trainer and Predictor') - st.markdown('This function allows you to train and create a ML Model to classify the topic of the News Article ' - 'passed on to the dataset. This function requires the use of the PyTorch Library to train and ' - 'evaluate your model. Ensure that you have downloaded and installed the correct PyTorch library ' - 'corresponding to your CUDA version.') - - st.markdown('---') - col1, col2 = st.columns(2) - with col1: - st.markdown('### PyTorch for CUDA 10.2') - if st.button('Install Relevant Packages', key='10.2'): - os.system('pip install torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102' - ' -f https://download.pytorch.org/whl/cu102/torch_stable.html') - with col2: - st.markdown('### PyTorch for CUDA 11.3') - if st.button('Install Relevant Packages', key='11.3'): - os.system('pip install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113' - ' -f https://download.pytorch.org/whl/cu113/torch_stable.html') - st.markdown('\n\n') - - if st.button('Check if GPU is properly installed'): - st.info(f'GPU Installation Status: **{torch.cuda.is_available()}**') - if st.button('Check GPU used'): - try: - st.info(f'GPU Device **{torch.cuda.get_device_name(torch.cuda.current_device())}** in use.') - except AssertionError: - st.error('Your version of PyTorch is CPU-optimised. Download and install any of the above two ' - 'supported GPU-enabled PyTorch versions to use your GPU and silence this error.') - except Exception as ex: - st.error(ex) - - st.markdown('---') - st.markdown('## Mode Selector') - trainer['MODEL_MODE'] = st.selectbox('Select the actions you want to perform', ('Training', 'Evaluation')) - - if trainer['MODEL_MODE'] == 'Training': - # FLAGS - st.markdown('## Options\n\n' - '### Transformers Selection') - trainer['TRANSFORMERS_SELECTION'] = st.selectbox('Choose Transformers Auto Model Class to Use', - options=trainer['TRANSFORMERS_CHOICES'], - help='Note that this selection is important as failure to ' - 'use the correct class will result in errors when ' - 'running the Training step.', - key='transformers') - - st.markdown('### Training Parameters') - trainer['API'] = st.checkbox('Use Training API?', - help='Note that with this option selected, you must ensure that your GPU has ' - 'sufficient GPU memory to run the networks/models you selected. If you ' - 'are unsure, it is better to use the Command Line Argument API to fine ' - 'tune the model parameters before starting the training.', - value=True) - - if trainer['API']: - trainer['TRAINING_PARAMS'] = st.multiselect('Select Training Parameters', - ('num_epochs', 'num_clean_epochs', 'attack_epoch_interval', - 'early_stopping_epochs', 'learning_rate', - 'num_warmup_steps', - 'weight_decay', 'per_device_train_batch_size', - 'per_device_eval_batch_size', - 'gradient_accumulation_steps', 'random_seed', 'parallel', - 'load_best_model_at_end', 'alpha', - 'num_train_adv_examples', 'query_budget_train', - 'attack_num_workers_per_device', 'output_dir', - 'checkpoint_interval_steps', 'checkpoint_interval_epochs', - 'save_last', 'log_to_tb', 'tb_log_dir', 'log_to_wandb', - 'wandb_project', 'logging_interval_step'), - default=('num_epochs', 'per_device_train_batch_size')) - else: - trainer['TRAINING_PARAMS'] = st.multiselect('Select Training Parameters', - ('attack', 'model_max_length', - 'model_num_labels', 'dataset_train_split', - 'dataset_eval_split', 'filter_train_by_labels', - 'filter_eval_by_labels', 'num_epochs', 'num_clean_epochs', - 'attack_epoch_interval', 'early_stopping_epochs', - 'learning_rate', 'num_warmup_steps', - 'weight_decay', 'per_device_train_batch_size', - 'per_device_eval_batch_size', - 'gradient_accumulation_steps', 'random_seed', 'parallel', - 'load_best_model_at_end', 'alpha', - 'num_train_adv_examples', 'query_budget_train', - 'attack_num_workers_per_device', 'output_dir', - 'checkpoint_interval_steps', 'checkpoint_interval_epochs', - 'save_last', 'log_to_tb', 'tb_log_dir', 'log_to_wandb', - 'wandb_project', 'logging_interval_step'), - default=('model_max_length', 'num_epochs', - 'per_device_train_batch_size', 'model_num_labels')) - - # DEFINE PARAMETERS - if 'attack' in trainer['TRAINING_PARAMS']: - trainer['attack'] = st.text_input('Attack string', key='attack') - else: - trainer['attack'] = None - - if 'model_max_length' in trainer['TRAINING_PARAMS']: - if st.checkbox('Define Model Max Length'): - trainer['model_max_length'] = st.number_input('Model Max Length', - min_value=1, - max_value=1000000, - value=64, - key='model_max_length') - else: - trainer['model_max_length'] = None - - if 'model_num_labels' in trainer['TRAINING_PARAMS']: - if st.checkbox('Define Number of Labels'): - trainer['model_num_labels'] = st.number_input('Number of Labels', - min_value=1, - max_value=1000000, - value=1, - key='model_num_labels') - else: - trainer['model_num_labels'] = None - - if 'filter_train_by_labels' in trainer['TRAINING_PARAMS']: - trainer['filter_train_by_labels'] = st_tags(label='Filter Train Data By Labels', - key='filter_train', - text='Press Enter to add in more labels...', - maxtags=9999999) - else: - trainer['filter_train_by_labels'] = None - - if 'filter_eval_by_labels' in trainer['TRAINING_PARAMS']: - trainer['filter_eval_by_labels'] = st_tags(label='Filter Train Data By Labels', - key='filter_test', - text='Press Enter to add in more labels...', - maxtags=9999999) - else: - trainer['filter_eval_by_labels'] = None - - if 'num_epochs' in trainer['TRAINING_PARAMS']: - trainer['num_epochs'] = st.number_input('Total number of epochs for training', - min_value=1, - max_value=1000000, - value=3, - key='num_epochs') - else: - if trainer['API']: - trainer['num_epochs'] = 3 - else: - trainer['num_epochs'] = None - - if 'num_clean_epochs' in trainer['TRAINING_PARAMS']: - trainer['num_clean_epochs'] = st.number_input('Number of epochs to train on just the original ' - 'training dataset before adversarial training', - min_value=1, - max_value=1000000, - value=1, - key='num_clean_epochs') - else: - if trainer['API']: - trainer['num_clean_epochs'] = 1 - else: - trainer['num_clean_epochs'] = None - - if 'attack_epoch_interval' in trainer['TRAINING_PARAMS']: - trainer['attack_epoch_interval'] = st.number_input('Generate a new adversarial training set every ' - 'N epochs', - min_value=1, - max_value=1000000, - value=1, - key='attack_epoch_interval') - else: - if trainer['API']: - trainer['attack_epoch_interval'] = 1 - else: - trainer['attack_epoch_interval'] = None - - if 'early_stopping_epochs' in trainer['TRAINING_PARAMS']: - trainer['early_stopping_epochs'] = st.number_input('Number of epochs validation must increase ' - 'before stopping early', - min_value=1, - max_value=1000000, - value=1, - key='early_stopping_epochs') - else: - trainer['early_stopping_epochs'] = None - - if 'learning_rate' in trainer['TRAINING_PARAMS']: - trainer['learning_rate'] = st.number_input('Number of epochs validation must increase before ' - 'stopping early', - min_value=0., - max_value=1., - value=5e-5, - step=0.000001, - format='%.6f', - key='learning_rate') - else: - if trainer['API']: - trainer['learning_rate'] = 5e-5 - else: - trainer['learning_rate'] = None - - if 'num_warmup_steps' in trainer['TRAINING_PARAMS']: - if st.checkbox('Define in float?'): - trainer['num_warmup_steps'] = st.number_input('The number of steps for the warmup phase of ' - 'linear scheduler', - min_value=0., - max_value=1., - value=0.50, - step=0.001, - format='%.3f', - key='num_warmup_steps') - else: - trainer['num_warmup_steps'] = st.number_input('The number of steps for the warmup phase of ' - 'linear scheduler', - min_value=1, - max_value=1000000, - value=500, - key='num_warmup_steps') - else: - if trainer['API']: - trainer['num_warmup_steps'] = 500 - else: - trainer['num_warmup_steps'] = None - - if 'weight_decay' in trainer['TRAINING_PARAMS']: - trainer['weight_decay'] = st.number_input('Weight decay (L2 penalty)', - min_value=0., - max_value=1., - value=0.01, - step=0.01, - format='%.2f', - key='weight_decay') - else: - if trainer['API']: - trainer['weight_decay'] = 0.01 - else: - trainer['weight_decay'] = None - - if 'per_device_train_batch_size' in trainer['TRAINING_PARAMS']: - trainer['per_device_train_batch_size'] = st.number_input('The batch size per GPU/CPU for training', - min_value=1, - max_value=1000000, - value=8, - key='per_device_train_batch_size') - else: - if trainer['API']: - trainer['per_device_train_batch_size'] = 8 - else: - trainer['per_device_train_batch_size'] = None - - if 'per_device_eval_batch_size' in trainer['TRAINING_PARAMS']: - trainer['per_device_eval_batch_size'] = st.number_input('The batch size per GPU/CPU for evaluation', - min_value=1, - max_value=1000000, - value=32, - key='per_device_eval_batch_size') - else: - if trainer['API']: - trainer['per_device_eval_batch_size'] = 32 - else: - trainer['per_device_eval_batch_size'] = None - - if 'gradient_accumulation_steps' in trainer['TRAINING_PARAMS']: - trainer['gradient_accumulation_steps'] = st.number_input('Number of updates steps to accumulate ' - 'the gradients before performing a ' - 'backward/update pass', - min_value=1, - max_value=1000000, - value=32, - key='gradient_accumulation_steps') - else: - if trainer['API']: - trainer['gradient_accumulation_steps'] = 1 - else: - trainer['gradient_accumulation_steps'] = None - - if 'random_seed' in trainer['TRAINING_PARAMS']: - trainer['random_seed'] = st.number_input('Random seed for reproducibility', - min_value=1, - max_value=1000000, - value=786, - key='random_seed') - else: - if trainer['API']: - trainer['random_seed'] = 786 - else: - trainer['random_seed'] = None - - if 'parallel' in trainer['TRAINING_PARAMS']: - trainer['parallel'] = st.checkbox('Use Multiple GPUs using torch.DataParallel class?', - value=False, - key='parallel') - else: - if trainer['API']: - trainer['parallel'] = False - else: - trainer['parallel'] = None - - if 'load_best_model_at_end' in trainer['TRAINING_PARAMS']: - trainer['load_best_model_at_end'] = st.checkbox('keep track of the best model across training and ' - 'load it at the end', - value=False, - key='parallel') - else: - trainer['load_best_model_at_end'] = False - - if 'alpha' in trainer['TRAINING_PARAMS']: - trainer['alpha'] = st.number_input('The weight for adversarial loss', - min_value=0., - max_value=1., - value=0.50, - step=0.001, - format='%.3f', - key='alpha') - else: - if trainer['API']: - trainer['alpha'] = 1.0 - else: - trainer['alpha'] = None - - if 'num_train_adv_examples' in trainer['TRAINING_PARAMS']: - if st.checkbox('Use Float Parameters?'): - trainer['num_train_adv_examples'] = st.number_input('The number of samples to successfully ' - 'attack when generating adversarial ' - 'training set before start of every epoch', - min_value=0., - max_value=1., - value=0.50, - step=0.001, - format='%.3f', - key='num_train_adv_examples') - else: - trainer['num_train_adv_examples'] = st.number_input('The number of samples to successfully ' - 'attack when generating adversarial ' - 'training set before start of every epoch', - min_value=1, - max_value=1000000, - value=8, - key='per_device_train_batch_size') - else: - if trainer['API']: - trainer['num_train_adv_examples'] = -1 - else: - trainer['num_train_adv_examples'] = None - - if 'query_budget_train' in trainer['TRAINING_PARAMS']: - if st.checkbox('Set Max Query Budget?', value=False): - trainer['query_budget_train'] = st.number_input('The max query budget to use when generating ' - 'adversarial training set', - min_value=1, - max_value=1000000, - value=1, - key='query_budget_train') - else: - trainer['query_budget_train'] = None - - if 'attack_num_workers_per_device' in trainer['TRAINING_PARAMS']: - if st.checkbox('Set Number of Worker Process to run attack?', value=False): - trainer['attack_num_workers_per_device'] = st.number_input('Number of worker processes to run ' - 'per device for attack', - min_value=1, - max_value=1000000, - value=1, - key='attack_num_workers_per_device') - else: - if trainer['API']: - trainer['attack_num_workers_per_device'] = 1 - else: - trainer['attack_num_workers_per_device'] = None - - if 'output_dir' in trainer['TRAINING_PARAMS']: - dt = datetime.now() - trainer['output_dir'] = st.text_input('Directory to output training logs and checkpoints', - value=f'/outputs/{dt.strftime("%Y-%m-%d-%H-%M-%S-%f")}', - key='output_dir') - else: - trainer['output_dir'] = f'/outputs/{datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")}' - - if 'checkpoint_interval_steps' in trainer['TRAINING_PARAMS']: - if st.checkbox('Save Model Checkpoint after every N updates?'): - trainer['checkpoint_interval_steps'] = st.number_input('Save after N updates', - min_value=1, - max_value=1000000, - value=1, - key='checkpoint_interval_steps') - else: - trainer['checkpoint_interval_steps'] = None - - if 'checkpoint_interval_epochs' in trainer['TRAINING_PARAMS']: - if st.checkbox('Save Model Checkpoint after every N epochs?'): - trainer['checkpoint_interval_epochs'] = st.number_input('Save after N epochs', - min_value=1, - max_value=1000000, - value=1, - key='checkpoint_interval_epochs') - else: - trainer['checkpoint_interval_epochs'] = None - - if 'save_last' in trainer['TRAINING_PARAMS']: - trainer['save_last'] = st.checkbox('Save the model at end of training', - value=True, - key='save_last') - else: - if trainer['API']: - trainer['save_last'] = True - else: - trainer['save_last'] = None - - if 'log_to_tb' in trainer['TRAINING_PARAMS']: - trainer['log_to_tb'] = st.checkbox('Log to Tensorboard', - value=False, - key='log_to_tb') - else: - if trainer['API']: - trainer['log_to_tb'] = False - else: - trainer['log_to_tb'] = None - - if 'tb_log_dir' in trainer['TRAINING_PARAMS']: - trainer['tb_log_dir'] = st.text_input('Directory to output training logs and checkpoints', - value=r'./runs', - key='tb_log_dir') - else: - trainer['tb_log_dir'] = r'./runs' - - if 'log_to_wandb' in trainer['TRAINING_PARAMS']: - trainer['log_to_wandb'] = st.checkbox('Log to Wandb', - value=False, - key='log_to_wandb') - else: - if trainer['API']: - trainer['log_to_wandb'] = False - else: - trainer['log_to_wandb'] = None - - if 'wandb_project' in trainer['TRAINING_PARAMS']: - trainer['wandb_project'] = st.text_input('Name of Wandb project for logging', - value=r'textattack', - key='wandb_project') - else: - if trainer['API']: - trainer['wandb_project'] = 'textattack' - else: - trainer['wandb_project'] = None - - if 'logging_interval_step' in trainer['TRAINING_PARAMS']: - trainer['logging_interval_step'] = st.number_input('Log to Tensorboard/Wandb every N training ' - 'steps', - min_value=1, - max_value=1000000, - value=1, - key='logging_interval_step') - else: - if trainer['API']: - trainer['logging_interval_step'] = 1 - else: - trainer['logging_interval_step'] = None - - if st.checkbox('Attack Model with confusion datasets?', value=False): - trainer['ATTACK'] = st.selectbox('Choose Attack recipes to execute on Model', - trainer['ATTACK_RECIPES']) - if trainer['ATTACK'] == 'None': - trainer['ATTACK_MODEL'] = None - - st.markdown('### Model and Data Selection') - col, col_ = st.columns(2) - trainer['MODEL'] = col.selectbox('Choose Model to Use', - trainer['ML_POSSIBLE_PICKS'], - key='mdl') - trainer['DATASET'] = col.selectbox('Choose Dataset to Use', - trainer['DATASET_POSSIBLE_PICKS'], - help='Due to the sheer number of datasets availble on HuggingFace, ' - 'we have only provided the top 100 datasets on the website.', - key='datasets') - - trainer['TASK_TYPE'] = col.selectbox('Choose Task for Model to Complete', ('classification', 'regression')) - - if len(trainer['SUBSET_MAPPINGS'][trainer['DATASET']][0]) != 0: - trainer['SUBSET'] = col_.selectbox('Select Subset of Data to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][0]) - else: - trainer['SUBSET'] = None - - if len(trainer['SUBSET_MAPPINGS'][trainer['DATASET']][1]) != 0: - trainer['MODEL_COL'] = col_.selectbox('Select Data Columns to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][1], - key='column_dat') - else: - trainer['MODEL_COL'] = None - - if len(trainer['SUBSET_MAPPINGS'][trainer['DATASET']][2]) > 0: - trainer['SPLIT_TRAIN'] = col_.selectbox('Select Training Split to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][2], - key='train') - trainer['SPLIT_TEST'] = col_.selectbox('Select Testing Split to Use', - trainer['SUBSET_MAPPINGS'][trainer['DATASET']][2], - key='test') - if trainer['SPLIT_TRAIN'] == trainer['SPLIT_TEST']: - st.warning('**Warning**: Your Training and Testing Dataset should not be the same. Ensure that ' - 'you have selected the right dataset to use for your model.') - else: - st.warning('**Warning:** This dataset does not have data split properly. You may wish to use another ' - 'dataset or to edit the dataset before passing it into the model for training.') - trainer['SPLIT_TRAIN'] = None - trainer['SPLIT_TEST'] = None - - with st.expander('Dataset Explorer'): - st.markdown('### Dataset Explorer\n' - 'Use the above flags to define the Dataset to download and explore.') - st.info(f'**Current Dataset Chosen**: {trainer["DATASET"]}') - if st.button(f'Explore {trainer["DATASET"]}'): - train = textattack.datasets.HuggingFaceDataset(name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TRAIN']) - test = textattack.datasets.HuggingFaceDataset(name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TEST']) - st.markdown(f'### Training Data\n\n' - f'**First Entry**: {train[0]}\n\n' - f'**Last Entry**: {train[-1]}\n\n' - f'**Length of Dataset**: {len(train)}') - st.markdown(f'### Testing Data\n\n' - f'**First Entry**: {test[0]}\n\n' - f'**Last Entry**: {test[-1]}\n\n' - f'**Length of Dataset**: {len(test)}') - - st.markdown('## Begin Training\n\n' - 'Kindly ensure that the models you have chosen above is compatible with the dataset. Failure to ' - 'do so will result in errors.') - if st.button('Proceed'): - if trainer['API']: - # transformers model selector - st.info(f'Loading {trainer["TRANSFORMERS_SELECTION"]} Class...') - if trainer['TRANSFORMERS_SELECTION'] == 'Pre Training': - trainer['ML_MODEL'] = transformers.AutoModelForPreTraining.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'CausalLM': - trainer['ML_MODEL'] = transformers.AutoModelForCausalLM.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'MaskedLM': - trainer['ML_MODEL'] = transformers.AutoModelForMaskedLM.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'Seq2SeqLM': - trainer['ML_MODEL'] = transformers.AutoModelForSeq2SeqLM.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'SequenceClassification': - trainer['ML_MODEL'] = transformers.AutoModelForSequenceClassification.from_pretrained( - trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'MultipleChoice': - trainer['ML_MODEL'] = transformers.AutoModelForMultipleChoice.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'NextSentencePrediction': - trainer['ML_MODEL'] = transformers.AutoModelForNextSentencePrediction.from_pretrained( - trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'TokenClassificaition': - trainer['ML_MODEL'] = transformers.AutoModelForTokenClassification.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'QuestionAnswering': - trainer['ML_MODEL'] = transformers.AutoModelForQuestionAnswering.from_pretrained(trainer['MODEL']) - elif trainer['TRANSFORMERS_SELECTION'] == 'TableQuestionAnswering': - trainer['ML_MODEL'] = transformers.AutoModelForTableQuestionAnswering.from_pretrained( - trainer['MODEL']) - - trainer['TOKENIZER'] = transformers.AutoTokenizer.from_pretrained(trainer['MODEL']) - trainer['WRAPPED_MODEL'] = textattack.models.wrappers.HuggingFaceModelWrapper(trainer['ML_MODEL'], - trainer['TOKENIZER']) - trainer['TRAINING_DATA'] = textattack.datasets.HuggingFaceDataset( - name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TRAIN'] - ) - trainer['EVAL_DATA'] = textattack.datasets.HuggingFaceDataset( - name_or_dataset=trainer['DATASET'], - subset=trainer['SUBSET'], - dataset_columns=trainer['MODEL_COL'], - split=trainer['SPLIT_TEST'] - ) - - if trainer['ATTACK'] != 'None': - if trainer['ATTACK'] == 'A2T (A2T: Attack for Adversarial Training Recipe)': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.A2TYoo2021.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'BAE (BAE: BERT-Based Adversarial Examples)': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.BAEGarg2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'BERT-Attack': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.BERTAttackLi2020.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'CheckList': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.CheckList2020.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'CLARE Recipe': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.CLARE2020.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'DeepWordBug': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.DeepWordBugGao2018. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Faster Alzantot Genetic Algorithm': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.FasterGeneticAlgorithmJia2019. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Alzantot Genetic Algorithm': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.GeneticAlgorithmAlzantot2018. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'HotFlip': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.HotFlipEbrahimi2017. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Improved Genetic Algorithm': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.IGAWang2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Input Reduction': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.InputReductionFeng2018. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Kuleshov2017': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.Kuleshov2017.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'MORPHEUS2020': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.MorpheusTan2020.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Pruthi2019: Combating with Robust Word Recognition': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.Pruthi2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Particle Swarm Optimization': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.PSOZang2020.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'PWWS': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.PWWSRen2019.build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'Seq2Sick': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.Seq2SickCheng2018BlackBox. \ - build(trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'TextBugger': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.TextBuggerLi2018.build( - trainer['WRAPPED_MODEL']) - elif trainer['ATTACK'] == 'TextFooler (Is BERT Really Robust?)': - trainer['ATTACK_MODEL'] = textattack.attack_recipes.TextFoolerJin2019. \ - build(trainer['WRAPPED_MODEL']) - - trainer['TRAINING_ARGS'] = textattack.TrainingArgs( - num_epochs=trainer['num_epochs'], - num_clean_epochs=trainer['num_clean_epochs'], - attack_epoch_interval=trainer['attack_epoch_interval'], - early_stopping_epochs=trainer['early_stopping_epochs'], - learning_rate=trainer['learning_rate'], - num_warmup_steps=trainer['num_warmup_steps'], - weight_decay=trainer['weight_decay'], - per_device_train_batch_size=trainer['per_device_train_batch_size'], - per_device_eval_batch_size=trainer['per_device_eval_batch_size'], - gradient_accumulation_steps=trainer['gradient_accumulation_steps'], - random_seed=trainer['random_seed'], - parallel=trainer['parallel'], - load_best_model_at_end=trainer['load_best_model_at_end'], - alpha=trainer['alpha'], - num_train_adv_examples=trainer['num_train_adv_examples'], - query_budget_train=trainer['query_budget_train'], - attack_num_workers_per_device=trainer['attack_num_workers_per_device'], - output_dir=trainer['output_dir'], - checkpoint_interval_steps=trainer['checkpoint_interval_steps'], - checkpoint_interval_epochs=trainer['checkpoint_interval_epochs'], - save_last=trainer['save_last'], - log_to_tb=trainer['log_to_tb'], - tb_log_dir=trainer['tb_log_dir'], - log_to_wandb=trainer['log_to_wandb'], - wandb_project=trainer['wandb_project'], - logging_interval_step=trainer['logging_interval_step'] - ) - trainer['TRAINER'] = textattack.Trainer( - model_wrapper=trainer['WRAPPED_MODEL'], - task_type=trainer['TASK_TYPE'], - attack=trainer['ATTACK_MODEL'], - train_dataset=trainer['TRAINING_DATA'], - eval_dataset=trainer['EVAL_DATA'], - training_args=trainer['TRAINING_ARGS'] - ) - - with st.spinner('Training Model... Refer to your Terminal for more information...'): - try: - trainer['TRAINER'].train() - except Exception as ex: - st.error(ex) - else: - st.success(f'Successfully trained model! Model saved in {os.getcwd()}{trainer["output_dir"]}.') - - else: - with st.spinner('Training Model... Refer to your Terminal for more information...'): - var_list = ['textattack', 'train'] - maps = { - 'model_name_or_path': ['--model-name-or-path', trainer['MODEL']], - 'dataset': ['--dataset', trainer['DATASET']], - 'attack': ['--attack', trainer['attack']], - 'task_type': ['--task-type', trainer['TASK_TYPE']], - 'model_max_length': ['--model-max-length', trainer['model_max_length']], - 'model_num_labels': ['--model-num-labels', trainer['model_num_labels']], - 'dataset_train_split': ['--dataset-train-split', trainer['dataset_train_split']], - 'dataset_eval_split': ['--dataset-eval-split', trainer['dataset_eval_split']], - 'filter_train_by_labels': ['--filter-train-by-labels', trainer['filter_train_by_labels']], - 'filter_eval_by_labels': ['--filter-eval-by-labels', trainer['filter_eval_by_labels']], - 'num_epochs': ['--num-epochs', trainer['num_epochs']], - 'num_clean_epochs': ['--num-clean-epochs', trainer['num_clean_epochs']], - 'attack_epoch_interval': ['--attack-epoch-interval', trainer['attack_epoch_interval']], - 'early_stopping_epochs': ['--early-stopping-epochs', trainer['early_stopping_epochs']], - 'learning_rate': ['--learning-rate', trainer['learning_rate']], - 'num_warmup_steps': ['--num-warmup-steps', trainer['num_warmup_steps']], - 'weight_decay': ['--weight-decay', trainer['weight_decay']], - 'per_device_train_batch_size': ['--per-device-train-batch-size', - trainer['per_device_train_batch_size']], - 'per_device_eval_batch_size': ['--per-device-eval-batch-size', - trainer['per_device_eval_batch_size']], - 'gradient_accumulation_steps': ['--gradient-accumulation-steps', - trainer['gradient_accumulation_steps']], - 'random_seed': ['--random-seed', trainer['random_seed']], - 'parallel': ['--parallel', trainer['parallel']], - 'load_best_model_at_end': ['--load-best-model-at-end', trainer['load_best_model_at_end']], - 'alpha': ['--alpha', trainer['alpha']], - 'num_train_adv_examples': ['--num-train-adv-examples', trainer['num_train_adv_examples']], - 'query_budget_train': ['--query-budget-train', trainer['query_budget_train']], - 'attack_num_workers_per_device': ['--attack-num-workers-per-device', - trainer['attack_num_workers_per_device']], - 'output_dir': ['--output-dir', trainer['output_dir']], - 'checkpoint_interval_steps': ['--checkpoint-interval-steps', - trainer['checkpoint_interval_steps']], - 'checkpoint_interval_epochs': ['--checkpoint-interval-epochs', - trainer['checkpoint_interval_epochs']], - 'save_last': ['--save-last', trainer['save_last']], - 'log_to_tb': ['--log-to-tb', trainer['log_to_tb']], - 'tb_log_dir': ['--tb-log-dir', trainer['tb_log_dir']], - 'log_to_wandb': ['--log-to-wandb', trainer['log_to_wandb']], - 'wandb_project': ['--wandb-project', trainer['wandb_project']], - 'logging_interval_step': ['--logging-interval-step', - trainer['logging_interval_step']] - } - - # only include variables that are defined - maps = {key: value for key, value in maps.items() if value[1] is not None} - for k, v in maps.items(): - var_list.extend(v) - - var_list = [str(iter_) for iter_ in var_list if type(iter_) is not bool] - print(var_list) - - # run the command - st.markdown('### Outputs') - try: - results = subprocess.run(var_list, capture_output=True) - except Exception as ex: - st.error(ex) - else: - st.markdown('#### Outputs') - try: - results.check_returncode() - st.write(results.stdout) - except subprocess.CalledProcessError: - st.error('Error: Command cannot be executed. Try again.') - finally: - if os.path.exist(os.path.join(os.getcwd(), trainer['output_dir'])): - st.success(f'Successfully trained model! Model saved in {os.getcwd()}' - f'{trainer["output_dir"]}.') - else: - st.error('Error: Model is not saved.') - - elif trainer['MODEL_MODE'] == 'Evaluation': - st.markdown('## Options') - trainer['SAVE'] = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name ' - 'and content being downloaded again, a unique file ' - 'identifier is tacked onto the filename.') - trainer['VERBOSE'] = st.checkbox('Display Outputs?') - - if trainer['VERBOSE']: - trainer['VERBOSITY'] = st.slider('Data points', - key='Data points to display?', - min_value=0, - max_value=1000, - value=20, - help='Select 0 to display all Data Points') - trainer['ADVANCED_ANALYSIS'] = st.checkbox('Display Advanced DataFrame Statistics?', - help='This option will analyse your DataFrame and display ' - 'advanced statistics on it. Note that this will require ' - 'some time and processing power to complete. Deselect this ' - 'option if this if you do not require it.') - - st.markdown('## Upload Prediction Data and Model\n') - st.markdown('### Prediction Data') - col3, col3_ = st.columns(2) - - trainer['FILE'] = col3.selectbox('Select the Size of File to Load', ('Local', 'Online'), - help='Choose "Local" if you wish to upload a file from your machine or choose ' - '"Online" if you wish to pull a file from any one of the supported Cloud ' - 'Service Providers.') - trainer['MODE'] = col3_.selectbox('Define the Data Input Format', ('CSV', 'XLSX', 'PKL', 'JSON', 'HDF5')) - - if trainer['FILE'] == 'Local': - trainer['PRED_FILEPATH'] = st.file_uploader(f'Load {trainer["MODE"]} File', type=[trainer['MODE']]) - if trainer['PRED_FILEPATH'] is not None: - trainer['PRED_DATA'] = readFile(trainer['PRED_FILEPATH'], trainer['MODE']) - if not trainer['PRED_DATA'].empty: - trainer['PRED_DATA'] = trainer['PRED_DATA'].astype(str) - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - else: - trainer['PRED_DATA'] = pd.DataFrame() - - elif trainer['FILE'] == 'Online': - st.info(f'File Format Selected: **{trainer["MODE"]}**') - trainer['CSP'] = st.selectbox('CSP', ('Select a CSP', 'Azure', 'Amazon', 'Google')) - - if trainer['CSP'] == 'Azure': - azure = csp_downloaders.AzureDownloader() - if azure.SUCCESSFUL: - try: - azure.downloadBlob() - trainer['PRED_DATA'] = readFile(azure.AZURE_DOWNLOAD_PATH, trainer['MODE']) - except Exception as ex: - st.error(f'Error: {ex}. Try again.') - - if not trainer['PRED_DATA'].empty: - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - - elif trainer['CSP'] == 'Amazon': - aws = csp_downloaders.AWSDownloader() - if aws.SUCCESSFUL: - try: - aws.downloadFile() - trainer['PRED_DATA'] = readFile(aws.AWS_FILE_NAME, trainer['MODE']) - except Exception as ex: - st.error(f'Error: {ex}. Try again.') - - if not trainer['PRED_DATA'].empty: - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - - elif trainer['CSP'] == 'Google': - gcs = csp_downloaders.GoogleDownloader() - if gcs.SUCCESSFUL: - try: - gcs.downloadBlob() - trainer['PRED_DATA'] = readFile(gcs.GOOGLE_DESTINATION_FILE_NAME, trainer['MODE']) - except Exception as ex: - st.error(f'Error: {ex}. Try again.') - - if not trainer['PRED_DATA'].empty: - trainer['DATA_COLUMN'] = st.selectbox('Choose Column where Data is Stored', - list(trainer['PRED_DATA'].columns)) - st.success(f'Data Loaded from {trainer["DATA_COLUMN"]}!') - - st.markdown('### Model\n' - 'Due to the tendency for model files to be larger than the 200 MB limit of the File Uploader ' - 'Widget, you will need to provide a path to the model. The following text input widget will ' - 'display the current working directory where this app is launched from.') - trainer['MODEL_PATH'] = st.text_input('Key in the path to the model below', - value=os.getcwd(), - key='model_path') - if os.path.exists(trainer['MODEL_PATH']): - st.success(f'File Path {trainer["MODEL_PATH"]} exists!') - trainer['PATH_EXIST'] = True - else: - st.error(f'Error: {trainer["MODEL_PATH"]} is invalid!') - trainer['PATH_EXIST'] = False - - # PREDICTIONS - st.markdown('## Prediction') - st.markdown('Ensure that all your data is properly loaded before proceeding.') - - if st.button('Proceed?'): - if trainer['PATH_EXIST']: - trainer['PRED_DATA'] = trainer['PRED_DATA'][[trainer['DATA_COLUMN']]] - trainer['PRED_DATA'] = trainer['PRED_DATA'].to_list() - - try: - trainer['ML_MODEL'] = torch.load('MODEL_PATH') - predictions = trainer['ML_MODEL'](trainer['PRED_DATA']) - except Exception as ex: - st.error(ex) - else: - st.markdown('### Predicted Data') - st.write(predictions) - else: - st.error('Error: Model File Path is not valid. Try again.') diff --git a/pyfiles/pages/toolkit_nlp.py b/pyfiles/pages/toolkit_nlp.py index 519e571..cb2e387 100644 --- a/pyfiles/pages/toolkit_nlp.py +++ b/pyfiles/pages/toolkit_nlp.py @@ -20,14 +20,11 @@ import pyLDAvis.gensim_models import pyLDAvis.sklearn import streamlit.components.v1 -import textattack.models.wrappers import torch -import tensorflow as tf import matplotlib.pyplot as plt import transformers from streamlit_tags import st_tags -from datetime import datetime from config import toolkit from operator import itemgetter from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline, AutoModelForSequenceClassification diff --git a/requirements.txt b/requirements.txt index 6fca680..59442fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,17 +27,8 @@ transformers~=4.10.2 pytorch-lightning==1.5.4 pathlib~=1.0.1 pyyaml~=5.4.1 -tensorflow~=2.7.0 -mlxtend~=0.19.0 matplotlib~=3.4.3 -textattack~=0.3.4 datetime~=4.3 scikit-learn~=0.24.2 -tensorflow-text~=2.7.3 -fastapi~=0.70.1 -uvicorn~=0.16.0 -aiofiles~=0.8.0 -python-multipart~=0.0.5 +pillow~=9.0.0 streamlit-tags~=1.2.8 -pillow~=8.3.2 -cython~=0.29.26 diff --git a/utils/helper.py b/utils/helper.py index 8926461..cd7d583 100644 --- a/utils/helper.py +++ b/utils/helper.py @@ -6,7 +6,6 @@ # | IMPORT RELEVANT LIBRARIES | # # -------------------------------------------------------------------------------------------------------------------- # import io -import logging import os import typing import nltk @@ -21,7 +20,6 @@ import pickle import uuid import re -import urllib.parse from collections import Counter from heapq import nlargest @@ -29,7 +27,6 @@ from PIL import Image from nltk.stem import WordNetLemmatizer from streamlit_pandas_profiling import st_profile_report -from config import toolkit # -------------------------------------------------------------------------------------------------------------------- # # | DOWNLOAD DEPENDENCIES | #