From 9169955bfa9b50c8cdfd1b5a7d7aba98e7a70c8b Mon Sep 17 00:00:00 2001 From: asdfghjkxd <73705042+asdfghjkxd@users.noreply.github.com> Date: Sat, 4 Dec 2021 22:44:14 +0800 Subject: [PATCH] Mulitple Fixes and Feature Additions Fixed bugs with file downloads and added a data modification feature into the app --- pyfiles/pages/document_term_matrix.py | 11 +- pyfiles/pages/load_clean_visualise.py | 289 ++++++++++++++++--------- pyfiles/pages/toolkit_nlp.py | 296 +++++++++++++++++++------- 3 files changed, 413 insertions(+), 183 deletions(-) diff --git a/pyfiles/pages/document_term_matrix.py b/pyfiles/pages/document_term_matrix.py index b6161f5..2b1fe02 100644 --- a/pyfiles/pages/document_term_matrix.py +++ b/pyfiles/pages/document_term_matrix.py @@ -164,7 +164,9 @@ def app(): 'DataFrame visualising Python packages. There is no definitive way to increase the size of the ' 'DataFrame that can be printed out due to the inherent limitation on the size of the packets sent ' 'over to and from the Streamlit server.') - SAVE = st.checkbox('Save Outputs?') + SAVE = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name and ' + 'content being downloaded again, a unique file identifier is ' + 'tacked onto the filename.') VERBOSE_DTM = st.checkbox('Display DataFrame of Document-Term Matrix?') if VERBOSE_DTM: VERBOSITY_DTM = st.slider('Data Points to Display for Document-Term Matrix?', @@ -185,9 +187,10 @@ def app(): 'This parameter is not the same as that above which controls the number of data points ' 'printed out for the raw DTM DataFrame; Select 0 to display all Data Points') ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', - help='This option will analyse your DataFrame and display advanced statistics ' - 'on it. Note that this will require some time and processing power to ' - 'complete. Deselect this option if this functionality is not required.') + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') # -------------------------------------------------------------------------------------------------------------------- # # | DOCUMENT-TERM MATRIX CREATION | # diff --git a/pyfiles/pages/load_clean_visualise.py b/pyfiles/pages/load_clean_visualise.py index 5f4d919..5027467 100644 --- a/pyfiles/pages/load_clean_visualise.py +++ b/pyfiles/pages/load_clean_visualise.py @@ -27,6 +27,7 @@ import plotly.express as px from utils import csp_downloaders from utils.helper import readFile, lemmatizeText, downloadCorpora, printDataFrame +from st_aggrid import AgGrid, DataReturnMode, GridUpdateMode, GridOptionsBuilder # -------------------------------------------------------------------------------------------------------------------- # # | INITIAL SETUP | # @@ -88,6 +89,9 @@ QUERY_MODE = None QUERY_DATA = pd.DataFrame() FC = 0 +MOD_MODE = 'Country Extraction' +FIXED_KEY = True +HEIGHT = 400 # -------------------------------------------------------------------------------------------------------------------- # @@ -104,7 +108,7 @@ def app(): global FILE, MODE, DATA_PATH, DATA, CSP, CLEAN, CLEAN_MODE, SAVE, VERBOSE, VERBOSITY, CLEANED_DATA, \ CLEANED_DATA_TOKENIZED, SIMPLE_PIPELINE, ADVANCED_ANALYSIS, FINALISED_DATA_LIST, DATA_COLUMN, QUERY, \ TOKENIZE, EXTEND_STOPWORD, STOPWORD_LIST, ENGLISH_WORDS, FINALISE, ANALYSIS_MODE, WORLD_MAP, GLOBE_DATA, \ - GLOBE_FIG, QUERY_MODE, QUERY_DATA, MATCH, FC, QUERY_SUCCESS + GLOBE_FIG, QUERY_MODE, QUERY_DATA, MATCH, FC, QUERY_SUCCESS, MOD_MODE, FIXED_KEY, HEIGHT # -------------------------------------------------------------------------------------------------------------------- # # | INIT | # @@ -158,7 +162,7 @@ def app(): DATA_PATH = st.file_uploader(f'Load {MODE} File', type=[MODE]) if DATA_PATH is not None: DATA = readFile(DATA_PATH, MODE) - if not DATA.empty: + if not DATA.empty and MOD_MODE is not 'Inplace Data Modification': DATA_COLUMN = st.selectbox('Choose Column where Data is Stored', list(DATA.columns)) st.success(f'Data Loaded from {DATA_COLUMN}!') else: @@ -179,7 +183,7 @@ def app(): DATA = pd.DataFrame() st.error(f'Error: {ex}. Try again.') - if not DATA.empty: + if not DATA.empty and MOD_MODE is not 'Inplace Data Modification': DATA_COLUMN = st.selectbox('Choose Column where Data is Stored', list(DATA.columns)) st.success(f'Data Loaded from {DATA_COLUMN}!') @@ -193,7 +197,7 @@ def app(): DATA = pd.DataFrame() st.error(f'Error: {ex}. Try again.') - if not DATA.empty: + if not DATA.empty and MOD_MODE is not 'Inplace Data Modification': DATA_COLUMN = st.selectbox('Choose Column where Data is Stored', list(DATA.columns)) st.success(f'Data Loaded from {DATA_COLUMN}!') @@ -207,7 +211,7 @@ def app(): DATA = pd.DataFrame() st.error(f'Error: {ex}. Try again.') - if not DATA.empty: + if not DATA.empty and MOD_MODE is not 'Inplace Data Modification': DATA_COLUMN = st.selectbox('Choose Column where Data is Stored', list(DATA.columns)) st.success(f'Data Loaded from {DATA_COLUMN}!') @@ -215,29 +219,33 @@ def app(): # -------------------------------------------------------------------------------------------------------------------- # # | PROCESSING FLAGS | # # -------------------------------------------------------------------------------------------------------------------- # - st.markdown('## Flags\n' - 'Note that there is an size limit **(50 MB)** for the DataFrames that are printed to screen. If ' - 'you get an error telling you that the DataFrame size is too large to proceed, kindly lower the number ' - 'of data points you wish to visualise or download the file and visualise it through Excel or any other ' - 'DataFrame visualising Python packages. There is no definitive way to increase the size of the ' - 'DataFrame that can be printed out due to the inherent limitation on the size of the packets sent ' - 'over to and from the Streamlit server.') - SAVE = st.checkbox('Save Outputs?', help='Note: Only Simple and Complex Cleaning modes will produce any saved ' - 'outputs. If None mode is chosen, you will not be able to download ' - 'the outputs as it is assumed that you already possess that.') - VERBOSE = st.checkbox('Display Outputs?') - if VERBOSE: - VERBOSITY = st.slider('Data Points To Print', - key='Data points to display?', - min_value=0, - max_value=1000, - value=20, - help='Select 0 to display all Data Points') - ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', - help='This option will analyse your DataFrame and display advanced ' - 'statistics on it. Note that this will require some time and ' - 'processing power to complete. Deselect this option if this if ' - 'you do not require ') + if MOD_MODE is not 'Inplace Data Modification': + st.markdown('## Flags\n' + 'Note that there is an size limit **(50 MB)** for the DataFrames that are printed to screen. If ' + 'you get an error telling you that the DataFrame size is too large to proceed, kindly lower the ' + 'number of data points you wish to visualise or download the file and visualise it through Excel ' + 'or any other DataFrame visualising Python packages. There is no definitive way to increase the ' + 'size of the DataFrame that can be printed out due to the inherent limitation on the size of the ' + 'packets sent over to and from the Streamlit server.') + SAVE = st.checkbox('Save Outputs?', help='Note: Only Simple and Complex Cleaning modes will produce any saved ' + 'outputs. If None mode is chosen, you will not be able to download ' + 'the outputs as it is assumed that you already possess that.\n\n' + 'Additionally, due to the possibility of files with the same file ' + 'name and content being downloaded again, a unique file identifier is ' + 'tacked onto the filename.') + VERBOSE = st.checkbox('Display Outputs?') + if VERBOSE: + VERBOSITY = st.slider('Data Points To Print', + key='Data points to display?', + min_value=0, + max_value=1000, + value=20, + help='Select 0 to display all Data Points') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') if ANALYSIS_MODE == 'Data Cleaning': CLEAN_MODE = st.selectbox('Select Preprocessing Pipelines', ('None', 'Simple', 'Complex'), @@ -272,9 +280,16 @@ def app(): elif ANALYSIS_MODE == 'Data Modification': st.markdown('This module will allow you to modify the data passed in by performing certain elementary ' 'analysis on the data. So far, we have implemented the ability to extract the countries ' - 'mentioned in your data and to plot out the Data Points on a World Map.') - if VERBOSE: - WORLD_MAP = st.checkbox('Generate a World Map Representation of the Countries Mentioned?', value=True) + 'mentioned in your data and to plot out the Data Points on a World Map and the ability to ' + 'modify a single value of the inputted DataFrame in place.') + st.markdown('## Data Modification Mode') + MOD_MODE = st.selectbox('Choose Mode', ('Country Extraction', 'Inplace Data Modification')) + if MOD_MODE == 'Country Extraction': + if VERBOSE: + WORLD_MAP = st.checkbox('Generate a World Map Representation of the Countries Mentioned?', value=True) + elif MOD_MODE == 'Inplace Data Modification': + FIXED_KEY = st.checkbox('Use Fixed Key for Editing Table?') + HEIGHT = st.number_input('Height of Table', min_value=100, max_value=800, value=400) elif ANALYSIS_MODE == 'Data Query': MATCH = st.checkbox('Query Must Match Exactly?', help='Select this option if you want your query string/' @@ -553,82 +568,156 @@ def app(): # | COUNTRY EXTRACTION | # # -------------------------------------------------------------------------------------------------------------------- # elif ANALYSIS_MODE == 'Data Modification': - st.markdown('---') - st.markdown('## Country Extraction\n' - 'This module will take in a DataFrame containing all documents meant for NLP Analysis and return ' - 'a new DataFrame whereby countries mentioned in the documents will be extracted. Users can then ' - 'choose to either output a set of countries mentioned in the document, or generate a graphical ' - 'representation of the frequency of country name occurrence within the set of documents passed ' - 'to it.') + if MOD_MODE == 'Country Extraction': + st.markdown('---') + st.markdown('## Country Extraction\n' + 'This module will take in a DataFrame containing all documents meant for NLP Analysis and ' + 'return a new DataFrame whereby countries mentioned in the documents will be extracted. Users ' + 'can then choose to either output a set of countries mentioned in the document, or generate a ' + 'graphical representation of the frequency of country name occurrence within the set of ' + 'documents passed to it.') + + if FILE == 'Small File(s)': + if DATA_PATH: + st.info('File loaded.') + else: + st.warning('File has not been loaded.') + elif FILE == 'Large File(s)': + if not DATA.empty: + st.info('File loaded.') + else: + st.warning('File has not been loaded.') - if FILE == 'Small File(s)': - if DATA_PATH: - st.info('File loaded.') - else: - st.warning('File has not been loaded.') - elif FILE == 'Large File(s)': - if not DATA.empty: - st.info('File loaded.') - else: - st.warning('File has not been loaded.') + if st.button('Begin Country Extraction', key='country'): + GLOBE_DATA = pd.DataFrame() + GLOBE_FIG = None - if st.button('Begin Country Extraction', key='country'): - GLOBE_DATA = pd.DataFrame() - GLOBE_FIG = None + if not DATA.empty: + DATA = DATA.astype(object) + DATA['COUNTRIES'] = DATA[DATA_COLUMN].astype(str).apply(lambda x: [country.name for country in + pycountry.countries if + country.name.lower() in x.lower()]) + new_list = DATA['COUNTRIES'].to_list() + temp = [] + for ls in new_list: + temp.extend(ls) + zipped = list(zip(Counter(temp).keys(), Counter(temp).values())) + + GLOBE_DATA = pd.DataFrame(data=zipped, index=range(len(zipped)), columns=['country', 'count']) + GLOBE_FIG = px.scatter_geo(data_frame=GLOBE_DATA, projection='natural earth', color='country', + locations='country', size='count', hover_name='country', + locationmode='country names', title='Country Name Mention Frequency') - if not DATA.empty: - DATA = DATA.astype(object) - DATA['COUNTRIES'] = DATA[DATA_COLUMN].astype(str).apply(lambda x: [country.name for country in - pycountry.countries if - country.name.lower() in x.lower()]) - new_list = DATA['COUNTRIES'].to_list() - temp = [] - for ls in new_list: - temp.extend(ls) - zipped = list(zip(Counter(temp).keys(), Counter(temp).values())) - - GLOBE_DATA = pd.DataFrame(data=zipped, index=range(len(zipped)), columns=['country', 'count']) - GLOBE_FIG = px.scatter_geo(data_frame=GLOBE_DATA, projection='natural earth', color='country', - locations='country', size='count', hover_name='country', - locationmode='country names', title='Country Name Mention Frequency') - - if VERBOSE: - st.markdown('## Country Name Mention Frequency') - printDataFrame(data=GLOBE_DATA, verbose_level=VERBOSITY, - advanced=ADVANCED_ANALYSIS) - if WORLD_MAP: - st.markdown('## World Map Representation') - st.plotly_chart(GLOBE_FIG) - - if SAVE: - try: - st.markdown('---') - st.markdown('## Download Data') - st.markdown('### Country Data') - st.markdown(f'Download data from [downloads/globe_data.csv]' - f'(downloads/globe_data_id{FC}.csv)') - DATA.to_csv(str(DOWNLOAD_PATH / f'globe_data_id{FC}.csv'), index=False) - FC += 1 + if VERBOSE: + st.markdown('## Country Name Mention Frequency') + printDataFrame(data=GLOBE_DATA, verbose_level=VERBOSITY, + advanced=ADVANCED_ANALYSIS) + if WORLD_MAP: + st.markdown('## World Map Representation') + st.plotly_chart(GLOBE_FIG) + + if SAVE: + try: + st.markdown('---') + st.markdown('## Download Data') + st.markdown('### Country Data') + st.markdown(f'Download data from [downloads/globe_data.csv]' + f'(downloads/globe_data_id{FC}.csv)') + DATA.to_csv(str(DOWNLOAD_PATH / f'globe_data_id{FC}.csv'), index=False) + FC += 1 + + st.markdown('### Country Data (Concatenated)') + st.markdown(f'Download data from [downloads/globe_data_concat.csv]' + f'(downloads/globe_data_concat_id{FC}.csv)') + GLOBE_DATA.to_csv(str(DOWNLOAD_PATH / f'globe_data_concat_id{FC}.csv'), index=False) + FC += 1 - st.markdown('### Country Data (Concatenated)') - st.markdown(f'Download data from [downloads/globe_data_concat.csv]' - f'(downloads/globe_data_concat_id{FC}.csv)') - GLOBE_DATA.to_csv(str(DOWNLOAD_PATH / f'globe_data_concat_id{FC}.csv'), index=False) + if WORLD_MAP: + st.markdown('### World Map Representation') + st.markdown(f'Download data from [downloads/map.png]' + f'(downloads/map_id{FC}.png)') + GLOBE_FIG.write_image(str(DOWNLOAD_PATH / f'map_id{FC}.png')) + FC += 1 + except ValueError: + st.warning('Error: Not connected to the Internet. Plot may not be generated properly. ' + 'Connect to the Internet and try again.') + except Exception as ex: + st.error(f'Error: Unknown Fatal Error -> {ex}') + else: + st.error('Error: No files loaded.') + + elif MOD_MODE == 'Inplace Data Modification': + st.markdown('---') + st.markdown('## Inplace Data Modification\n' + 'This function uses the AgGrid module to create editable tables for your to edit your ' + 'DataFrame as you would with an Excel sheet.') + + if FILE == 'Small File(s)': + if DATA_PATH: + st.info('File loaded.') + gb = GridOptionsBuilder.from_dataframe(DATA) + gb.configure_columns(DATA.columns, editable=True) + go = gb.build() + + if FIXED_KEY: + ag = AgGrid( + DATA, + gridOptions=go, + height=HEIGHT, + fit_columns_on_grid_load=True, + key='data', + reload_data=False + ) + else: + ag = AgGrid( + DATA, + gridOptions=go, + height=HEIGHT, + fit_columns_on_grid_load=True + ) + + if st.button('Generate Modified Data'): + st.markdown('### Modified Data') + st.markdown(f'Download data from [downloads/modified_data.csv]' + f'(downloads/modified_data_id{FC}.csv)') + ag['data'].to_csv(str(DOWNLOAD_PATH / f'modified_data_id{FC}.csv'), index=False) FC += 1 + else: + st.warning('File has not been loaded.') - if WORLD_MAP: - st.markdown('### World Map Representation') - st.markdown(f'Download data from [downloads/map.png]' - f'(downloads/map_id{FC}.png)') - GLOBE_FIG.write_image(str(DOWNLOAD_PATH / f'map_id{FC}.png')) + elif FILE == 'Large File(s)': + if not DATA.empty: + st.info('File loaded.') + gb = GridOptionsBuilder.from_dataframe(DATA) + gb.configure_columns(DATA.columns, editable=True) + go = gb.build() + + if FIXED_KEY: + ag = AgGrid( + DATA, + gridOptions=go, + height=HEIGHT, + fit_columns_on_grid_load=True, + key='data', + reload_data=False + ) + else: + ag = AgGrid( + DATA, + gridOptions=go, + height=HEIGHT, + fit_columns_on_grid_load=True + ) + + if st.button('Generate Modified Data'): + if SAVE: + st.markdown('### Modified Data') + st.markdown(f'Download data from [downloads/modified_data.csv]' + f'(downloads/modified_data_id{FC}.csv)') + ag['data'].to_csv(str(DOWNLOAD_PATH / f'modified_data_id{FC}.csv'), index=False) FC += 1 - except ValueError: - st.warning('Error: Not connected to the Internet. Plot may not be generated properly. ' - 'Connect to the Internet and try again.') - except Exception as ex: - st.error(f'Error: Unknown Fatal Error -> {ex}') - else: - st.error('Error: No files loaded.') + else: + st.warning('File has not been loaded.') # -------------------------------------------------------------------------------------------------------------------- # diff --git a/pyfiles/pages/toolkit_nlp.py b/pyfiles/pages/toolkit_nlp.py index 9aebd32..207189d 100644 --- a/pyfiles/pages/toolkit_nlp.py +++ b/pyfiles/pages/toolkit_nlp.py @@ -22,6 +22,9 @@ import pyLDAvis.gensim_models import pyLDAvis.sklearn import streamlit.components.v1 +import tokenizers +import torch +from transformers import AutoTokenizer, AutoModelWithLMHead from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer @@ -96,6 +99,9 @@ PLOT = False W_PLOT = False FC = 0 +MIN_WORDS = 80 +MAX_TENSOR = 512 +SUM_MODE = 'Basic' # -------------------------------------------------------------------------------------------------------------------- # @@ -114,7 +120,8 @@ def app(): NLP_MODEL, DATA_COLUMN, NLP, ONE_DATAPOINT, DATAPOINT_SELECTOR, NLP_TOPIC_MODEL, MIN_DF, MAX_DF, MAX_ITER, \ NMF_MODEL, LSI_MODEL, TFIDF_MODEL, TFIDF_VECTORISED, MAR_FIG, WORD_FIG, CV, VECTORISED, \ COLOUR, COLOUR_BCKGD, COLOUR_TXT, TOPIC_TEXT, LDA_VIS_STR, WIDTH, HEIGHT, SVG, HAC_PLOT, WORKER, MAX_FEATURES, \ - KW, TOPIC_FRAME, ALPHA, L1_RATIO, PLOT, W_PLOT, HAC_PLOT1, LDA_DATA, LSI_DATA, FC + KW, TOPIC_FRAME, ALPHA, L1_RATIO, PLOT, W_PLOT, HAC_PLOT1, LDA_DATA, LSI_DATA, FC, MIN_WORDS, MAX_TENSOR, \ + SUM_MODE # -------------------------------------------------------------------------------------------------------------------- # # | INIT | # @@ -339,7 +346,11 @@ def app(): else: st.info('You are conducting NER on the entire dataset. Only DataFrame is printed. NER output will be ' 'automatically saved.') - ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') SAVE = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name and ' 'content being downloaded again, a unique file identifier is ' 'tacked onto the filename.') @@ -446,7 +457,11 @@ def app(): value=20, help='Select 0 to display all Data Points') ONE_DATAPOINT = st.checkbox('Visualise One Data Point?') - ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') if ONE_DATAPOINT: DATAPOINT_SELECTOR = st.selectbox('Choose Data Point From Data', range(len(DATA))) COLOUR_BCKGD = st.color_picker('Choose Colour of Render Background', value='#000000') @@ -515,95 +530,209 @@ def app(): elif APP_MODE == 'Summarise': st.markdown('---') st.markdown('# Summarization of Text') - st.markdown('Note that this module takes a long time to process a long piece of text. If you intend to process ' - 'large chunks of text, prepare to wait for hours for the summarization process to finish. We are ' - 'looking to implement multiprocessing into the app to optimise it.\n\n' + st.markdown('For this function, you are able to upload a piece of document or multiple pieces of documents ' + 'in a CSV file to create a summary for the documents of interest.\n\n' + 'However, do note that this module takes a long time to process a long piece of text. ' + 'If you intend to process large chunks of text, prepare to wait for hours for the summarization ' + 'process to finish. We are looking to implement multiprocessing into the app to optimise it.\n\n' 'In the meantime, it may be better to process your data in smaller batches to speed up your ' - 'workflow.') - - # FLAGS - st.markdown('## Flags') - st.markdown('Select one model to use for your NLP Processing. Choose en_core_web_sm for a model that is ' - 'optimised for efficiency or en_core_web_lg for a model that is optimised for accuracy.') - NLP_MODEL = st.radio('Select spaCy model', ('en_core_web_sm', 'en_core_web_lg'), - help='Be careful when using the Accuracy Model. Due to caching issues in the app, ' - 'changes to the column where data is extracted from in the file uploaded will ' - 'reset the portions of the app that follows.') - if NLP_MODEL == 'en_core_web_sm': - try: - NLP = spacy.load('en_core_web_sm') - except OSError: - st.warning('Model not found, downloading...') + 'workflow.\n\n' + 'In an effort to enhance this module to provide users with meaningful summaries of their document, ' + ' we have implemented two modes of summarization in this function, namely Basic and Advanced ' + 'Mode.\n ' + '**Basic Mode** uses the spaCy package to distill your documents into the specified number of ' + 'sentences. No machine learning model was used to produce a unique summary of the text.\n' + '**Advanced Mode** uses the Pytorch and Huggingface Transformers library to produce summaries ' + 'using Google\'s T5 Model.') + + st.markdown('## Summary Complexity') + SUM_MODE = st.selectbox('Choose Mode', ('Basic', 'Advanced')) + + if SUM_MODE == 'Basic': + # FLAGS + st.markdown('## Flags') + st.markdown('Select one model to use for your NLP Processing. Choose en_core_web_sm for a model that is ' + 'optimised for efficiency or en_core_web_lg for a model that is optimised for accuracy.') + NLP_MODEL = st.radio('Select spaCy model', ('en_core_web_sm', 'en_core_web_lg'), + help='Be careful when using the Accuracy Model. Due to caching issues in the app, ' + 'changes to the column where data is extracted from in the file uploaded will ' + 'reset the portions of the app that follows.') + if NLP_MODEL == 'en_core_web_sm': try: - os.system('python -m spacy download en_core_web_sm') + NLP = spacy.load('en_core_web_sm') + except OSError: + st.warning('Model not found, downloading...') + try: + os.system('python -m spacy download en_core_web_sm') + except Exception as ex: + st.error(f'Unable to download Model. Error: {ex}') except Exception as ex: - st.error(f'Unable to download Model. Error: {ex}') - except Exception as ex: - st.error(f'Unknown Error: {ex}. Try again.') - else: - st.info('Efficiency Model Loaded!') - elif NLP_MODEL == 'en_core_web_lg': - try: - NLP = spacy.load('en_core_web_lg') - except OSError: - st.warning('Model not found, downloading...') + st.error(f'Unknown Error: {ex}. Try again.') + else: + st.info('Efficiency Model Loaded!') + elif NLP_MODEL == 'en_core_web_lg': try: - os.system('python -m spacy download en_core_web_lg') + NLP = spacy.load('en_core_web_lg') + except OSError: + st.warning('Model not found, downloading...') + try: + os.system('python -m spacy download en_core_web_lg') + except Exception as ex: + st.error(f'Unable to download Model. Error: {ex}') except Exception as ex: - st.error(f'Unable to download Model. Error: {ex}') - except Exception as ex: - st.error(f'Unknown Error: {ex}. Try again.') - else: - st.info('Accuracy Model Loaded!') - SENT_LEN = st.number_input('Enter the total number of sentences to summarise text to', - min_value=1, - max_value=100, - value=3) - SAVE = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name and ' - 'content being downloaded again, a unique file identifier is ' - 'tacked onto the filename.') - VERBOSE = st.checkbox('Display Outputs?') - if VERBOSE: - VERBOSITY = st.slider('Data points', - key='Data points to display?', - min_value=0, - max_value=1000, - value=20, - help='Select 0 to display all Data Points') - ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?') + st.error(f'Unknown Error: {ex}. Try again.') + else: + st.info('Accuracy Model Loaded!') + SENT_LEN = st.number_input('Enter the total number of sentences to summarise text to', + min_value=1, + max_value=100, + value=3) + SAVE = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name and ' + 'content being downloaded again, a unique file identifier is ' + 'tacked onto the filename.') + VERBOSE = st.checkbox('Display Outputs?') + if VERBOSE: + VERBOSITY = st.slider('Data points', + key='Data points to display?', + min_value=0, + max_value=1000, + value=20, + help='Select 0 to display all Data Points') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') + + # MAIN PROCESSING + if st.button('Summarise Text', key='runner'): + if not DATA.empty: + try: + # CLEAN UP AND STANDARDISE DATAFRAMES + DATA = DATA[[DATA_COLUMN]] + DATA['SUMMARY'] = np.nan + DATA = DATA.astype(str) + except KeyError: + st.error('Warning: CLEANED CONTENT is not found in the file uploaded. Try again.') + except Exception as ex: + st.error(ex) + else: + stopwords = list(STOP_WORDS) + pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] + DATA['SUMMARY'] = DATA[DATA_COLUMN].apply(lambda x: summarise(x, stopwords, pos_tag, NLP, SENT_LEN)) + + # SHOW DATASETS + if VERBOSE: + st.markdown('## Summary DataFrame') + printDataFrame(data=DATA, verbose_level=VERBOSITY, advanced=ADVANCED_ANALYSIS) + + # SAVE DATA + if SAVE: + st.markdown('---') + st.markdown('## Download Summarised Data') + st.markdown('Download summarised data from [downloads/summarised.csv]' + f'(downloads/summarised_id{FC}.csv)') + DATA.to_csv(str(DOWNLOAD_PATH / f'summarised_id{FC}.csv'), index=False) + FC += 1 + else: + st.error('Error: Data not loaded properly. Try again.') + + elif SUM_MODE == 'Advanced': + # FLAGS + st.markdown('## Flags') + st.markdown('Choose the minimum and maximum number of words to summarise to below. If you are an ' + 'advanced user, you may choose to modify the number of input tensors for the model. If ' + 'you do not wish to modify the setting, a default value of 512 will be used for your ' + 'summmary.\n\n' + 'If your system has a GPU enabled, you may wish to install the GPU (CUDA) enabled version ' + 'of PyTorch. If so, click on the expander below to install the correct version of PyTorch ' + 'and to check if your GPU is enabled.') + with st.expander('GPU-enabled Features'): + col1, col2 = st.columns(2) + with col1: + st.markdown('### PyTorch for CUDA 10.2') + if st.button('Install Relevant Packages', key='10.2'): + os.system('pip3 install torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102' + ' -f https://download.pytorch.org/whl/cu102/torch_stable.html') + with col2: + st.markdown('### PyTorch for CUDA 11.3') + if st.button('Install Relevant Packages', key='11.3'): + os.system('pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113' + ' -f https://download.pytorch.org/whl/cu113/torch_stable.html') + st.markdown('---') + if st.button('Check if GPU is properly installed'): + st.info(f'GPU Installation Status: **{torch.cuda.is_available()}**') + if st.button('Check GPU used'): + try: + st.info(f'GPU Device **{torch.cuda.get_device_name(torch.cuda.current_device())}** in use.') + except AssertionError: + st.error('Your version of PyTorch is CPU-optimised. Download and install any of the above two ' + 'supported GPU-enabled PyTorch versions to use your GPU and silence this error.') + except Exception as ex: + st.error(ex) - # MAIN PROCESSING - if st.button('Summarise Text', key='runner'): - if not DATA.empty: - try: - # CLEAN UP AND STANDARDISE DATAFRAMES - DATA = DATA[[DATA_COLUMN]] - DATA['SUMMARY'] = np.nan + SAVE = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name and ' + 'content being downloaded again, a unique file identifier is ' + 'tacked onto the filename.') + VERBOSE = st.checkbox('Display Outputs?') + + if VERBOSE: + VERBOSITY = st.slider('Data points', + key='Data points to display?', + min_value=0, + max_value=1000, + value=20, + help='Select 0 to display all Data Points') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') + + MIN_WORDS = st.number_input('Key in the minimum number of words to summarise to', + min_value=1, + max_value=1000, + value=80) + MAX_WORDS = st.number_input('Key in the maximum number of words to summarise to', + min_value=80, + max_value=1000, + value=150) + MAX_TENSOR = st.number_input('Key in the maximum number of vectors to consider', + min_value=1, + max_value=10000, + value=512) + + if st.button('Summarise', key='summary_t5'): + tokenizer = AutoTokenizer.from_pretrained('t5-base') + model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) + + if not DATA.empty: + # to work with tensors, we need to convert the dataframe to a complex datatype + DATA = DATA.astype(object) + DATA['ENCODED'] = DATA[DATA_COLUMN].apply(lambda x: tokenizer.encode('summarize: ' + x, + return_tensors='pt', + max_length=MAX_TENSOR, + truncation=True)) + DATA['OUTPUTS'] = DATA['ENCODED'].apply(lambda x: model.generate(x, + max_length=MAX_WORDS, + min_length=MIN_WORDS, + length_penalty=5.0, + num_beams=2)) + DATA['SUMMARISED'] = DATA['OUTPUTS'].apply(lambda x: tokenizer.decode(x[0])) + DATA.drop(columns=['ENCODED', 'OUTPUTS'], inplace=True) DATA = DATA.astype(str) - except KeyError: - st.error('Warning: CLEANED CONTENT is not found in the file uploaded. Try again.') - except Exception as ex: - st.error(ex) - else: - stopwords = list(STOP_WORDS) - pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB'] - DATA['SUMMARY'] = DATA[DATA_COLUMN].apply(lambda x: summarise(x, stopwords, pos_tag, NLP, SENT_LEN)) - # SHOW DATASETS if VERBOSE: - st.markdown('## Summary DataFrame') - printDataFrame(data=DATA, verbose_level=VERBOSITY, advanced=ADVANCED_ANALYSIS) + st.markdown('## Summarised Text') + printDataFrame(DATA, VERBOSITY, ADVANCED_ANALYSIS) - # SAVE DATA if SAVE: st.markdown('---') st.markdown('## Download Summarised Data') st.markdown('Download summarised data from [downloads/summarised.csv]' f'(downloads/summarised_id{FC}.csv)') DATA.to_csv(str(DOWNLOAD_PATH / f'summarised_id{FC}.csv'), index=False) - else: - st.error('Error: Data not loaded properly. Try again.') - + FC += 1 # -------------------------------------------------------------------------------------------------------------------- # # | SENTIMENT ANALYSIS | # @@ -622,7 +751,8 @@ def app(): help='VADER is more optimised for texts extracted from Social Media platforms ' '(where slangs and emoticons are used) while TextBlob performs better ' 'for more formal pieces of text. If you are not sure which to choose, ' - 'TextBlob is recommended.') + 'VADER is recommended due to its higher accuracy of analysis compared to ' + 'Textblob.') SAVE = st.checkbox('Save Outputs?', help='Due to the possibility of files with the same file name and ' 'content being downloaded again, a unique file identifier is ' 'tacked onto the filename.') @@ -636,7 +766,11 @@ def app(): help='Select 0 to display all Data Points') if BACKEND_ANALYSER == 'VADER': COLOUR = st.color_picker('Choose Colour of Marker to Display', value='#2ACAEA') - ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') # MAIN PROCESSING if st.button('Start Analysis', key='analysis'): @@ -829,7 +963,11 @@ def app(): max_value=1000, value=20, help='Select 0 to display all Data Points') - ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?') + ADVANCED_ANALYSIS = st.checkbox('Display Advanced DataFrame Statistics?', + help='This option will analyse your DataFrame and display advanced ' + 'statistics on it. Note that this will require some time and ' + 'processing power to complete. Deselect this option if this if ' + 'you do not require it.') NUM_TOPICS = st.number_input('Choose Number of Topics to Generate Per Text', min_value=1, max_value=100,