diff --git a/.travis.yml b/.travis.yml index b8db4f8..bbbf933 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,13 @@ +#Travis CI build configuration language: python python: "3.8" +#help: https://stackoverflow.com/questions/35972415/python-import-fails-on-travisci-but-not-locally before_install: - "pip install -U pip" - "export PYTHONPATH=$PYTHONPATH:$(pwd)" -script: +script: +#The command "sonar-scanner" exited with 1: https://travis-ci.org/ZNClub-PA-ML-AI/OctoPy-Predictor/builds/635588777#L594 # - sonar-scanner - pytest addons: diff --git a/README.md b/README.md index c61546b..0bb5e77 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ Generic platform for Prediction using basic machine learning models ```bash conda env list # show current environments conda create --name OctoPy # if OctoPy is NOT listed +conda activate OctoPy conda list # show all libraries conda install pip # if pip is NOT listed conda list > versions.txt # store all versions post install diff --git a/octopy_predictor/src/analyser.py b/octopy_predictor/src/analyser.py index b064c9b..a620866 100644 --- a/octopy_predictor/src/analyser.py +++ b/octopy_predictor/src/analyser.py @@ -4,28 +4,73 @@ """ from sklearn.metrics import explained_variance_score -class Analyser(object): - """docstring for Analyser""" - def __init__(self, arg = None): - super(Analyser, self).__init__() - self.arg = arg - self.regression_metrics = { - 'Explained Variance' : explained_variance_score - } - self.metrics = { - 'regression' : self.regression_metrics - } +REGRESSION_MODEL = 'REGRESSION' +CLASSIFICATION_MODEL = 'CLASSIFICATION' + + +class AnalyserMetric(object): + """"AnalyserMetrics: state to represent metrics for Analyser""" + + def __init__(self, + name = 'Analyser Metric', + model_type = REGRESSION_MODEL, + function = lambda x:x): + """ + fields to store different metric configuration + """ + self.name = name + self.model_type = model_type + self.function = function - def get_columns(self, df): - return df.columns.values + +class AnalyserMetricsRegistry(object): + """"AnalyserMetricsRegistry: create metrics for Analyser""" + + metrics = [ + AnalyserMetric('Explained Variance', REGRESSION_MODEL, explained_variance_score) + ] + + @staticmethod + def apply_metrics(model_type = REGRESSION_MODEL, data = {}): + expected, actual = data + + filtered_metrics = filter( + lambda member: member.model_type == model_type, + AnalyserMetricsRegistry.metrics + ) + + return { metric.name : metric.function(expected, actual) + for metric in filtered_metrics + } - def get_column_data_types(self, df): + +class Analyser(object): + """Analyser: logic to analyse data + """ + + def columns(self, df): + """ + input: DataFrame + output: list of names of columns of DataFrame + """ + return df.columns.values + + def types(self, df): + """ + input: DataFrame + output: str containing data types of columns of DataFrame + """ return str(df.dtypes).split('\n')[:-1] - def get_summary(self, df): - return df.describe() + def describe(self, df): + """ + input: DataFrame + output: DataFrame containing descriptive statistics of DataFrame + """ + return df.describe(how ='all') - def get_model_metrics(self, y_, mode): + #TODO + def model_metrics(self, y_, mode): metric_values = {} y_true, y_pred = y_[0], y_[1] @@ -33,11 +78,13 @@ def get_model_metrics(self, y_, mode): metric_values[metric_name] = metric_method(y_true, y_pred) return metric_values + #TODO def _is_categorical(self, label): """ input: """ return len(set(label)) < 10 and all(map(label, lambda x: isinstance(x, str))) + #TODO def get_model_type_by_label(self, label = []): return 'Classification' if self._is_categorical(label) else 'Regression' diff --git a/octopy_predictor/src/datagatherer.py b/octopy_predictor/src/datagatherer.py index fc73c58..851197f 100644 --- a/octopy_predictor/src/datagatherer.py +++ b/octopy_predictor/src/datagatherer.py @@ -5,37 +5,116 @@ import pandas as pd from io import StringIO from collections import namedtuple +import sqlite3 + + +# TODO +# from util import logit +# import util + + +# CONSTANTS + +FILE = 'FILE' +SQL = 'SQL' +FILE_PATH = 'FILEPATH' +CONNECTION = 'CONN' +QUERY = 'SQL' +QUERY_PARAMERTERS = 'SQLPARAMS' + + +class DataGathererInput(object): + + """ + DataGathererInput + + Usage + ---------- + DataGatherer + + """ + FILE_CONSTRAINTS = [FILE_PATH] + SQL_CONSTRAINTS = [CONNECTION, QUERY, QUERY_PARAMERTERS] + + CONSTRAINTS = { + FILE: FILE_CONSTRAINTS, + SQL: SQL_CONSTRAINTS + } + + def __init__(self, type: str): + """ + + + Parameters + ---------- + type : str + TYPE of DataGatherer. + + Returns + ------- + None. + + """ + if type not in DataGathererInput.CONSTRAINTS.keys(): + pass + # TODO Throw error + self.type = type + self.values = {} + + def add(self, key: str, value): + """ + + + Parameters + ---------- + key : str + valid keys present in CONSTRAINTS _values. + value : any + value corresponding to key. + + Returns + ------- + None. + + """ + + if key in DataGathererInput.CONSTRAINTS[self.type]: + self.values[key] = value -#from util import logit -#import util class DataGatherer(object): - """docstring for DataGatherer""" - def __init__(self, arg = None): + """docstring for DataGatherer + DataGatherer is responsible to fetch data from multiple sources + and convert it to a specific type using provided Adapters + + The defaul Adapter is DataFrame + """ + + def __init__(self, arg=None): super(DataGatherer, self).__init__() self.arg = arg - #@logit + # @logit @staticmethod def _read_from_file(file): _file_content = None try: _file_content = file.read() - #util.debug_store['file_content at datagatherer'] = _file_content + # util.debug_store['file_content'] = _file_content except IOError as io_error: - #util.debug_store['io_error at datagatherer'] = io_error.__traceback__ + # util.debug_store['io_error'] = io_error.__traceback__ raise io_error else: return _file_content - - #@logit + + # @logit @staticmethod - def _determine_resource(path): + def determine_resource(path): resource_type, file_type = None, None - + # resource type resource_type = 'web' if path.startswith('http') else 'local' - + # file type try: file_extension_index = path.rindex('.') @@ -43,45 +122,46 @@ def _determine_resource(path): # TODO: message = invalid path raise val_error else: - file_type = path[file_extension_index + 1 :] + file_type = path[file_extension_index + 1:] finally: - FileResource = namedtuple('FileResource', 'resource_type file_type') - return FileResource(resource_type = resource_type, file_type = file_type) - - - #@logit + FileResource = namedtuple('FileResource', + 'resource_type file_type') + return FileResource(resource_type=resource_type, + file_type=file_type) + + # @logit @staticmethod def _read_from_path(path): ''' read data from a file available at given path ''' df = pd.DataFrame() - metadata = _determine_resource(path) - + metadata = DataGatherer.determine_resource(path) + if metadata.resource_type == 'local': - + if metadata.file_type == 'csv': df = pd.read_csv(path) - + elif metadata.resource_type == 'web': - + if metadata.file_type == 'csv': df = pd.read_csv(path) - + return df - - #@logit - def read(self, path = None, file = None): + + # @logit + def read(self, path=None, file=None, sql=None): ''' - read receives either path or file. If received both, file is given priority + read receives either path or file. + If received both, file is given priority ''' - try: + try: df = None if path is None: file_content = self._read_from_file(file) - #util.debug_store['StringIO(file_content) at datagatherer'] = StringIO(file_content) + # util.debug_store['S'] = StringIO(file_content) df = pd.read_csv(StringIO(file_content)) - elif file is None: df = pd.read_csv(path) else: @@ -93,5 +173,27 @@ def read(self, path = None, file = None): print('Exception occured while loading data') raise exception finally: - #util.debug_store['df at datagatherer'] = df.to_json(orient='columns') + # util.debug_store['df'] = df.to_json(orient='columns') return df + + def read_sql(self, gatherer_input: DataGathererInput): + """ + + + Parameters + ---------- + input : DataGathererInput + + Contains _values required to execute SQL QUERY. + + Returns + ------- + df : DataFrame + Result of SQL QUERY. + + """ + df = pd.DataFrame() + # TODO Move all connections to application start-up + conn = sqlite3.connect(gatherer_input.values[CONNECTION], uri=True) + df = pd.read_sql_query(gatherer_input.values[QUERY], con=conn) + return df diff --git a/octopy_predictor/src/util.py b/octopy_predictor/src/util.py index 7e96123..dfd46bd 100644 --- a/octopy_predictor/src/util.py +++ b/octopy_predictor/src/util.py @@ -63,3 +63,4 @@ def is_allowed_file(filename): return True #TODO #return '.' in filename and filename.rsplit('.', 1)[1] in context.ALLOWED_EXTENSIONS + diff --git a/octopy_predictor/tests/test_analyser.py b/octopy_predictor/tests/test_analyser.py index ef6e565..2130033 100644 --- a/octopy_predictor/tests/test_analyser.py +++ b/octopy_predictor/tests/test_analyser.py @@ -1,11 +1,22 @@ -# test for visualizer.py +"""Tests for analyser.py""" -#import sys -#sys.path.insert(0, '../main/') - -from octopy_predictor.src.analyser import Analyser +from octopy_predictor.src.analyser import Analyser, AnalyserMetricsRegistry, REGRESSION_MODEL import unittest +class AnalyserMetricsRegistryTest(unittest.TestCase): + """Test cases for AnalyserMetricsRegistry""" + + def setUp(self): + self.registry = AnalyserMetricsRegistry() + + def test_regression_model(self): + + result = self.registry.apply_metrics(REGRESSION_MODEL, + ([.0, 1.5, 3.0], [-3.0, -1.5, .0]) + ) + + self.assertIsNotNone(result, "apply_metrics returned null") + class AnalyserTest(unittest.TestCase): """Test cases for Analyser""" @@ -18,9 +29,9 @@ def test_is_regression_model_type(self): actual_result = self.analyser.get_model_type_by_label(array) - self.assertEquals(actual_result, expected_result) + self.assertEqual(actual_result, expected_result, "expected does not match actual") if __name__ == '__main__': # unittest.main() - suite = unittest.defaultTestLoader.loadTestsFromTestCase(AnalyserTest) + suite = unittest.defaultTestLoader.loadTestsFromTestCase(AnalyserMetricsRegistryTest) unittest.TextTestRunner().run(suite) diff --git a/octopy_predictor/tests/test_datagatherer.py b/octopy_predictor/tests/test_datagatherer.py index 0ba6a4c..fcb728a 100644 --- a/octopy_predictor/tests/test_datagatherer.py +++ b/octopy_predictor/tests/test_datagatherer.py @@ -1,24 +1,97 @@ # -*- coding: utf-8 -*- import unittest +import pandas as pd +import numpy as np +from octopy_predictor.src.datagatherer import * + +conn = 'file::memory:?cache=shared' -from octopy_predictor.src.datagatherer import DataGatherer class DataGathererTest(unittest.TestCase): - """Test cases for Analyser""" + """Test cases for DataGatherer""" def setUp(self): - self.datagatherer = DataGatherer() - - def test_determine_resource(self): + self.gatherer = DataGatherer() + + def test_read_sql_from_empty_table(self): """ - TBD + Test read_sql() + + given a gatherer_input with SQL gatherer values + and empty in-memory database + + when read_sql is called + + then dataframe should be returned + """ - self.assertEquals(1,1) + c = sqlite3.connect(conn, uri=True) + c.execute('drop table if exists test') + gatherer_input = DataGathererInput(SQL) + gatherer_input.add(QUERY, "SELECT * FROM sqlite_master") + gatherer_input.add(CONNECTION, conn) + gatherer = DataGatherer() + + df = gatherer.read_sql(gatherer_input) - + self.assertIsNotNone(df) + self.assertTrue(df.empty) + + def test_read_sql_from_populated_table(self): + """ + Test read_sql + + given a gatherer_input with SQL gatherer values + and empty in-memory database + + when read_sql is called + + then dataframe should be returned + + """ + #%% + expected_df = pd.DataFrame(np.reshape(np.arange(10), (2,5))) + c = sqlite3.connect(conn, uri=True) + expected_df.to_sql('test', con=c, if_exists='replace', index=False) + #%% + + #%% + gatherer_input = DataGathererInput(SQL) + gatherer_input.add(QUERY, "SELECT * FROM test") + gatherer_input.add(CONNECTION, conn) + gatherer = DataGatherer() + + df = gatherer.read_sql(gatherer_input) + #%% + self.assertIsNotNone(df) + self.assertFalse(df.empty, "df is empty") + self.assertEqual(expected_df.shape, df.shape) + + +class DataGathererInputTest(unittest.TestCase): + """Test cases for DataGathererInput""" + + def test_SQL_inputs(self): + """ + given: input is SQL + when: DataGathererInput is created + then: all parameters required for SQL datagatherer should be available + """ + + expected = { + 'type': SQL, + CONNECTION: conn + } + + input = DataGathererInput(SQL) + input.add(CONNECTION, conn) + + self.assertIsNotNone(input.values) + self.assertEqual(input.values[CONNECTION], expected[CONNECTION], "expected does not match actual") + -if __name__ == '__main__': +if __name__ == '__main__': # unittest.main() suite = unittest.defaultTestLoader.loadTestsFromTestCase(DataGathererTest) unittest.TextTestRunner().run(suite) diff --git a/requirements.txt b/requirements.txt index 5194bc6..4e6dea6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ scikit-learn>=0.22.0 scipy>=1.3.1 pandas>=0.24.2 spyder>=4.0.1 +db-sqlite3==0.0.1 \ No newline at end of file diff --git a/versions.txt b/versions.txt index 492026d..ace8799 100644 --- a/versions.txt +++ b/versions.txt @@ -2,29 +2,159 @@ # # Name Version Build Channel _libgcc_mutex 0.1 main -ca-certificates 2019.11.27 0 +alabaster 0.7.12 py_0 +antiorm 1.2.1 pypi_0 pypi +argh 0.26.2 py38_0 +asn1crypto 1.3.0 py38_0 +astroid 2.3.3 py38_0 +atomicwrites 1.3.0 py_0 +attrs 19.3.0 py_0 +autopep8 1.4.4 py_0 +babel 2.8.0 py_0 +backcall 0.1.0 py38_0 +bleach 3.1.0 py_0 +ca-certificates 2020.1.1 0 certifi 2019.11.28 py38_0 +cffi 1.13.2 py38h2e261b9_0 +chardet 3.0.4 py38_1003 +cloudpickle 1.2.2 py_0 +cryptography 2.8 py38h1ba5d50_0 +db 0.1.1 pypi_0 pypi +db-sqlite3 0.0.1 pypi_0 pypi +dbus 1.13.12 h746ee38_0 +decorator 4.4.1 py_0 +defusedxml 0.6.0 py_0 +diff-match-patch 20181111 py_0 +docutils 0.16 py38_0 +entrypoints 0.3 py38_0 +expat 2.2.6 he6710b0_0 +flake8 3.7.9 py38_0 +fontconfig 2.13.0 h9420a91_0 +freetype 2.9.1 h8a8886c_1 +future 0.18.2 py38_0 +glib 2.63.1 h5a9c865_0 +gmp 6.1.2 h6c8ec71_1 +gst-plugins-base 1.14.0 hbbd80ab_1 +gstreamer 1.14.0 hb453b48_1 +helpdev 0.6.10 pypi_0 pypi +icu 58.2 h9c2bf20_1 +idna 2.8 py38_1000 +imagesize 1.2.0 py_0 +importlib_metadata 1.4.0 py38_0 +intervaltree 3.0.2 py_0 +ipykernel 5.1.4 py38h39e3cac_0 +ipython 7.11.1 py38h39e3cac_0 +ipython_genutils 0.2.0 py38_0 +isort 4.3.21 py38_0 +jedi 0.14.1 py38_0 +jeepney 0.4.2 py_0 +jinja2 2.10.3 py_0 joblib 0.14.1 pypi_0 pypi +jpeg 9b h024ee3a_2 +jsonschema 3.2.0 py38_0 +jupyter_client 5.3.4 py38_0 +jupyter_core 4.6.1 py38_0 +keyring 21.1.0 py38_0 +lazy-object-proxy 1.4.3 py38h7b6447c_0 ld_impl_linux-64 2.33.1 h53a641e_7 libedit 3.1.20181209 hc058e9b_0 libffi 3.2.1 hd88cf55_4 libgcc-ng 9.1.0 hdf63c60_0 +libpng 1.6.37 hbc83047_0 +libsodium 1.0.16 h1bed415_0 +libspatialindex 1.9.3 he6710b0_0 libstdcxx-ng 9.1.0 hdf63c60_0 +libuuid 1.0.3 h1bed415_2 +libxcb 1.13 h1bed415_1 +libxml2 2.9.9 hea5a465_1 +markupsafe 1.1.1 py38h7b6447c_0 +mccabe 0.6.1 py38_1 +mistune 0.8.4 py38h7b6447c_1000 +more-itertools 8.0.2 py_0 +nbconvert 5.6.1 py38_0 +nbformat 5.0.4 py_0 ncurses 6.1 he6710b0_1 numpy 1.18.1 pypi_0 pypi +numpydoc 0.9.2 py_0 openssl 1.1.1d h7b6447c_3 +packaging 20.1 py_0 pandas 0.25.3 pypi_0 pypi +pandoc 2.2.3.2 0 +pandocfilters 1.4.2 py38_1 +parso 0.6.0 py_0 +pathtools 0.1.2 py_1 +pcre 8.43 he6710b0_0 +pexpect 4.8.0 py38_0 +pickleshare 0.7.5 py38_1000 pip 19.3.1 py38_0 +pluggy 0.13.1 py38_0 +prompt_toolkit 3.0.3 py_0 +psutil 5.6.7 py38h7b6447c_0 +ptyprocess 0.6.0 py38_0 +pycodestyle 2.5.0 py38_0 +pycparser 2.19 py_0 +pydocstyle 4.0.1 py_0 +pyflakes 2.1.1 py38_0 +pygments 2.5.2 py_0 +pylint 2.4.4 py38_0 +pyopenssl 19.1.0 py38_0 +pyparsing 2.4.6 py_0 +pyqt 5.9.2 py38h05f1152_4 +pyqt5 5.12.3 pypi_0 pypi +pyqt5-sip 12.7.1 pypi_0 pypi +pyqtwebengine 5.12.1 pypi_0 pypi +pyrsistent 0.15.7 py38h7b6447c_0 +pysocks 1.7.1 py38_0 python 3.8.1 h0371630_1 -python-dateutil 2.8.1 pypi_0 pypi -pytz 2019.3 pypi_0 pypi +python-dateutil 2.8.1 py_0 +python-jsonrpc-server 0.3.4 py_0 +python-language-server 0.31.7 py38_0 +pytz 2019.3 py_0 +pyxdg 0.26 py_0 +pyyaml 5.2 py38h7b6447c_0 +pyzmq 18.1.0 py38he6710b0_0 +qdarkstyle 2.8 py_0 +qt 5.9.7 h5867ecd_1 +qtawesome 0.6.1 py_0 +qtconsole 4.6.0 py_1 +qtpy 1.9.0 py_0 readline 7.0 h7b6447c_5 +requests 2.22.0 py38_1 +rope 0.16.0 py_0 +rtree 0.9.3 py38_0 scikit-learn 0.22.1 pypi_0 pypi scipy 1.4.1 pypi_0 pypi +secretstorage 3.1.2 py38_0 setuptools 44.0.0 py38_0 +sip 4.19.13 py38he6710b0_0 six 1.13.0 pypi_0 pypi +snowballstemmer 2.0.0 py_0 +sortedcontainers 2.1.0 py38_0 +sphinx 2.3.1 py_0 +sphinxcontrib-applehelp 1.0.1 py_0 +sphinxcontrib-devhelp 1.0.1 py_0 +sphinxcontrib-htmlhelp 1.0.2 py_0 +sphinxcontrib-jsmath 1.0.1 py_0 +sphinxcontrib-qthelp 1.0.2 py_0 +sphinxcontrib-serializinghtml 1.1.3 py_0 +spyder 4.0.1 py38_0 +spyder-kernels 1.8.1 py38_0 sqlite 3.30.1 h7b6447c_0 +testpath 0.4.4 py_0 tk 8.6.8 hbc83047_0 +tornado 6.0.3 py38h7b6447c_0 +traitlets 4.3.3 py38_0 +ujson 1.35 py38h7b6447c_0 +urllib3 1.25.8 py38_0 +watchdog 0.9.0 py38_1 +wcwidth 0.1.7 py38_0 +webencodings 0.5.1 py38_1 wheel 0.33.6 py38_0 +wrapt 1.11.2 py38h7b6447c_0 +wurlitzer 2.0.0 py38_0 xz 5.2.4 h14c3975_4 +yaml 0.1.7 had09818_2 +yapf 0.28.0 py_0 +zeromq 4.3.1 he6710b0_3 +zipp 0.6.0 py_0 zlib 1.2.11 h7b6447c_3