diff --git a/.github/workflows/releases.yaml b/.github/workflows/releases.yaml new file mode 100644 index 0000000..68c7a8c --- /dev/null +++ b/.github/workflows/releases.yaml @@ -0,0 +1,45 @@ +name: Create Release + +on: + push: + tags: + - 'v*' + +jobs: + release: + runs-on: ubuntu-latest + permissions: write-all + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 # This ensures all commits and tags are fetched + + - name: Generate release notes + id: release_notes + run: | + TAG_NAME=${{ github.ref_name }} + LATEST_SHA=$(gh release view latest --json publishedAt -q ".publishedAt") || true + if [ -z "$LATEST_SHA" ]; then + NOTES=$(git log --pretty=format:"%s" $(git rev-list --max-parents=0 HEAD)..HEAD) + else + NOTES=$(git log --pretty=format:"%s" $LATEST_SHA..HEAD) + fi + INTRO="Version $TAG_NAME contains the following changes and improvements:" + EOF=$(dd if=/dev/urandom bs=15 count=1 status=none | base64) + echo "text<> $GITHUB_OUTPUT + + - name: Create Release + run: | + TAG_NAME=${{ github.ref_name }} + gh release create $TAG_NAME -t $TAG_NAME -n "${{ steps.release_notes.outputs.notes }}" --target main + + - name: Move 'latest' tag + run: | + git tag -d latest || true # Delete local 'latest' tag if exists + git push origin :refs/tags/latest || true # Delete remote 'latest' tag if exists + git tag latest + git push origin refs/tags/latest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4f5a96e --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +.vscode/ +.venv/ +venv/ +__pycache__/ +.mypy_cache/ +.ipynb_checkpoints/ +*.egg-info/ +testing/ + +*.csv +*.xls +*.xlsx +*.pkl +*.log +*.old diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f09e8c..e14a990 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,33 @@ -# See https://pre-commit.com for more information -# See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - id: trailing-whitespace + name: Trim Trailing Whitespace - id: end-of-file-fixer + name: Fix End of Files - id: check-yaml + name: Check YAML - id: check-added-large-files + name: Check for added large files + - id: detect-private-key + name: Detect Private Key + - id: check-merge-conflict + name: Check for merge conflicts + - id: requirements-txt-fixer + name: Fix `requirements.txt` +- repo: https://github.com/asottile/reorder-python-imports + rev: v3.10.0 + hooks: + - id: reorder-python-imports + name: Reorder Python Imports +- repo: https://github.com/pyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + name: Flake8 + args: [--max-line-length=80, --ignore=E226, --extend-ignore=E501] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.5.1 + hooks: + - id: mypy diff --git a/README.md b/README.md index ab24500..9d07037 100644 --- a/README.md +++ b/README.md @@ -1 +1,38 @@ -# tcc-stats \ No newline at end of file +# Statistics for TCC + +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) [![Create Release](https://github.com/dimboump/tcc-stats/actions/workflows/releases.yaml/badge.svg)](https://github.com/dimboump/tcc-stats/actions/workflows/releases.yaml) + +## Installation and Usage + +1. Clone the repository: + + ```bash + git clone git@github.com:dimboump/tcc-stats + ``` + +2. Install the dependencies and run the script: + + - Windows: + + ```bash + pip install -r requirements.txt + python tcc_stats.py + ``` + + - Linux/macOS: + + ```bash + pip3 install -r requirements.txt + python3 tcc_stats.py + ``` + +Options: + +| Option | Data Type | Description | Default | +| ------ | --------- | ----------- | ------- | +| `-p`, `--path` | `str` or `pathlib.Path` | Path to or name of the TCC data directory. | `data/` | +| `-y`, `--years` | `int`, `Sequence[int]` | Years to include in the statistics. You can specify a single year or multiple years, by separating them with a space. | 2018-today | +| `-m`, `--months` | `int`, `Sequence[int]` | Months to include in the statistics. To specify multiple months, separate them with a comma. | 12 months if the year is in the past, current month since January of this year otherwise | +| `-o`, `--output` | `str` or `pathlib.Path` | Path to or name of the output file (with extension). Can also be an Excel file (`.xlsx`, `.xls`). If the path/file doens't exist, it will be created or, if it does, will be overwritten. | `export.csv` | +| `-v`, `--verbose` | `bool` | Print more information about the data, along with a small sample (max 10 rows) before and after preprocessing. | False | +| `-h`, `--help` | – | Show help message and exit. | – | diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..c99dcbd --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +mypy +pytest +pytest-env diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0993c31 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +matplotlib +openpyxl +pandas diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..d548146 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,37 @@ +[metadata] +name = tcc-stats +version = 1.0.0 +description = A Python package for generating statistics from TCC data +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/dimboump/tcc-stats +author = Dimitris Boumparis +author_email = dimitris@dimboump.dev +classifiers = + Programming Language :: Python :: 3 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: Implementation :: CPython + +[options] +packages = find: +install_requires = + pandas>=1.0.0 + matplotlib>=3.0.0 + numpy>=1.0.0 +python_requires = >=3.8 + +[options.packages.find] +exclude = + tests* + testing* + +[options.entry_points] +console_scripts = + tcc-stats = tcc_stats.main:main + +[tool:pytest] +testpaths = tests + +[flake8] +max-line-length = 80 +ignore = E266, E501 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a03590f --- /dev/null +++ b/setup.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from setuptools import setup + +setup() diff --git a/tcc_stats/__init__.py b/tcc_stats/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tcc_stats/__main__.py b/tcc_stats/__main__.py new file mode 100644 index 0000000..9667ad1 --- /dev/null +++ b/tcc_stats/__main__.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from tcc_stats.main import main + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/tcc_stats/main.py b/tcc_stats/main.py new file mode 100644 index 0000000..dd57c09 --- /dev/null +++ b/tcc_stats/main.py @@ -0,0 +1,384 @@ +from __future__ import annotations + +import argparse +import calendar +import importlib +import os +import pathlib +import sys +from datetime import datetime +from typing import Sequence + +import pandas as pd +from pandas import to_datetime as pd_to_dt + +if sys.version_info <= (3, 9): + from typing import Union + DataFrameSheets = dict[Union[int, str], pd.DataFrame] +else: + from typing import TypeAlias + DataFrameSheets: TypeAlias = dict[int | str, pd.DataFrame] + +VERSION = importlib.metadata.version('tcc_stats') +CLI_WIDTH = os.get_terminal_size().columns + +CWD = pathlib.Path(os.getcwd()) +YEARS = sorted(list(range(2018, datetime.now().year + 1))) +MONTHS = [calendar.month_name[i] for i in range(1, 13)] +ALLOWED_EXCEL = ('.xlsx', '.xls', '.xlsm', '.xlsb') +COLS = ('operator', 'requester_code', 'doc_type', 'fdr_no', 'result', 'pdf', + 'minutes', 'source_lang', 'target_lang') + +DESCRIPTION = """\ +Plot TCC statistics for: + - the current year + - previous years (by providing the directory) + - given month(s) of the current or previous years + +The user can provide: + +(a) no directory, so 'data/' will be used. The directory structure should be: + +|-- data (or any specified name) + |-- + |-- .csv + |-- .csv + |-- .csv + |-- + |-- .xlsx (max. one sheet per month) + +(b) a path to a directory +(c) a list of years +(d) a list of months, `year` is the current one by default +(e) a list of years and months + +It is not possible to provide both a path and any of the other arguments +at the same time. +""" + +RED = '\033[41m' +GREEN = '\033[42m' +YELLOW = '\033[43;30m' +TURQUOISE = '\033[46;30m' +SUBTLE = '\033[2m' +NORMAL = '\033[m' + + +def colored_message(msg: str, color: str, end: str = NORMAL) -> str: + """Color the background and text of a message.""" + return f"{color}{msg}{end}" + + +def step( + message: str, + *, + start: str = '', + color: str = NORMAL, + cols: int = 80, +) -> None: + """Print a message with a colored background and text.""" + message = colored_message(message, color) + dots = "." * (cols - len(start) - len(message) + len(color)) + print(f"{start}{dots}{message}", end=None) + + +def get_args() -> argparse.Namespace: + """Parse the command line arguments.""" + + parser = argparse.ArgumentParser( + description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter + ) + + parser.add_argument( + 'path', type=str, default=pathlib.Path(CWD, 'data'), + help=( + "Path to the directory containing the statistics files inside " + "individual directories for each year. (default: `./data/`)" + ) + ) + + parser.add_argument( + '-y', '--years', type=int, nargs='+', default=YEARS, + help=( + "Year(s) to plot the statistics for. Should match the name of the " + "directory inside of which the statistics files are located." + ) + ) + + parser.add_argument( + '-m', '--months', type=parse_months, nargs='+', + default=",".join([str(i) for i in range(1, 13)]), + help=( + "Month(s) to plot the statistics for. If not provided, " + "all 12 months will be used." + ) + ) + + parser.add_argument( + '-o', '--output', type=str, default='export.csv', + help=( + "Output file name (or relative or absolute path). If it does not " + "exist, the (sub)directories will be created. Allowed extentions: " + f"{ALLOWED_EXCEL + ('.csv',)}. (default: `%(default)s`)" + ) + ) + + parser.add_argument( + '-v', '--verbose', action='store_true', + help="Print more information about the data along with a small sample." + ) + + # https://stackoverflow.com/a/8521644/812183 + parser.add_argument( + '-V', '--version', action='version', + version=f'{SUBTLE}%(prog)s {VERSION}{NORMAL}', + ) + + return parser.parse_args() + + +def parse_months(months: str | None) -> int | Sequence[int]: + """Parse the `--months` argument. + `--months` can be: + - None -> all 12 months + - 1 -> January, 2 -> February, ..., 12 -> December + - any number of months between 1 and 12 separated by a comma + """ + parsed_month = None + parsed_months = [] + + if months is None: + parsed_months = list(range(1, 13)) + elif ',' in months: + months_list = [int(month) for month in months.split(',')] + if any(month == 0 for month in months_list) and len(months_list) > 1: + print("Will raise an error.") + raise argparse.ArgumentTypeError( + "Months cannot be 0 and any other number at the same time." + ) + parsed_months = [int(month) for month in months_list] + parsed_months = sorted(list(set(parsed_months))) + elif isinstance(months, str): + parsed_month = int(months) + else: + raise argparse.ArgumentTypeError( + f"{months} is not a valid type for the months argument." + ) + return parsed_month or parsed_months + + +def preprocess_path(path: str | pathlib.Path | None) -> pathlib.Path: + """Preprocess the path to the directory containing the statistics files.""" + if path is None: + path = pathlib.Path(CWD, 'data') + path = pathlib.Path(path) + if not path.is_dir(): + raise ValueError(f"{path} is not a valid directory.") + return pathlib.Path(path) + + +def get_data( + path: str | pathlib.Path, # default: 'data/' +) -> pd.DataFrame | DataFrameSheets | None: + """Read the data from the given file and return a DataFrame.""" + + path = pathlib.Path(path) + filename = path.name + + df, df_ = None, None + + try: + if path.suffix in ALLOWED_EXCEL: + df = pd.read_excel(path, usecols='A:I', index_col=None, names=COLS, + sheet_name=None) # get all sheets, filter later + step("[Opening]", start=filename, color=YELLOW) + elif path.suffix == '.csv': + df_ = pd.read_csv(path, index_col=None, names=COLS, + encoding='utf-8', sep=';') + step("Done", start=filename, color=GREEN) + except FileNotFoundError: + step("(none)", start=filename, color=TURQUOISE) + + return df or df_ + + +def get_current_data( + path: str | pathlib.Path, # default: 'data/' + *, + year: int = datetime.now().year, + months: int | Sequence[int] = 0, + verbose: bool = False +) -> pd.DataFrame: + """Get a Pandas DataFrame of the data for the current year.""" + + df = pd.DataFrame() + + for i, month_name in enumerate(MONTHS, start=1): + if not isinstance(months, int) and i not in months: + continue + filename = f"{i}. {month_name.title()} stats all.csv" + filepath = pathlib.Path(path, str(year), filename) + df_temp = get_data(filepath) + if df_temp is not None and isinstance(df_temp, pd.DataFrame): + if isinstance(months, list): + month = pd_to_dt(months[i-1], format='%m').month + elif isinstance(months, int): + month = pd_to_dt(i, format='%m').month + df_temp = df_temp.assign(year=year, month=month) + date = pd_to_dt(df_temp[['year', 'month']].assign(day=1), + format='%Y-%m') + df_temp = df_temp.assign(date=date) + df = pd.concat([df, df_temp], ignore_index=True) + continue + + if verbose: + df.info() + print() + sample_size = min(10, len(df)) + print(df.sample(sample_size)) + + return df + + +def get_history_data( + path: str | pathlib.Path, # default: 'data/' + *, + years: Sequence[int] | set[int] | None = None, + months: int | Sequence[int], + verbose: bool = False +) -> pd.DataFrame: + """Get a Pandas DataFrame of the data for the previous year(s).""" + + years = sorted(years) if years is not None else YEARS + if months is None or months == 0: + months_list = {i+1: month_name for i, month_name in enumerate(MONTHS)} + elif isinstance(months, Sequence): + # check if months is a list of lists and flatten it + if any(isinstance(month, list) for month in months): + months = [month for subl in months + for month in (subl if isinstance(subl, list) else [subl])] + # months_list should be a list of tuples (month, month_name) + # with only the specified months + months_list = {i+1: month_name for i, month_name in enumerate(MONTHS) + if i+1 in months} + elif isinstance(months, int): + months_list = {months: MONTHS[months-1]} + else: + raise TypeError(f"{months!r} is not a valid type.") + + df = pd.DataFrame() + + for year in years: + filename = f"{year}_stats.xlsx" + filepath = pathlib.Path(path, str(year), filename) + df_sheets = get_data(filepath) + if df_sheets is not None: + for i, month_name in enumerate(MONTHS, start=1): + months_dict: dict[str | int, int] = {v: k for k, v + in months_list.items()} + if i not in months_dict.values(): + continue + else: + sheetname = f"ANTE - {month_name.upper()} {year}" + # If `df`` comes from Excel, it is a dict of DataFrames + # so we need to filter the sheets + if isinstance(df_sheets, dict): + df_temp = df_sheets[sheetname] + months_dict = {v: v for _, v in months_dict.items()} + month = pd_to_dt(months_dict[i], format='%m').month + df_temp = df_temp.assign(year=year, month=month) + date = pd_to_dt(df_temp[['year', 'month']].assign(day=1), + format='%Y-%m') + df_temp = df_temp.assign(date=date) + df = pd.concat([df, df_temp], ignore_index=True) + step("Done", start=f"{filename}/{sheetname}", color=GREEN) + step("[Closing]", start=filename, color=YELLOW) + + if verbose: + df.info() + print() + sample_size = min(10, len(df)) + print(df.sample(sample_size)) + + return df + + +def preprocess_data(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame: + """Perform preprocessing of the data for statistical analysis.""" + + if verbose: + print("=" * CLI_WIDTH, "Initial DataFrame:", sep="\n", end="\n\n") + print("-" * CLI_WIDTH, "INFO:", "=" * CLI_WIDTH, sep="\n") + df.info() + print() + print("-" * CLI_WIDTH, "SAMPLE:", "=" * CLI_WIDTH, sep="\n") + sample_size = min(10, len(df)) + print(df.sample(sample_size)) + + # Remove unnecessary columns + df = df.drop(['operator', 'fdr_no', 'source_lang', 'target_lang'], axis=1) + + # Convert 'requester_code' and 'doc_type' to categorical + df['requester_code'] = df['requester_code'].astype('category') + df['doc_type'] = df['doc_type'].astype('category') + + # Convert 'result' and 'pdf' to boolean + df['improved'] = df['result'].map( + {'OK': True, 'Improved': False}).astype('bool') + df = df.drop('result', axis=1) + df['pdf'] = df['pdf'].map({'yes': True, 'no': False}).astype('bool') + + if verbose: + print("=" * CLI_WIDTH, "Final DataFrame:", sep="\n", end="\n\n") + print("-" * CLI_WIDTH, "INFO:", "=" * CLI_WIDTH, sep="\n") + df.info() + print() + print("-" * CLI_WIDTH, "SAMPLE:", "=" * CLI_WIDTH, sep="\n") + sample_size = min(10, len(df)) + print(df.sample(sample_size)) + + return df + + +def main() -> int: + args = get_args() + path = preprocess_path(args.path) + years = set(args.years) + months = args.months + verbose = args.verbose + output = args.output + + df = pd.DataFrame() + pd.set_option('display.width', CLI_WIDTH) + + current_year = datetime.now().year + if current_year in years: + df = get_current_data(path, months=months, verbose=verbose) + years.remove(current_year) + + df = pd.concat([df, get_history_data(path, years=years, months=months, + verbose=verbose)]) + df = preprocess_data(df, verbose=verbose) + + if output: + output = pathlib.Path(output) + if not output.parent.exists(): + output.parent.mkdir(parents=True, exist_ok=True) + + if output.suffix in ALLOWED_EXCEL: + df.to_excel(output, index=False, sheet_name='TCC') + elif output.suffix == '.csv': + df.to_csv(output, index=False, encoding='utf-8', sep=';') + else: + raise ValueError( + f"""{output} does not have a valid file extension. + Only Excel and CSV files are supported.""" + ) + print() + step("Saved", start=f"Saving {output.absolute().relative_to(CWD)}", + color=GREEN) + return 0 + + +if '__main__' == __name__: + raise SystemExit(main())