Skip to content

Commit

Permalink
Scrape all known pruefis for each format version (#259)
Browse files Browse the repository at this point in the history
* 🎨 Improve the structure of the collect_pruefis script

* 🎨 Change the script to scrape for all known format versions

* 🎨 use parents

* 📝 Add all known pruefis for all known edifact format versions

* 🩹 Remove deprecation warning by replacing datetime.utcnow()

* 📝 foo

* 🎨 Apply review suggestions

* 🎨 Add --edi-energy-mirror-path flag
  • Loading branch information
hf-krechan authored Mar 27, 2024
1 parent 9aeaf8d commit 56f1f82
Show file tree
Hide file tree
Showing 11 changed files with 1,399 additions and 68 deletions.
63 changes: 0 additions & 63 deletions src/kohlrahbi/collect_pruefis.py

This file was deleted.

5 changes: 5 additions & 0 deletions src/kohlrahbi/format_versions/FV2104_all_known_pruefis.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[meta_data]
updated_on = 2024-03-27

[pruefidentifikatoren]
"⚠️ No Prüfidentifikatoren found" = "No AHB documents found. Probably there are no AHB in the docx format in the provided path /Users/kevin/workspaces/hochfrequenz/edi_energy_mirror/edi_energy_de/FV2104."
5 changes: 5 additions & 0 deletions src/kohlrahbi/format_versions/FV2110_all_known_pruefis.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[meta_data]
updated_on = 2024-03-27

[pruefidentifikatoren]
"⚠️ No Prüfidentifikatoren found" = "No AHB documents found. Probably there are no AHB in the docx format in the provided path /Users/kevin/workspaces/hochfrequenz/edi_energy_mirror/edi_energy_de/FV2110."
364 changes: 364 additions & 0 deletions src/kohlrahbi/format_versions/FV2210_all_known_pruefis.toml

Large diffs are not rendered by default.

172 changes: 172 additions & 0 deletions src/kohlrahbi/format_versions/FV2304_all_known_pruefis.toml

Large diffs are not rendered by default.

481 changes: 481 additions & 0 deletions src/kohlrahbi/format_versions/FV2310_all_known_pruefis.toml

Large diffs are not rendered by default.

272 changes: 272 additions & 0 deletions src/kohlrahbi/format_versions/FV2404_all_known_pruefis.toml

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/kohlrahbi/format_versions/FV2410_all_known_pruefis.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[meta_data]
updated_on = 2024-03-27

[pruefidentifikatoren]
"⚠️ No Prüfidentifikatoren found" = "No AHB documents found. Probably there are no AHB in the docx format in the provided path /Users/kevin/workspaces/hochfrequenz/edi_energy_mirror/edi_energy_de/FV2410."
4 changes: 2 additions & 2 deletions src/kohlrahbi/read_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

import re
from datetime import datetime
from datetime import datetime, timezone
from typing import Generator, Optional, Union

import pytz
Expand Down Expand Up @@ -63,7 +63,7 @@ def _get_format_version_from_ahbfile_name(ahb_docx_name: str) -> EdifactFormatVe
local_date_str = match.groupdict()["germanLocalTimeStartDate"]
berlin_local_time = datetime.strptime(local_date_str, "%Y%m%d").astimezone(berlin)
else:
berlin_local_time = datetime.utcnow().astimezone(berlin)
berlin_local_time = datetime.now(timezone.utc).astimezone(berlin)
edifact_format_version = get_edifact_format_version(berlin_local_time)
return edifact_format_version

Expand Down
90 changes: 90 additions & 0 deletions src/kohlrahbi/scrape_pruefis/collect_pruefis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from datetime import date
from pathlib import Path

import click
import docx # type:ignore[import]
import tomlkit
from docx.table import Table # type:ignore[import]
from maus.edifact import EdifactFormatVersion

from kohlrahbi.docxfilefinder import DocxFileFinder
from kohlrahbi.logger import logger
from kohlrahbi.read_functions import does_the_table_contain_pruefidentifikatoren, get_all_paragraphs_and_tables
from kohlrahbi.seed import Seed


def validate_path(ctx, param, value):
"""Ensure the path exists."""
path = Path(value)
if not path.exists():
raise click.BadParameter(f"Path does not exist: {value}")
return path


@click.command()
@click.option(
"--format-version",
multiple=True,
default=[e.value for e in EdifactFormatVersion], # Set default to all known format versions
show_default="All",
type=click.Choice([e.value for e in EdifactFormatVersion], case_sensitive=False),
help="Format version(s) of the AHB documents. Default is all known format versions.",
)
@click.option(
"--edi-energy-mirror-path",
type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True),
callback=validate_path,
help="The root path to the edi_energy_mirror repository.",
required=True,
)
def update_pruefis(format_version: list[EdifactFormatVersion], edi_energy_mirror_path: Path):
"""
This CLI tool updates the all_known_pruefis.toml files with Prüfidentifikatoren from AHB documents.
If no specific format version is provided, it processes all known format versions.
"""
for version in format_version: # Iterate over each provided format version
all_pruefis: dict[str, str] = {}

path_to_ahb_documents = edi_energy_mirror_path / Path(f"edi_energy_de/{version}")

assert path_to_ahb_documents.exists(), f"The specified path {path_to_ahb_documents.absolute()} does not exist."

output_filename = f"{version}_all_known_pruefis.toml"
output_file_path = Path(__file__).parents[1] / "format_versions" / output_filename

assert output_file_path.parent.exists(), f"The specified path {output_file_path.parent} does not exist."

ahb_file_finder = DocxFileFinder.from_input_path(input_path=path_to_ahb_documents)

ahb_file_finder.filter_for_latest_ahb_docx_files()

for ahb_file_path in ahb_file_finder.paths_to_docx_files:
doc = docx.Document(ahb_file_path)
for item in get_all_paragraphs_and_tables(parent=doc):
if isinstance(item, Table) and does_the_table_contain_pruefidentifikatoren(table=item):
if not item.row_cells(0)[-1].paragraphs[-1].text.startswith("Prüfidentifikator"):
continue
seed = Seed.from_table(docx_table=item)
logger.info("Found a table with the following pruefis: %s", seed.pruefidentifikatoren)
for pruefi in seed.pruefidentifikatoren:
all_pruefis.update({pruefi: ahb_file_path.name})

all_pruefis = dict(sorted(all_pruefis.items()))
if not any(all_pruefis):
logger.warning("No Prüfidentifikatoren found in the AHB documents for format version %s.", version)
all_pruefis = {
"⚠️ No Prüfidentifikatoren found": f"No AHB documents found. Probably there are no AHB in the docx format in the provided path {path_to_ahb_documents}."
}

toml_data = {
"meta_data": {"updated_on": date.today()},
"pruefidentifikatoren": all_pruefis,
}

with open(output_file_path, "w", encoding="utf-8") as f:
tomlkit.dump(toml_data, f)
logger.info("🎉 Successfully updated %s and saved it at %s.", output_filename, output_file_path)


if __name__ == "__main__":
update_pruefis()
6 changes: 3 additions & 3 deletions unittests/test_read_functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone
from typing import Optional

import pytest # type:ignore[import]
Expand All @@ -20,8 +20,8 @@ class TestReadFunctions:
"COMDISAHB-informatorischeLesefassung1.0c_99991231_20221001.docx", EdifactFormatVersion.FV2210
),
pytest.param("REQOTEQUOTESORDERSORDRSPORDCHGAHB2.1_99991231_20230401.docx", EdifactFormatVersion.FV2304),
pytest.param("foo", get_edifact_format_version(datetime.utcnow().astimezone(tz=pytz.UTC))),
pytest.param("bar", get_edifact_format_version(datetime.utcnow().astimezone(tz=pytz.UTC))),
pytest.param("foo", get_edifact_format_version(datetime.now(timezone.utc))),
pytest.param("bar", get_edifact_format_version(datetime.now(timezone.utc))),
],
)
def test_get_format_version_from_filename(self, filename: str, expected_result: Optional[EdifactFormatVersion]):
Expand Down

0 comments on commit 56f1f82

Please sign in to comment.