generated from Hochfrequenz/python_template_repository
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Scrape all known pruefis for each format version (#259)
* 🎨 Improve the structure of the collect_pruefis script * 🎨 Change the script to scrape for all known format versions * 🎨 use parents * 📝 Add all known pruefis for all known edifact format versions * 🩹 Remove deprecation warning by replacing datetime.utcnow() * 📝 foo * 🎨 Apply review suggestions * 🎨 Add --edi-energy-mirror-path flag
- Loading branch information
1 parent
9aeaf8d
commit 56f1f82
Showing
11 changed files
with
1,399 additions
and
68 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[meta_data] | ||
updated_on = 2024-03-27 | ||
|
||
[pruefidentifikatoren] | ||
"⚠️ No Prüfidentifikatoren found" = "No AHB documents found. Probably there are no AHB in the docx format in the provided path /Users/kevin/workspaces/hochfrequenz/edi_energy_mirror/edi_energy_de/FV2104." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[meta_data] | ||
updated_on = 2024-03-27 | ||
|
||
[pruefidentifikatoren] | ||
"⚠️ No Prüfidentifikatoren found" = "No AHB documents found. Probably there are no AHB in the docx format in the provided path /Users/kevin/workspaces/hochfrequenz/edi_energy_mirror/edi_energy_de/FV2110." |
364 changes: 364 additions & 0 deletions
364
src/kohlrahbi/format_versions/FV2210_all_known_pruefis.toml
Large diffs are not rendered by default.
Oops, something went wrong.
172 changes: 172 additions & 0 deletions
172
src/kohlrahbi/format_versions/FV2304_all_known_pruefis.toml
Large diffs are not rendered by default.
Oops, something went wrong.
481 changes: 481 additions & 0 deletions
481
src/kohlrahbi/format_versions/FV2310_all_known_pruefis.toml
Large diffs are not rendered by default.
Oops, something went wrong.
272 changes: 272 additions & 0 deletions
272
src/kohlrahbi/format_versions/FV2404_all_known_pruefis.toml
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[meta_data] | ||
updated_on = 2024-03-27 | ||
|
||
[pruefidentifikatoren] | ||
"⚠️ No Prüfidentifikatoren found" = "No AHB documents found. Probably there are no AHB in the docx format in the provided path /Users/kevin/workspaces/hochfrequenz/edi_energy_mirror/edi_energy_de/FV2410." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from datetime import date | ||
from pathlib import Path | ||
|
||
import click | ||
import docx # type:ignore[import] | ||
import tomlkit | ||
from docx.table import Table # type:ignore[import] | ||
from maus.edifact import EdifactFormatVersion | ||
|
||
from kohlrahbi.docxfilefinder import DocxFileFinder | ||
from kohlrahbi.logger import logger | ||
from kohlrahbi.read_functions import does_the_table_contain_pruefidentifikatoren, get_all_paragraphs_and_tables | ||
from kohlrahbi.seed import Seed | ||
|
||
|
||
def validate_path(ctx, param, value): | ||
"""Ensure the path exists.""" | ||
path = Path(value) | ||
if not path.exists(): | ||
raise click.BadParameter(f"Path does not exist: {value}") | ||
return path | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
"--format-version", | ||
multiple=True, | ||
default=[e.value for e in EdifactFormatVersion], # Set default to all known format versions | ||
show_default="All", | ||
type=click.Choice([e.value for e in EdifactFormatVersion], case_sensitive=False), | ||
help="Format version(s) of the AHB documents. Default is all known format versions.", | ||
) | ||
@click.option( | ||
"--edi-energy-mirror-path", | ||
type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True), | ||
callback=validate_path, | ||
help="The root path to the edi_energy_mirror repository.", | ||
required=True, | ||
) | ||
def update_pruefis(format_version: list[EdifactFormatVersion], edi_energy_mirror_path: Path): | ||
""" | ||
This CLI tool updates the all_known_pruefis.toml files with Prüfidentifikatoren from AHB documents. | ||
If no specific format version is provided, it processes all known format versions. | ||
""" | ||
for version in format_version: # Iterate over each provided format version | ||
all_pruefis: dict[str, str] = {} | ||
|
||
path_to_ahb_documents = edi_energy_mirror_path / Path(f"edi_energy_de/{version}") | ||
|
||
assert path_to_ahb_documents.exists(), f"The specified path {path_to_ahb_documents.absolute()} does not exist." | ||
|
||
output_filename = f"{version}_all_known_pruefis.toml" | ||
output_file_path = Path(__file__).parents[1] / "format_versions" / output_filename | ||
|
||
assert output_file_path.parent.exists(), f"The specified path {output_file_path.parent} does not exist." | ||
|
||
ahb_file_finder = DocxFileFinder.from_input_path(input_path=path_to_ahb_documents) | ||
|
||
ahb_file_finder.filter_for_latest_ahb_docx_files() | ||
|
||
for ahb_file_path in ahb_file_finder.paths_to_docx_files: | ||
doc = docx.Document(ahb_file_path) | ||
for item in get_all_paragraphs_and_tables(parent=doc): | ||
if isinstance(item, Table) and does_the_table_contain_pruefidentifikatoren(table=item): | ||
if not item.row_cells(0)[-1].paragraphs[-1].text.startswith("Prüfidentifikator"): | ||
continue | ||
seed = Seed.from_table(docx_table=item) | ||
logger.info("Found a table with the following pruefis: %s", seed.pruefidentifikatoren) | ||
for pruefi in seed.pruefidentifikatoren: | ||
all_pruefis.update({pruefi: ahb_file_path.name}) | ||
|
||
all_pruefis = dict(sorted(all_pruefis.items())) | ||
if not any(all_pruefis): | ||
logger.warning("No Prüfidentifikatoren found in the AHB documents for format version %s.", version) | ||
all_pruefis = { | ||
"⚠️ No Prüfidentifikatoren found": f"No AHB documents found. Probably there are no AHB in the docx format in the provided path {path_to_ahb_documents}." | ||
} | ||
|
||
toml_data = { | ||
"meta_data": {"updated_on": date.today()}, | ||
"pruefidentifikatoren": all_pruefis, | ||
} | ||
|
||
with open(output_file_path, "w", encoding="utf-8") as f: | ||
tomlkit.dump(toml_data, f) | ||
logger.info("🎉 Successfully updated %s and saved it at %s.", output_filename, output_file_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
update_pruefis() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters