From 3e2c1279f0b7765b85ba742a5906df0481b3fa17 Mon Sep 17 00:00:00 2001 From: Daniel <139119540+DeltaDaniel@users.noreply.github.com> Date: Wed, 20 Nov 2024 09:13:39 +0100 Subject: [PATCH] fix: sorting documents which have been replaced before they became valid (#242) * changed valid until date for documents which have been replaced b. valid Co-authored-by: konstantin --- src/edi_energy_scraper/__init__.py | 7 +++++-- unittests/test_edienergyscraper.py | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/edi_energy_scraper/__init__.py b/src/edi_energy_scraper/__init__.py index fac6a31..af4ab31 100644 --- a/src/edi_energy_scraper/__init__.py +++ b/src/edi_energy_scraper/__init__.py @@ -252,7 +252,7 @@ def get_epoch_links(document_soup) -> Dict[Epoch, str]: return result @staticmethod - def get_epoch_file_map(epoch_soup: BeautifulSoup) -> Dict[str, str]: + def get_epoch_file_map(epoch_soup: BeautifulSoup, epoch: Epoch) -> Dict[str, str]: """ Extracts a dictionary from the epoch soup (e.g. soup of "future.html") that maps file basenames as keys (e.g. "APERAKCONTRLAHB2.3h_99993112_20210104") to URLs of the documents as value. @@ -285,6 +285,9 @@ def get_epoch_file_map(epoch_soup: BeautifulSoup) -> Dict[str, str]: # there's a special case: "Offen" means the document is valid until further notice. if table_cells[2].text.strip() == "Offen": valid_to_date = datetime.datetime(9999, 12, 31) + document_has_been_replaced_before_it_became_valid = epoch == Epoch.PAST + if document_has_been_replaced_before_it_became_valid: + valid_to_date = publication_date else: raise value_error # the 4th column contains a download link for the PDF. @@ -351,7 +354,7 @@ async def mirror(self): epoch_path: Path = Path(self._root_dir, f"{_epoch}.html") # e.g. "future.html" with open(epoch_path, "w+", encoding="utf8") as outfile: outfile.write(epoch_soup.prettify()) - file_map = EdiEnergyScraper.get_epoch_file_map(epoch_soup) + file_map = EdiEnergyScraper.get_epoch_file_map(epoch_soup, _epoch) download_tasks: List[Awaitable[Optional[List[Path]]]] = [] file_counter = itertools.count() for file_basename, link in file_map.items(): diff --git a/unittests/test_edienergyscraper.py b/unittests/test_edienergyscraper.py index a2d9213..a5a3c2f 100644 --- a/unittests/test_edienergyscraper.py +++ b/unittests/test_edienergyscraper.py @@ -123,7 +123,7 @@ def test_epoch_file_map_future_20210210(self, datafiles): with open(datafiles / "future_20210210.html", "r", encoding="utf8") as infile: response_body = infile.read() soup = BeautifulSoup(response_body, "html.parser") - actual = EdiEnergyScraper.get_epoch_file_map(soup) + actual = EdiEnergyScraper.get_epoch_file_map(soup, Epoch.FUTURE) assert len(actual.keys()) == 76 for file_basename in actual.keys(): # all the future names should contain 99991231 as "valid to" date @@ -140,7 +140,7 @@ def test_epoch_file_map_current_20210210(self, datafiles): with open(datafiles / "current_20210210.html", "r", encoding="utf8") as infile: response_body = infile.read() soup = BeautifulSoup(response_body, "html.parser") - actual = EdiEnergyScraper.get_epoch_file_map(soup) + actual = EdiEnergyScraper.get_epoch_file_map(soup, Epoch.CURRENT) assert len(actual.keys()) == 81 for file_basename in actual.keys(): # all the current documents are either "open" or valid until April 2021 @@ -157,7 +157,7 @@ def test_epoch_file_map_past_20210210(self, datafiles): with open(datafiles / "past_20210210.html", "r", encoding="utf8") as infile: response_body = infile.read() soup = BeautifulSoup(response_body, "html.parser") - actual = EdiEnergyScraper.get_epoch_file_map(soup) + actual = EdiEnergyScraper.get_epoch_file_map(soup, Epoch.PAST) assert len(actual.keys()) == 705 @pytest.mark.parametrize(