Skip to content

Commit

Permalink
fix: sorting documents which have been replaced before they became va…
Browse files Browse the repository at this point in the history
…lid (#242)

* changed valid until date for documents which have been replaced b. valid

Co-authored-by: konstantin <konstantin.klein@hochfrequenz.de>
  • Loading branch information
DeltaDaniel and hf-kklein authored Nov 20, 2024
1 parent 7e73873 commit 3e2c127
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
7 changes: 5 additions & 2 deletions src/edi_energy_scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def get_epoch_links(document_soup) -> Dict[Epoch, str]:
return result

@staticmethod
def get_epoch_file_map(epoch_soup: BeautifulSoup) -> Dict[str, str]:
def get_epoch_file_map(epoch_soup: BeautifulSoup, epoch: Epoch) -> Dict[str, str]:
"""
Extracts a dictionary from the epoch soup (e.g. soup of "future.html") that maps file basenames as keys
(e.g. "APERAKCONTRLAHB2.3h_99993112_20210104") to URLs of the documents as value.
Expand Down Expand Up @@ -285,6 +285,9 @@ def get_epoch_file_map(epoch_soup: BeautifulSoup) -> Dict[str, str]:
# there's a special case: "Offen" means the document is valid until further notice.
if table_cells[2].text.strip() == "Offen":
valid_to_date = datetime.datetime(9999, 12, 31)
document_has_been_replaced_before_it_became_valid = epoch == Epoch.PAST
if document_has_been_replaced_before_it_became_valid:
valid_to_date = publication_date
else:
raise value_error
# the 4th column contains a download link for the PDF.
Expand Down Expand Up @@ -351,7 +354,7 @@ async def mirror(self):
epoch_path: Path = Path(self._root_dir, f"{_epoch}.html") # e.g. "future.html"
with open(epoch_path, "w+", encoding="utf8") as outfile:
outfile.write(epoch_soup.prettify())
file_map = EdiEnergyScraper.get_epoch_file_map(epoch_soup)
file_map = EdiEnergyScraper.get_epoch_file_map(epoch_soup, _epoch)
download_tasks: List[Awaitable[Optional[List[Path]]]] = []
file_counter = itertools.count()
for file_basename, link in file_map.items():
Expand Down
6 changes: 3 additions & 3 deletions unittests/test_edienergyscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_epoch_file_map_future_20210210(self, datafiles):
with open(datafiles / "future_20210210.html", "r", encoding="utf8") as infile:
response_body = infile.read()
soup = BeautifulSoup(response_body, "html.parser")
actual = EdiEnergyScraper.get_epoch_file_map(soup)
actual = EdiEnergyScraper.get_epoch_file_map(soup, Epoch.FUTURE)
assert len(actual.keys()) == 76
for file_basename in actual.keys():
# all the future names should contain 99991231 as "valid to" date
Expand All @@ -140,7 +140,7 @@ def test_epoch_file_map_current_20210210(self, datafiles):
with open(datafiles / "current_20210210.html", "r", encoding="utf8") as infile:
response_body = infile.read()
soup = BeautifulSoup(response_body, "html.parser")
actual = EdiEnergyScraper.get_epoch_file_map(soup)
actual = EdiEnergyScraper.get_epoch_file_map(soup, Epoch.CURRENT)
assert len(actual.keys()) == 81
for file_basename in actual.keys():
# all the current documents are either "open" or valid until April 2021
Expand All @@ -157,7 +157,7 @@ def test_epoch_file_map_past_20210210(self, datafiles):
with open(datafiles / "past_20210210.html", "r", encoding="utf8") as infile:
response_body = infile.read()
soup = BeautifulSoup(response_body, "html.parser")
actual = EdiEnergyScraper.get_epoch_file_map(soup)
actual = EdiEnergyScraper.get_epoch_file_map(soup, Epoch.PAST)
assert len(actual.keys()) == 705

@pytest.mark.parametrize(
Expand Down

0 comments on commit 3e2c127

Please sign in to comment.