Skip to content

Commit

Permalink
fix(parsing): handle page breaks within tables (#455)
Browse files Browse the repository at this point in the history
* fixed routine for broken lines

* remove comments

* fix typecheck and linter

* fixed small errors

* fix: merging messed with snapshots

* moved coditions text and fixed tests

* fix whitespace

* updated snapshot

---------

Co-authored-by: kevin <68426071+hf-krechan@users.noreply.github.com>
  • Loading branch information
DeltaDaniel and hf-krechan authored Oct 2, 2024
1 parent fb1e8d3 commit a6869c1
Show file tree
Hide file tree
Showing 7 changed files with 4,651 additions and 4,203 deletions.
110 changes: 107 additions & 3 deletions src/kohlrahbi/ahbtable/ahbsubtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
This module contains the AhbSubTable class.
"""

from typing import Generator
from typing import Generator, Union

import numpy as np
import pandas as pd
from docx.table import Table as DocxTable
from docx.table import _Cell, _Row
from docx.text.paragraph import Paragraph
from numpy.typing import NDArray
from pydantic import BaseModel, ConfigDict

from kohlrahbi.ahbtable.ahbtablerow import AhbTableRow
from kohlrahbi.docxtablecells.bodycell import INDEX_OF_CODES_AND_QUALIFIER_COLUMN, KNOW_SUFFIXES
from kohlrahbi.enums import RowType
from kohlrahbi.row_type_checker import get_row_type
from kohlrahbi.seed import Seed
Expand All @@ -30,6 +34,7 @@ class AhbSubTable(BaseModel):
def _parse_docx_table(
table_meta_data: Seed, ahb_table_dataframe: pd.DataFrame, docx_table: DocxTable
) -> pd.DataFrame:
"""Parse the docx table and add the information to the dataframe."""
for row in docx_table.rows:
sanitized_cells = list(AhbSubTable.iter_visible_cells(row=row))

Expand Down Expand Up @@ -58,16 +63,45 @@ def _parse_docx_table(

if ahb_table_row_dataframe is not None:
ahb_table_dataframe = pd.concat([ahb_table_dataframe, ahb_table_row_dataframe], ignore_index=True)
# this case covers the page break situation
else:
# this case covers the page break situation

# check for conditions_text
contains_condition_texts = any(paragraph.text != "" for paragraph in bedingung_cell.paragraphs)
# conditions are always at the top of a dataelement
# add condition texts
if contains_condition_texts:
AhbSubTable.combine_condition_text(ahb_table_dataframe, bedingung_cell)

# add new row regularly
ahb_table_row = AhbTableRow(
seed=table_meta_data,
edifact_struktur_cell=edifact_struktur_cell,
middle_cell=middle_cell,
bedingung_cell=bedingung_cell,
)
ahb_table_row_dataframe = ahb_table_row.parse(row_type=current_row_type)

ahb_table_row.parse(row_type=table_meta_data.last_two_row_types[1])
# look at first line to determine if it is broken
first_paragraph = middle_cell.paragraphs[0]

if ahb_table_row_dataframe is not None:
if AhbSubTable.is_broken_line(
table=ahb_table_dataframe,
table_meta_data=table_meta_data,
paragraph=first_paragraph,
):
AhbSubTable.add_broken_line(ahb_table_dataframe, ahb_table_row_dataframe)
# we have a broken line
ahb_table_dataframe = pd.concat(
[ahb_table_dataframe, ahb_table_row_dataframe.iloc[1:]],
ignore_index=True,
)
else:
ahb_table_dataframe = pd.concat(
[ahb_table_dataframe, ahb_table_row_dataframe],
ignore_index=True,
)

# An AhbSubTable can span over two pages.
# But after every page break, even if we're still in the same subtable,
Expand Down Expand Up @@ -131,3 +165,73 @@ def iter_visible_cells(row: _Row) -> Generator[_Cell, None, None]:
table_row = row._tr # pylint:disable=protected-access
for table_column in table_row.tc_lst:
yield _Cell(table_column, row.table)

@staticmethod
def add_text_to_last_row(ahb_table_dataframe: pd.DataFrame, row_index: int, column_index: int, text: str) -> None:
"""Add a text to the last row of the dataframe."""
starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
if len(text) > 0:
if len(ahb_table_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
text = " " + text
ahb_table_dataframe.iat[row_index, column_index] += text

@staticmethod
def add_broken_line(ahb_table_dataframe: pd.DataFrame, broken_line: pd.DataFrame) -> None:
"""Add a broken line to the dataframe."""
for col_index in range(INDEX_OF_CODES_AND_QUALIFIER_COLUMN, len(ahb_table_dataframe.columns)):
AhbSubTable.add_text_to_last_row(
ahb_table_dataframe, ahb_table_dataframe.index.max(), col_index, str(broken_line.iat[0, col_index])
)

@staticmethod
def combine_condition_text(ahb_table_dataframe: pd.DataFrame, bedingung_cell: _Cell) -> None:
"""Add the condition text to the dataframe."""
conditions_text = " " + " ".join(
paragraph.text for paragraph in bedingung_cell.paragraphs if paragraph.text != ""
)
last_valid_row = ahb_table_dataframe["Bedingung"].last_valid_index()
conditions_text = ahb_table_dataframe.at[last_valid_row, "Bedingung"] + conditions_text
# remove existing text
ahb_table_dataframe.at[last_valid_row, "Bedingung"] = ""
# remove remaining text to avoid misplacements
for paragraph in bedingung_cell.paragraphs:
paragraph.text = ""
bedingung_cell.paragraphs[-1].text = conditions_text

@staticmethod
def is_broken_line(
table: pd.DataFrame,
table_meta_data: Seed,
paragraph: Paragraph,
) -> bool:
"""
Check for broken lines in the middle cell.
"""
tabsplit_text = paragraph.text.split("\t")

loc: Union[int, slice, NDArray[np.bool_]] = table.columns.get_loc("Beschreibung")

# Ensure loc is an int
if isinstance(loc, int):
beschreibung_index: int = loc
else:
raise ValueError("The location of the column 'Beschreibung' is not an integer.")

is_empty_middle_line = all(text == "" for text in tabsplit_text)
is_broken_code_qualifier = (
paragraph.paragraph_format.left_indent is not None
and paragraph.paragraph_format.left_indent != table_meta_data.middle_cell_left_indent_position
and table.iat[-1, beschreibung_index] != ""
and table.iloc[-1, beschreibung_index + 1 :].ne("").any()
)
if is_broken_code_qualifier and len(tabsplit_text) == 1:
# only broken code / qualifier
assert (
table.iat[-1, beschreibung_index] != "" and table.iloc[-1, beschreibung_index + 1 :].ne("").any()
), "no condition expected in broken line"
there_are_conditions = (
len(tabsplit_text) > 1
and paragraph.paragraph_format.left_indent != table_meta_data.middle_cell_left_indent_position
)

return is_empty_middle_line or there_are_conditions or is_broken_code_qualifier
5 changes: 1 addition & 4 deletions src/kohlrahbi/ahbtable/ahbtablerow.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ class AhbTableRow(BaseModel):

model_config = ConfigDict(arbitrary_types_allowed=True)

def parse(
self,
row_type: RowType,
) -> Optional[pd.DataFrame]:
def parse(self, row_type: RowType) -> Optional[pd.DataFrame]:
"""
Writes the current row of the current table into the DataFrame depending on the type of the row.
If the row is a header row, it will be skipped and None will be returned.
Expand Down
13 changes: 7 additions & 6 deletions src/kohlrahbi/docxtablecells/bedinungscell.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Parses a cell in the Bedingung column and puts the information into the appropriate column of the dataframe.
"""

bedingung = self.beautify_bedingungen()
bedingung = self.table_cell.text
bedingung = self.beautify_bedingungen(bedingung)

row_index = ahb_row_dataframe.index.max()
ahb_row_dataframe.at[row_index, "Bedingung"] += bedingung
return ahb_row_dataframe

# pylint: disable=line-too-long
def beautify_bedingungen(self) -> str:
@staticmethod
def beautify_bedingungen(bedingung: str) -> str:
"""
Beautifies the Bedingungen by removing the given line breaks and insert the line breaks at the correct places.
Expand All @@ -41,11 +42,11 @@ def beautify_bedingungen(self) -> str:
[494] Das hier genannte Datum muss der Zeitpunkt sein, zu dem das Dokument erstellt wurde, oder ein Zeitpunkt, der davor liegt
[931] Format: ZZZ = +00
"""
beautified_bedingung = self.table_cell.text.replace("\n", " ")
beautified_bedingung = bedingung.replace("\n", " ")

matches = re.findall(r"\[\d+\]", beautified_bedingung)
for match in matches[1:]:
index = beautified_bedingung.find(match)
beautified_bedingung = beautified_bedingung[:index] + "\n" + beautified_bedingung[index:]
beautified_bedingung = beautified_bedingung[:index].rstrip() + "\n" + beautified_bedingung[index:]

return beautified_bedingung
return beautified_bedingung.lstrip()
6 changes: 5 additions & 1 deletion src/kohlrahbi/docxtablecells/bodycell.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
if len(text) > 0:
if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
if (
len(ahb_row_dataframe.iat[row_index, column_index]) > 0
and not starts_with_known_suffix
and len(text) > 1
):
text = " " + text
ahb_row_dataframe.iat[row_index, column_index] += text

Expand Down
Loading

0 comments on commit a6869c1

Please sign in to comment.