fix(parsing): handle page breaks within tables (#455)

* fixed routine for broken lines * remove comments * fix typecheck and linter * fixed small errors * fix: merging messed with snapshots * moved coditions text and fixed tests * fix whitespace * updated snapshot --------- Co-authored-by: kevin <68426071+hf-krechan@users.noreply.github.com>
Hochfrequenz · Oct 2, 2024 · a6869c1 · a6869c1
1 parent fb1e8d3
commit a6869c1
Show file tree

Hide file tree

Showing 7 changed files with 4,651 additions and 4,203 deletions.
diff --git a/src/kohlrahbi/ahbtable/ahbsubtable.py b/src/kohlrahbi/ahbtable/ahbsubtable.py
@@ -2,14 +2,18 @@
 This module contains the AhbSubTable class.
 """
 
-from typing import Generator
+from typing import Generator, Union
 
+import numpy as np
 import pandas as pd
 from docx.table import Table as DocxTable
 from docx.table import _Cell, _Row
+from docx.text.paragraph import Paragraph
+from numpy.typing import NDArray
 from pydantic import BaseModel, ConfigDict
 
 from kohlrahbi.ahbtable.ahbtablerow import AhbTableRow
+from kohlrahbi.docxtablecells.bodycell import INDEX_OF_CODES_AND_QUALIFIER_COLUMN, KNOW_SUFFIXES
 from kohlrahbi.enums import RowType
 from kohlrahbi.row_type_checker import get_row_type
 from kohlrahbi.seed import Seed
@@ -30,6 +34,7 @@ class AhbSubTable(BaseModel):
     def _parse_docx_table(
         table_meta_data: Seed, ahb_table_dataframe: pd.DataFrame, docx_table: DocxTable
     ) -> pd.DataFrame:
+        """Parse the docx table and add the information to the dataframe."""
         for row in docx_table.rows:
             sanitized_cells = list(AhbSubTable.iter_visible_cells(row=row))
 
@@ -58,16 +63,45 @@ def _parse_docx_table(
 
                 if ahb_table_row_dataframe is not None:
                     ahb_table_dataframe = pd.concat([ahb_table_dataframe, ahb_table_row_dataframe], ignore_index=True)
-            # this case covers the page break situation
             else:
+                # this case covers the page break situation
+
+                # check for conditions_text
+                contains_condition_texts = any(paragraph.text != "" for paragraph in bedingung_cell.paragraphs)
+                # conditions are always at the top of a dataelement
+                # add condition texts
+                if contains_condition_texts:
+                    AhbSubTable.combine_condition_text(ahb_table_dataframe, bedingung_cell)
+
+                # add new row regularly
                 ahb_table_row = AhbTableRow(
                     seed=table_meta_data,
                     edifact_struktur_cell=edifact_struktur_cell,
                     middle_cell=middle_cell,
                     bedingung_cell=bedingung_cell,
                 )
+                ahb_table_row_dataframe = ahb_table_row.parse(row_type=current_row_type)
 
-                ahb_table_row.parse(row_type=table_meta_data.last_two_row_types[1])
+                # look at first line to determine if it is broken
+                first_paragraph = middle_cell.paragraphs[0]
+
+                if ahb_table_row_dataframe is not None:
+                    if AhbSubTable.is_broken_line(
+                        table=ahb_table_dataframe,
+                        table_meta_data=table_meta_data,
+                        paragraph=first_paragraph,
+                    ):
+                        AhbSubTable.add_broken_line(ahb_table_dataframe, ahb_table_row_dataframe)
+                        # we have a broken line
+                        ahb_table_dataframe = pd.concat(
+                            [ahb_table_dataframe, ahb_table_row_dataframe.iloc[1:]],
+                            ignore_index=True,
+                        )
+                    else:
+                        ahb_table_dataframe = pd.concat(
+                            [ahb_table_dataframe, ahb_table_row_dataframe],
+                            ignore_index=True,
+                        )
 
             # An AhbSubTable can span over two pages.
             # But after every page break, even if we're still in the same subtable,
@@ -131,3 +165,73 @@ def iter_visible_cells(row: _Row) -> Generator[_Cell, None, None]:
         table_row = row._tr  # pylint:disable=protected-access
         for table_column in table_row.tc_lst:
             yield _Cell(table_column, row.table)
+
+    @staticmethod
+    def add_text_to_last_row(ahb_table_dataframe: pd.DataFrame, row_index: int, column_index: int, text: str) -> None:
+        """Add a text to the last row of the dataframe."""
+        starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
+        if len(text) > 0:
+            if len(ahb_table_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
+                text = " " + text
+            ahb_table_dataframe.iat[row_index, column_index] += text
+
+    @staticmethod
+    def add_broken_line(ahb_table_dataframe: pd.DataFrame, broken_line: pd.DataFrame) -> None:
+        """Add a broken line to the dataframe."""
+        for col_index in range(INDEX_OF_CODES_AND_QUALIFIER_COLUMN, len(ahb_table_dataframe.columns)):
+            AhbSubTable.add_text_to_last_row(
+                ahb_table_dataframe, ahb_table_dataframe.index.max(), col_index, str(broken_line.iat[0, col_index])
+            )
+
+    @staticmethod
+    def combine_condition_text(ahb_table_dataframe: pd.DataFrame, bedingung_cell: _Cell) -> None:
+        """Add the condition text to the dataframe."""
+        conditions_text = " " + " ".join(
+            paragraph.text for paragraph in bedingung_cell.paragraphs if paragraph.text != ""
+        )
+        last_valid_row = ahb_table_dataframe["Bedingung"].last_valid_index()
+        conditions_text = ahb_table_dataframe.at[last_valid_row, "Bedingung"] + conditions_text
+        # remove existing text
+        ahb_table_dataframe.at[last_valid_row, "Bedingung"] = ""
+        # remove remaining text to avoid misplacements
+        for paragraph in bedingung_cell.paragraphs:
+            paragraph.text = ""
+        bedingung_cell.paragraphs[-1].text = conditions_text
+
+    @staticmethod
+    def is_broken_line(
+        table: pd.DataFrame,
+        table_meta_data: Seed,
+        paragraph: Paragraph,
+    ) -> bool:
+        """
+        Check for broken lines in the middle cell.
+        """
+        tabsplit_text = paragraph.text.split("\t")
+
+        loc: Union[int, slice, NDArray[np.bool_]] = table.columns.get_loc("Beschreibung")
+
+        # Ensure loc is an int
+        if isinstance(loc, int):
+            beschreibung_index: int = loc
+        else:
+            raise ValueError("The location of the column 'Beschreibung' is not an integer.")
+
+        is_empty_middle_line = all(text == "" for text in tabsplit_text)
+        is_broken_code_qualifier = (
+            paragraph.paragraph_format.left_indent is not None
+            and paragraph.paragraph_format.left_indent != table_meta_data.middle_cell_left_indent_position
+            and table.iat[-1, beschreibung_index] != ""
+            and table.iloc[-1, beschreibung_index + 1 :].ne("").any()
+        )
+        if is_broken_code_qualifier and len(tabsplit_text) == 1:
+            # only broken code / qualifier
+            assert (
+                table.iat[-1, beschreibung_index] != "" and table.iloc[-1, beschreibung_index + 1 :].ne("").any()
+            ), "no condition expected in broken line"
+        there_are_conditions = (
+            len(tabsplit_text) > 1
+            and paragraph.paragraph_format.left_indent != table_meta_data.middle_cell_left_indent_position
+        )
+
+        return is_empty_middle_line or there_are_conditions or is_broken_code_qualifier
diff --git a/src/kohlrahbi/ahbtable/ahbtablerow.py b/src/kohlrahbi/ahbtable/ahbtablerow.py
@@ -27,10 +27,7 @@ class AhbTableRow(BaseModel):
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    def parse(
-        self,
-        row_type: RowType,
-    ) -> Optional[pd.DataFrame]:
+    def parse(self, row_type: RowType) -> Optional[pd.DataFrame]:
         """
         Writes the current row of the current table into the DataFrame depending on the type of the row.
         If the row is a header row, it will be skipped and None will be returned.

diff --git a/src/kohlrahbi/docxtablecells/bedinungscell.py b/src/kohlrahbi/docxtablecells/bedinungscell.py
@@ -23,15 +23,16 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
         """
         Parses a cell in the Bedingung column and puts the information into the appropriate column of the dataframe.
         """
-
-        bedingung = self.beautify_bedingungen()
+        bedingung = self.table_cell.text
+        bedingung = self.beautify_bedingungen(bedingung)
 
         row_index = ahb_row_dataframe.index.max()
         ahb_row_dataframe.at[row_index, "Bedingung"] += bedingung
         return ahb_row_dataframe
 
     # pylint: disable=line-too-long
-    def beautify_bedingungen(self) -> str:
+    @staticmethod
+    def beautify_bedingungen(bedingung: str) -> str:
         """
         Beautifies the Bedingungen by removing the given line breaks and insert the line breaks at the correct places.
 
@@ -41,11 +42,11 @@ def beautify_bedingungen(self) -> str:
         [494] Das hier genannte Datum muss der Zeitpunkt sein, zu dem das Dokument erstellt wurde, oder ein Zeitpunkt, der davor liegt
         [931] Format: ZZZ = +00
         """
-        beautified_bedingung = self.table_cell.text.replace("\n", " ")
+        beautified_bedingung = bedingung.replace("\n", " ")
 
         matches = re.findall(r"\[\d+\]", beautified_bedingung)
         for match in matches[1:]:
             index = beautified_bedingung.find(match)
-            beautified_bedingung = beautified_bedingung[:index] + "\n" + beautified_bedingung[index:]
+            beautified_bedingung = beautified_bedingung[:index].rstrip() + "\n" + beautified_bedingung[index:]
 
-        return beautified_bedingung
+        return beautified_bedingung.lstrip()
diff --git a/src/kohlrahbi/docxtablecells/bodycell.py b/src/kohlrahbi/docxtablecells/bodycell.py
@@ -52,7 +52,11 @@ def parse(self, ahb_row_dataframe: pd.DataFrame) -> pd.DataFrame:
         def add_text_to_column(row_index: int, column_index: int, text: str) -> None:
             starts_with_known_suffix = any(text.startswith(suffix + " ") for suffix in KNOW_SUFFIXES)
             if len(text) > 0:
-                if len(ahb_row_dataframe.iat[row_index, column_index]) > 0 and not starts_with_known_suffix:
+                if (
+                    len(ahb_row_dataframe.iat[row_index, column_index]) > 0
+                    and not starts_with_known_suffix
+                    and len(text) > 1
+                ):
                     text = " " + text
                 ahb_row_dataframe.iat[row_index, column_index] += text