From 610b047d10afc5a95a6159bbb93d02734c3395ff Mon Sep 17 00:00:00 2001 From: Arthur Taylor Date: Wed, 24 Jan 2024 16:56:22 +0100 Subject: [PATCH] Revert "Revert "Fix for wggesucht crawler to only consider the desired listings"" --- flathunter/crawler/wggesucht.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/flathunter/crawler/wggesucht.py b/flathunter/crawler/wggesucht.py index f20a2a00..234297ed 100644 --- a/flathunter/crawler/wggesucht.py +++ b/flathunter/crawler/wggesucht.py @@ -148,12 +148,16 @@ def parse_expose_element_to_details(row: Tag, crawler: str) -> Optional[Dict]: def liste_attribute_filter(element: Union[Tag, str]) -> bool: - """Return true for elements whose 'id' attribute starts with 'liste-'""" + """Return true for elements whose 'id' attribute starts with 'liste-' + and are not contained in the 'premium_user_extra_list' container""" if not isinstance(element, Tag): return False - if "id" not in element.attrs: + if not element.attrs or "id" not in element.attrs: return False - return element.attrs["id"].startswith('liste-') + if not element.parent or not element.parent.attrs or "class" not in element.parent.attrs: + return False + return element.attrs["id"].startswith('liste-') and \ + 'premium_user_extra_list' not in element.parent.attrs["class"] class WgGesucht(Crawler): @@ -175,7 +179,6 @@ def extract_data(self, soup: BeautifulSoup): e for e in findings if isinstance(e, Tag) and e.has_attr('class') and not 'display-none' in e['class'] ] - for row in existing_findings: details = parse_expose_element_to_details(row, self.get_name()) if details is None: