From d3eae42e09404f23b323e1ffb2cda19f8e239cb8 Mon Sep 17 00:00:00 2001 From: ntkog <1333901+ntkog@users.noreply.github.com> Date: Mon, 5 Feb 2024 23:35:08 +0100 Subject: [PATCH 1/2] Fix scraper: problems with some fields. --- src/etls/bocm/utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/etls/bocm/utils.py b/src/etls/bocm/utils.py index dbaaf54..a36861f 100644 --- a/src/etls/bocm/utils.py +++ b/src/etls/bocm/utils.py @@ -15,8 +15,11 @@ def _get_url_from_cve(cve: str) -> str: # Metadata from head tags def metadata_from_head_tags(soup) -> tp.List[str]: - fecha_publicacion = soup.select_one('meta[property="article:published_time"]')["content"].split("T")[0] - cve = soup.select_one('meta[property="og:title"]')["content"] + # extract cve from meta[name="TituloGSA"] + cve = soup.select_one('meta[name="TituloGSA"]')["content"] + fecha = cve.split("-")[1:2][0] + fecha_publicacion = f'{fecha[:4]}-{fecha[4:6]}-{fecha[6:8]}' + html_link = soup.select_one('meta[property="og:url"]')["content"] return [fecha_publicacion, cve, html_link] @@ -30,16 +33,21 @@ def metadata_from_doc(soup, seccion: str, cve: str) -> tp.List[str]: apartado, tipo, anunciante, organo, rango = ["", "", "", "", ""] # get headers - seccion_name, *paras = [f.get_text().strip().upper() for f in soup.select("#cabeceras #cabeceras p")] + paras = [f.get_text().strip().upper() for f in soup.select("#cabeceras p")][:3] # Metadata from article description - desc = soup.select_one('meta[name="description"]')["content"] + desc_attempt = soup.select_one('meta[name="description"]') + if (desc_attempt is not None): + desc = desc_attempt["content"] + else: + desc = '' num_art = re.sub(r"BOCM-\d{8}-(\d{1,3})", r"\1", cve) try: if seccion == "1": subseccion_letter = ["A", "B", "C", "D"][int(seccion) - 1] - subseccion_name, organo = paras + subseccion_name = paras[0] + organo = paras[2] # Some articles don't have filled description needed for rango field extraction if len(desc) > 10: rango = re.sub(r"^(\b[^\s]+\b)(.*)", r"\1", desc.split(num_art)[1], flags=re.ASCII).upper() From e86a3de49a0bc58a764d5c45de358ab010dcf762 Mon Sep 17 00:00:00 2001 From: ntkog <1333901+ntkog@users.noreply.github.com> Date: Tue, 6 Feb 2024 00:10:28 +0100 Subject: [PATCH 2/2] =?UTF-8?q?Fix=20valor=20subsecci=C3=B3n=20para=20la?= =?UTF-8?q?=20secci=C3=B3n=204.=20Siempre=20es=20el=20mismo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/etls/bocm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/etls/bocm/utils.py b/src/etls/bocm/utils.py index a36861f..296e5da 100644 --- a/src/etls/bocm/utils.py +++ b/src/etls/bocm/utils.py @@ -66,7 +66,7 @@ def metadata_from_doc(soup, seccion: str, cve: str) -> tp.List[str]: organo = paras[0] if seccion == "4": - subseccion_name = paras[0] + subseccion_name = "ADMINISTRACIÓN DE JUSTICIA" if seccion == "5": subseccion_name = "OTROS ANUNCIOS" anunciante = paras[0]