Skip to content

Commit

Permalink
Merge pull request #62 from ntkog/fix_bocm_meta_published_time
Browse files Browse the repository at this point in the history
Fix scraper: problems with some fields.
  • Loading branch information
ntkog authored Feb 6, 2024
2 parents e45a181 + e86a3de commit 6bbe0a4
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions src/etls/bocm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ def _get_url_from_cve(cve: str) -> str:

# Metadata from head tags
def metadata_from_head_tags(soup) -> tp.List[str]:
fecha_publicacion = soup.select_one('meta[property="article:published_time"]')["content"].split("T")[0]
cve = soup.select_one('meta[property="og:title"]')["content"]
# extract cve from meta[name="TituloGSA"]
cve = soup.select_one('meta[name="TituloGSA"]')["content"]
fecha = cve.split("-")[1:2][0]
fecha_publicacion = f'{fecha[:4]}-{fecha[4:6]}-{fecha[6:8]}'

html_link = soup.select_one('meta[property="og:url"]')["content"]

return [fecha_publicacion, cve, html_link]
Expand All @@ -30,16 +33,21 @@ def metadata_from_doc(soup, seccion: str, cve: str) -> tp.List[str]:
apartado, tipo, anunciante, organo, rango = ["", "", "", "", ""]

# get headers
seccion_name, *paras = [f.get_text().strip().upper() for f in soup.select("#cabeceras #cabeceras p")]
paras = [f.get_text().strip().upper() for f in soup.select("#cabeceras p")][:3]

# Metadata from article description
desc = soup.select_one('meta[name="description"]')["content"]
desc_attempt = soup.select_one('meta[name="description"]')
if (desc_attempt is not None):
desc = desc_attempt["content"]
else:
desc = ''
num_art = re.sub(r"BOCM-\d{8}-(\d{1,3})", r"\1", cve)

try:
if seccion == "1":
subseccion_letter = ["A", "B", "C", "D"][int(seccion) - 1]
subseccion_name, organo = paras
subseccion_name = paras[0]
organo = paras[2]
# Some articles don't have filled description needed for rango field extraction
if len(desc) > 10:
rango = re.sub(r"^(\b[^\s]+\b)(.*)", r"\1", desc.split(num_art)[1], flags=re.ASCII).upper()
Expand All @@ -58,7 +66,7 @@ def metadata_from_doc(soup, seccion: str, cve: str) -> tp.List[str]:
organo = paras[0]

if seccion == "4":
subseccion_name = paras[0]
subseccion_name = "ADMINISTRACIÓN DE JUSTICIA"
if seccion == "5":
subseccion_name = "OTROS ANUNCIOS"
anunciante = paras[0]
Expand Down

0 comments on commit 6bbe0a4

Please sign in to comment.