Skip to content

Commit

Permalink
linter
Browse files Browse the repository at this point in the history
  • Loading branch information
bukosabino committed Jan 30, 2024
1 parent c861680 commit 0b8aee4
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 92 deletions.
8 changes: 2 additions & 6 deletions src/etls/boe/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@

@app.command()
def today(collection_name: str):
etl_job = ETL(
config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name]
)
etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name])
boe_scrapper = BOEScrapper()
day = date.today()
docs = boe_scrapper.download_day(day)
Expand All @@ -35,9 +33,7 @@ def today(collection_name: str):

@app.command()
def dates(collection_name: str, date_start: str, date_end: str):
etl_job = ETL(
config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name]
)
etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name])
boe_scrapper = BOEScrapper()
docs = boe_scrapper.download_days(
date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
Expand Down
2 changes: 1 addition & 1 deletion src/etls/boe/loading/defs_id_largos.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
"BOE-A-1999-637",
"BOE-A-1999-6568",
"BOE-A-1999-8910",
"BOE-A-1999-8994"
"BOE-A-1999-8994",
]


Expand Down
42 changes: 14 additions & 28 deletions src/etls/boe/loading/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@

initialize_logging()

QDRANT_CLIENT = QdrantClient(
url=os.environ['QDRANT_API_URL'],
api_key=os.environ['QDRANT_API_KEY'],
timeout=1000
)
QDRANT_CLIENT = QdrantClient(url=os.environ["QDRANT_API_URL"], api_key=os.environ["QDRANT_API_KEY"], timeout=1000)


def load_important_ids(filename):
Expand All @@ -29,66 +25,56 @@ def load_important_ids(filename):


def filter_documents_by_year(documents: tp.List[str]) -> tp.List[str]:
"""
"""
documents_filtered = []
for document_id in documents:
id_split = document_id.split('-')
if id_split[0] != 'BOE' or int(id_split[2]) < 2000:
id_split = document_id.split("-")
if id_split[0] != "BOE" or int(id_split[2]) < 2000:
documents_filtered.append(document_id)
return documents_filtered


def filter_documents_loaded(documents: tp.List[str]) -> tp.List[str]:
"""Filters a list of document IDs that are not loaded on Embedding database.
"""
"""Filters a list of document IDs that are not loaded on Embedding database."""
logger = lg.getLogger(filter_documents_loaded.__name__)
query_vector = np.random.rand(768)
documents_filtered = []
for document_id in documents:
logger.info('Checking if document id is already loaded: %s', document_id)
logger.info("Checking if document id is already loaded: %s", document_id)
search_result = QDRANT_CLIENT.search(
collection_name="justicio",
query_vector=query_vector,
query_filter=Filter(
must=[
FieldCondition(
key="metadata.identificador",
match=MatchValue(value=document_id)
)
]
must=[FieldCondition(key="metadata.identificador", match=MatchValue(value=document_id))]
),
limit=1
limit=1,
)
if not search_result:
documents_filtered.append(document_id)
logger.info('Document id: %s is added', document_id)
logger.info("Document id: %s is added", document_id)

return documents_filtered


if __name__ == "__main__":
logger = lg.getLogger("__main__")
INIT_OBJECTS = initialize_app()
etl_job = ETL(
config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store
)
etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store)
boe_scrapper = BOEScrapper()

documents = load_important_ids('src/etls/boe/load/defs_ids_importantes.txt')
documents = load_important_ids("src/etls/boe/load/defs_ids_importantes.txt")
documents += BOE_IDS
logger.info('Documents size: %s', len(documents))
logger.info("Documents size: %s", len(documents))
documents_filtered = list(set(documents))
logger.info('Documents filtered size: %s', len(documents_filtered))
logger.info("Documents filtered size: %s", len(documents_filtered))
documents_filtered = filter_documents_by_year(documents_filtered)
logger.info('Documents filtered size: %s', len(documents_filtered))
logger.info("Documents filtered size: %s", len(documents_filtered))
logger.info(documents_filtered)
# documents_filtered = filter_documents_loaded(documents_filtered)
# logger.info('Documents filtered size: %s', len(documents_filtered))

docs = []
for boe_id in documents_filtered:
logger.info('Loading BOE Id: %s', boe_id)
logger.info("Loading BOE Id: %s", boe_id)
url = f"https://www.boe.es/diario_boe/xml.php?id={boe_id}"
try:
meta_document = boe_scrapper.download_document(url)
Expand Down
40 changes: 10 additions & 30 deletions src/etls/boe/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,11 @@ def _extract_metadata(soup) -> tp.Dict:
if fecha_disposicion := soup.documento.metadatos.fecha_disposicion:
metadata_dict["fecha_disposicion"] = fecha_disposicion.get_text()

metadata_dict["anio"] = datetime.strptime(
fecha_publicacion.get_text(), "%Y%m%d"
).strftime("%Y")
metadata_dict["anio"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%Y")

metadata_dict["mes"] = datetime.strptime(
fecha_publicacion.get_text(), "%Y%m%d"
).strftime("%m")
metadata_dict["mes"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%m")

metadata_dict["dia"] = datetime.strptime(
fecha_publicacion.get_text(), "%Y%m%d"
).strftime("%d")
metadata_dict["dia"] = datetime.strptime(fecha_publicacion.get_text(), "%Y%m%d").strftime("%d")

# Analisis
if observaciones := soup.documento.analisis.observaciones:
Expand All @@ -74,35 +68,25 @@ def _extract_metadata(soup) -> tp.Dict:
metadata_dict["tipo"] = tipo.get_text()

metadata_dict["materias"] = [
materia.get_text()
for materia in soup.select("documento > analisis > materias > materia")
]
metadata_dict["alertas"] = [
alerta.get_text()
for alerta in soup.select("documento > analisis > alertas > alerta")
]
metadata_dict["notas"] = [
nota.get_text() for nota in soup.select("documento > analisis > notas > nota")
materia.get_text() for materia in soup.select("documento > analisis > materias > materia")
]
metadata_dict["alertas"] = [alerta.get_text() for alerta in soup.select("documento > analisis > alertas > alerta")]
metadata_dict["notas"] = [nota.get_text() for nota in soup.select("documento > analisis > notas > nota")]
metadata_dict["ref_posteriores"] = [
BOEMetadataReferencia(
id=ref["referencia"],
palabra=ref.palabra.get_text(),
texto=ref.texto.get_text(),
)
for ref in soup.select(
"documento > analisis > referencias > posteriores > posterior"
)
for ref in soup.select("documento > analisis > referencias > posteriores > posterior")
]
metadata_dict["ref_anteriores"] = [
BOEMetadataReferencia(
id=ref["referencia"],
palabra=ref.palabra.get_text(),
texto=ref.texto.get_text(),
)
for ref in soup.select(
"documento > analisis > referencias > anteriores > anterior"
)
for ref in soup.select("documento > analisis > referencias > anteriores > anterior")
]
return metadata_dict

Expand Down Expand Up @@ -147,13 +131,9 @@ def download_day(self, day: date) -> tp.List[BOEMetadataDocument]:
metadata_doc = self.download_document(url_document)
metadata_documents.append(metadata_doc)
except HTTPError:
logger.error(
"Not scrapped document %s on day %s", url_document, day_url
)
logger.error("Not scrapped document %s on day %s", url_document, day_url)
except AttributeError:
logger.error(
"Not scrapped document %s on day %s", url_document, day_url
)
logger.error("Not scrapped document %s on day %s", url_document, day_url)
except HTTPError:
logger.error("Not scrapped document on day %s", day_url)
logger.info("Downloaded BOE content for day %s", day)
Expand Down
4 changes: 1 addition & 3 deletions src/etls/common/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@


class BaseScrapper(ABC):
def download_days(
self, date_start: date, date_end: date
) -> tp.List[MetadataDocument]:
def download_days(self, date_start: date, date_end: date) -> tp.List[MetadataDocument]:
"""Download all the documents between two dates (from date_start to date_end)"""
logger = lg.getLogger(self.download_days.__name__)
logger.info("Downloading content from day %s to %s", date_start, date_end)
Expand Down
8 changes: 2 additions & 6 deletions src/etls/template/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@

@app.command()
def today(collection_name: str):
etl_job = ETL(
config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name]
)
etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name])
boe_scrapper = TemplateScrapper()
day = date.today()
docs = boe_scrapper.download_day(day)
Expand All @@ -35,9 +33,7 @@ def today(collection_name: str):

@app.command()
def dates(collection_name: str, date_start: str, date_end: str):
etl_job = ETL(
config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name]
)
etl_job = ETL(config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store[collection_name])
scrapper = TemplateScrapper()
docs = scrapper.download_days(
date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
Expand Down
12 changes: 4 additions & 8 deletions src/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ def initialize_logging():
logger.info("Initializing logging")
logger.handlers = []
handler = lg.StreamHandler()
formatter = lg.Formatter(
"[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s"
)
formatter = lg.Formatter("[%(asctime)s] [%(process)d] [%(levelname)s] [%(name)s] %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(lg.INFO)
Expand All @@ -45,7 +43,7 @@ def initialize_app():
config_loader = _init_config()
vector_store = _init_vector_store(config_loader)
openai_client = _init_openai_client()
tavily_client = TavilyClient(api_key=os.environ['TAVILY_API_KEY'])
tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
# retrieval_qa = _init_retrieval_qa_llm(vector_store, config_loader)
logger.info("Initialized application")
init_objects = collections.namedtuple(
Expand Down Expand Up @@ -138,7 +136,7 @@ def _init_vector_stores_qdrant(config_loader):
qdrant_client.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=config_loader['embeddings_model_size'], distance=config_loader['distance_type']
size=config_loader["embeddings_model_size"], distance=config_loader["distance_type"]
),
on_disk_payload=True,
)
Expand Down Expand Up @@ -172,9 +170,7 @@ def _init_retrieval_qa_llm(vector_store, config_loader):
# DEPRECATED
logger = lg.getLogger(_init_retrieval_qa_llm.__name__)
logger.info("Initializing RetrievalQA LLM")
retriever = vector_store.as_retriever(
search_type="similarity", search_kwargs={"k": config_loader["top_k_results"]}
)
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": config_loader["top_k_results"]})
system_template = f"{config_loader['prompt_system']}----------------\n{{context}}"
messages = [
SystemMessagePromptTemplate.from_template(system_template),
Expand Down
21 changes: 11 additions & 10 deletions src/service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
"¿Es de aplicación la ley de garantía integral de la libertad sexual a niños (varones) menores de edad "
"víctimas de violencias sexuales o solo a niñas y mujeres?"
)
DEFAULT_COLLECTION_NAME = 'justicio'
DEFAULT_COLLECTION_NAME = "justicio"


@APP.get("/healthcheck")
@timeit
Expand Down Expand Up @@ -53,7 +54,7 @@ async def semantic_search_tavily(input_query: str = DEFAULT_INPUT_QUERY):
max_results=10,
topic="general",
include_raw_content=False,
include_answer=False
include_answer=False,
)
logger.info(docs)
return docs
Expand All @@ -68,7 +69,11 @@ async def a_request_get(url):

@APP.get("/qa")
@timeit
async def qa(input_query: str = DEFAULT_INPUT_QUERY, collection_name: str = DEFAULT_COLLECTION_NAME, model_name: str = INIT_OBJECTS.config_loader["llm_model_name"]):
async def qa(
input_query: str = DEFAULT_INPUT_QUERY,
collection_name: str = DEFAULT_COLLECTION_NAME,
model_name: str = INIT_OBJECTS.config_loader["llm_model_name"],
):
logger = lg.getLogger(qa.__name__)
logger.info(input_query)

Expand All @@ -78,9 +83,7 @@ async def qa(input_query: str = DEFAULT_INPUT_QUERY, collection_name: str = DEFA
)

# Generate response using a LLM (OpenAI)
context_preprocessed = [
{"context": doc[0].page_content, "score": doc[1]} for doc in docs
]
context_preprocessed = [{"context": doc[0].page_content, "score": doc[1]} for doc in docs]
messages = [
{"role": "system", "content": INIT_OBJECTS.config_loader["prompt_system"]},
{
Expand Down Expand Up @@ -129,13 +132,11 @@ async def qa_tavily(input_query: str = DEFAULT_INPUT_QUERY):
max_results=10,
topic="general",
include_raw_content=False,
include_answer=False
include_answer=False,
)

# Generate response using a LLM (OpenAI)
context_preprocessed = [
{"context": doc['content'], "score": doc['score']} for doc in docs['results']
]
context_preprocessed = [{"context": doc["content"], "score": doc["score"]} for doc in docs["results"]]

response = await INIT_OBJECTS.openai_client.chat.completions.create(
model=INIT_OBJECTS.config_loader["llm_model_name"],
Expand Down

0 comments on commit 0b8aee4

Please sign in to comment.