Skip to content

Commit

Permalink
Merge pull request #44 from bukosabino/develop-update-config
Browse files Browse the repository at this point in the history
Modify config to save some money
  • Loading branch information
bukosabino authored Dec 21, 2023
2 parents fa7b53f + 6b137d5 commit e95b6c4
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 9 deletions.
4 changes: 2 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ prompt_system_context: |
collection_name: justicio

# Openai
llm_model_name: 'gpt-4-1106-preview' # 'gpt-3.5-turbo'
llm_model_name: 'gpt-3.5-turbo-1106' # 'gpt-4-1106-preview'
temperature: 0
seed: 42
max_tokens: 512
max_tokens: 1024

# Deprecated

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ supabase==1.0.2
pinecone-client==2.2.2
sentence_transformers==2.2.2
openai==1.2.3
tavily-python==0.2.6
openai==1.3.8
tavily-python==0.2.9

sendgrid==6.10.0

Expand Down
93 changes: 88 additions & 5 deletions src/etls/boe/load/documents.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,109 @@
import os
import typing as tp
import logging as lg

from requests.exceptions import HTTPError
from qdrant_client.http.models import Filter, FieldCondition, MatchValue
from qdrant_client import QdrantClient
import numpy as np

from src.email.send_email import send_email
from src.etls.boe.scrapper import BOEScrapper
from src.etls.boe.scripts.defs import boe_ids
from src.etls.boe.load.defs_id_largos import BOE_IDS
from src.etls.common.etl import ETL
from src.initialize import initialize_app
from src.initialize import initialize_app, initialize_logging

initialize_logging()

QDRANT_CLIENT = QdrantClient(
url=os.environ['QDRANT_API_URL'],
api_key=os.environ['QDRANT_API_KEY'],
timeout=1000
)


def load_important_ids(filename):
with open(filename) as f:
lines = f.read().splitlines()
return lines


def filter_documents_by_year(documents: tp.List[str]) -> tp.List[str]:
"""
"""
documents_filtered = []
for document_id in documents:
id_split = document_id.split('-')
if id_split[0] != 'BOE' or int(id_split[2]) < 2000:
documents_filtered.append(document_id)
return documents_filtered


def filter_documents_loaded(documents: tp.List[str]) -> tp.List[str]:
"""Filters a list of document IDs that are not loaded on Embedding database.
"""
logger = lg.getLogger(filter_documents_loaded.__name__)
query_vector = np.random.rand(768)
documents_filtered = []
for document_id in documents:
logger.info('Checking if document id is already loaded: %s', document_id)
search_result = QDRANT_CLIENT.search(
collection_name="justicio",
query_vector=query_vector,
query_filter=Filter(
must=[
FieldCondition(
key="metadata.identificador",
match=MatchValue(value=document_id)
)
]
),
limit=1
)
if not search_result:
documents_filtered.append(document_id)
logger.info('Document id: %s is added', document_id)

return documents_filtered


if __name__ == "__main__":
logger = lg.getLogger("__main__")
INIT_OBJECTS = initialize_app()
etl_job = ETL(
config_loader=INIT_OBJECTS.config_loader, vector_store=INIT_OBJECTS.vector_store
)
boe_scrapper = BOEScrapper()

documents = load_important_ids('src/etls/boe/load/defs_ids_importantes.txt')
documents += BOE_IDS
logger.info('Documents size: %s', len(documents))
documents_filtered = list(set(documents))
logger.info('Documents filtered size: %s', len(documents_filtered))
documents_filtered = filter_documents_by_year(documents_filtered)
logger.info('Documents filtered size: %s', len(documents_filtered))
logger.info(documents_filtered)
# documents_filtered = filter_documents_loaded(documents_filtered)
# logger.info('Documents filtered size: %s', len(documents_filtered))

docs = []
for boe_id in boe_ids:
for boe_id in documents_filtered:
logger.info('Loading BOE Id: %s', boe_id)
url = f"https://www.boe.es/diario_boe/xml.php?id={boe_id}"
docs.append(boe_scrapper.download_document(url))
try:
meta_document = boe_scrapper.download_document(url)
docs.append(meta_document)
except HTTPError:
logger.error("Not scrapped document %s", url)
except AttributeError:
logger.error("Not scrapped document %s", url)
if docs:
etl_job.run(docs)

subject = "[BOE] Documents ETL executed"
content = f"""
Documents ETL executed
- Documents loaded (BOE ids): {len(boe_ids)}
- Documents loaded (BOE ids): {len(documents_filtered)}
- Documents loaded: {len(docs)}
- Database used: {INIT_OBJECTS.config_loader['vector_store']}
"""
Expand Down
51 changes: 50 additions & 1 deletion src/service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,56 @@ async def qa_35(input_query: str = DEFAULT_INPUT_QUERY):
]
# logger.info(messages)
response = await INIT_OBJECTS.openai_client.chat.completions.create(
model='gpt-3.5-turbo',
model='gpt-3.5-turbo-1106',
messages=messages,
temperature=INIT_OBJECTS.config_loader["temperature"],
seed=INIT_OBJECTS.config_loader["seed"],
max_tokens=INIT_OBJECTS.config_loader["max_tokens"],
)
answer = response.choices[0].message.content
logger.info(answer)
logger.info(response.usage)

response_payload = dict(
scoring_id=str(uuid.uuid4()),
context=docs,
answer=answer,
)
return response_payload


@APP.get("/qa_4")
@timeit
async def qa_4(input_query: str = DEFAULT_INPUT_QUERY):
logger = lg.getLogger(qa_35.__name__)
logger.info(input_query)

# Getting context from embedding database (Qdrant)
docs = await INIT_OBJECTS.vector_store.asimilarity_search_with_score(
query=input_query, k=INIT_OBJECTS.config_loader["top_k_results"]
)

# Generate response using a LLM (OpenAI)
context_preprocessed = [
{"context": doc[0].page_content, "score": doc[1]} for doc in docs
]
messages = [
{"role": "system", "content": INIT_OBJECTS.config_loader["prompt_system"]},
{
"role": "system",
"content": INIT_OBJECTS.config_loader["prompt_system_context"],
},
{"role": "system", "content": "A continuación se proporciona el contexto:"},
{"role": "system", "content": str(context_preprocessed)},
{
"role": "system",
"content": "A continuación se proporciona la pregunta del usuario:",
},
{"role": "user", "content": input_query},
]
# logger.info(messages)
response = await INIT_OBJECTS.openai_client.chat.completions.create(
model='gpt-4-1106-preview',
messages=messages,
temperature=INIT_OBJECTS.config_loader["temperature"],
seed=INIT_OBJECTS.config_loader["seed"],
Expand Down

0 comments on commit e95b6c4

Please sign in to comment.