-
Notifications
You must be signed in to change notification settings - Fork 0
/
lib.py
67 lines (57 loc) · 2.23 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from secret import OLLAMA_EXTERNAL_BASEURL
from typing import DefaultDict
import uuid
os.environ["OLLAMA_HOST"] = OLLAMA_EXTERNAL_BASEURL
os.environ["OPENAI_API_KEY"] = "dummykey"
import embedchain
# docs: https://docs.embedchain.ai/
def print_calling_params(args, kwargs):
print("[*] Args:", args)
print("[*] Kwargs:", kwargs)
# load llm configuration from config.yaml file
class EmbedApp:
def __init__(self, config_path="secret.embedchain_config.yaml"):
self.app = embedchain.App.from_config(config_path=config_path)
def add(self, *args, **kwargs):
print("[*] Adding to index:")
print_calling_params(args, kwargs)
return self.app.add(*args, **kwargs)
def query(self, *args, **kwargs) -> str:
print("[*] Performing query:")
print_calling_params(args, kwargs)
return self.app.query(*args, **kwargs)
def get_all_data_from_chromadb(self) -> dict:
ret = self.app.db.get()
return ret
def check_url_is_added(self, url: str) -> bool:
embed_data = self.get_all_data_from_chromadb()
for metadata in embed_data["metadatas"]:
if type(metadata) == dict:
added_url = metadata.get("url", None)
if url == added_url:
return True
return False
@staticmethod
def get_url_from_metadata(metadata):
ret = None
if type(metadata) == dict:
if "url" in metadata.keys():
ret = metadata['url']
if type(ret) != str:
ret = str(uuid.uuid4())
return ret
def get_all_document_chunks(self) -> list[dict[str, str]]:
ret = []
embed_data = self.get_all_data_from_chromadb()
for metadata, document in zip(embed_data["metadatas"], embed_data["documents"]):
# url = metadata.get("url", None)
url = self.get_url_from_metadata(metadata)
ret.append(dict(url=url, document=document))
return ret
def get_all_documents_by_url(self) -> dict[str, list[str]]:
ret = DefaultDict(list)
document_chunks = self.get_all_document_chunks()
for it in document_chunks:
ret[it["url"]].append(it["document"])
return dict(ret)