-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_knowledge.py
140 lines (117 loc) · 4.02 KB
/
import_knowledge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from lib import EmbedApp
from func_timeout import func_timeout, FunctionTimedOut
import traceback
import subprocess
from bs4 import BeautifulSoup
# import re
def cleanup_text(text: str):
ret = ""
for line in text.splitlines():
line = line.strip()
if line:
ret += line + "\n"
return ret
# one cannot fully retrieve all data using embedchain only from github repo homepage
# TODO: use github api to inspect repo and get major branch name, and readme file path
WEBPAGE_TIMEOUT = 15
ENCODING = "utf-8"
REPLACE_MIRROR_PAIRS = [
("huggingface.co", "hf-mirror.com"),
]
def replace_with_mirror_site(url:str):
for source, target in REPLACE_MIRROR_PAIRS:
url = url.replace(source, target)
return url
def add_webpage_url(app: EmbedApp, url: str, timeout=WEBPAGE_TIMEOUT):
try: # need timeout
args = (url,)
kwargs = dict(data_type="web_page")
func_timeout(timeout, app.add, args=args, kwargs=kwargs)
print(f"[+] Adding webpage '{url}' success")
except FunctionTimedOut:
print(f"[-] Webpage '{url}' failed to add after {timeout} seconds (timeout)")
except:
traceback.print_exc()
print(f"[-] Failed to add webpage '{url}'")
def retrieve_html_from_url(url: str, timeout=WEBPAGE_TIMEOUT):
cmdlist = [
"spider",
"--url",
url,
"scrape",
"--depth",
"1",
"--output-html",
"--block-images",
]
ret = subprocess.check_output(cmdlist, timeout=timeout, encoding=ENCODING)
return ret
def extract_github_readme_from_html(html: str):
soup = BeautifulSoup(html, features="lxml")
tag = "article"
target_elem = soup.find(tag)
readme_html = str(target_elem)
readme_text = target_elem.text
readme_text = cleanup_text(readme_text)
print("[*] HTML:")
print(readme_html)
print("[*] Text:")
print(readme_text)
return readme_text
def get_github_readme_from_url(url: str):
html = retrieve_html_from_url(url)
ret = extract_github_readme_from_html(html)
return ret
def add_readme_from_github_url(app: EmbedApp, url: str):
print("[*] Getting README from Github URL:", url)
try:
readme_text = get_github_readme_from_url(url)
app.add(readme_text, data_type="text", metadata=dict(url=url))
print("[+] README added")
except subprocess.TimeoutExpired:
print("[-] Failed to get HTML within timeout")
except:
traceback.print_exc()
print("[-] Failed to add README")
def import_all_urls(urls: list[str]):
app = EmbedApp()
for index, it in enumerate(urls):
print("[*] Progress:", f"{index+1}/{len(urls)}")
it = it.strip()
if it:
print("[*] Processing URL:", it)
found_in_index = app.check_url_is_added(it)
if found_in_index:
print("[*] Skipping added URL")
continue
add_webpage_url(app, it)
# crawl from raw page, then import as text
# TODO: add url metadata to imported text content, for duplication removal
# app.add(page_content, data_type="text")
else:
print("[*] Skipping empty URL")
continue
def filter_urls(urls:list[str]):
ret = []
added_urls_lowercase_set = set()
for it in urls:
it = it.strip(".")
it = replace_with_mirror_site(it)
url_lower = it.lower()
if url_lower not in added_urls_lowercase_set:
added_urls_lowercase_set.add(url_lower)
ret.append(it)
return ret
def import_urls_from_file(filepath: str):
print("[*] Loading URLs from file:", filepath)
urls = open(filepath, "r").read().strip().splitlines()
urls = filter_urls(urls)
import_all_urls(urls)
def main():
# input_filepath = "devon_agents_urls.txt"
input_filepath = "test_run_all.txt"
# input_filepath = "test_grouped_urls_reprocessed.txt"
# input_filepath = "test_urls.txt"
import_urls_from_file(input_filepath)
if __name__ == "__main__":
main()