-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimages_layout.py
executable file
·174 lines (158 loc) · 6.07 KB
/
images_layout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import re
from bs4 import BeautifulSoup
import html2text
from tldextract import extract
transl_table = dict([(ord(x), ord(y)) for x, y in zip(u"‘’´“”–-—", u"'''\"\"---")])
# pre-defined regex
h = html2text.HTML2Text()
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
RE_SPACE = re.compile(r"\s\+")
RE_LINK = re.compile(r"((\[.*?\]) ?(\(.*?\)))|((\[.*?\]):(.*?))")
RE_STRONG = re.compile(r"\*\*((?!\*\*).+?)\*\*|\*\*((?!\*\*).+)")
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
RE_IMAGES = re.compile(r"<img.*?/>")
def clean_html(html):
"""
clean html code from non needed elements
:param html:
:return:
"""
soup = BeautifulSoup(html, 'html.parser')
footers = soup.findAll(["footer"]) # remove all javascript and stylesheet code
if len(footers) > 0:
footers[-1].extract()
return str(soup)
def create_text_snippet(text, limit=150):
"""
create text snippet using html2text with specific limit
:param text:
:param limit:
:return:
"""
# remove extra newlines from html2text output
text = re.sub(r'\\n+', '', text)
text = re.sub(r'\n+', '\n', text)
# loop through all the lines till reaching the limit
lines = text.split('\n')
lines = [line for line in lines if re.search(r'\w+', line)]
target_length = 0
target_text = []
for line in lines:
needed_length = limit - target_length
target_length += len(line.split())
if target_length <= limit:
target_text.append(line)
else:
# if it was one long paragraph crop it to the needed length to reach the limit
target_text.append(' '.join(line.split()[:needed_length]))
break
text_snippet = '\n'.join(target_text)
text_snippet = re.sub(RE_MD_CHARS_MATCHER_ALL, '', text_snippet).strip()
return text_snippet
def create_html_snippet(text, limit=150):
"""
create html snippet from html2text output
:param text:
:param limit:
:return:
"""
# remove extra new lines
text = re.sub(r'\\n+', '', text)
text = re.sub(r'\n+', '\n', text)
# replace highlighted elements with <b> tag
html_snippet = re.sub(RE_STRONG, '<b>\\1\\2</b>', text)
HText = []
target_length = 0
lines = html_snippet.split('\n')
for i, line in enumerate(lines):
# only include lines that have valid text
if re.search(r'\w+', line):
needed_length = limit - target_length
# check the length of the text
target_length += len(re.sub(r'<.*?>', '', line).split())
if target_length <= limit:
# convert any # element to <h4> tag
if line.strip().startswith('#'):
line = re.sub(r'^\s*#+', '<h4>', line)
line += '</h4>'
else:
# if it is an ordered list replace with an un order list and wrap with list tags
if re.match(RE_ORDERED_LIST_MATCHER, line.strip()):
count = min(2, int((len(line) - len(line.lstrip(' '))) / 2))
line = re.sub(RE_ORDERED_LIST_MATCHER, '<li>', line.strip()) + '</li>'
start_list = ['<ul>'] * count
end_list = ['</ul>'] * count
line = ''.join(start_list + [line.strip()] + end_list)
# if it is an unordered list replace with an un order list and wrap with list tags
elif re.match(RE_UNORDERED_LIST_MATCHER, line.strip()):
count = min(2, int((len(line) - len(line.lstrip(' '))) / 2))
line = re.sub(RE_UNORDERED_LIST_MATCHER, '<li>', line.strip()) + r'</li>'
start_list = ['<ul>'] * count
end_list = ['</ul>'] * count
line = ''.join(start_list + [line.strip()] + end_list)
else:
# wrap anything else with <p> tag
line = '<p>' + line.strip() + '</p>'
HText.append(line)
else:
# if it is a long paragraph crop to the needed length
line = ' '.join(line.split()[:needed_length])
HText.append('<p>' + line.strip() + '</p>')
break
else:
continue
html_snippet = '\n'.join(HText)
html_snippet = re.sub(RE_MD_CHARS_MATCHER_ALL, '', html_snippet).strip()
return html_snippet
def get_images(text, url):
"""
get the images array from html
:param text:
:param url:
:return:
"""
text = re.sub(r'\\n+', '', text)
text = re.sub(r'\n+', '\n', text)
images = []
matches = re.findall(RE_IMAGES, text)
# correct images link to include the main domain if not included
for match in matches:
soup = BeautifulSoup(match, 'html.parser')
src = soup.img.get('src')
if not extract(src).domain:
main_domain = url.split("://")[1].split("/")[0]
correct_link = "https://" + main_domain + src
src = correct_link
images.append(src)
return images
def get_snippets(html, url):
"""
main function for extracting html,text snippets
:param html:
:param url:
:return:
"""
try:
html = clean_html(html)
h.body_width = 1000
# ignore any non text element to generate text snippet
h.ignore_links = True
h.ignore_images = True
h.ignore_tables = False
h.bypass_tables = False
text = h.handle(html)
text_snippet = create_text_snippet(text)
# include images to be extracted
h.bypass_tables = True
h.ignore_images = False
h.images_as_html = True
# remove images again to generate html snippet without it
text = h.handle(html)
images = get_images(text, url)
h.ignore_images = True
text = h.handle(html)
html_snippet = create_html_snippet(text)
return html_snippet, text_snippet, images
except Exception:
return '', '', []