-
Notifications
You must be signed in to change notification settings - Fork 5
/
create_vocabulary.py
44 lines (32 loc) · 1.38 KB
/
create_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
import os
def create_vocabulary(filepath):
# clean for BioASQ
bioclean = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '',
t.replace('"', '').replace('/', '').replace('\\', '').replace("'",
'').strip().lower()).split()
total_words = []
pr_captions = []
# load data
train_path = os.path.join(filepath, "train_images.tsv")
with open(train_path, "r") as file:
for line in file:
line = line.replace("\n", "").split("\t")
tokens = bioclean(line[1])
for token in tokens:
total_words.append(token)
caption = " ".join(tokens)
pr_captions.append(caption)
print("Total number of captions is", len(pr_captions))
unique_captions = set(pr_captions)
print("Total number of unique captions is", len(unique_captions))
mean_length = len(total_words) / len(pr_captions)
print("The average caption length is", mean_length, "words")
# create vocabulary of unique words
vocabulary = set(total_words)
print("Unique words are", len(vocabulary))
with open(os.path.join(filepath, "vocabulary.txt"), 'w') as output_file:
for word in vocabulary:
output_file.write(word)
output_file.write("\n")
return mean_length