-
Notifications
You must be signed in to change notification settings - Fork 5
/
tokenizer_wrapper.py
71 lines (56 loc) · 2.63 KB
/
tokenizer_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import text_to_word_sequence
import numpy as np
class TokenizerWrapper:
def __init__(self, dataset_csv_file, class_name, max_caption_length, tokenizer_num_words=None):
dataset_df = pd.read_csv(dataset_csv_file)
sentences = dataset_df[class_name].tolist()
self.max_caption_length=max_caption_length
self.tokenizer_num_words=tokenizer_num_words
self.init_tokenizer(sentences)
def clean_sentence(self,sentence):
return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
def init_tokenizer(self,sentences):
for i in range(len(sentences)):
if pd.isna(sentences[i]):
sentences[i] = ""
sentences[i] = self.clean_sentence(sentences[i])
# Tokenize the reviews
print("Tokenizing dataset..")
self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words)
self.tokenizer.fit_on_texts(sentences) # give each word a unique id
print("number of tokens: {}".format(self.tokenizer.word_index))
print("Tokenizing is complete.")
def get_tokenizer_num_words(self):
return self.tokenizer_num_words
def get_token_of_word(self,word):
return self.tokenizer.word_index[word]
def get_word_from_token(self, token):
try:
return self.tokenizer.index_word[token]
except:
return ""
def get_sentence_from_tokens(self, tokens):
sentence = []
for token in tokens[0]:
word = self.get_word_from_token(token)
if word == 'endseq':
return sentence
if word != 'startseq':
sentence.append(word)
return sentence
def get_string_from_word_list(self,word_list):
return " ".join(word_list)
def get_word_tokens_list(self):
return self.tokenizer.word_index
def tokenize_sentences(self,sentences):
index = 0
tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int)
for caption in sentences:
tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])])
tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length,
padding='post') # padded with max length
index = index + 1
return tokenized_sentences