forked from vanangamudi/solthiruthi-sothanaikal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
trie.py
161 lines (130 loc) · 4.6 KB
/
trie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
from pprint import pprint, pformat
import pdb
from tqdm import tqdm
from tamil.utf8 import get_letters
import csv
import utils
from resources import DEFAULT_DICTIONARY_FILES, XSV_DELIMITER
class Node(object):
def __repr__(self):
if self.level < 160:
return "Node({}, {}, {})\n{} {}".format(
''.join(self.value) if self.value else self.value,
self.count,
self.is_complete,
'\t' * self.level,
self.children,)
else:
return ''
def __str__(self):
return self.__repr__()
def __init__(self, value=None, count=1, level=0):
self.value = value
self.count = count
self.children = {}
self.is_complete = False
self.level = level
class Trie(object):
def __init__(self):
self.root = Node(value=None, level=0)
def __repr__(self): return self.root.__repr__()
def __str__(self): return self.__repr__()
def add(self, item):
node, i = self.find_prefix(item)
if i < len(item):
#increment count
j = 0
tnode = self.root
while j < i:
tnode.children[item[j]].count += 1
tnode = tnode.children[item[j]]
j += 1
# add new nodes
while i < len(item):
#new_node = Node(item[:i+1], count=1, level=i+1)
new_node = Node(item[i], count=1, level=i+1)
node.children[item[i]] = new_node
node = new_node
i += 1
node.is_complete = True
def find_prefix(self, prefix, default=None):
i = 0
prev_node = node = self.root
while i < len(prefix) and node:
prev_node = node
node = node.children.get(prefix[i], None)
i += 1
if i <= len(prefix):
return prev_node, i-1
else:
return self.root, 0
def prefix_exists_p(self, prefix):
node, index = self.find_prefix(prefix)
if not node == self.root:
node = node.children.get(prefix[-1], None)
if node:
return node.is_complete
def get_all_suffixes(self, prefix):
suffixes = []
node, level = self.find_prefix(prefix)
if len(prefix):
node = node.children[prefix[-1]]
branches = list([ ('' + k, v) for k,v in node.children.items()])
while branches:
prefix, node = branches.pop(0)
branches = list([ (prefix + k, v) for k,v in node.children.items()]) + branches
if node.is_complete:
suffixes.append(prefix)
return suffixes
def words(self):
return self.get_all_suffixes(self.root)
def build_trie(filepaths,
pbarp = False):
trie = Trie()
for filepath in filepaths:
print('loading {}...'.format(filepath))
if pbarp:
pbar = tqdm.tqdm(utils.openfile(filepath), ncols=100)
else:
pbar = utils.openfile(filepath)
for item in csv.reader(pbar, delimiter=XSV_DELIMITER):
token, count = item
if token:
trie.add(get_letters(token))
if pbarp:
pbar.set_description(token)
return trie
if __name__ == '__main__':
trie = Trie()
trie.add("hell")
trie.add("hello")
trie.add("hey")
trie.add("trie")
pprint (trie)
pprint (trie.find_prefix("hey"))
pprint (trie.get_all_suffixes("h"))
pprint (trie.prefix_exists_p("hel"))
pprint (trie.prefix_exists_p("hell"))
pprint (trie.prefix_exists_p("heyl"))
pprint (trie.prefix_exists_p("hey"))
pprint (trie.prefix_exists_p("tri"))
pprint (trie.prefix_exists_p("trie"))
pprint (trie.prefix_exists_p("Trie"))
tamil_trie = Trie()
for filepath in DEFAULT_DICTIONARY_FILES:
print('loading {}...'.format(filepath))
with utils.openfile(filepath) as f:
csvf = csv.reader(f, delimiter=XSV_DELIMITER)
for line in tqdm(csvf):
token, count = line
if token:
tamil_trie.add(get_letters(token))
word = input('> ')
while word:
print('இருக்குதா? {}'.format(
'இருக்கு' if tamil_trie.prefix_exists_p(get_letters(word)) \
else 'இல்லை'))
word = input('> ')
with utils.openfile('tamil_trie_output.txt', 'w') as of:
of.write(pformat(tamil_trie))