forked from vanangamudi/solthiruthi-sothanaikal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
collocation_freq.py
102 lines (76 loc) · 2.71 KB
/
collocation_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#** coding: utf-8 **
import logging
from pprint import pprint, pformat
log = logging.getLogger(__name__)
logging.basicConfig()
log.setLevel(logging.DEBUG)
import gzip
from tamil import utf8
from tqdm import tqdm
from collections import Counter, defaultdict
UYIRMEI_MAP_FILEPATH = 'uyir-mei.csv'
def read_uyirmei_map():
uyirmei_map = {}
uyirmei_map_reverse = {}
with gzip.open(UYIRMEI_MAP_FILEPATH) as f:
for line in f.readlines()[1:]:
letter, mei, uyir = line.strip().split('|')
if len(uyir) > 0 and len(mei) > 0:
uyirmei_map[letter] = mei + uyir #not fusing just concat
uyirmei_map_reverse [mei+uyir] = letter
elif len(mei) > 0:
uyirmei_map[letter] = mei
uyirmei_map_reverse [mei] = letter
else:
uyirmei_map[letter] = letter
uyirmei_map_reverse [letter] = letter
return uyirmei_map, uyirmei_map_reverse
uyirmei_map, uyirmei_map_reverse = read_uyirmei_map()
def split_uyirmei(string):
letters = []
for i in utf8.get_letters(string.strip()):
if i in uyirmei_map:
letters.append(uyirmei_map[i])
else:
letters.append(i)
return ''.join(letters)
def build_freqdict(filepath, ngram_size=2, line_limit=100000, offset=1000):
def ngram_zipper(letters, size=2):
return [letters[i:] for i in range(size)]
counter = defaultdict(Counter)
with gzip.open(filepath) as f:
for line in tqdm(f.readlines()[offset:line_limit+offset]):
for word in line.split():
oletters = utf8.get_letters(
word
)
sletters = utf8.get_letters(
split_uyirmei(word)
)
log.debug('word: {}'.format(word))
log.debug('len: oletters/sletters: {}/{}'.format(
len(oletters),
len(sletters)))
for i, ngram in enumerate(
zip(*ngram_zipper(sletters, ngram_size))
):
counter[ ''.join(ngram) ] [i - len(sletters)] += 1
return counter
if __name__ == '__main__':
print(split_uyirmei(
'போன்றself'
))
freq = build_freqdict(
filepath = '/home/vanangamudi/data/datasets/text/tamiltext-6M-10lines.txt',
ngram_size = 3,
line_limit = 100,
)
pprint(freq)
pprint(
[
':'.join(str(j) for j in i) for i in sorted(
[(k, sum(v.values())) for k,v in freq.items()],
key=lambda x: x[1]
)
]
)