-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram.py
executable file
·286 lines (255 loc) · 14.1 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
" this module is a tool for automatic creation of n-grams for newsela data "
#TODO Get Tok Paragraphs
import newselautil as nutils
import classpaths as path
import subprocess
import sys
import io
import os.path
is_py2 = sys.version[0] == '2'
if is_py2:
import Queue as queue
else:
import queue as queue
MERGE_PER_TIME = 200 # the maximum amount of ngram-files to merge in one call. Developers of SRLIM suggest that
# it is better to merge fewer files at a time (file: doc/intro-lm). There is also a limit for input
# parameters for one call, which does not allow to merge all articles at once
MINUS_INFINITY = '-1000000' # the value that will be assigned to the log of the probability if the probability is zero
def build_ngrams(outputFile, nToProcess=-1, levels=[0,1,2,3,4,5], mingrade=0, maxgrade=12, exclude = None,
onlyEnglish = True, usePrecalculated = True):
"""
Count ngrams for nToProcess slugs (and indicated levels) (put them into path.OUTPUT_NGRAMS/ngrams_by_file/) and
then merge them all into one file. Then create a language model using Kneser-Nay
smoothing. All the files in path.OUTPUT_NGRAMS/toDelete/ (but not the folder itself) can be deleted after the
program finishes.
:param outputFile: the name of the files that will contain the merged output. outputFile.ngrams will contain the
ngrams and output.bo - the language model
:param nToProcess: the number of slugs to process. If nToProcess = -1, all the slugs will be processed
:param levels: the list of levels for which to build the models
:param mingrade: the lowest grade for which to calculate the ngrams
:param maxgrade: the highest grade for which to calculate the ngarms
:param exclude: the name of the file that contains the list of all articles that should be excluded (the first
parameter on each line (except for the first line) should be the name of a 0 level article). The names should go in
the same order as they are in metafile (that is in alphabetical order)
:param onlyEnglish: If True, only english articles will be processed
:param usePrecalculated: If True, the program will not reevaluate the ngrams for the files which are already
in the ngrams_by_file folder
:return: None
"""
info = nutils.loadMetafile()
nSlugs = 0 # this will store the number of the slug that is currently processed
q = queue.Queue() # the program first creates n-gram models for all the articles separately, then merges them all
# together and only afterwards creates the language model. The queue will store the names of all files that are to
# be merged
if (nToProcess == 0) or (nToProcess < -1):
print("nToProcess parameter takes either a positive value or -1")
return
if len(levels) < 1:
print("at list one level should be indictaed")
return
if (mingrade > maxgrade):
tmp = mingrade
mingrade = maxgrade
maxgrade = tmp
if exclude is not None:
with io.open(exclude) as file:
excluded = file.readlines()
for i in range(len(excluded)):
excluded[i] = excluded[i].split(" ")
e_index = 1 # the position of the next article in excluded array that should be taken into account
else:
excluded = []
e_index = 1
i = 0
while (i < len(info))and((nToProcess == -1)or(nSlugs < nToProcess)):
artLow = i # first article with this slug
slug = info[i]['slug']
while i < len(info) and slug == info[i]['slug']:
i += 1
artHi = i # one more than the number of the highest article with this slug
if (e_index >= len(excluded))or(info[artLow]["filename"] != excluded[e_index][0]+".txt"): # if this article
# should not be excluded from the model
nSlugs += 1
if (info[artLow]["language"] == "en") or not onlyEnglish:
for level in levels: # the level of adaptation (i.e. 0 - is the original etc.)
if level >= artHi - artLow:
continue
grade = float(info[artLow + level]["grade_level"]) # the grade level (not the level of adaptation
if (grade < mingrade) or (grade > maxgrade):
continue
q.put(path.OUTDIR_PRECALCULATED + info[artLow + level]["filename"] + ".ngrams")
if usePrecalculated and os.path.isfile(
path.OUTDIR_PRECALCULATED + info[artLow + level]["filename"] + ".ngrams"):
continue
subprocess.call(["ngram-count", "-text", path.OUTDIR_TOK_NGRAMS +
info[artLow + level]["filename"] + ".tok", "-sort", "-write",
path.OUTDIR_PRECALCULATED + info[artLow + level]["filename"] + ".ngrams"])
# crete the n-gram model for this particular file
if nToProcess == -1:
print(
"Processing slug... " + slug + ' ' + str(round(i / float(len(info)) * 100, 3)) + '% completed')
else:
print("Processing slug... " + slug + ' ' + str(
round(nSlugs / float(nToProcess) * 100, 3)) + '% completed')
else:
e_index += 1
extraFilesCount = 0 # the number of extra files that could be deleted after the completion of the program
# (these are temporaraly merges and will be located in path.OUTPUT_NGRAMS/toDelete/)
while q.qsize() > MERGE_PER_TIME: # align MERGE_PER_TIME files per call
next_input = ["ngram-merge"]
print("Files left to merge:"+str(q.qsize()))
for i in range (MERGE_PER_TIME):
next_input.append(q.get())
with open(path.OUTDIR_TO_DELETE+str(extraFilesCount)+".ngrams", 'w') as file:
if is_py2:
file.write(subprocess.check_output(next_input, shell=False))
else:
file.write(subprocess.run(next_input, stdout=subprocess.PIPE).stdout.decode('utf-8'))
q.put(path.OUTDIR_TO_DELETE+str(extraFilesCount)+".ngrams")
extraFilesCount += 1
next_input = ["ngram-merge"]
for i in range (q.qsize()):
next_input.append(q.get())
with open(path.OUTDIR_NGRAMS+outputFile+".ngrams", 'w') as file:
if is_py2:
file.write(subprocess.check_output(next_input, shell=False))
else:
file.write(subprocess.run(next_input, stdout=subprocess.PIPE).stdout.decode('utf-8')) # this will create
# the outputFile.ngrams that will contain all the merged ngrams
subprocess.call(["ngram-count", "-read", path.OUTDIR_NGRAMS+outputFile+".ngrams",
"-lm", path.OUTDIR_NGRAMS+outputFile+".bo", "-kndiscount"])
def delete_pars_symbols():
"""Read all tokenized articles and delete @PGPH lines because they contaminate ngrams. Also, replace ## tags with
@TITLE tags, because otherwise they are not processed by SRILM"""
info = nutils.loadMetafile()
i = 0
while i < len(info):
artLow = i # first article with this slug
slug = info[i]['slug']
while i < len(info) and slug == info[i]['slug']:
i += 1
artHi = i # one more than the number of the highest article with this slug
for level in range(artHi-artLow):
with io.open(path.BASEDIR + "/articles/" + info[artLow + level]["filename"] + ".tok") as file:
lines = file.readlines()
with io.open(path.OUTDIR_TOK_NGRAMS + info[artLow + level]["filename"] + ".tok", 'w') as file:
for line in lines:
splitted = line.split()
if splitted[0] == '@PGPH':
continue
if splitted[0] == '##' or splitted[0] == '###':
file.write('@TITLE'+line[2:])
else:
file.write(line)
def article_perplexity(articleName,lmName):
"""
Calculate the probability of each word in an article according to the designated language model. Write all of this
to a file in perplexity folder. The first line contains the overall perplexity of the article as given by SRILM:
"Perplexity is given with two different normalizations: counting all input tokens and excluding end-of-sentence tags"
Both of these values are on the first line. Every one of the following lines represents a sentence and contains the
probabilities for every word in this sentence given as the logarithms (base 10). If the probability is equal, to
MINUS_INFINITY value, the word is out of vocabulary.
:param articleName: the name of the article to calculate perplexity
:param lmName: the name of the language model (.bo extension), found in OUTDIR_NGRAMS.
:return: the perplexity (as a tuple)
"""
if is_py2:
ouput=subprocess.check_output(["ngram", "-lm", path.OUTDIR_NGRAMS + lmName, "-ppl", path.OUTDIR_TOK_NGRAMS +
articleName, "-debug", "2"], shell=False).split('\n')
else:
ouput = subprocess.run(["ngram", "-lm", path.OUTDIR_NGRAMS + lmName, "-ppl", path.OUTDIR_TOK_NGRAMS +
articleName, "-debug", "2"], stdout=subprocess.PIPE).stdout.decode('utf-8').split('\n')
with io.open(path.OUTDIR_TOK_NGRAMS + articleName) as file:
lines = file.readlines()
for i in range(len(lines)):
lines[i] = lines[i].split(' ')
i = 1
with open(path.OUTDIR_PERPLEX+articleName+".prob", 'w') as file:
perplexity = ouput[-2].split(' ')
file.write(str(perplexity[-3]) + " " + str(perplexity[-1])+"\n")
for sent in range(len(lines)):
for word in range(len(lines[sent])):
value = ouput[i].split('\t=')[1].split(' ')[4]
if value == '-inf': # replace -inf with MINUS_INFINITY
file.write(MINUS_INFINITY+' ')
else:
file.write(value+' ')
i += 1
value = ouput[i].split('\t=')[1].split(' ')[4]
if value == '-inf': # replace -inf with MINUS_INFINITY
file.write(MINUS_INFINITY+'\n')
else:
file.write(value+'\n')
i += 5
return float(perplexity[-3]), float(perplexity[-1])
def calculate_all_perplexities(lmName, nToProcess=-1, levels=[0,1,2,3,4,5], mingrade=0, maxgrade=12, include = None,
onlyEnglish = True):
"""
calculate perplexities for articles designated by adaptation level, grade, language or slug
:param lmName: the name of the language model (.bo extension), found in OUTDIR_NGRAMS.
:param nToProcess: number of slugs to calculate perplexities for
:param levels: the list of levels for which to calculate perplexities
:param mingrade: the lowest grade for which to calculate perplexities
:param maxgrade: the highest grade for which to calculate perplexities
:param include: the name of the file that contains the list of all articles for which to calculate perplexities
(the first parameter on each line (except for the first line) should be the name of a 0 level article).
The names should go in the same order as they are in metafile (that is in alphabetical order). If include = NONE,
the perplexities will be calculated for the first nToProcess slugs
:param onlyEnglish: If True, only english articles will be processed
:return:
"""
info = nutils.loadMetafile()
nSlugs = 0 # this will store the number of the slug that is currently processed
if (nToProcess == 0) or (nToProcess < -1):
print("nToProcess parameter takes either a positive value or -1")
return
if len(levels) < 1:
print("at list one level should be indictaed")
return
if (mingrade > maxgrade):
tmp = mingrade
mingrade = maxgrade
maxgrade = tmp
if include is not None:
with io.open(include) as file:
included = file.readlines()
for i in range(len(included)):
included[i] = included[i].split(" ")
i_index = 1 # the position of the next article in included array that should be taken into account
else:
included = []
i_index = 1
average_perpl = (.0,.0)
num_of_files = .0
i = 0
while (i < len(info)) and ((nToProcess == -1) or (nSlugs < nToProcess)):
artLow = i # first article with this slug
slug = info[i]['slug']
while i < len(info) and slug == info[i]['slug']:
i += 1
artHi = i # one more than the number of the highest article with this slug
if (i_index >= len(included)) or (info[artLow]["filename"] != included[i_index][0] + ".txt"): # if the
# perplexities for this article should not be calculated
continue
i_index +=1
if (info[artLow]["language"] == "en") or not onlyEnglish:
for level in levels: # the level of adaptation (i.e. 0 - is the original etc.)
if level >= artHi - artLow:
continue
grade = float(info[artLow + level]["grade_level"]) # the grade level (not the level of adaptation
if (grade < mingrade) or (grade > maxgrade):
continue
curr_perpl = article_perplexity(info[artLow + level]["filename"] + ".tok", lmName)
num_of_files += 1
average_perpl=(average_perpl[0]+curr_perpl[0], average_perpl[1]+curr_perpl[1])
if nToProcess == -1:
print(
"Processing slug... " + slug + ' ' + str(round(i / float(len(info)) * 100, 3)) + '% completed')
else:
print("Processing slug... " + slug + ' ' + str(
round(nSlugs / float(nToProcess) * 100, 3)) + '% completed')
print(str(average_perpl[0]/num_of_files)+ " "+ str(average_perpl[1]/num_of_files))
if __name__ == "__main__":
delete_pars_symbols()
build_ngrams("test", levels=[1,2,3,4], mingrade=2, maxgrade=9, exclude=path.BASEDIR+"/NewselaSimple03test.idx")
calculate_all_perplexities("test.bo", levels=[0], include=path.BASEDIR+"/NewselaSimple03test.idx")