-
Notifications
You must be signed in to change notification settings - Fork 14
/
Arabycia.py
304 lines (262 loc) · 8.84 KB
/
Arabycia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk
import re
import pyaramorph
import SinaiCorpus.load as SinaiCorpusload
class Arabycia:
def __init__(self):
self.analyzer = pyaramorph.Analyzer()
self.stemmer = nltk.ISRIStemmer()
self.lemmatizer = nltk.WordNetLemmatizer()
self.segmenter = nltk.data.load("tokenizers/punkt/english.pickle")
def analyze(self):
self.analyze_text()
self.find_ambiguity()
self.generate_candidates()
self.load_corpus("SinaiCorpus/src/Sinai-corpus.zip", 60)
self.select_candidate()
self.print_result()
def tokenization(self, txt):
"""
tokenization a certain arabic text
:param txt: string : arabic text
:return: tokens : array : array contain Tokens
"""
tokens = nltk.word_tokenize(txt)
return tokens
def stemming(self, txt):
"""
Apply Arabic Stemming without a root dictionary, using nltk's ISRIStemmer.
:param txt: string : arabic text
:return: stems : array : array contains a stem for each word in the text
"""
stems = [self.stemmer.stem(w) for w in self.tokenization(txt)]
return stems
def lemmatization(self, txt):
"""
Lemmatize using WordNet's morphy function.
Returns the input word unchanged if it cannot be found in WordNet.
:param txt: string : arabic text
:return: lemmas : array : array contains a Lemma for each word in the text.
"""
lemmas = str([self.lemmatizer.lemmatize(w) for w in self.tokenization(txt)])
return lemmas
def set_raw_text(self, text):
self.raw_text = text
def segmentation(self, txt):
"""
Apply NLTK Sentence segmentation.
:param txt: string : arabic text
:return: sents : array : array contains Sentences.
"""
sents = self.segmenter.tokenize(txt)
return sents
@staticmethod
def transliteration(str):
"""
Buckwalter Word transliteration.
:param str: string : arabic word
:return: trans : string : Word transliteration.
"""
trans = pyaramorph.buckwalter.uni2buck(str)
return trans
@staticmethod
def reverse_transliteration(str):
"""
convert Word transliteration to the original word.
:param str: string : Word transliteration.
:return: trans : string : original word
"""
trans = pyaramorph.buckwalter.buck2uni(str)
return trans
def stem(self, word):
"""
Get word stem (NLTK ISRIStemmer)
:param word: string : Word.
:return: stem : string : stem
"""
stem = str(self.stemmer.stem(word))
return stem
def analyze_text(self):
"""
apply some analysis to a text ('raw_data') using pyaramorph lib
:return: sents : array : the analysis data
"""
if len(self.raw_text):
self.full_analyzed_data = self.analyzer.analyze_text(self.raw_text)
return self.full_analyzed_data
def text_search(self, key):
"""
Search for word that have the same root as 'key' (Text Search)
:param key: string : Search keyword.
:return: result: array : original words from the text with the same root.
"""
result = []
text = self.raw_text.split()
for word in text:
if key == self.stem(word):
result.append(word)
return list(set(result))
def load_corpus(self, path, filenum = 50):
"""
Load all Sinai-Corpus content
:param filename: path to the file
:return:
"""
self.corpus = SinaiCorpusload.load_corpus(path, filenum)
self.corpus = self.corpus.split('\n')
return self.corpus
def find_ambiguity(self):
"""
Find all the ambiguous words.
(After text_analysis the word with one solution is considered unambiguous, otherwise it's ambiguous)
:return:
"""
self.ambiguous_words = []
for word in self.full_analyzed_data:
if len(word['solution']) > 1:
self.ambiguous_words.append(word['arabic'])
self.ambiguous_words = list(set(self.ambiguous_words))
self.solve_unambiguity()
return self.ambiguous_words
def solve_unambiguity(self):
"""
Solve any unambiguous word, and add its diacritics.
Note: After text_analysis the word with one solution is considered unambiguous, otherwise it's ambiguous)
add diacritics for unambiguous words
:return:
"""
self.analyzed_text_result = []
self.diacritized_text = self.raw_text
self.diacritized_text_pos = ""
raw = self.raw_text
for word in raw.split():
if word not in self.ambiguous_words:
for possible_word in self.full_analyzed_data:
if possible_word['arabic'] == word:
diacritized_word = possible_word['solution'][0]['word'][0]
diacritized_word_pos = possible_word['solution'][0]['pos'][1]
self.diacritized_text_pos += diacritized_word_pos + " "
self.diacritized_text = self.diacritized_text.replace(word, diacritized_word)
self.analyzed_text_result.append({'transl': possible_word['transl'],
'arabic': possible_word['arabic'],
'word': possible_word['solution'][0]['word'],
'pos': possible_word['solution'][0]['pos'],
'gloss': possible_word['solution'][0]['gloss']
})
else:
self.diacritized_text_pos += "? "
return self.diacritized_text
def generate_candidates(self):
"""
foreach ambiguous word find all unambiguous candidates that can be solution.
[one to be selected later]
:return: all candidates
"""
candidates = []
for word in self.ambiguous_words:
temp = []
for possible_word in self.full_analyzed_data:
if possible_word['arabic'] == word:
if possible_word not in temp: temp.append(possible_word)
candidates.append(temp)
self.candidates = candidates
return self.candidates
def find_index(self, word):
text = self.raw_text.split()
return text.index(word)
def select_candidate(self):
"""
Select the best candidate.
Replace the best candidate with the original ambiguous word.
:return:
"""
candidates = self.candidates
ambiguous = self.ambiguous_words
words = self.raw_text.split()
pos = self.diacritized_text_pos.split()
for i in range(0,len(words)):
if pos[i] == "?" and pos[i-1] != "?" and (i-1)>=0:
NEXT = self.find_index(words[i])
PERV = NEXT - 1
cand_index = ambiguous.index(words[NEXT])
prob = 0
cand_best = -1
ocurrence_count_best = -1
transl = candidates[cand_index][0]['transl']
arabic = candidates[cand_index][0]['arabic']
for solution in candidates[cand_index][0]['solution']:
cand_pos = solution['pos'][1]
current_cand_prob, ocurrence_count = self.prob(pos[PERV], cand_pos)
if current_cand_prob > prob or (current_cand_prob == prob and ocurrence_count > ocurrence_count_best):
prob = current_cand_prob
cand_best = solution
ocurrence_count_best = ocurrence_count
self.diacritized_text = self.diacritized_text.replace(words[NEXT], cand_best['word'][0])
pos[i] = cand_best['pos'][1]
self.diacritized_text_pos = " ".join(pos)
self.analyzed_text_result.append({'transl': transl,
'arabic': arabic,
'word': cand_best['word'],
'pos': cand_best['pos'],
'gloss': cand_best['gloss'] })
return self.analyzed_text_result
def search(self, text, key):
return [sent for sent in text if re.search(key, sent)]
def get_subsentences(self, sents, key):
subsentences = []
for sent in sents:
words = sent.split()
for i in range(0, len(words)):
if key in words[i] and i >= 0 and i < len(words) - 1:
subsentences.append(words[i + 1])
return subsentences
def split(self, str, returnval="pos"):
str = str.split('/')
if returnval is "pos": return str[1]
else: return str[0]
def prob(self, word1, word2):
"""
compute the probability of the given two words.
prob = count(w1 | w2) / count(w1)
:param w1:
:param w2:
:return:
"""
w1 = self.split(word1, "pos")
w2 = self.split(word2, "pos")
count_word2 = len(self.search(self.corpus, self.split(word2, "word")))
filter = self.search(self.corpus, w1)
count_w1 = self.get_subsentences(filter, w1)
count_w1_w2 = self.search(count_w1, w2)
prob = len(count_w1_w2) / float(len(count_w1))
if w1 == w2: prob /= 2
return prob, count_word2
def print_result(self):
"""
Reformat the output & print it.
:return:
"""
print('Sentence :')
print(self.raw_text)
print('With Diacritics :')
print(self.diacritized_text)
print('POS :')
print(self.diacritized_text_pos)
for result in self.analyzed_text_result:
word = '\nWord : \t' + '\t'.join(filter(None, result['word']))
root = '\nRoot : \t' + self.stemming(result['word'][0])[0]
gloss = '\nGloss : \t' + result['gloss'][1]
pos = '\nPOS : \t' + '\t'.join(filter(None, result['pos']))
print(word, root, pos, gloss)
return self.analyzed_text_result
arabycia = Arabycia()
text = 'يستعيد الكاتب في هذه الرواية كيف تحولت من مدينة للانوار الي مدينة للاشباح'
arabycia.set_raw_text(text)
arabycia.analyze()
text = 'يستجمع المؤرخ أفكاره'
arabycia.set_raw_text(text)
arabycia.analyze()
search_result = arabycia.text_search("جمع")
print(search_result)