-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
78 lines (58 loc) · 1.96 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import nltk
import numpy as np
import scipy
import random
import re
from config import conf
from russian_tagsets import converters
morph = conf.MORPH
punct = conf.PUNCT
model = conf.MODEL
to_ud = converters.converter('opencorpora-int', 'ud20')
def clean_numbers(text):
text = re.sub(r'[0-9]+', '', text)
return text
def pymorphy_tagger(text, stops):
text = text.replace('[', ' ').replace(']', ' ')
parsed = []
tokens = nltk.word_tokenize(text)
for word in tokens:
word = word.strip(punct)
if (word not in stops) and (word not in punct) and (
re.sub(r'[{}]+'.format(punct), '', word).isdigit() is False) and (word != 'nan'):
lemma = str(morph.parse(word)[0].normal_form)
pos = to_ud(str(morph.parse(word)[0].tag.POS)).split()[0]
word_with_tag = lemma + '_' + pos
parsed.append(word_with_tag)
return ' '.join(parsed)
def cosine(a, b):
dot = np.dot(a, b.T)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)
return cos
def cos_sim(a, b):
return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
def text_cleaner(text, stopwords):
text = text.replace('[', ' ').replace(']', ' ').replace('(', ' ').replace(')', ' ')
cleaned = []
tokens = nltk.word_tokenize(text)
for word in tokens:
word = word.strip(punct)
if (word not in stopwords) and (word not in punct) and (
re.sub(r'[{}]+'.format(punct), '', word).isdigit() is False) and (word != 'nan'):
lemma = str(morph.parse(word)[0].normal_form)
cleaned.append(lemma)
return ' '.join(cleaned)
def vectorize_word(word):
"""vectorize word with unknown word handler"""
try:
vec = model[word]
except KeyError:
vec = np.zeros(len(model[random.choice(model.index_to_key)]))
return vec
def getList(dct):
list = []
for key in dct.keys():
list.append(key)
return list