-
Notifications
You must be signed in to change notification settings - Fork 6
/
text_tonality_classifier.py
118 lines (81 loc) · 3.19 KB
/
text_tonality_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
import numpy as np
from fuzzywuzzy import process, fuzz
from torch import nn, tensor
from slovnet.model.emb import NavecEmbedding
class TextTonalityClassifierByRules:
"""
Classifier of the tonality of the text according to the rules.
"""
valid_symbols_re = re.compile('[^a-zа-я]', flags=re.IGNORECASE)
def __init__(self, bad_words: list, bad_word_threshold=0.75) -> None:
"""
:param bad_words: list of bad words
:param bad_word_threshold: float in the range 0 to 1
"""
self.list_of_bad_words = bad_words
self.bad_word_threshold = bad_word_threshold
def clear_text(self, text: str) -> str:
"""
:param text: str
:return: clean text
"""
return self.valid_symbols_re.sub('', text)
def predict(self, x: list) -> np.array:
"""
:param x: input 2d list with the str. Example [['Hello','my',friends'],['My','name','is','Jack']]
:return: numpy array with predictions. Example np.array([0,0])
"""
y = []
for row in x:
in_list = False
for word in row:
clear_word = self.clear_text(word)
if clear_word == '':
continue
if process.extractOne(clear_word, self.list_of_bad_words, scorer=fuzz.ratio)[1] \
> self.bad_word_threshold * 100:
in_list = True
break
y.append(1 if in_list else 0)
return np.array(y)
class TextTonalityClassifierNN(nn.Module):
"""
Neural network model for the classification of text tonality
"""
def __init__(self, embedding_dim: int, gru_hidden_size: int, fc_hidden_size: int, output_size: int, navec) -> None:
"""
:param embedding_dim: embedding dim
:param gru_hidden_size: gru hidden size
:param fc_hidden_size: full connected hidden size
:param output_size: output size
:param navec: navec model
"""
super(TextTonalityClassifierNN, self).__init__()
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=-1)
self.embedding = NavecEmbedding(navec) # nn.Embedding(input_size, embedding_dim)
self.conv1 = nn.Conv1d(embedding_dim, 512, kernel_size=(5,), padding=2)
self.conv2 = nn.Conv1d(512, 1024, kernel_size=(3,), padding=1)
self.conv3 = nn.Conv1d(1024, 2048, kernel_size=(5,), padding=2)
self.gru = nn.GRU(2048, gru_hidden_size, batch_first=True)
self.fc1 = nn.Linear(gru_hidden_size, fc_hidden_size)
self.fc2 = nn.Linear(fc_hidden_size, output_size)
def forward(self, x: tensor) -> tensor:
x = self.embedding(x)
x = x.permute((0, 2, 1))
x = self.conv1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.relu(x)
x = self.conv3(x)
x = self.relu(x)
x = x.permute((0, 2, 1))
x, _ = self.gru(x) # (batch_size, L, hidden_size)
x, _ = x.max(dim=1)
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
def predict(self, x: tensor) -> tensor:
return self.softmax(self.forward(x))