-
Notifications
You must be signed in to change notification settings - Fork 1
/
post_processings.py
80 lines (61 loc) · 2.59 KB
/
post_processings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Copyright Software Engineering Analytics Lab (SEAL), Wayne State University, 2023
# Authors: Jaydeb Sarker <jaydebsarker@wayne.edu> and Amiangshu Bosu <abosu@wayne.edu>
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# version 3 as published by the Free Software Foundation.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import numpy as np
import pandas as pd
from itertools import chain
##making binary (0 or 1) value using a threshold (float 0~1) value
def classify_by_threshold(prediction, thresh):
predictions = np.zeros(prediction.shape)
for i in range(prediction.shape[0]):
for j in range(prediction.shape[1]):
if prediction[i,j]>=thresh:
predictions[i,j]=1
return predictions
## this function is for text encoding which provides the explainable output
def convert_to_original_text_with_toxicity(encoded_text,pred,tokenizer):
##list_decoded_text_toxicity = []
only_decoded_text = tokenizer.convert_ids_to_tokens(encoded_text)
decode = [tokenizer.convert_tokens_to_string([i]) for i in tokenizer.convert_ids_to_tokens(encoded_text)]
decoded_new = []
prev = 0
if (len(pred) == 0):
decoded_new = decode
else:
#if (pred[0] == 0):
#decoded_new.append(['<toxic>'])
for i in range(0, len(pred)):
pred_val = int(pred[i])
##for handling exception
if (pred_val >= len(decode)):
print("Missing here", only_decoded_text, pred)
break
if (pred_val > prev and prev > 0):
decoded_new.append(['</toxic>'])
decoded_new.append(decode[prev:pred_val])
if (pred_val - prev > 0):
decoded_new.append(['<toxic>'])
decoded_new.append([decode[pred_val]])
#decoded_new.append([decode[pred_val]])
# if(pred_val - prev >0): decoded_new.append(['</toxic>'])
prev = pred_val + 1
decoded_new.append(['</toxic>'])
decoded_new.append(decode[prev:len(decode)])
decoded_new = list(chain.from_iterable(decoded_new))
return decoded_new
## if we want to use BIO for prediction
# we do not use it in our experiment
def classify_bio_output(prediction):
num_samples = prediction.shape[0]*prediction.shape[1]
predictions = np.zeros((prediction.shape[0],prediction.shape[1]))
for i in range(prediction.shape[0]):
for j in range(prediction.shape[1]):
if prediction[i,j] != 0:
predictions[i,j] = 1
return predictions