forked from vivek8031/CFI-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
HelperGUI.py
executable file
·157 lines (121 loc) · 5 KB
/
HelperGUI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import mysql.connector
from threading import *
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None,
args=(), kwargs={}, Verbose=None):
Thread.__init__(self, group, target, name, args, kwargs)
self._return = None
def run(self):
#print(type(self._target))
if self._target is not None:
self._return = self._target(*self._args,
**self._kwargs)
def join(self, *args):
Thread.join(self, *args)
return self._return
def EditDistDP(str1, str2):
len1 = len(str1);
len2 = len(str2);
#To fill the DP array with 0
DP = [[0 for i in range(len1 + 1)] for j in range(2)]
#Base condition when second string is empty then we remove all characters
for i in range(len1 + 1):
DP[0][i] = i
#Start filling the DP This loop run for every character in second string
for i in range(1,len2 + 1):
#This loop compares the char from second string with first string characters
for j in range(len1 + 1):
#if first string is empty then we have to perform add character operation to get second string
if(j == 0):
DP[i % 2][j] = i
#if character from both string is same then we do not perform any operation . here i % 2 is for bound the row number.
elif(str1[j-1] == str2[i-1]):
DP[i % 2][j] = DP[(i - 1) % 2][j - 1]
#if character from both string is not same then we take the minimum from three specified operation
else:
DP[i % 2][j] = 1 + min(DP[(i - 1) % 2][j], min(DP[i % 2][j - 1], DP[(i - 1) % 2][j - 1]))
#after complete fill the DP array if the len2 is even then we end up in the 0th row else we end up
#in the 1th row so we take len2 % 2 to get row
S = (1 - (DP[len2 % 2][len1])/(max(len1,len2)))*100
return(S)
def Sentencing(sent1,sent2):
#Extracting words from the sentences
words1 = [w.lower() for w in word_tokenize(sent1) if w not in punctuation]
words2 = [w.lower() for w in word_tokenize(sent2) if w not in punctuation]
#Common words giving zero edit distance
words = set(words1) & set(words2)
common_words = [w for w in words]
common_words.sort()
sen = " ".join(common_words)
#Sentence 1 with alphabetically arranged words excluding common words
sen1 = [w for w in words1 if w not in words]
sen1.sort()
sent1 = " ".join(sen1)
#Similarly sentence 2
sen2 = [w for w in words2 if w not in words]
sen2.sort()
sent2 = " ".join(sen2)
#Sentences with common words at the beginning followed by alphabetically arranged words
sent1 = sen + ' ' + sent1
sent2 = sen + ' ' + sent2
return(EditDistDP(sent1,sent2))
def stripComments(sent):
#When flag =1 indicates the characters part of comment which need to be omitted
flag = 0
sentence = ''
for token in sent:
if token == '#':
flag = 1
if '\n' in token :
flag = 0
if flag == 1:
token = ''
sentence = sentence + token
return sentence
#Opening the query doc that needs to checked(User enter in *.txt form)
f1 = input("Enter the name of the file to be inspected: ")
with open (f1) as f:
tokens1 = sent_tokenize(f.read())
f.close()
sent1 = ' '.join(tokens1)
sent1 = stripComments(sent1)
num = int(input("Enter the Number of files needed to be checked with: ")) #Number of files to be checked with query doc
files =[] #To store file contents in the list
filename=[] #To store file names in the list
sqlcmd = "INSERT INTO DATAS (FILENAME, SIMILARITY) VALUES (%s,%s)"
#Opening the files
for i in range(num):
print("Enter the name of the ",i+1,"th file: ")
f2 = input()
inp = (f2,0)
mycursor.execute(sqlcmd, inp) #Entering the names of files into the database
filename.append(f2) #Store the file contents
with open (f2) as f:
tokens2 = sent_tokenize(f.read())
sent2 = ' '.join(tokens2)
sent2 = stripComments(sent2)
files.append(sent2)
f.close()
print()
mydb.commit()
#Creating threads for simultaneous checking of various documents
Tobj=[] #To store the returned percentages
for fil in files:
a = ThreadWithReturnValue(target=Sentencing, args=(sent1,fil))
a.start()
Tobj.append(a)
#Entering the returned %ofsimilarity into the database
for (val,fname) in zip(Tobj,filename):
perc=val.join()
sql = "UPDATE DATAS SET SIMILARITY = "+str(perc)+" WHERE FILENAME = '"+fname+"'"
mycursor.execute(sql)
mydb.commit()
#Retrieving data from database to be displayed to user
mycursor.execute("SELECT * FROM DATAS")
data = mycursor.fetchall()
for row in data:
print(row)
#mycursor.execute() - used to execute SQL commands
#mydb.commit() - Used to save changes made to the database