-
Notifications
You must be signed in to change notification settings - Fork 6
/
X_except_its_Y.py
executable file
·63 lines (57 loc) · 1.51 KB
/
X_except_its_Y.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
import gensim
from gensim.models import word2vec
import sys
try:
open("bigcorpus.wv", 'r')
model=gensim.models.Word2Vec.load("bigcorpus.wv")
except:
sys.stderr.write("Corpus not found; training...")
sys.stderr.flush()
import nltk
#nltk.download("all-corpora")
from nltk.corpus import webtext, inaugural, abc, genesis, state_union, gutenberg
class corpusIterator(object):
def __init__(self, c):
self.corpora=c
def __iter__(self):
for c in self.corpora:
sys.stderr.write("\nNew corpus.")
sys.stderr.flush()
for s in c.paras():
sys.stderr.write(".")
sys.stderr.flush()
for j in s:
sys.stderr.write("\b.")
sys.stderr.flush()
yield(j)
sentences=corpusIterator([webtext, inaugural, abc, genesis, state_union])
model=gensim.models.Word2Vec(sentences)
model.save("bigcorpus.wv")
sys.stderr.write("\ttrained!\n")
sys.stderr.flush()
def doubleSplit(fname):
ret=[]
f=open(fname, 'r')
for line in f.readlines():
ret.append(line.split())
return ret
corpus1=doubleSplit(sys.argv[1])
corpus2=doubleSplit(sys.argv[2])
for l in range(0, len(corpus1)):
for w in range(0, len(corpus1[l])):
l2=l
if(l2>=len(corpus2)):
l2=l%len(corpus2)
w2=w
if(w2>=len(corpus2[l2])):
if(len(corpus2[l2])<1):
corpus2[l2]=[""]
w2=w%len(corpus2[l2])
try:
sys.stdout.write(model.most_similar(positive=[ corpus1[l][w], corpus2[l2][w2] ])[0][0])
except:
sys.stdout.write(corpus1[l][w])
sys.stdout.write(" ")
sys.stdout.flush()
sys.stdout.write("\n")