Skip to content

Commit

Permalink
source folder renamed
Browse files Browse the repository at this point in the history
  • Loading branch information
hank110 committed Mar 17, 2019
1 parent 59063f8 commit 9944b52
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 0 deletions.
1 change: 1 addition & 0 deletions bagofconcepts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .boc import BOCModel
120 changes: 120 additions & 0 deletions bagofconcepts/boc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import logging
from collections import Counter, defaultdict
import math
import sys

import numpy as np
from scipy.sparse import csr_matrix
import scipy.sparse
from sklearn.utils.extmath import safe_sparse_dot
from gensim.models import Word2Vec, KeyedVectors
from spherecluster import SphericalKMeans


class BOCModel():

def __init__(self, doc_path=None, model_path=None, embedding_dim=200,
context=8, min_freq=100, num_concept=100, iterations=5):
# Unified model path required for incorporating numpy ndarray
# Different embedding methods --> numpy ndarray
if doc_path is None and model_path is None:
raise ValueError("Must specify either the document path or pre-trained word2vec path")

self.doc_path=doc_path
self.model_path=model_path
self.embedding_dim=embedding_dim
self.context=context
self.min_freq=min_freq
self.num_concept=num_concept
self.iterations=iterations


def fit(self, save_path=""):

if self.model_path is not None:
wv, idx2word=load_w2v(self.doc_path)
else:
wv, idx2word=train_w2v(self.doc_path, self.embedding_dim,
self.context, self.min_freq, self.iterations, save_path)

wv_cluster_id=_cluster_wv(wv, self.num_concept)
bow=_create_bow(idx2word, self.doc_path)
w2c=_create_w2c(idx2word, wv_cluster_id, self.num_concept)
boc=_apply_cfidf(safe_sparse_dot(bow, w2c))

if save_path:
_save_boc(save_path, boc, idx2word, wv_cluster_id)

return boc, [wc_pair for wc_pair in zip(idx2word, wv_cluster_id)], idx2word


def _save_boc(filepath, boc, idx2word, wv_cluster_id):
scipy.sparse.save_npz(filepath+'/boc_matrix.npz', boc)
with open(filepath+'/word2context.txt', 'w') as f:
for wc_pair in zip(idx2word, wv_cluster_id):
f.write(str(wc_pair)+'\n')


def _cluster_wv(wv, num_concept):
skm=SphericalKMeans(n_clusters=num_concept)
skm.fit(wv)
return skm.labels_


def _create_bow(idx2word, doc_path):
rows=[]
cols=[]
vals=[]
word2idx={word:idx for idx, word in enumerate(idx2word)}
with open(doc_path, "r") as f:
for i, doc in enumerate(f):
tokens=doc.rstrip().split(" ")
tokens_count=Counter([word2idx[token] for token in tokens if token in word2idx])
for idx, count in tokens_count.items():
rows.append(i)
cols.append(idx)
vals.append(float(count))
return csr_matrix((vals, (rows, cols)), shape=(i+1, len(word2idx)))


def _create_w2c(idx2word, cluster_label, num_concept):
if len(idx2word)!=len(cluster_label):
raise IndexError("Dimensions between words and labels mismatched")

rows=[i for i, idx2word in enumerate(idx2word)]
cols=[j for j in cluster_label]
vals=[1.0 for i in idx2word]

return csr_matrix((vals, (rows, cols)), shape=(len(idx2word), num_concept))


def _apply_cfidf(csr_matrix):
num_docs, num_concepts=csr_matrix.shape
_, nz_concept_idx=csr_matrix.nonzero()
cf=np.bincount(nz_concept_idx, minlength=num_concepts)
icf=np.log(num_docs / cf)
icf[np.isinf(icf)]=0
return safe_sparse_dot(csr_matrix, scipy.sparse.diags(icf))


def tokenize(doc_path):
with open(doc_path, "r") as f:
for doc in f:
yield doc.rstrip().split(" ")


def train_w2v(doc_path, embedding_dim, context, min_freq, iterations, save_path=""):
tokenized_docs=tokenize(doc_path)
model=Word2Vec(size=embedding_dim, window=context, min_count=min_freq, sg=1)
model.build_vocab(tokenized_docs)
model.train(tokenized_docs, total_examples=model.corpus_count, epochs=iterations)

if save_path:
model_name="/w2v_model_d%d_w%d" %(embedding_dim, context)
model.wv.save_word2vec_format(save_path+model_name)

return model.wv.vectors, model.wv.index2word


def load_w2v(model_path):
return KeyedVectors.load_word2vec_format(model_path)
41 changes: 41 additions & 0 deletions bagofconcepts/pagerank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
max_iteration=1000
epsilon=0.00001
damping_factor=0.15
output_name="pagerank.csv"


for concept, words in concept_to_words.items():
# Initializer
M_PageRank = defaultdict(int)
total_coocc=0
for word in words:
M_PageRank[word]=sum(M_cooccurrence[word].values())
total_coocc+=sum(M_cooccurrence[word].values())
for word, pr_value in M_PageRank.items():
M_PageRank[word]=pr_value/total_coocc
#M_PageRank[word]=pr_value//(get_tf(word)/get_df(word))

# Iteration
for _ in range(max_iteration):
old_PageRank=M_PageRank
old_PageRank_vector=np.array(list(old_PageRank.values()))
for word, pr_value in old_PageRank.items():
update_pr_value=0
for linked_word in M_cooccurrence[word].keys():
#update_pr_value+=(M_cooccurrence[word][linked_word]*old_PageRank[linked_word])/sum(M_cooccurrence[linked_word].values())
update_pr_value+=old_PageRank[linked_word]/len(M_cooccurrence[linked_word].keys())
## Should be divided by the number of documents
if (len(M_cooccurrence[word].keys())==0): alpha=0
else: alpha = 1.0 / float(len(M_cooccurrence[word].keys())+0.0000001) * damping_factor
#alpha = 1.0 / float(len(words)+0.0000001) * damping_factor
M_PageRank[word]=update_pr_value*(1-damping_factor)+alpha
delta=math.sqrt(np.sum(np.power(np.array((list(M_PageRank.values()))-old_PageRank_vector),2)))
if delta < epsilon:
print("...PageRank converged after %s iterations" %str(_))
break
if _ % 100 == 0: print("...number of iterations: %s" %str(_))
print("PageRank calculation completed")
print("delta: %s" %str(delta))
for word, pr_value in sorted(M_PageRank.items(), key=lambda x: x[1], reverse=True):
with open(output_name, "a") as f:
f.write('%d, %s, %.5f\n' % (concept, word, pr_value))
17 changes: 17 additions & 0 deletions bagofconcepts/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os
import psutil


# This code is provided by Hyunjoong Kim

def get_available_memory():
"""It returns remained memory as percentage"""

mem = psutil.virtual_memory()
return 100 * mem.available / (mem.total)

def get_process_memory():
"""It returns the memory usage of current process"""

process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024 ** 3)

0 comments on commit 9944b52

Please sign in to comment.