-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .boc import BOCModel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import logging | ||
from collections import Counter, defaultdict | ||
import math | ||
import sys | ||
|
||
import numpy as np | ||
from scipy.sparse import csr_matrix | ||
import scipy.sparse | ||
from sklearn.utils.extmath import safe_sparse_dot | ||
from gensim.models import Word2Vec, KeyedVectors | ||
from spherecluster import SphericalKMeans | ||
|
||
|
||
class BOCModel(): | ||
|
||
def __init__(self, doc_path=None, model_path=None, embedding_dim=200, | ||
context=8, min_freq=100, num_concept=100, iterations=5): | ||
# Unified model path required for incorporating numpy ndarray | ||
# Different embedding methods --> numpy ndarray | ||
if doc_path is None and model_path is None: | ||
raise ValueError("Must specify either the document path or pre-trained word2vec path") | ||
|
||
self.doc_path=doc_path | ||
self.model_path=model_path | ||
self.embedding_dim=embedding_dim | ||
self.context=context | ||
self.min_freq=min_freq | ||
self.num_concept=num_concept | ||
self.iterations=iterations | ||
|
||
|
||
def fit(self, save_path=""): | ||
|
||
if self.model_path is not None: | ||
wv, idx2word=load_w2v(self.doc_path) | ||
else: | ||
wv, idx2word=train_w2v(self.doc_path, self.embedding_dim, | ||
self.context, self.min_freq, self.iterations, save_path) | ||
|
||
wv_cluster_id=_cluster_wv(wv, self.num_concept) | ||
bow=_create_bow(idx2word, self.doc_path) | ||
w2c=_create_w2c(idx2word, wv_cluster_id, self.num_concept) | ||
boc=_apply_cfidf(safe_sparse_dot(bow, w2c)) | ||
|
||
if save_path: | ||
_save_boc(save_path, boc, idx2word, wv_cluster_id) | ||
|
||
return boc, [wc_pair for wc_pair in zip(idx2word, wv_cluster_id)], idx2word | ||
|
||
|
||
def _save_boc(filepath, boc, idx2word, wv_cluster_id): | ||
scipy.sparse.save_npz(filepath+'/boc_matrix.npz', boc) | ||
with open(filepath+'/word2context.txt', 'w') as f: | ||
for wc_pair in zip(idx2word, wv_cluster_id): | ||
f.write(str(wc_pair)+'\n') | ||
|
||
|
||
def _cluster_wv(wv, num_concept): | ||
skm=SphericalKMeans(n_clusters=num_concept) | ||
skm.fit(wv) | ||
return skm.labels_ | ||
|
||
|
||
def _create_bow(idx2word, doc_path): | ||
rows=[] | ||
cols=[] | ||
vals=[] | ||
word2idx={word:idx for idx, word in enumerate(idx2word)} | ||
with open(doc_path, "r") as f: | ||
for i, doc in enumerate(f): | ||
tokens=doc.rstrip().split(" ") | ||
tokens_count=Counter([word2idx[token] for token in tokens if token in word2idx]) | ||
for idx, count in tokens_count.items(): | ||
rows.append(i) | ||
cols.append(idx) | ||
vals.append(float(count)) | ||
return csr_matrix((vals, (rows, cols)), shape=(i+1, len(word2idx))) | ||
|
||
|
||
def _create_w2c(idx2word, cluster_label, num_concept): | ||
if len(idx2word)!=len(cluster_label): | ||
raise IndexError("Dimensions between words and labels mismatched") | ||
|
||
rows=[i for i, idx2word in enumerate(idx2word)] | ||
cols=[j for j in cluster_label] | ||
vals=[1.0 for i in idx2word] | ||
|
||
return csr_matrix((vals, (rows, cols)), shape=(len(idx2word), num_concept)) | ||
|
||
|
||
def _apply_cfidf(csr_matrix): | ||
num_docs, num_concepts=csr_matrix.shape | ||
_, nz_concept_idx=csr_matrix.nonzero() | ||
cf=np.bincount(nz_concept_idx, minlength=num_concepts) | ||
icf=np.log(num_docs / cf) | ||
icf[np.isinf(icf)]=0 | ||
return safe_sparse_dot(csr_matrix, scipy.sparse.diags(icf)) | ||
|
||
|
||
def tokenize(doc_path): | ||
with open(doc_path, "r") as f: | ||
for doc in f: | ||
yield doc.rstrip().split(" ") | ||
|
||
|
||
def train_w2v(doc_path, embedding_dim, context, min_freq, iterations, save_path=""): | ||
tokenized_docs=tokenize(doc_path) | ||
model=Word2Vec(size=embedding_dim, window=context, min_count=min_freq, sg=1) | ||
model.build_vocab(tokenized_docs) | ||
model.train(tokenized_docs, total_examples=model.corpus_count, epochs=iterations) | ||
|
||
if save_path: | ||
model_name="/w2v_model_d%d_w%d" %(embedding_dim, context) | ||
model.wv.save_word2vec_format(save_path+model_name) | ||
|
||
return model.wv.vectors, model.wv.index2word | ||
|
||
|
||
def load_w2v(model_path): | ||
return KeyedVectors.load_word2vec_format(model_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
max_iteration=1000 | ||
epsilon=0.00001 | ||
damping_factor=0.15 | ||
output_name="pagerank.csv" | ||
|
||
|
||
for concept, words in concept_to_words.items(): | ||
# Initializer | ||
M_PageRank = defaultdict(int) | ||
total_coocc=0 | ||
for word in words: | ||
M_PageRank[word]=sum(M_cooccurrence[word].values()) | ||
total_coocc+=sum(M_cooccurrence[word].values()) | ||
for word, pr_value in M_PageRank.items(): | ||
M_PageRank[word]=pr_value/total_coocc | ||
#M_PageRank[word]=pr_value//(get_tf(word)/get_df(word)) | ||
|
||
# Iteration | ||
for _ in range(max_iteration): | ||
old_PageRank=M_PageRank | ||
old_PageRank_vector=np.array(list(old_PageRank.values())) | ||
for word, pr_value in old_PageRank.items(): | ||
update_pr_value=0 | ||
for linked_word in M_cooccurrence[word].keys(): | ||
#update_pr_value+=(M_cooccurrence[word][linked_word]*old_PageRank[linked_word])/sum(M_cooccurrence[linked_word].values()) | ||
update_pr_value+=old_PageRank[linked_word]/len(M_cooccurrence[linked_word].keys()) | ||
## Should be divided by the number of documents | ||
if (len(M_cooccurrence[word].keys())==0): alpha=0 | ||
else: alpha = 1.0 / float(len(M_cooccurrence[word].keys())+0.0000001) * damping_factor | ||
#alpha = 1.0 / float(len(words)+0.0000001) * damping_factor | ||
M_PageRank[word]=update_pr_value*(1-damping_factor)+alpha | ||
delta=math.sqrt(np.sum(np.power(np.array((list(M_PageRank.values()))-old_PageRank_vector),2))) | ||
if delta < epsilon: | ||
print("...PageRank converged after %s iterations" %str(_)) | ||
break | ||
if _ % 100 == 0: print("...number of iterations: %s" %str(_)) | ||
print("PageRank calculation completed") | ||
print("delta: %s" %str(delta)) | ||
for word, pr_value in sorted(M_PageRank.items(), key=lambda x: x[1], reverse=True): | ||
with open(output_name, "a") as f: | ||
f.write('%d, %s, %.5f\n' % (concept, word, pr_value)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import os | ||
import psutil | ||
|
||
|
||
# This code is provided by Hyunjoong Kim | ||
|
||
def get_available_memory(): | ||
"""It returns remained memory as percentage""" | ||
|
||
mem = psutil.virtual_memory() | ||
return 100 * mem.available / (mem.total) | ||
|
||
def get_process_memory(): | ||
"""It returns the memory usage of current process""" | ||
|
||
process = psutil.Process(os.getpid()) | ||
return process.memory_info().rss / (1024 ** 3) |