forked from mfejzer/reviewers_recommendation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tversky_non_tie_breaking_profile.py
133 lines (98 loc) · 3.97 KB
/
tversky_non_tie_breaking_profile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import collections
import json
import psutil
import sys
from operator import itemgetter
from tqdm import tqdm
def tversky_params(reviewers, commit_count_words):
d1 = sum((reviewers - commit_count_words).values())
d2 = sum((commit_count_words - reviewers).values())
i = sum((commit_count_words & reviewers).values())
return i, d1, d2
def callculate_tversky_for_reviewers(reviewers, commit_count_words):
top = collections.defaultdict(list)
for r in reviewers:
i, d1, d2 = tversky_params( reviewers[r], commit_count_words)
a = 0.0
m_max = i / ( i + a * d1 + (1-a) * d2)
top[m_max].append(r)
sorted_top = sorted(top.keys(), reverse=True)
return top, sorted_top
def get_top(sorted_top, top):
ret_top = collections.defaultdict(list)
max_top = len(sorted_top)
# check if sorted_top not empty and if top1 value >= 0
if len(sorted_top) == 0 or sorted_top[0] == 0:
return ret_top
ret_top[1].extend(top[sorted_top[0]])
# Callculate top1..top10,
for i in range(1, 10):
# use current top if available
if i < max_top and sorted_top[i] > 0 :
ret_top[i+1].extend(top[sorted_top[i]])
# reuse previous top
ret_top[i+1].extend(ret_top[i])
return ret_top
def callculate_commit_files_to_words(files):
commit_count_words = collections.Counter()
for f in files:
commit_count_words.update(f.split("/"))
return commit_count_words
def parse_file(f_in):
with open(f_in) as f: data = [json.loads(line) for line in f]
data = sorted(data, key=itemgetter('changeId'))
prediction = collections.Counter()
reviewers = collections.defaultdict(collections.Counter)
suggested_reviewers_count = collections.defaultdict(list)
mrr_sum = 0.0
mrr_count = 0.0
reviews_size = 0.0
empty_profile = 0 # computed but unused
for index,d in enumerate(tqdm(data)):
commit_count_words = callculate_commit_files_to_words(d["files"])
top, sorted_top = callculate_tversky_for_reviewers(reviewers, commit_count_words)
top_dict = get_top(sorted_top, top)
for k in top_dict:
suggested_reviewers_count[k].append(len(top_dict[k]))
for hist in d["approve_history"]:
if hist['userId'] in top_dict[k]:
prediction[k] += 1
if not reviewers[hist['userId']]:
empty_profile += 1
break
for hist in d["approve_history"]:
reviewers[hist['userId']] += commit_count_words
reviewer = hist['userId']
in_mrr = False
for k in top_dict:
if reviewer in top_dict[k]:
if not(in_mrr):
mrr_sum += 1.0 / k
mrr_count += 1
in_mrr = True
reviews_size += 1
precision = collections.Counter()
recall = collections.Counter()
for key, value in prediction.items():
precision[key] = float(value) / sum(i for i in suggested_reviewers_count[key])
recall[key] = float(value) / reviews_size
for p in sorted(prediction):
print("Top %d = %f" % (p, float(prediction[p]) / reviews_size))
print("MRR %f" % (mrr_sum / mrr_count))
print_precision(precision)
print_recall(recall)
current_process = psutil.Process()
current_memory_info = current_process.memory_info()
print(current_memory_info)
print('####')
print(suggested_reviewers_count[1])
def print_precision(precision_top):
print("Precision")
for n in sorted(precision_top):
print("%f" % (float(precision_top[n])))
def print_recall(recall_top):
print("Recall")
for n in sorted(recall_top):
print("%f" % (float(recall_top[n])))
for f in sys.argv[1:]:
users = parse_file(f)