-
Notifications
You must be signed in to change notification settings - Fork 5
/
jaccard_profile.py
228 lines (173 loc) · 7.69 KB
/
jaccard_profile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import collections
import json
import psutil
import sys
from operator import itemgetter
from tqdm import tqdm
from timeit import default_timer
def file_path_to_list(file_path):
return file_path.split("/")
# 5% speed boost using encode + intern
def fast_file_path_to_list(file_path):
# with Python 3.x JSON parser returns 'str', not 'bytes'; no need for .encode()
return [sys.intern(w) for w in file_path_to_list(file_path)]
def parse_file(f_in):
with open(f_in) as f: reviews = [json.loads(line) for line in f]
reviews = sorted(reviews, key=itemgetter('changeId'))
for index, review in enumerate(reviews):
files = review['files']
files = map(fast_file_path_to_list, files)
review['files'] = files
return reviews
def jaccard_params(reviewer_profiles, commit_count_words):
i = float(sum((commit_count_words & reviewer_profiles).values()))
u = float(sum((commit_count_words | reviewer_profiles).values()))
return i, u
def jaccard(intersection, union):
return intersection / union
def jaccard_for_reviewer_profiles(reviewer_profiles, commit_count_words):
top = collections.defaultdict(list)
for reviewer_id in reviewer_profiles:
i, u = jaccard_params(reviewer_profiles[reviewer_id], commit_count_words)
m_max = jaccard(i, u)
top[m_max].append(reviewer_id)
return top
def commit_files_to_words(file_paths):
commit_count_words = collections.Counter()
for file_path_as_list in file_paths:
commit_count_words.update(file_path_as_list)
return commit_count_words
def process_reviews(reviews):
reviewer_profiles = collections.defaultdict(collections.Counter)
topN = TopN()
total_transformation = 0.0
total_update = 0.0
total_similarity = 0.0
total = 0.0
before = default_timer()
for index, review in enumerate(tqdm(reviews)):
before_transformation = default_timer()
commit_count_words = commit_files_to_words(review["files"])
after_transformation = default_timer()
total_transformation += after_transformation - before_transformation
before_similarity = default_timer()
top = jaccard_for_reviewer_profiles(reviewer_profiles, commit_count_words)
after_similarity = default_timer()
total_similarity += after_similarity - before_similarity
topN.update(review, top)
before_update = default_timer()
for hist in review["approve_history"]:
reviewer = hist['userId']
reviewer_profiles[reviewer] += commit_count_words
after_update = default_timer()
total_update += after_update - before_update
after = default_timer()
total = after - before
all_profiles_size = 0.0
average_profile_size = 0.0
for reviewer_profile in reviewer_profiles:
all_profiles_size += sys.getsizeof(reviewer_profiles[reviewer_profile])
average_profile_size = all_profiles_size / len(reviewer_profiles)
current_process = psutil.Process()
current_memory_info = current_process.memory_info()
return topN.results(), (total, total_transformation, total_update, total_similarity), (all_profiles_size, average_profile_size, current_memory_info)
class TopN:
def __init__(self):
self.prediction = collections.Counter()
self.suggested_reviewers_count = collections.defaultdict(list)
self.reviewer_last_review_dates = collections.defaultdict(collections.Counter)
self.reviewer_count = 0.0
self.mrr_sum = 0.0
self.mrr_count = 0.0
def update(self, review, top):
top_dict = get_top_by_date(top, self.reviewer_last_review_dates)
for k in top_dict:
self.suggested_reviewers_count[k].append(len(top_dict[k]))
for hist in review["approve_history"]:
reviewer = hist['userId']
if reviewer in top_dict[k]:
self.prediction[k] += 1
break
for hist in review["approve_history"]:
reviewer = hist['userId']
self.reviewer_last_review_dates[reviewer] = review['changeId']
in_mrr = False
for k in top_dict:
if reviewer in top_dict[k]:
if not(in_mrr):
self.mrr_sum += 1.0 / k
self.mrr_count += 1
in_mrr = True
self.reviewer_count += 1
def results(self):
precision = collections.Counter()
recall = collections.Counter()
for key, value in self.prediction.items():
precision[key] = float(value) / sum(i for i in self.suggested_reviewers_count[key])
recall[key] = float(value) / self.reviewer_count
mrr = self.mrr_sum / self.mrr_count
print_topN(recall)
print_mrr(mrr)
print('----')
for key, value in self.prediction.items():
print(key, sum(i for i in self.suggested_reviewers_count[key]))
print('----')
print('----')
for key, value in self.prediction.items():
print(key, value)
print('----')
print_precision(precision)
print_recall(recall)
return self.suggested_reviewers_count, precision, self.mrr_sum / self.mrr_count
def get_top_by_date(top, users_last_date):
sorted_top = sorted(top.keys(), reverse=True)
ret_top = collections.defaultdict(list)
t_ret_top = []
max_top = len(sorted_top)
# check if sorted_top not empty and if top1 value >= 0
if len(sorted_top) == 0 or sorted_top[0] == 0:
return ret_top
# Callculate top1..top10,
for i in range(0, max_top):
# use current top if available
if i < max_top and sorted_top[i] > 0 :
l = sorted_list_by_date(top[sorted_top[i]], users_last_date)
t_ret_top.extend(l)
for i in range(1, 11):
# reuse previous top
ret_top[i].extend(t_ret_top[:i])
return ret_top
def sorted_list_by_date(users, users_last_date):
selected_users_dates = {u:users_last_date[u] for u in users}
out = sorted(selected_users_dates.items(), key=itemgetter(1), reverse=True)
return [o[0] for o in out]
def print_topN(prediction):
for p in sorted(prediction):
print("Top %d = %f" % (p, float(prediction[p])))
def print_mrr(mrr):
print("Mean reciprocal rank = %f" % (mrr))
def print_precision(precision_top):
print("Precision")
for n in sorted(precision_top):
print("%f" % (float(precision_top[n])))
def print_recall(recall_top):
print("Recall")
for n in sorted(recall_top):
print("%f" % (float(recall_top[n])))
def main():
for f in sys.argv[1:]:
reviews = parse_file(f)
(suggested_reviewers_count, prediction, mrr), (total, transformation, update, similarity), (all_profiles_size, average_profile_size, memory_info) = process_reviews(reviews)
print_time(total, transformation, update, similarity)
print_size(all_profiles_size, average_profile_size)
print(memory_info)
def print_time(total, transformation, update, similarity):
print("Total time = %f" % (total))
print("Total review to multiset transformation time = %f" % (transformation))
print("Total profile update time = %f" % (update))
print("Total profile to review similarity time = %f" % (similarity))
def print_size(all_profiles, average_profile):
print("All profiles size in bytes = %f" % (all_profiles))
print("Average profile size in bytes = %f" % (average_profile))
if __name__ == '__main__':
main()