-
Notifications
You must be signed in to change notification settings - Fork 1
/
Lopacity.py
572 lines (509 loc) · 18 KB
/
Lopacity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
from collections import Counter
import pandas as pd
import networkx as nx
import random
def init(g, degs, verbose=False):
"""
:bool verbose: Print graph stats
:array degs: Degree distibution of a graph
:Graph g: networkx graph
"""
degree_count = Counter(degs.values())
if verbose:
print("Totally " + str(len(degree_count)) + " distinct degrees")
print("Min degree " + str(min(degree_count)))
print("Max degree " + str(max(degree_count)))
print("Mode degree " + str(degree_count.most_common(1)[0][0]))
print("Avg degree " + str(np.mean(list(degs.values()))))
opacity = {}
for k in degree_count.keys():
opacity[k] = {}
for i in degree_count.keys():
opacity[k][i] = 0.0
opacity = pd.DataFrame(opacity)
return [degree_count, opacity]
def __opacity_calc(a1, a2, degs, deg_count, opacity, inv_opacity):
"""
Add to opacity matrix edge a1, a2
:param a1:
:param a2:
:param degs:
:param deg_count:
:param opacity:
:param inv_opacity:
"""
d1 = degs[a1]
d2 = degs[a2]
if d1 == d2:
added_value = deg_count[d1] * (deg_count[d1] - 1) / 2
else:
added_value = deg_count[d1] * deg_count[d2]
opacity[max(d1, d2)][min(d1, d2)] += 1.0 / added_value
if opacity[max(d1, d2)][min(d1, d2)] > 1.0:
opacity[max(d1, d2)][min(d1, d2)] = 1.0
if (max(d1, d2), min(d1, d2)) not in inv_opacity.keys():
inv_opacity[(max(d1, d2), min(d1, d2))] = [(a1, a2)]
else:
inv_opacity[(max(d1, d2), min(d1, d2))].append((a1, a2))
def calc_lopacity_matrix(g, L, degs, deg_count, opacity, inv_opacity):
"""
Calucalte opacity matrix with shortest path with level L
:param g: Graph
:param L:
:param degs:
:param deg_count:
:param opacity:
:param inv_opacity:
"""
opacity[opacity > 0.0] = 0.0
lapsp = nx.all_pairs_shortest_path_length(g, cutoff=L)
for i in lapsp:
for z in lapsp[i].keys():
if i < z:
__opacity_calc(i, z, degs, deg_count, opacity, inv_opacity)
def get_max_violated(g, theta, opacity, inv_opacity):
"""
Get the edes with maximum opacity value, exceeding $theta$
:param g:
:param theta:
:param opacity:
:param inv_opacity:
:return: Edges with maximum opacity
"""
kf = opacity[(opacity > theta) & (opacity == opacity.max().max())]
df_notnull = kf.notnull().unstack()
deg_vals = df_notnull[df_notnull].keys()
edges_viol = []
for (d1, d2) in deg_vals:
edges_viol.append(inv_opacity[(d1, d2)])
return edges_viol
def get_all_violated(g, theta, opacity, inv_opacity):
"""
Get all edges with with opacity value exceeding $theta$
:param g:
:param theta:
:param opacity:
:param inv_opacity:
:return: Edges with maximum opacity
"""
kf = opacity[opacity > theta]
df_notnull = kf.notnull().unstack()
deg_vals = df_notnull[df_notnull].keys()
edges_viol = []
for (d1, d2) in deg_vals:
edges_viol.append(inv_opacity[(d1, d2)])
return edges_viol
def edge_to_path(g, buckets):
"""
Transform path to edges, calcaulte inverted index
:param g:
:param buckets:
:return:
"""
cand = {}
inv_cand = {}
bucket_id = 0
for bucket in buckets:
cand_id = 0
cand[bucket_id] = {}
for edge in bucket:
# Get all shortest paths from A to B
edge_paths = [p for p in nx.all_shortest_paths(g, source=edge[0], target=edge[1])]
full_paths = []
# Transfrom shortest path into edge list:
# To remove the connection we need to remove everything
# withing one vertex pair of one cell in opacity matrix
for path in edge_paths:
full_path = []
for i in range(len(path) - 1):
full_path.append((path[i], path[i + 1]))
full_paths.append(full_path)
# Add the number of shortest path
cand[bucket_id][cand_id] = len(full_paths)
# Inverted candidates
f_id = 0
for path in full_paths:
for e in path:
if e not in inv_cand.keys():
inv_cand[e] = {}
if bucket_id not in inv_cand[e].keys():
inv_cand[e][bucket_id] = {}
if cand_id not in inv_cand[e][bucket_id].keys():
inv_cand[e][bucket_id][cand_id] = []
inv_cand[e][bucket_id][cand_id].append(f_id)
# Add some statistics maybe ?
f_id += 1
cand_id += 1
bucket_id += 1
return [cand, inv_cand]
def set_score_func(inv_cand, exceptSet=set([])):
"""
Apply score function to inverted indexes with edges
:param inv_cand:
"""
for e in inv_cand:
score_func = 1.0
for b_id in inv_cand[e]:
if b_id == 'score':
continue
for c_id in inv_cand[e][b_id]:
if b_id in exceptSet:
score_func /= (len(inv_cand[e][b_id][c_id]) + 1)
else:
score_func *= (len(inv_cand[e][b_id][c_id]) + 1)
if b_id in exceptSet:
score_func /= (len(inv_cand[e][b_id]) + 1) * (len(inv_cand[e][b_id]) + 1)
else:
score_func *= (len(inv_cand[e][b_id]) + 1) * (len(inv_cand[e][b_id]) + 1)
if b_id in exceptSet:
score_func /= (len(inv_cand[e]) + 1) * (len(inv_cand[e]) + 1) * (len(inv_cand[e]) + 1)
else:
score_func *= (len(inv_cand[e]) + 1) * (len(inv_cand[e]) + 1) * (len(inv_cand[e]) + 1)
inv_cand[e]['score'] = score_func
def opacity_diff(op, new_op):
pass
def remove_edge_lopacity_orig(g, L, degs, deg_count, opacity, edge_pairs_buckets, consider_all=False):
"""
Original lopacity model
:param consider_all:
:param g:
:param L:
:param degs:
:param deg_count:
:param opacity:
:param edge_pairs_buckets:
:return:
"""
cand, inv_cand = edge_to_path(g, edge_pairs_buckets)
edges = list(inv_cand.keys())
best_edge = edges[0]
best_lo = 1.1
best_pop = g.number_of_edges()
t = 0
rand_dec = False
if consider_all:
edges = g.edges()
for edge in edges:
g.remove_edge(edge[0], edge[1])
new_opacity = opacity.copy()
inv_op = {}
calc_lopacity_matrix(g, L, degs, deg_count, new_opacity, inv_op)
kf = new_opacity[(new_opacity == new_opacity.max().max())]
df_notnull = kf.notnull().unstack()
n_lo = len(df_notnull[df_notnull].keys())
max_lo = new_opacity.max().max()
if max_lo < best_lo:
best_lo = max_lo
t = 1
best_pop = n_lo
best_edge = edge
rand_dec = False
elif (max_lo == best_lo) and (n_lo < best_pop):
best_pop = n_lo
best_edge = edge
rand_dec = False
t = 1
elif (max_lo == best_lo) and (n_lo == best_pop):
rand = random.random()
t += 1
if rand < 1 / t:
best_edge = edge
rand_dec = True
g.add_edge(edge[0], edge[1])
# g.remove_edge(best_edge[0], best_edge[1])
print("Best edge to remove: ", best_edge)
print("Is the decision is random ? " + str(rand_dec))
return best_edge
def get_edges_intersect(g, edge_pairs_buckets, verbose=False):
"""
Inverted index model
:param g:
:param edge_pairs_buckets:
:return:
"""
cand, inv_cand = edge_to_path(g, edge_pairs_buckets)
set_score_func(inv_cand)
# get top candidates
top_cand = sorted(inv_cand.items(), key=lambda x: (x[1]['score'], random.random()), reverse=True)
if verbose:
print("Top candidates for removal: ")
print(top_cand)
print(" ")
print(" In each cell how many candidates and how many shortest paths there are")
print(cand)
for k in cand:
for k1 in cand[k]:
cand[k][k1] = set(range(cand[k][k1]))
rem_edges = []
finished = False
s_id = 0
while not finished:
# Take the first top candidate
s = top_cand[s_id]
# Get the edge id
edge = s[0]
# Get cell
for b_id in s[1].keys():
# Get candidate
if b_id == 'score':
continue
for c_id in s[1][b_id].keys():
# Remove from all that it has effect
cand[b_id][c_id] = cand[b_id][c_id] - set(s[1][b_id][c_id])
finished = True
for b_id in cand.keys():
completed = False
for c_id in cand[b_id].keys():
if len(cand[b_id][c_id]) == 0:
completed = True
if not completed:
finished = False
rem_edges.append(edge)
s_id += 1
return rem_edges
def get_edges_intersect_dynamic(g, edge_pairs_buckets):
"""
Inverted index model
:param g:
:param edge_pairs_buckets:
:return:
"""
exceptSet = set([])
cand, inv_cand = edge_to_path(g, edge_pairs_buckets)
set_score_func(inv_cand)
# get top candidates
top_cand = sorted(inv_cand.items(), key=lambda x: (x[1]['score'], random.random()), reverse=True)
# print("Top 10 candidates for removal: ")
# print(top_cand[:10])
# print(" ")
# print(" In each cell how many candidates and how many shortest paths there are")
# print(cand)
for k in cand:
for k1 in cand[k]:
cand[k][k1] = set(range(cand[k][k1]))
rem_edges = set([])
finished = False
s_id = 0
while not finished:
# Take the first top candidate
s = top_cand[s_id]
# Get the edge id
edge = s[0]
# Get cell
if edge in rem_edges:
s_id += 1
continue
for b_id in s[1].keys():
# Get candidate
if b_id == 'score':
continue
for c_id in s[1][b_id].keys():
# Remove from all that it has effect
cand[b_id][c_id] = cand[b_id][c_id] - set(s[1][b_id][c_id])
finished = True
for b_id in cand.keys():
completed = False
for c_id in cand[b_id].keys():
if len(cand[b_id][c_id]) == 0:
completed = True
if not completed:
finished = False
else:
exceptSet.add(b_id)
set_score_func(inv_cand, exceptSet)
top_cand = sorted(inv_cand.items(), key=lambda x: (x[1]['score'], random.random()), reverse=True)
# print("Scores are updated: ")
# print(top_cand)
# print(" --------- ")
s_id = -1
rem_edges.add(edge)
s_id += 1
return rem_edges
def set_score_func_opt(cand, inv_cand):
"""
Apply score function to inverted indexes with edges
:param inv_cand:
"""
for e in inv_cand:
score_func = [0, 0] # cells removed, cand removed, closer to remove
for b_id in inv_cand[e]:
if b_id == 'score':
continue
has_effect = False
for c_id in inv_cand[e][b_id]:
temp_set = cand[b_id][c_id] - set(inv_cand[e][b_id][c_id])
# The removal of this edge leads to removal of all shortest path => decrease of cell b_id
score_func[1] += -1
if len(temp_set) == 0:
has_effect = True
if has_effect:
score_func[0] += 1 # number of cell it affects
inv_cand[e]['score'] = score_func
def set_score_func_opt_2(cand, inv_cand):
"""
Apply score function to inverted indexes with edges
:param inv_cand:
"""
for e in inv_cand:
score_func = [0, 0, 0] # cells removed, cand removed, closer to remove
for b_id in inv_cand[e]:
if b_id == 'score':
continue
has_effect = False
for c_id in inv_cand[e][b_id]:
temp_set = cand[b_id][c_id] - set(inv_cand[e][b_id][c_id])
# The removal of this edge leads to removal of all shortest path => decrease of cell b_id
score_func[1] += -1
if len(temp_set) == 0:
has_effect = True
else:
score_func[2] += -len(temp_set)
if has_effect:
score_func[0] += 1 # number of cell it affects
inv_cand[e]['score'] = score_func
def get_edges_intersect_opt(g, edge_pairs_buckets, mode ='1', verbose=False):
"""
Inverted index model
:param verbose:
:param g:
:param edge_pairs_buckets:
:return:
"""
cand, inv_cand = edge_to_path(g, edge_pairs_buckets)
for k in cand:
for k1 in cand[k]:
cand[k][k1] = set(range(cand[k][k1]))
if mode == '1':
set_score_func_opt(cand, inv_cand)
else:
set_score_func_opt_2(cand, inv_cand)
top_cand = sorted(inv_cand.items(), key=lambda x: (x[1]['score'], random.random()), reverse=True)
if verbose:
print("Top candidates for removal: ")
print(top_cand)
print(" ")
print(" In each cell how many candidates and how many shortest paths there are")
print(cand)
s = top_cand[0] # get first top candidate
return [s[0]]
def intersect_anonimize(g, L, theta, score='stat', mode='val'):
"""
Anonimize graph g with intersect method and inverted index
:param mode: Mode to calculate :
'max' - edges that contribute to maximum of the opacity matrix
'val' - all violated edges
:param score:
'stat' - simple static score function
'dyn' - dynamically changing score
'opt' - local optimally
:param g: Graph for anonymization
:param L: Maximum significant distance
:param theta: The maximum allowed opacity value
"""
degs = g.degree(g)
deg_count, opacity = init(g, degs)
finished = False
g_hat = g.copy()
total_removed = 0
results = {}
while not finished:
# Main steps:
# 1. Calculate L-APSP:
inv_opacity = {}
calc_lopacity_matrix(g_hat, L, degs, deg_count, opacity, inv_opacity)
if total_removed == 0:
results['old_opacity'] = opacity.copy()
# 5. If the max is lower than theta - exit
if opacity.max().max() <= theta:
break
# 2. Get all violated vertex-pairs that have the maximum opacity value
if mode == 'val':
edge_pairs_buckets = get_all_violated(g_hat, theta, opacity, inv_opacity)
elif mode == 'max':
edge_pairs_buckets = get_max_violated(g_hat, theta, opacity, inv_opacity)
else:
raise NameError('Not valid parameter mode')
# 3. Get best edges for removal
if score == 'dyn':
rem_edges = get_edges_intersect_dynamic(g_hat, edge_pairs_buckets)
elif score == 'stat':
rem_edges = get_edges_intersect(g_hat, edge_pairs_buckets)
elif score == 'opt':
rem_edges = get_edges_intersect_opt(g_hat, edge_pairs_buckets, verbose=False)
elif score == 'opt2':
rem_edges = get_edges_intersect_opt(g_hat, edge_pairs_buckets, verbose=False, mode='2')
else:
raise NameError('Not valid parameter score')
# 4. Remove edges
g_hat.remove_edges_from(rem_edges)
print("Edges removed: ")
print(rem_edges)
total_removed += len(rem_edges)
#print("Total Removed edges : ")
#print(total_removed)
print('Finished')
results['new_opacity'] = opacity
results['new_graph'] = g_hat
return results
def anonimize_lopacity(g, L, theta, mode='val'):
"""
:param g: Graph to anonymize
:param L: Maximum length
:param theta: maximum opacity value
:param mode: Mode to calculate : 'all' - consider all possible edges,
'max' - edges that contribute to maximum of the opacity matrix
'val' - all violated edges
:return:
"""
degs = g.degree(g)
inv_opacity = {}
deg_count, opacity = init(g, degs)
finished = False
g_hat = g.copy()
total_rem = 0
results = {}
while not finished:
# 1. Calculate L-APSP:
calc_lopacity_matrix(g_hat, L, degs, deg_count, opacity, inv_opacity)
if total_rem == 0:
results['old_opacity'] = opacity.copy()
if opacity.max().max() <= theta:
break
consider_all = False
if mode == 'val':
edge_pairs_buckets = get_all_violated(g_hat, theta, opacity, inv_opacity)
elif mode == 'max':
edge_pairs_buckets = get_max_violated(g_hat, theta, opacity, inv_opacity)
elif mode == 'all':
edge_pairs_buckets = get_all_violated(g_hat, theta, opacity, inv_opacity)
consider_all = True
else:
raise NameError('Not valid parameter mode')
rem_edge = remove_edge_lopacity_orig(g_hat, L, degs, deg_count, opacity, edge_pairs_buckets,
consider_all=consider_all)
g_hat.remove_edges_from([rem_edge])
total_rem += 1
results['new_opacity'] = opacity
results['new_graph'] = g_hat
print("Total Removed edges : ")
print(total_rem)
return results
def lopacity_step(g, degs, L, theta, init=True, verbose=False):
"""
:param g:
:param degs:
:param L:
:param theta:
:param init:
:param verbose:
:return:
"""
deg_count, opacity = init(g, degs, verbose)
inv_opacity = {}
# 1. Calcaulte L-APSP:
calc_lopacity_matrix(g, L, degs, deg_count, opacity, inv_opacity)
if verbose:
print(opacity)
# 2. Get all violated vertex-pairs by the maximum
edge_pairs_buckets = get_max_violated(g, theta, opacity, inv_opacity)
return remove_edge_lopacity_orig(g, L, degs, deg_count, opacity, edge_pairs_buckets)