-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_clinical_data.py
62 lines (57 loc) · 2.01 KB
/
get_clinical_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import pickle
import operator
import numpy as np
import pandas as pd
OP_MAP = {
"lt": operator.lt,
"le": operator.le,
"eq": operator.eq,
"ne": operator.ne,
"ge": operator.ge,
"gt": operator.gt,
}
def build_sets(data, categories):
clinical_sets = {}
for name, criteria in categories:
evals = [OP_MAP[op](data[trait], val) for trait, val, op in criteria]
idx = np.all(evals, axis=0)
ids = set(data.loc[idx, "projid"].astype(str))
# print(ids) ####
# print(data.loc[data[trait] == val]["projid"].astype(str)) ####
# print(type(ids.pop())) ####
print(name, len(ids)) ####
clinical_sets[name] = ids
return clinical_sets
def cast_num_no_plus(val):
try:
return float(val.rstrip("+"))
except ValueError:
return np.nan
def get_clinical_data(data_path, categories, out_path):
converters = {
"age_first_ad_dx": cast_num_no_plus,
"age_death": cast_num_no_plus,
"age_at_visit_max": cast_num_no_plus,
}
data = pd.read_csv(data_path, converters=converters)
sets = build_sets(data, categories)
with open(out_path, "wb") as out_file:
pickle.dump(sets, out_file)
if __name__ == '__main__':
data_path = "/agusevlab/awang/sc_kellis/rosmap_clinical/ROSMAP_clinical.csv"
categories = [
("Female", (("msex", 0, "eq"),)),
("Male", (("msex", 1, "eq"),)),
("AgeUnder80", (("age_death", 80, "lt"),)),
("Age80To90", (("age_death", 80, "ge"), ("age_death", 90, "lt"),)),
("AgeOver90", (("age_death", 90, "ge"),)),
("ReaganNeg", (("ad_reagan", 0, "eq"),)),
("ReaganPos", (("ad_reagan", 1, "eq"),)),
("Cerad1", (("ceradsc", 1, "eq"),)),
("Cerad2", (("ceradsc", 2, "eq"),)),
("Cerad3", (("ceradsc", 3, "eq"),)),
("Cerad4", (("ceradsc", 4, "eq"),)),
]
out_path = "/agusevlab/awang/sc_kellis/rosmap_clinical/clinical_sets.pickle"
get_clinical_data(data_path, categories, out_path)