-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_parser.py
139 lines (118 loc) · 4.25 KB
/
file_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
import os
import json
attribute_source = "infores:biothings-multiomics-drugapprovals"
attribute_data_source = "infores:isb-wellness"
faers = "infores:faers"
dailymed = "infores:dailymed"
kgInfoUrl = "https://db.systemsbiology.net/gestalt/cgi-pub/KGinfo.pl?id="
def load_data(data_folder):
edges_file_path = os.path.join(data_folder, "drug_approvals_kg_edges_v0.3.5.tsv")
nodes_file_path = os.path.join(data_folder, "drug_approvals_kg_nodes_v0.3.5.tsv")
nodes_data = pd.read_csv(nodes_file_path, sep='\t')
id_name_mapping = {}
id_type_mapping = {}
for index,row in nodes_data.iterrows():
id_name_mapping[row["id"]] = row["name"]
id_type_mapping[row["id"]] = row["category"]
edges_data = pd.read_csv(edges_file_path, sep='\t')
for index,line in edges_data.iterrows():
subj = line['subject']
pred = line['predicate']
obj = line['object']
if subj and pred and subj.split(':')[0] and obj.split(':')[0]:
prefix = subj.split(':')[0].replace(".","_")
subject = {
"id": subj,
prefix.lower(): subj,
"name": id_name_mapping[subj],
"type": id_type_mapping[subj]
}
prefix = obj.split(':')[0].replace(".","_")
object_ = {
"id": obj,
prefix.lower(): obj,
"name": id_name_mapping[obj],
"type": id_type_mapping[obj]
}
# properties for predicate/association
edge_attributes = []
# knowledge level
edge_attributes.append(
{
"attribute_type_id": "biolink:knowledge_level",
"value": line['knowledge_level'],
}
)
# agent type
edge_attributes.append(
{
"attribute_type_id": "biolink:agent_type",
"value": line['agent_type'],
}
)
# approval status
edge_attributes.append(
{
"attribute_type_id": "clinical_approval_status",
"value": "approved_for_condition" if pred=="treats" else "not_approved_for_condition",
}
)
# approval NDAs
#edge_attributes.append(
# {
# "attribute_type_id": "approvals",
# "value": line['approval'],
# }
#)
# supporting SPLs
#edge_attributes.append(
# {
# "attribute_type_id": "supporting SPLs",
# "value": line['supporting_spls'],
# }
#)
# sources
edge_sources = [
{
"resource_id": attribute_source,
"resource_role": "aggregator_knowledge_source",
"source_record_urls": [ kgInfoUrl + line['id'] ]
},
{
"resource_id": dailymed if pred == 'treats' else faers,
"resource_role": "primary_knowledge_source"
},
{
"resource_id": faers if pred == 'treats' else dailymed,
"resource_role": "supporting_data_source"
}
]
association = {
"label": pred,
"attributes": edge_attributes,
"sources": edge_sources
}
# Yield subject, predicate, and object properties
data = {
"_id": line['id'],
"subject": subject,
"association": association,
"object": object_
}
yield data
else:
print(f"Cannot find prefix for {line} !")
def main():
testing = False #True
done = 0
gen = load_data('test')
while not testing or done < 10:
try: entry = next(gen)
except:
break
else:
print(json.dumps(entry, sort_keys=True, indent=2))
done = done + 1
if __name__ == '__main__':
main()