-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_stringDB.py
95 lines (79 loc) · 3 KB
/
get_stringDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import pandas as pd
import requests
import io
path = 'path/to/folder'
# define interactors for which proteins should be retrieved:
# get STRING ENS via uniprot (manually)
# bArr1 9606.ENSP00000409581
# bArr2 9606.ENSP00000403701
my_genes = ["9606.ENSP00000409581", "9606.ENSP00000403701"]
# HELP
# https://string-db.org/help/api/
# what we need:
# /api/tsv/interaction_partners? -> Gets all the STRING interaction partners of your proteins
# parameters, see help site for options
string_api_url = "https://string-db.org/api"
output_format = "tsv"
method = "interaction_partners"
# built request
# https://string-db.org/api/[output-format]/interaction_partners?identifiers=[your_identifiers]&[optional_parameters]
request_url = "/".join([string_api_url, output_format, method])
print(request_url)
# set parameters for request
params = {
"identifiers": "%0d".join(my_genes),
"species": 9606, # species NCBI identifier for human
"limit": 1000, # limits the number of interaction proteins retrieved per protein
"caller_identity": "Mona" # my identifier for stringDB
}
# call STRING request
response = requests.post(request_url, data=params)
# Read and parse the results
# see help for all option of scores which can be retrieved
temp_row = str()
data_string = str()
for line in response.text.strip().split("\n"):
l = line.strip().split("\t")
query_ensp = l[0]
query_name = l[2]
partner_ensp = l[1]
partner_name = l[3]
combined_score = l[5] # in help called "score"
text_score = l[12] # textmining score (aus interesse)
temp_row = "\t".join([query_ensp, query_name,
partner_ensp, partner_name,
combined_score])
data_string += temp_row + "\n"
# string to dataframe
df_raw = pd.read_csv(io.StringIO(data_string), sep="\t")
# remove all entries with combined score < 0.5
df = df_raw[df_raw['score']>=0.5]
# index is kept in line above, thus reset index
df.reset_index(inplace=True)
# identify non-unique interactors, add column with unique or non-unique
all_dupl = df.duplicated(subset="stringId_B", keep=False)
# create pd.Series for new column containing either both/bArr1/bArr2
unique = []
for i, e in enumerate(all_dupl):
# means if e = True
if e:
unique += ["both"]
else:
# add "bArr1" or "bArr2"
unique += [df['preferredName_A'][i]]
unique = pd.Series(unique)
# add this series as new column to data
df_uniq = pd.concat([df, unique], axis=1)
df_uniq.rename(columns={0: "uniqueness"}, inplace=True)
# export df as xlsx
# make new folder, if is not there yet
try:
os.mkdir(path + "/OG_stringDB_data/")
except FileExistsError:
pass
df_uniq.to_excel(path + "/OG_stringDB_data/" + "interactors_stringDB.xlsx")
## about the combined_score
# The combined score is computed by combining the probabilities from the different evidence channels and
# corrected for the probability of randomly observing an interaction. For a more detailed description please
# see von Mering, et al. Nucleic Acids Res. 2005