-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathYoY_data_process.py
148 lines (143 loc) · 9.42 KB
/
YoY_data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import pandas as pd
import os
from time import time
import pdb
def process_data(data_version):
"""
Processes a .csv of sequences and their respective O-GlcNAcylation status for input into YoY, then can be run again to process the results.
First, run this file with only the data_version input to create a folder called f'YoY_data_{data_version}'. Send each .txt file in that folder to YoY.
After each YoY run is done, copy and paste all the results into the Results.txt file (that is, Ctrl+A then Ctrl+C in the YoY results webpage, then Ctrl+V in the .txt file).
Run this file with the --analyze_results flag to generate a list of thresholds for the central AAs (position 11), which then are compared with the real values.
"""
if data_version == 'v3':
data = pd.read_csv('all_sites_fixed.csv')
elif data_version == 'v5':
raw_file = pd.read_csv('OVSlab_allSpecies_O-GlcNAcome_PS.csv')
# Some pre-processing to make the v5 data in the same form as the v3 data
positive_sites = set()
for idx in range(raw_file.shape[0]):
temp_sites = raw_file.iat[idx, 2].split(';') # A single protein may have multiple positive sites, which are separated with a ;
for site in temp_sites:
positive_sites.add( (raw_file.iat[idx, 0], int(site[1:]), raw_file.iat[idx, 3]) ) # site[1:] to not add the letter
data = pd.DataFrame(positive_sites)
y_boolean = pd.read_csv(f'OH_data_{data_version}_10-window.csv') # Hardcoding the 10-window since YoY demands at least 21 AAs per window
# Creating a folder and saving the results
dir_path = f'YoY_data_{data_version}'
if not os.path.isdir(dir_path):
os.mkdir(dir_path)
to_file = []
used_prot = set() # Do not take duplicate proteins
skipped_prot = set()
for prot_idx in range(data.shape[0]):
if len(data.iat[prot_idx, 2]) <= 4000 and data.iat[prot_idx, 0] not in used_prot:
header = f'>{data.iat[prot_idx, 0]}\n'
to_file.append(header + data.iat[prot_idx, 2] + '\n')
used_prot.add(data.iat[prot_idx, 0])
elif len(data.iat[prot_idx, 2]) > 4000:
skipped_prot.add(data.iat[prot_idx, 0])
print(f'{len(skipped_prot)} proteins were skipped due to length > 4000: {skipped_prot}')
proteins_per_file = 175 # YoY has limits on input size. From experience, the 200,000 amino acids limit gets hit with some 200 proteins
# Separating the proteins in multiple files
for written_file_count in range( int(len(to_file)/proteins_per_file) + 1 ): # The int is acting like np.floor(), but returns an integer instead of a float
with open(os.path.join(dir_path, f'YoY_file_{data_version}_{written_file_count}.txt'), 'w') as f:
for line_count, line in enumerate(to_file[written_file_count*proteins_per_file : (written_file_count+1)*proteins_per_file]):
f.writelines(line)
if os.path.isfile(os.path.join(dir_path, 'Results.txt')): # Ensuring Results.txt doesn't get overwritten if the user mistakenly runs this function instead of process_results()
print(f'Note: the file {os.path.join(dir_path, "Results.txt")} already exists, so it will not be recreated nor modified.')
else:
with open(os.path.join(dir_path, 'Results.txt'), 'w') as f: # The empty .txt file in which the YoY results will be pasted
f.write('')
def process_results(data_version):
"""
Called when this file is run with the --analyze_results flag.
Generates a list of thresholds for the central AAs (position 11), then compares them with the real values and prints the coordinates for plotting
"""
dir_path = f'YoY_data_{data_version}'
# Reading and filtering the file generated by process_data()
with open(os.path.join(dir_path, 'Results.txt')) as f:
lines = f.readlines()
thresholds = []
for idx, line in enumerate(lines):
if '+' in line or (' - ' in line and 'DTU Health Tech' not in line):
temp = [elem for elem in line.split(' ') if elem]
thresholds.append((temp[0], temp[1], temp[4])) # Splitting and getting the [0] element returns the protein idx, the [1] element returns the position, and the [4] element returns the threshold
thresholds = pd.DataFrame(thresholds, columns = ['Prot_Idx', 'Position', 'YoY Threshold'])
thresholds.to_csv(os.path.join(dir_path, 'YoY_thresholds.csv'), index = False)
# Reading the real data
if data_version == 'v3':
real_data = pd.read_csv('all_sites_fixed.csv')
cutoff = np.concatenate( ([0.15], np.linspace(0.2, 0.8, 7)) ) # No points with a threshold < 0.15 or > 0.8
elif data_version == 'v5':
temp_data = pd.read_csv('OVSlab_allSpecies_O-GlcNAcome_PS.csv').iloc[:, [0, 2]]
real_data = []
for idx in range(temp_data.shape[0]):
sites = temp_data.iat[idx, -1].replace('S', '').replace('T', '').split(';')
real_data.extend([temp_data.iat[idx, 0], int(elem)] for elem in sites) # Note both data are 1-indexed, so we do not need to apply any corrections
real_data = pd.DataFrame(real_data)
cutoff = np.concatenate( ([0.12], np.linspace(0.2, 0.8, 7)) ) # No points with a threshold < 0.12 or > 0.85
# Setup and helper variables
numerical_yoy_position = np.array(thresholds.iloc[:, 1], dtype = np.uint32) # Length up to 4,294,967,296
numerical_thresholds = np.array(thresholds.iloc[:, 2], dtype = float)
pos_idxs = set()
true_pos = np.zeros(cutoff.shape, dtype = np.uint32) # Up to 4,294,967,296 - 1 positive samples
false_neg = np.zeros_like(true_pos)
# Going through data that are real positives, but can be labeled as negative or positive by the model
for idx in range(real_data.shape[0]):
if not idx%200:
print(f'Current idx: {idx:,}/{real_data.shape[0]:,}', end = '\r')
temp = np.where((thresholds.iloc[:, 0] == real_data.iat[idx, 0]) & (numerical_yoy_position == real_data.iat[idx, 1]))[0]
if np.any(temp):
true_pos += numerical_thresholds[temp[0]] >= cutoff
false_neg += numerical_thresholds[temp[0]] < cutoff
pos_idxs.add(temp[0])
# Going through data that are real negatives, but can be labeled as negative or positive by the model
neg_idxs = list(set(range(thresholds.shape[0])) - pos_idxs)
false_pos = np.sum(numerical_thresholds[neg_idxs, None] >= cutoff, axis = 0)
true_neg = np.sum(numerical_thresholds[neg_idxs, None] < cutoff, axis = 0)
x_unrounded = true_pos / (true_pos + false_neg)*100
x_values = [round(elem, 2) for elem in x_unrounded]
print(f'x_values = {x_values}')
y_unrounded = true_pos / (true_pos + false_pos)*100
y_values = [round(elem, 2) for elem in y_unrounded]
print(f'y_values = {y_values}')
f1 = 2*(1/x_unrounded + 1/y_unrounded)**-1
f1 = [round(elem, 2) for elem in f1]
print(f'f1 = {f1}')
# The denominator becomes pretty large as the number of samples increases. The v5 dataset gets too close to overflowing and leads to incorrect MCC values if not converted to float
TP = np.array(true_pos, dtype = float)
FP = np.array(false_pos, dtype = float)
TN = np.array(true_neg, dtype = float)
FN = np.array(false_neg, dtype = float)
MCC = (TP*TN - FP*FN) / np.sqrt((TP+FP) * (TP+FN) * (TN+FP) * (TN+FN))*100
MCC = [round(elem, 2) for elem in MCC]
print(f'MCC = {MCC}')
def truncate_intermediate_results(data_version = 'v5'):
"""
A helper function to reduce the size of Results.txt as multiple runs of YoY are done
"""
dir_path = f'YoY_data_{data_version}'
# Reading and filtering the file generated by process_data()
with open(os.path.join(dir_path, 'Results_temp.txt')) as f:
lines = f.readlines()
useful_lines = []
for idx, line in enumerate(lines):
if '+' in line or (' - ' in line and 'DTU Health Tech' not in line):
useful_lines.append(line)
with open(os.path.join(dir_path, 'Results.txt'), 'a') as f:
f.writelines(useful_lines)
if __name__ == '__main__':
# Input setup
import argparse
parser = argparse.ArgumentParser(description = 'Processes a .csv of sequences and their respective O-GlcNAcylation status for input into YoY, then can be run again to process the results. '
'First, run this file with only the data_version input to create a folder called "YoY_data_{data_version}". Send each .txt file in that folder to YoY. '
'After each YoY run is done, copy and paste all the results into the Results.txt file (that is, Ctrl+A then Ctrl+C in the YoY results webpage, then Ctrl+V in the .txt file). '
'Run this file with the --analyze_results flag to generate a list of thresholds for the central AAs (position 11), which can be compared to the real values.')
parser.add_argument('data_version', type = str, nargs = 1, help = 'The version of the dataset you are using. Should be "v#", where # is an integer')
parser.add_argument('-ar', '--analyze_results', metavar = 'True | False (Default)', type = bool, nargs = '?', default = False, const = True, choices = {True, False},
help = 'Use this flag to filter and analyze the outputs of YoY')
my_args = parser.parse_args()
if my_args.analyze_results:
process_results(my_args.data_version[0]) # [0] to convert from list to string
else:
process_data(my_args.data_version[0]) # [0] to convert from list to string