YoY_data_process.py

import numpy as np
import pandas as pd
import os
from time import time
import pdb

def process_data(data_version):
    """
    Processes a .csv of sequences and their respective O-GlcNAcylation status for input into YoY, then can be run again to process the results.
    First, run this file with only the data_version input to create a folder called f'YoY_data_{data_version}'. Send each .txt file in that folder to YoY.
    After each YoY run is done, copy and paste all the results into the Results.txt file (that is, Ctrl+A then Ctrl+C in the YoY results webpage, then Ctrl+V in the .txt file).
    Run this file with the --analyze_results flag to generate a list of thresholds for the central AAs (position 11), which then are compared with the real values.
    """
    if data_version == 'v3':
        data = pd.read_csv('all_sites_fixed.csv')
    elif data_version == 'v5':
        raw_file = pd.read_csv('OVSlab_allSpecies_O-GlcNAcome_PS.csv')
        # Some pre-processing to make the v5 data in the same form as the v3 data
        positive_sites = set()
        for idx in range(raw_file.shape[0]):
            temp_sites = raw_file.iat[idx, 2].split(';') # A single protein may have multiple positive sites, which are separated with a ;
            for site in temp_sites:
                positive_sites.add( (raw_file.iat[idx, 0], int(site[1:]), raw_file.iat[idx, 3]) ) # site[1:] to not add the letter
        data = pd.DataFrame(positive_sites)
    y_boolean = pd.read_csv(f'OH_data_{data_version}_10-window.csv') # Hardcoding the 10-window since YoY demands at least 21 AAs per window
    # Creating a folder and saving the results
    dir_path = f'YoY_data_{data_version}'
    if not os.path.isdir(dir_path):
        os.mkdir(dir_path)
    to_file = []
    used_prot = set() # Do not take duplicate proteins
    skipped_prot = set()
    for prot_idx in range(data.shape[0]):
        if len(data.iat[prot_idx, 2]) <= 4000 and data.iat[prot_idx, 0] not in used_prot:
            header = f'>{data.iat[prot_idx, 0]}\n'
            to_file.append(header + data.iat[prot_idx, 2] + '\n')
            used_prot.add(data.iat[prot_idx, 0])
        elif len(data.iat[prot_idx, 2]) > 4000:
            skipped_prot.add(data.iat[prot_idx, 0])
    print(f'{len(skipped_prot)} proteins were skipped due to length > 4000: {skipped_prot}')
    proteins_per_file = 175 # YoY has limits on input size. From experience, the 200,000 amino acids limit gets hit with some 200 proteins
    # Separating the proteins in multiple files
    for written_file_count in range( int(len(to_file)/proteins_per_file) + 1 ): # The int is acting like np.floor(), but returns an integer instead of a float
        with open(os.path.join(dir_path, f'YoY_file_{data_version}_{written_file_count}.txt'), 'w') as f:
            for line_count, line in enumerate(to_file[written_file_count*proteins_per_file : (written_file_count+1)*proteins_per_file]):
                f.writelines(line)
    if os.path.isfile(os.path.join(dir_path, 'Results.txt')): # Ensuring Results.txt doesn't get overwritten if the user mistakenly runs this function instead of process_results()
        print(f'Note: the file {os.path.join(dir_path, "Results.txt")} already exists, so it will not be recreated nor modified.')
    else:
        with open(os.path.join(dir_path, 'Results.txt'), 'w') as f: # The empty .txt file in which the YoY results will be pasted
            f.write('')

def process_results(data_version):
    """
    Called when this file is run with the --analyze_results flag.
    Generates a list of thresholds for the central AAs (position 11), then compares them with the real values and prints the coordinates for plotting
    """
    dir_path = f'YoY_data_{data_version}'
    # Reading and filtering the file generated by process_data()
    with open(os.path.join(dir_path, 'Results.txt')) as f:
        lines = f.readlines()
    thresholds = []
    for idx, line in enumerate(lines):
        if '+' in line or (' - ' in line and 'DTU Health Tech' not in line):
            temp = [elem for elem in line.split(' ') if elem]
            thresholds.append((temp[0], temp[1], temp[4])) # Splitting and getting the [0] element returns the protein idx, the [1] element returns the position, and the [4] element returns the threshold
    thresholds = pd.DataFrame(thresholds, columns = ['Prot_Idx', 'Position', 'YoY Threshold'])
    thresholds.to_csv(os.path.join(dir_path, 'YoY_thresholds.csv'), index = False)
    # Reading the real data
    if data_version == 'v3':
        real_data = pd.read_csv('all_sites_fixed.csv')
        cutoff = np.concatenate( ([0.15], np.linspace(0.2, 0.8, 7)) ) # No points with a threshold < 0.15 or > 0.8
    elif data_version == 'v5':
        temp_data = pd.read_csv('OVSlab_allSpecies_O-GlcNAcome_PS.csv').iloc[:, [0, 2]]
        real_data = []
        for idx in range(temp_data.shape[0]):
            sites = temp_data.iat[idx, -1].replace('S', '').replace('T', '').split(';')
            real_data.extend([temp_data.iat[idx, 0], int(elem)] for elem in sites) # Note both data are 1-indexed, so we do not need to apply any corrections
        real_data = pd.DataFrame(real_data)
        cutoff = np.concatenate( ([0.12], np.linspace(0.2, 0.8, 7)) ) # No points with a threshold < 0.12 or > 0.85
    # Setup and helper variables
    numerical_yoy_position = np.array(thresholds.iloc[:, 1], dtype = np.uint32) # Length up to 4,294,967,296
    numerical_thresholds = np.array(thresholds.iloc[:, 2], dtype = float)
    pos_idxs = set()
    true_pos = np.zeros(cutoff.shape, dtype = np.uint32) # Up to 4,294,967,296 - 1 positive samples
    false_neg = np.zeros_like(true_pos)
    # Going through data that are real positives, but can be labeled as negative or positive by the model
    for idx in range(real_data.shape[0]):
        if not idx%200:
            print(f'Current idx: {idx:,}/{real_data.shape[0]:,}', end = '\r')
        temp = np.where((thresholds.iloc[:, 0] == real_data.iat[idx, 0]) & (numerical_yoy_position == real_data.iat[idx, 1]))[0]
        if np.any(temp):
            true_pos += numerical_thresholds[temp[0]] >= cutoff
            false_neg += numerical_thresholds[temp[0]] < cutoff
            pos_idxs.add(temp[0])
    # Going through data that are real negatives, but can be labeled as negative or positive by the model
    neg_idxs = list(set(range(thresholds.shape[0])) - pos_idxs)
    false_pos = np.sum(numerical_thresholds[neg_idxs, None] >= cutoff, axis = 0)
    true_neg = np.sum(numerical_thresholds[neg_idxs, None] < cutoff, axis = 0)
    x_unrounded = true_pos / (true_pos + false_neg)*100
    x_values = [round(elem, 2) for elem in x_unrounded]
    print(f'x_values = {x_values}')
    y_unrounded = true_pos / (true_pos + false_pos)*100
    y_values = [round(elem, 2) for elem in y_unrounded]
    print(f'y_values = {y_values}')
    f1 = 2*(1/x_unrounded + 1/y_unrounded)**-1
    f1 = [round(elem, 2) for elem in f1]
    print(f'f1 = {f1}')
    # The denominator becomes pretty large as the number of samples increases. The v5 dataset gets too close to overflowing and leads to incorrect MCC values if not converted to float
    TP = np.array(true_pos, dtype = float)
    FP = np.array(false_pos, dtype = float)
    TN = np.array(true_neg, dtype = float)
    FN = np.array(false_neg, dtype = float)
    MCC = (TP*TN - FP*FN) / np.sqrt((TP+FP) * (TP+FN) * (TN+FP) * (TN+FN))*100
    MCC = [round(elem, 2) for elem in MCC]
    print(f'MCC = {MCC}')

def truncate_intermediate_results(data_version = 'v5'):
    """
    A helper function to reduce the size of Results.txt as multiple runs of YoY are done
    """
    dir_path = f'YoY_data_{data_version}'
    # Reading and filtering the file generated by process_data()
    with open(os.path.join(dir_path, 'Results_temp.txt')) as f:
        lines = f.readlines()
    useful_lines = []
    for idx, line in enumerate(lines):
        if '+' in line or (' - ' in line and 'DTU Health Tech' not in line):
            useful_lines.append(line)
    with open(os.path.join(dir_path, 'Results.txt'), 'a') as f:
        f.writelines(useful_lines)


if __name__ == '__main__':
    # Input setup
    import argparse
    parser = argparse.ArgumentParser(description = 'Processes a .csv of sequences and their respective O-GlcNAcylation status for input into YoY, then can be run again to process the results. '
                                    'First, run this file with only the data_version input to create a folder called "YoY_data_{data_version}". Send each .txt file in that folder to YoY. '
                                    'After each YoY run is done, copy and paste all the results into the Results.txt file (that is, Ctrl+A then Ctrl+C in the YoY results webpage, then Ctrl+V in the .txt file). '
                                    'Run this file with the --analyze_results flag to generate a list of thresholds for the central AAs (position 11), which can be compared to the real values.')
    parser.add_argument('data_version', type = str, nargs = 1, help = 'The version of the dataset you are using. Should be "v#", where # is an integer')
    parser.add_argument('-ar', '--analyze_results', metavar = 'True | False (Default)', type = bool, nargs = '?', default = False, const = True, choices = {True, False},
                        help = 'Use this flag to filter and analyze the outputs of YoY')
    my_args = parser.parse_args()
    if my_args.analyze_results:
        process_results(my_args.data_version[0]) # [0] to convert from list to string
    else:
        process_data(my_args.data_version[0]) # [0] to convert from list to string