Ligress:Sample specific priors (#244)

* added new N definition for lioness in ligress * renamed to ligress * added ligress test Co-authored-by: violafanfani <viola.fanfani@gmail.com>
netZoo · May 20, 2022 · 4224f2a · 4224f2a
1 parent 05bbc5f
commit 4224f2a
Show file tree

Hide file tree

Showing 14 changed files with 162,237 additions and 1 deletion.
diff --git a/netZooPy/cli.py b/netZooPy/cli.py
@@ -11,3 +11,4 @@ def cli():
 cli.add_command(cl.panda)
 cli.add_command(cl.lioness)
 cli.add_command(cl.condor)
+cli.add_command(cl.ligress)
diff --git a/netZooPy/command_line.py b/netZooPy/command_line.py
@@ -5,6 +5,7 @@
 import click
 from netZooPy.panda.panda import Panda
 from netZooPy.lioness import Lioness
+from netZooPy.ligress import Ligress
 from netZooPy.condor import condor_object
 
 #############################################################################
@@ -152,6 +153,9 @@ def lioness(expression, motif, ppi, output_panda, output_lioness, fmt, computing
     print('All done!')
 
 
+################################################
+###### CONDOR ##################################
+################################################
 
 @click.command()
 @click.option('-n', '--network_file', 'network', type=str, required=True,
@@ -205,3 +209,58 @@ def condor(
     co.tar_memb.to_csv(tar_output)
     co.reg_memb.to_csv(reg_output)
 
+
+
+#############################################################################
+# TIGRESS ###################################################################
+#############################################################################
+
+@click.command()
+@click.option('-e', '--expression', 'expression', type=str, required=True,
+              help='Path to file containing the gene expression data. By default, \
+                  the expression file does not have a header, and the cells are separated by a tab.')
+@click.option('-p', '--ppi', 'ppi', type=str, required=True,
+              help='PPI filename.')
+@click.option('-pt', '--priors-table', 'priors_table', type=str, required=True,
+              help='Path to file containing the sample-prior table.' )
+@click.option('-ol', '--out-lioness', 'output_lioness', type=str, required=True,
+              help='Output lioness folder')
+@click.option('--fmt', type=str, show_default=True, default='npy',
+              help='Lioness network files output format. Choose one between .npy,.txt,.mat')
+@click.option('--computing', type=str, show_default=True, default='cpu',
+              help='computing option, choose one between cpu and gpu')
+@click.option('--precision', type=str, show_default=True, default='double',
+              help='precision option')
+@click.option('--ncores', type=int, show_default=True, default=1,
+              help='Number of cores. Lioness CPU parallelizes over ncores')
+@click.option('--save_memory', is_flag=True, show_default=False,
+              help='panda option. When true the result network is weighted adjacency matrix of size (nTFs, nGenes).\
+                  when false The result network has 4 columns in the form gene - TF - weight in motif prior - PANDA edge.')
+@click.option('--save_coexpression', is_flag=True, show_default=True,
+              help='save ')
+@click.option('--rm_missing', is_flag=True, show_default=False,
+              help='Removes the genes and TFs that are not present in one of the priors. Works only if modeProcess=legacy')
+@click.option('--mode_process', type=str, default='union', show_default=True,
+              help='panda option for input data processing. Choose between union(default), \
+                  legacy and intersection')
+@click.option('--alpha', type=float, default=0.1, show_default=True,
+              help='panda and lioness first sample')
+@click.option('--output_type', type=str, default='txt', show_default=True,
+              help='Output type. Now unused')
+@click.option('--th_motifs', type=int, show_default=True, default=3,
+              help='Threshold for the motifs. Reads motif only once if possible.')      
+def ligress(expression, priors_table, ppi, output_lioness, fmt, computing, precision, ncores, save_memory, save_coexpression, rm_missing, mode_process,output_type, alpha, th_motifs):
+    """Run Lioness to extract single-sample coexpression networks. 
+    Then run Panda on each sample with sample-specific priors.
+    """
+
+    print('Input data:')
+    print('Expression:', expression)
+
+    # read expression data, prepare ppi+motif+expression universes
+    print('Initialise ligress...')
+    ligress_obj = Ligress(expression, priors_table, ppi, output_folder=output_lioness, mode_process=mode_process)
+    print('Running ligress computations ...')
+    ligress_obj.run_ligress(keep_coexpression=save_coexpression, save_memory=save_memory,computing_panda = computing, precision=precision, alpha = alpha, th_motifs=th_motifs)
+    print('All done!')
+
diff --git a/netZooPy/ligress/__init__.py b/netZooPy/ligress/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import absolute_import
+
+from .ligress import Ligress
diff --git a/netZooPy/ligress/io.py b/netZooPy/ligress/io.py
@@ -0,0 +1,264 @@
+from __future__ import print_function
+import math
+from random import sample
+import time
+import pandas as pd
+from scipy.stats import zscore
+from .timer import Timer
+import numpy as np
+from netZooPy.panda import calculations as calc
+import sys
+
+
+def read_ppi(ppi_fn):
+
+    with open(ppi_fn, 'r') as f:
+        ppi_data = pd.read_csv(f, sep="\t", header=None)
+        ppi_data.columns = ['tf1','tf2','exists']
+
+    # get all tfs from first and second column
+    ppi_tfs = sorted(set(ppi_data.iloc[:,0].values.tolist()).union(set(ppi_data.iloc[:,1].values.tolist())))
+    # create adjacency matrix
+    df = pd.DataFrame(np.eye(len(ppi_tfs)), index=ppi_tfs, columns=ppi_tfs)
+    z = ppi_data.pivot_table(columns='tf2',index = 'tf1',values = 'exists', fill_value=0)
+    df = df.add(z, fill_value=0).add(z.T, fill_value=0)
+    df = 1*(df>0)
+    return(df, ppi_tfs)
+
+
+def read_priors_table(table_filename, sample_col = 'sample', prior_col = 'prior'):
+    """Read the table detailing the sample-prior dyads
+
+    Parameters
+    ----------
+        table_filename :str
+            A csv that couples each sample with a prior
+            filename. The csv file can have any number of columns, but only 
+            the sample_col and prior_col are read. 
+        sample_col: str 
+            name of the column storing the sample name (default: 'sample')
+        prior_col: str
+            name of the column storing the prior filename  (default: 'prior')
+
+    Returns
+    -------
+        samples: list
+            list of samples that are defined
+        prior_dict: dict
+            dictionary sample:prior_filename
+    """    
+
+    with open(table_filename, 'r') as f:
+        df = pd.read_csv(table_filename, usecols=[sample_col, prior_col])
+
+    # check that sample-prior are unique
+    if len(df.drop_duplicates(subset = [sample_col, prior_col]))!=len(df):
+        print('Some samples are reported twice')
+        print(df[df.duplicated(subset = [sample_col, prior_col])])
+        df = df.drop_duplicates(subset = [sample_col, prior_col])
+
+    # check that there are no more samples duplicated
+    if len(df.drop_duplicates(subset = [sample_col]))!=len(df):
+        sys.exit('No unique sample-prior assignment, there are duplicated sample columns')
+
+    samples = df[sample_col].values.tolist()
+    sample2prior_dict = {i[1]:i[2] for i in df.itertuples()}
+
+    prior2sample_dict = {}
+    for p,tab in df.groupby(prior_col):
+        prior2sample_dict[p] = tab.loc[:,sample_col].values.tolist()
+
+    return(samples,  sample2prior_dict,  prior2sample_dict)
+
+def read_motif(motif_fn, tf_names = None, gene_names = None, pivot = True):
+    """ Read a motif edgelist, generates
+
+    Args:
+        motif_fn (_type_): filename of the motif edgelist
+        tf_names (_type_, optional): list of tf_names to be used. Defaults to None.
+        gene_names (_type_, optional): list of gene_names to be used. Defaults to None.
+        pivot (bool): if true returns a pivot tfs X genes table. Otherwise keeps the edgelist
+    """
+
+    with open(motif_fn, 'r') as f:
+        df = pd.read_csv(f, sep= '\t', header = None)
+
+    tftoadd = None
+    genetoadd = None
+
+    presenttf = df.iloc[:,0].unique()
+    presentgene = df.iloc[:,1].unique()
+    ntfs = len(presenttf)
+    ngenes = len(presentgene)
+
+    if isinstance(tf_names, list):
+
+        # Check how many are removed
+        dtf = len(set(presenttf).difference(set(tf_names)))
+        if (dtf!=0):
+            print('Note: %d/%d tfs in the prior are not in the tf universe ' %(int(dtf),int(ntfs)))
+
+        # tfs to add
+        tftoadd = set(tf_names).difference(set(presenttf))
+
+    if isinstance(gene_names, list):
+
+
+        # Check how many are removed
+        dgene = len(set(presentgene).difference(set(gene_names)))
+        if (dgene!=0):
+            print('Note: %d/%d genes in the prior are not in the gene universe ' %(int(dgene), int(ngenes)))
+
+        # genes to add
+        genetoadd = set(gene_names).difference(set(presentgene))
+
+    print('hello')
+    # now if one or both tftoadd/genetoadd are not None, we add rows with zeros at the end
+    if (tftoadd or genetoadd):
+        print('here')
+        # if one of the two is None, add all the existing ones
+        if tftoadd==None:
+            tftoadd = presenttf
+        if genetoadd==None:
+            genetoadd = presentgene
+
+        toadd = pd.DataFrame(tftoadd, columns = [0]).merge(pd.DataFrame(genetoadd, columns = [1]), how='cross')
+        toadd[2] = 0
+
+        df = pd.concat([df,toadd])
+
+    if pivot:
+        piv = df.pivot_table(values=2, index=0, columns=1, fill_value=0)
+        if isinstance(tf_names, list):
+            piv = piv.loc[tf_names,:]
+        if isinstance(gene_names, list):
+            piv = piv.loc[:,gene_names]
+        return(piv)
+    else:
+        if isinstance(tf_names, list):
+            df = df[df[0].isin(tf_names)]
+        if isinstance(gene_names, list):
+            df = df[df[1].isin(gene_names)]
+        return(df)
+
+def read_expression(expression_fn, header = 0, usecols = None):
+    """Read expression data.
+
+    Parameters
+    -----------
+        expression_fn: str
+            filename of the expression file
+        header: str or int
+            header row
+        usecols:list
+            pass a list of the columns that need to be read
+    """
+    with open(expression_fn, 'r') as f:
+        if expression_fn.endswith('.txt'):
+            df = pd.read_csv(f, sep = '\t', usecols = usecols, index_col=0)
+        elif expression_fn.endswith('.csv'):
+            df = pd.read_csv(f, sep = ' ', usecols = usecols, index_col=0)
+        elif expression_fn.endswith('.tsv'):
+            df = pd.read_csv(f, sep = '\t', usecols = usecols, index_col=0)
+        else:
+            sys.exit("Format of expression filename not recognised %s" %str(expression_fn))
+
+    return(df)
+
+
+def prepare_expression(expression_filename, samples = None):
+
+    """ Prepare main coexpression network by reading the expression file.
+    
+    Parameters
+    ----------
+        expression_filename :str
+            A table (tsv, csv, or txt) where each column is a sample 
+            and each row is a gene. Values are expression.
+        samples: list
+            list of sample names. If None all samples are read (default: None)
+
+	Returns
+	---------
+		expression_data: pd.DataFrame
+		expression_genes:set
+
+    """    
+    # expression file is properly annotated with the sample name and 
+    # a list of sample of interest is passed
+
+    if type(expression_filename) is str:
+        if (isinstance(sample, list)):
+            expression_data = read_expression(expression_filename, usecols = samples)
+        else:
+            expression_data = read_expression(expression_filename)
+
+
+    elif isinstance(expression_filename, pd.DataFrame):
+
+        expression_data = expression_filename.loc[:,samples]
+
+    else: 
+        sys.exit('Expression filename needs to be either a table string or a panda dataframe')
+
+
+    # keep names of expression genes
+    expression_genes = set(expression_data.index.tolist())
+
+    if len(expression_data) != len(expression_genes):
+        print(
+            "Duplicate gene symbols detected. Consider averaging before running PANDA"
+        )
+
+    return(expression_data, expression_genes)
+
+def read_motif_universe(priors_dict, mode = 'union', tf_col = 0, gene_col = 1):
+
+    """ Read tf and gene names for all the priors and generate 
+    a universe. By default the union of all gene names and tfs 
+    is considered. Tables needs to be tab separated.
+
+    Parameters
+    ----------- 
+    priors_dict: dict
+        dictionary sample:prior_filename
+    mode: str
+        how to generate the universe. Default: union
+	tf_col:str or int
+		Column used to store the TF names
+	gene_col: str or int
+		Column used to store the gene names
+
+    Returns
+    --------
+    tfs:set
+        names of tfs
+	genes:set
+		names of genes
+	universe:set
+		name of both genes and tfs
+    """
+
+    files = set(priors_dict.values())
+    tfs = set()
+    genes = set()
+    for fff in files:
+        with open(fff, 'r') as f:
+            df = pd.read_csv(f, sep= '\t', header = None)
+            tf = df.iloc[:,0].unique().tolist()
+            gene = df.iloc[:,1].unique().tolist()
+            if mode=='union':
+                tfs = tfs.union(set(tf))
+                genes = genes.union(set(gene))
+                universe = genes.union(tfs)
+            elif mode=='intersection':
+                tfs = tfs.intersection(set(tf))
+                genes = genes.intersection(set(gene))   
+                universe = genes.intersection(tfs)       
+            else:
+                sys.exit('Name %s is not a valid mode for prior tf/gene universe. Use union or intersection' %str(mode))   
+    return(tfs, genes)
+
+
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from __future__ import absolute_import

		from .ligress import Ligress