Merge pull request #51 from ncats/pw_enrich_dev

Pull request for pw_enrich_dev into main, package version 2.3.0
ncats · Apr 26, 2023 · e24ce93 · e24ce93
2 parents 800d9bc + 0dbdf33
commit e24ce93
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 29 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: RaMP
 Title: RaMP (Relational Database of Metabolomic Pathways)
 Type: Package
-Version: 2.2.1
+Version: 2.3.0
 License: GPL-2
 Depends: R (>= 3.6.0)
 Authors@R: c(
@@ -36,7 +36,7 @@ Imports:
     methods,
     tibble
 Encoding: UTF-8
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.3
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/R/ReturnAnalytes_InputPathways.R b/R/ReturnAnalytes_InputPathways.R
@@ -5,6 +5,7 @@
 #' @param analyte_type a string denoting the type of analyte to return ("gene", "metabolite", "both")
 #' @param match type of matching to use, options are "exact" or "fuzzy".  The default is "exact".
 #' @param max_pathway_size (default Inf), trims returned results to pathways that have fewer than this number
+#' @param names_or_ids are the input pathways input as pathway names or as pathway ids
 #' of genes and metabolites
 #' @return a data.frame that contains all search results
 #' @examples
@@ -18,7 +19,7 @@
 #'	"sphingolipid metabolism"))
 #' }
 #' @export
-getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf) {
+getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf, names_or_ids="names") {
   now <- proc.time()
   print("fired")
   if(is.character(pathway)){
@@ -39,6 +40,12 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
   list_pathway <- sapply(list_pathway,shQuote)
   list_pathway <- paste(list_pathway,collapse = ",")
 
+  pathwayMatchCol = 'pathwayName'
+  if(names_or_ids == 'ids') {
+    pathwayMatchCol = 'sourceId'
+    match = 'exact'
+  }
+
   # Retrieve pathway RaMP ids
   if (match=='exact') {
 
@@ -48,34 +55,35 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
     group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs,
     s.geneOrCompound as geneOrCompound,
     p.pathwayName as pathwayName,
+    p.sourceId as pathwayId,
     p.pathwayCategory as pathwayCategory,
     p.type as pathwayType
     from pathway p, analytehaspathway ap, source s
     where s.rampId = ap.rampID
     and ap.pathwayRampId = p.pathwayRampId
     and (p.pathwayCategory not like 'smpdb%' or p.pathwayCategory is Null)
-    and p.pathwayName in (",list_pathway,") ",
+    and p.",pathwayMatchCol," in (",list_pathway,") ",
     "group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound
-    order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;"
-                 )
+    order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;")
     con <- connectToRaMP()
     df <- RMariaDB::dbGetQuery(con,sql)
     RMariaDB::dbDisconnect(con)
   } else if(match == 'fuzzy') {
     df = data.frame(matrix(nrow=0, ncol=6))
-    sql = "select
+    sql = paste0("select
     group_concat(distinct s.commonName order by s.commonName asc separator '; ') as analyteName,
     group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs,
     s.geneOrCompound as geneOrCompound,
     p.pathwayName as pathwayName,
+    p.sourceId as pathwayId,
     p.pathwayCategory as pathwayCategory,
     p.type as pathwayType
     from pathway p, analytehaspathway ap, source s
     where s.rampId = ap.rampID
     and ap.pathwayRampId = p.pathwayRampId
     and (p.pathwayCategory not like 'smpdb%' or p.pathwayCategory is Null)
-    and p.pathwayName like '%[SOME_PW_NAME]%' group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound
-    order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;"
+    and p.",pathwayMatchCol," like '%[SOME_PW_NAME]%' group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound
+    order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;")
 
     con <- connectToRaMP()
     for(p in pathway) {
@@ -88,9 +96,9 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
 
   # if we have a result and max_pathway size is not Infinite, filter pathway results by pathway size
   if(nrow(df) > 0 && max_pathway_size != Inf) {
-    pwAnalyteCounts <- data.frame(table(df$`Pathway Name`))
-    pwAnalyteCounts <- pwAnalyteCounts[pwAnalyteCounts$Freq < max_pathway_size,]
-    df <- df[df$`Pathway Name` %in% unlist(pwAnalyteCounts$Var1),]
+    pwAnalyteCounts <- data.frame(table(df$`pathwayName`))
+    pwAnalyteCounts <- pwAnalyteCounts[pwAnalyteCounts$Freq <= max_pathway_size,]
+    df <- df[df$`pathwayName` %in% unlist(pwAnalyteCounts$Var1),]
   }
 
   if(analyte_type=="gene") {
@@ -101,13 +109,12 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
     allout <- df[which(df$`geneOrCompound`=="compound"),]
   } else {
     allout <- df
-
   }
 
   print("Timing ..")
   print(proc.time() - now)
 
-    return(allout)
+  return(allout)
 }
 
 

diff --git a/R/ReturnPathwaysEnrich_InputAnalytes.R b/R/ReturnPathwaysEnrich_InputAnalytes.R
@@ -75,7 +75,7 @@ runFisherTest <- function(analytes,
                                             NameOrIds = NameOrIds
                                             )
       print("Custom background specified, genes will be discarded")
-      
+
     } else if (background_type=="file" & analyte_type == "metabolites") {
       userbkg <- utils::read.table(background, header=F)[,1]
       backgrounddf <- getPathwayFromAnalyte(userbkg,
@@ -131,9 +131,9 @@ runFisherTest <- function(analytes,
       # do nothing, it's handled down below in if statements
     }else{
       stop("Only custom backgrounds are supported for custom pathway definitions. Please provide a 'list' or 'file' containing the analyte background")
-    } 
+    }
   }
-    
+
   ## Check that all metabolites of interest are in the background
   if (background_type != "database") {
     if (length(setdiff(pathwaydf$rampId, backgrounddf$rampId) != 0)) {
@@ -151,7 +151,8 @@ runFisherTest <- function(analytes,
   list_pid <- paste(list_pid, collapse = ",")
 
   # Get the total number of metabolites that are mapped to pathways in RaMP (that's the default background)
-  query <- "select * from analytehaspathway"
+  # added conditional to not pull hmdb ids
+  query <- "select * from analytehaspathway where pathwaySource != 'hmdb';"
   con <- connectToRaMP()
   allids <- RMariaDB::dbGetQuery(con, query)
 
@@ -318,6 +319,12 @@ runFisherTest <- function(analytes,
     totinpath <- c(totinpath, tot_in_pathway)
     pidused <- c(pidused, i)
   } # end for loop
+
+  print("")
+  print(now - proc.time())
+  print("before optional MCall")
+  print("")
+
   # Now run fisher's tests for all other pids (all pathways not covered in dataset)
   if (MCall == T) {
     # Now run fisher's tests for all other pids
@@ -337,7 +344,8 @@ runFisherTest <- function(analytes,
     restcids <- RMariaDB::dbGetQuery(con, query2) # [[1]]
     RMariaDB::dbDisconnect(con)
 
-    query1 <- paste0("select rampId,pathwayRampId from analytehaspathway;")
+    # modify to not take hmdb pathways
+    query1 <- paste0("select rampId,pathwayRampId from analytehaspathway where pathwaySource != 'hmdb';")
 
     con <- connectToRaMP()
     allcids <- RMariaDB::dbGetQuery(con, query1) # [[1]]
@@ -474,16 +482,20 @@ runFisherTest <- function(analytes,
       Num_In_Path = userinpath[keepers],
       Total_In_Path = totinpath[keepers]
     )
-  } # End else if MCall (when False)
-  # Remove duplicate pathways between wikipathways and KEGG
-  duplicate_pathways <- find_duplicate_pathways()
+  }
+  # End else if MCall (when False)
+
+  # Remove duplicate pathways between wikipathways and reactome, only perfect overlaps
+  # only make the dup list if it doesn't exist from a previous run in the session
+  if( !exists('duplicate_pathways')) {
+    duplicate_pathways <<- findDuplicatePathways()
+  }
   if (any(out$pathwayRampId %in% duplicate_pathways)) {
     out <- out[-which(out$pathwayRampId %in% duplicate_pathways), ]
   }
 
   out <- out[!duplicated(out), ]
-  print(dim(out))
-  print(colnames(out))
+
   # for user is the output needed, based on what user input
   return(list(out, pathwaydf))
 }
@@ -547,6 +559,7 @@ runCombinedFisherTest <- function(analytes,
   ## fishmetab <- pathwaydf[grep("RAMP_C_", pathwaydf$rampId), ]
 
   print("Running Fisher's tests on metabolites")
+
   outmetab <- runFisherTest(
     analytes = analytes,
     analyte_type = "metabolites",
@@ -564,7 +577,8 @@ runCombinedFisherTest <- function(analytes,
     M <- 1
   }
 
-  # Grab pathways that contain genes to run Fisher on genes
+
+  ## Grab pathways that contain genes to run Fisher on genes
   ## fishgene <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ]
   ## Genes are not evaluated if custom background is specified
   if (background_type == "database" & pathway_definitions == "RaMP") {
@@ -579,7 +593,7 @@ runCombinedFisherTest <- function(analytes,
     )
     pathwaydf_gene <- outgene[[2]]
     outgene <- outgene[[1]]
-  } else if (pathway_definitions != "RaMP"){
+  } else if (pathway_definitions != "RaMP") {
     outgene <- runFisherTest(
       analytes = analytes,
       analyte_type = "genes",

diff --git a/R/processNewRamp.R b/R/processNewRamp.R
@@ -485,10 +485,13 @@ processData <- function(){
 #username = <username>
 #conpass = <connection_password>
 
+
+
 # run these 3 methods, these generate files in the R RaMP library area
 # if commiting to git, then copy the new files into your R git project inst/extdata
 
 # pkg.globals <- setConnectionToRaMP(dbname=dbname,username=username,conpass=conpass,host=hostname)
+
 # RaMP:::updateOverlapMatrices(method="balanced" ,all="all")
 # RaMP:::updateOverlapMatrices(method="balanced" ,all="analyte")
 # RaMP:::processData()

diff --git a/R/rampQueryHelper.R b/R/rampQueryHelper.R
@@ -675,6 +675,9 @@ segregateDataBySource<-function(input_RampIds){
 ##' @return List of duplicate Wikipathway IDs from Reactome.
 ##' @author Andrew Patt
 find_duplicate_pathways <- function(){
+
+  .Deprecated("findDuplicatPathways")
+
   pathway_overlap = analyte_result
   duplicate_pairs = data.frame(Pathway1=character(),Pathway2=character())
   for(i in 1:ncol(pathway_overlap)){
@@ -686,7 +689,7 @@ find_duplicate_pathways <- function(){
                                           Pathway2=colnames(pathway_overlap)[i]))
     }
   }
-  query <- "select * from analytehaspathway;"
+  query <- "select * from analytehaspathway where pathwaySource != 'hmdb';"
   con <- connectToRaMP()
   allpids <- RMariaDB::dbGetQuery(con, query)
   RMariaDB::dbDisconnect(con)
@@ -713,6 +716,53 @@ find_duplicate_pathways <- function(){
   return(duplicate_pathways)
 }
 
+##' Return list of duplicate Wikipathway IDs from Reactome. This may be unnecessary in the future
+##' @return List of duplicate Wikipathway IDs from Reactome.
+##' @author John Braisted
+findDuplicatePathways <- function() {
+  query <- "select pathwayRampId from pathway where type = 'reactome';"
+  con <- connectToRaMP()
+  reactomePIDs <- RMariaDB::dbGetQuery(con, query)
+  RMariaDB::dbDisconnect(con)
+
+  ar <- RaMP:::analyte_result
+  diag(ar) <- 0.0
+  ar[ar != 1.0] <- 0.0
+  colHits <- colnames(ar)[colSums(ar) >= 1.0]
+  rowHits <- colnames(ar)[rowSums(ar) >= 1.0]
+  ar2 <- ar[rowHits, colHits]
+  n = 0
+
+  for(r in rownames(ar2)) {
+    colHits <- colnames(ar2)[ar2[r,]==1.0]
+    rowHits <- rep(r, length(colHits))
+    df <- data.frame(colHits)
+    df <- cbind(df, rowHits)
+    if(n == 0) {
+      df2 <- df
+    } else {
+      df2 <- rbind(df2, df)
+    }
+    n = n + 1
+  }
+
+  dupReturnList <- list(nrow(df2))
+  # preference for reactome over wiki or kegg
+  for(r in 1:nrow(df2)) {
+    if(df2[r,1] %in% reactomePIDs[,1]) {
+      dupReturnList[[r]] <- df2[r,2]
+    } else if(df2[r,2] %in% reactomePIDs[,1]) {
+      dupReturnList[[r]] <- df2[r,1]
+    } else {
+      dupReturnList[[r]] <- df2[r,2]
+    }
+  }
+
+  return(unlist(dupReturnList))
+}
+
+
+
 #' Filter pathways by p-value cutoff for display and clustering
 #' @param fishers_df The data frame generated by runFisherTest
 #' @param pval_type Specifies which p-value to use as the filter threshold.

diff --git a/man/findDuplicatePathways.Rd b/man/findDuplicatePathways.Rd
diff --git a/man/getAnalyteFromPathway.Rd b/man/getAnalyteFromPathway.Rd