diff --git a/DESCRIPTION b/DESCRIPTION index fc7b21e1..65ac9c10 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RaMP Title: RaMP (Relational Database of Metabolomic Pathways) Type: Package -Version: 2.2.1 +Version: 2.3.0 License: GPL-2 Depends: R (>= 3.6.0) Authors@R: c( @@ -36,7 +36,7 @@ Imports: methods, tibble Encoding: UTF-8 -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.3 Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/R/ReturnAnalytes_InputPathways.R b/R/ReturnAnalytes_InputPathways.R index 4c282300..7f179a39 100644 --- a/R/ReturnAnalytes_InputPathways.R +++ b/R/ReturnAnalytes_InputPathways.R @@ -5,6 +5,7 @@ #' @param analyte_type a string denoting the type of analyte to return ("gene", "metabolite", "both") #' @param match type of matching to use, options are "exact" or "fuzzy". The default is "exact". #' @param max_pathway_size (default Inf), trims returned results to pathways that have fewer than this number +#' @param names_or_ids are the input pathways input as pathway names or as pathway ids #' of genes and metabolites #' @return a data.frame that contains all search results #' @examples @@ -18,7 +19,7 @@ #' "sphingolipid metabolism")) #' } #' @export -getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf) { +getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf, names_or_ids="names") { now <- proc.time() print("fired") if(is.character(pathway)){ @@ -39,6 +40,12 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m list_pathway <- sapply(list_pathway,shQuote) list_pathway <- paste(list_pathway,collapse = ",") + pathwayMatchCol = 'pathwayName' + if(names_or_ids == 'ids') { + pathwayMatchCol = 'sourceId' + match = 'exact' + } + # Retrieve pathway RaMP ids if (match=='exact') { @@ -48,34 +55,35 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs, s.geneOrCompound as geneOrCompound, p.pathwayName as pathwayName, + p.sourceId as pathwayId, p.pathwayCategory as pathwayCategory, p.type as pathwayType from pathway p, analytehaspathway ap, source s where s.rampId = ap.rampID and ap.pathwayRampId = p.pathwayRampId and (p.pathwayCategory not like 'smpdb%' or p.pathwayCategory is Null) - and p.pathwayName in (",list_pathway,") ", + and p.",pathwayMatchCol," in (",list_pathway,") ", "group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound - order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;" - ) + order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;") con <- connectToRaMP() df <- RMariaDB::dbGetQuery(con,sql) RMariaDB::dbDisconnect(con) } else if(match == 'fuzzy') { df = data.frame(matrix(nrow=0, ncol=6)) - sql = "select + sql = paste0("select group_concat(distinct s.commonName order by s.commonName asc separator '; ') as analyteName, group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs, s.geneOrCompound as geneOrCompound, p.pathwayName as pathwayName, + p.sourceId as pathwayId, p.pathwayCategory as pathwayCategory, p.type as pathwayType from pathway p, analytehaspathway ap, source s where s.rampId = ap.rampID and ap.pathwayRampId = p.pathwayRampId and (p.pathwayCategory not like 'smpdb%' or p.pathwayCategory is Null) - and p.pathwayName like '%[SOME_PW_NAME]%' group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound - order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;" + and p.",pathwayMatchCol," like '%[SOME_PW_NAME]%' group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound + order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;") con <- connectToRaMP() for(p in pathway) { @@ -88,9 +96,9 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m # if we have a result and max_pathway size is not Infinite, filter pathway results by pathway size if(nrow(df) > 0 && max_pathway_size != Inf) { - pwAnalyteCounts <- data.frame(table(df$`Pathway Name`)) - pwAnalyteCounts <- pwAnalyteCounts[pwAnalyteCounts$Freq < max_pathway_size,] - df <- df[df$`Pathway Name` %in% unlist(pwAnalyteCounts$Var1),] + pwAnalyteCounts <- data.frame(table(df$`pathwayName`)) + pwAnalyteCounts <- pwAnalyteCounts[pwAnalyteCounts$Freq <= max_pathway_size,] + df <- df[df$`pathwayName` %in% unlist(pwAnalyteCounts$Var1),] } if(analyte_type=="gene") { @@ -101,13 +109,12 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m allout <- df[which(df$`geneOrCompound`=="compound"),] } else { allout <- df - } print("Timing ..") print(proc.time() - now) - return(allout) + return(allout) } diff --git a/R/ReturnPathwaysEnrich_InputAnalytes.R b/R/ReturnPathwaysEnrich_InputAnalytes.R index 81bde4da..86834ef1 100644 --- a/R/ReturnPathwaysEnrich_InputAnalytes.R +++ b/R/ReturnPathwaysEnrich_InputAnalytes.R @@ -75,7 +75,7 @@ runFisherTest <- function(analytes, NameOrIds = NameOrIds ) print("Custom background specified, genes will be discarded") - + } else if (background_type=="file" & analyte_type == "metabolites") { userbkg <- utils::read.table(background, header=F)[,1] backgrounddf <- getPathwayFromAnalyte(userbkg, @@ -131,9 +131,9 @@ runFisherTest <- function(analytes, # do nothing, it's handled down below in if statements }else{ stop("Only custom backgrounds are supported for custom pathway definitions. Please provide a 'list' or 'file' containing the analyte background") - } + } } - + ## Check that all metabolites of interest are in the background if (background_type != "database") { if (length(setdiff(pathwaydf$rampId, backgrounddf$rampId) != 0)) { @@ -151,7 +151,8 @@ runFisherTest <- function(analytes, list_pid <- paste(list_pid, collapse = ",") # Get the total number of metabolites that are mapped to pathways in RaMP (that's the default background) - query <- "select * from analytehaspathway" + # added conditional to not pull hmdb ids + query <- "select * from analytehaspathway where pathwaySource != 'hmdb';" con <- connectToRaMP() allids <- RMariaDB::dbGetQuery(con, query) @@ -318,6 +319,12 @@ runFisherTest <- function(analytes, totinpath <- c(totinpath, tot_in_pathway) pidused <- c(pidused, i) } # end for loop + + print("") + print(now - proc.time()) + print("before optional MCall") + print("") + # Now run fisher's tests for all other pids (all pathways not covered in dataset) if (MCall == T) { # Now run fisher's tests for all other pids @@ -337,7 +344,8 @@ runFisherTest <- function(analytes, restcids <- RMariaDB::dbGetQuery(con, query2) # [[1]] RMariaDB::dbDisconnect(con) - query1 <- paste0("select rampId,pathwayRampId from analytehaspathway;") + # modify to not take hmdb pathways + query1 <- paste0("select rampId,pathwayRampId from analytehaspathway where pathwaySource != 'hmdb';") con <- connectToRaMP() allcids <- RMariaDB::dbGetQuery(con, query1) # [[1]] @@ -474,16 +482,20 @@ runFisherTest <- function(analytes, Num_In_Path = userinpath[keepers], Total_In_Path = totinpath[keepers] ) - } # End else if MCall (when False) - # Remove duplicate pathways between wikipathways and KEGG - duplicate_pathways <- find_duplicate_pathways() + } + # End else if MCall (when False) + + # Remove duplicate pathways between wikipathways and reactome, only perfect overlaps + # only make the dup list if it doesn't exist from a previous run in the session + if( !exists('duplicate_pathways')) { + duplicate_pathways <<- findDuplicatePathways() + } if (any(out$pathwayRampId %in% duplicate_pathways)) { out <- out[-which(out$pathwayRampId %in% duplicate_pathways), ] } out <- out[!duplicated(out), ] - print(dim(out)) - print(colnames(out)) + # for user is the output needed, based on what user input return(list(out, pathwaydf)) } @@ -547,6 +559,7 @@ runCombinedFisherTest <- function(analytes, ## fishmetab <- pathwaydf[grep("RAMP_C_", pathwaydf$rampId), ] print("Running Fisher's tests on metabolites") + outmetab <- runFisherTest( analytes = analytes, analyte_type = "metabolites", @@ -564,7 +577,8 @@ runCombinedFisherTest <- function(analytes, M <- 1 } - # Grab pathways that contain genes to run Fisher on genes + + ## Grab pathways that contain genes to run Fisher on genes ## fishgene <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ] ## Genes are not evaluated if custom background is specified if (background_type == "database" & pathway_definitions == "RaMP") { @@ -579,7 +593,7 @@ runCombinedFisherTest <- function(analytes, ) pathwaydf_gene <- outgene[[2]] outgene <- outgene[[1]] - } else if (pathway_definitions != "RaMP"){ + } else if (pathway_definitions != "RaMP") { outgene <- runFisherTest( analytes = analytes, analyte_type = "genes", diff --git a/R/processNewRamp.R b/R/processNewRamp.R index 5b5507d5..93b6ce8d 100644 --- a/R/processNewRamp.R +++ b/R/processNewRamp.R @@ -485,10 +485,13 @@ processData <- function(){ #username = #conpass = + + # run these 3 methods, these generate files in the R RaMP library area # if commiting to git, then copy the new files into your R git project inst/extdata # pkg.globals <- setConnectionToRaMP(dbname=dbname,username=username,conpass=conpass,host=hostname) + # RaMP:::updateOverlapMatrices(method="balanced" ,all="all") # RaMP:::updateOverlapMatrices(method="balanced" ,all="analyte") # RaMP:::processData() diff --git a/R/rampQueryHelper.R b/R/rampQueryHelper.R index c31cf571..7551402c 100644 --- a/R/rampQueryHelper.R +++ b/R/rampQueryHelper.R @@ -675,6 +675,9 @@ segregateDataBySource<-function(input_RampIds){ ##' @return List of duplicate Wikipathway IDs from Reactome. ##' @author Andrew Patt find_duplicate_pathways <- function(){ + + .Deprecated("findDuplicatPathways") + pathway_overlap = analyte_result duplicate_pairs = data.frame(Pathway1=character(),Pathway2=character()) for(i in 1:ncol(pathway_overlap)){ @@ -686,7 +689,7 @@ find_duplicate_pathways <- function(){ Pathway2=colnames(pathway_overlap)[i])) } } - query <- "select * from analytehaspathway;" + query <- "select * from analytehaspathway where pathwaySource != 'hmdb';" con <- connectToRaMP() allpids <- RMariaDB::dbGetQuery(con, query) RMariaDB::dbDisconnect(con) @@ -713,6 +716,53 @@ find_duplicate_pathways <- function(){ return(duplicate_pathways) } +##' Return list of duplicate Wikipathway IDs from Reactome. This may be unnecessary in the future +##' @return List of duplicate Wikipathway IDs from Reactome. +##' @author John Braisted +findDuplicatePathways <- function() { + query <- "select pathwayRampId from pathway where type = 'reactome';" + con <- connectToRaMP() + reactomePIDs <- RMariaDB::dbGetQuery(con, query) + RMariaDB::dbDisconnect(con) + + ar <- RaMP:::analyte_result + diag(ar) <- 0.0 + ar[ar != 1.0] <- 0.0 + colHits <- colnames(ar)[colSums(ar) >= 1.0] + rowHits <- colnames(ar)[rowSums(ar) >= 1.0] + ar2 <- ar[rowHits, colHits] + n = 0 + + for(r in rownames(ar2)) { + colHits <- colnames(ar2)[ar2[r,]==1.0] + rowHits <- rep(r, length(colHits)) + df <- data.frame(colHits) + df <- cbind(df, rowHits) + if(n == 0) { + df2 <- df + } else { + df2 <- rbind(df2, df) + } + n = n + 1 + } + + dupReturnList <- list(nrow(df2)) + # preference for reactome over wiki or kegg + for(r in 1:nrow(df2)) { + if(df2[r,1] %in% reactomePIDs[,1]) { + dupReturnList[[r]] <- df2[r,2] + } else if(df2[r,2] %in% reactomePIDs[,1]) { + dupReturnList[[r]] <- df2[r,1] + } else { + dupReturnList[[r]] <- df2[r,2] + } + } + + return(unlist(dupReturnList)) +} + + + #' Filter pathways by p-value cutoff for display and clustering #' @param fishers_df The data frame generated by runFisherTest #' @param pval_type Specifies which p-value to use as the filter threshold. diff --git a/man/findDuplicatePathways.Rd b/man/findDuplicatePathways.Rd new file mode 100644 index 00000000..f1af4b6c --- /dev/null +++ b/man/findDuplicatePathways.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rampQueryHelper.R +\name{findDuplicatePathways} +\alias{findDuplicatePathways} +\title{Return list of duplicate Wikipathway IDs from Reactome. This may be unnecessary in the future} +\usage{ +findDuplicatePathways() +} +\value{ +List of duplicate Wikipathway IDs from Reactome. +} +\description{ +Return list of duplicate Wikipathway IDs from Reactome. This may be unnecessary in the future +} +\author{ +John Braisted +} diff --git a/man/getAnalyteFromPathway.Rd b/man/getAnalyteFromPathway.Rd index fbe646d9..f3e7f988 100644 --- a/man/getAnalyteFromPathway.Rd +++ b/man/getAnalyteFromPathway.Rd @@ -8,7 +8,8 @@ getAnalyteFromPathway( pathway, match = "exact", analyte_type = "both", - max_pathway_size = Inf + max_pathway_size = Inf, + names_or_ids = "names" ) } \arguments{ @@ -18,7 +19,9 @@ getAnalyteFromPathway( \item{analyte_type}{a string denoting the type of analyte to return ("gene", "metabolite", "both")} -\item{max_pathway_size}{(default Inf), trims returned results to pathways that have fewer than this number +\item{max_pathway_size}{(default Inf), trims returned results to pathways that have fewer than this number} + +\item{names_or_ids}{are the input pathways input as pathway names or as pathway ids of genes and metabolites} } \value{