Skip to content

Commit

Permalink
Merge pull request #51 from ncats/pw_enrich_dev
Browse files Browse the repository at this point in the history
Pull request for pw_enrich_dev into main, package version 2.3.0
  • Loading branch information
Mathelab authored Apr 26, 2023
2 parents 800d9bc + 0dbdf33 commit e24ce93
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 29 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: RaMP
Title: RaMP (Relational Database of Metabolomic Pathways)
Type: Package
Version: 2.2.1
Version: 2.3.0
License: GPL-2
Depends: R (>= 3.6.0)
Authors@R: c(
Expand Down Expand Up @@ -36,7 +36,7 @@ Imports:
methods,
tibble
Encoding: UTF-8
RoxygenNote: 7.1.2
RoxygenNote: 7.2.3
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
31 changes: 19 additions & 12 deletions R/ReturnAnalytes_InputPathways.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#' @param analyte_type a string denoting the type of analyte to return ("gene", "metabolite", "both")
#' @param match type of matching to use, options are "exact" or "fuzzy". The default is "exact".
#' @param max_pathway_size (default Inf), trims returned results to pathways that have fewer than this number
#' @param names_or_ids are the input pathways input as pathway names or as pathway ids
#' of genes and metabolites
#' @return a data.frame that contains all search results
#' @examples
Expand All @@ -18,7 +19,7 @@
#' "sphingolipid metabolism"))
#' }
#' @export
getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf) {
getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf, names_or_ids="names") {
now <- proc.time()
print("fired")
if(is.character(pathway)){
Expand All @@ -39,6 +40,12 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
list_pathway <- sapply(list_pathway,shQuote)
list_pathway <- paste(list_pathway,collapse = ",")

pathwayMatchCol = 'pathwayName'
if(names_or_ids == 'ids') {
pathwayMatchCol = 'sourceId'
match = 'exact'
}

# Retrieve pathway RaMP ids
if (match=='exact') {

Expand All @@ -48,34 +55,35 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs,
s.geneOrCompound as geneOrCompound,
p.pathwayName as pathwayName,
p.sourceId as pathwayId,
p.pathwayCategory as pathwayCategory,
p.type as pathwayType
from pathway p, analytehaspathway ap, source s
where s.rampId = ap.rampID
and ap.pathwayRampId = p.pathwayRampId
and (p.pathwayCategory not like 'smpdb%' or p.pathwayCategory is Null)
and p.pathwayName in (",list_pathway,") ",
and p.",pathwayMatchCol," in (",list_pathway,") ",
"group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound
order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;"
)
order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;")
con <- connectToRaMP()
df <- RMariaDB::dbGetQuery(con,sql)
RMariaDB::dbDisconnect(con)
} else if(match == 'fuzzy') {
df = data.frame(matrix(nrow=0, ncol=6))
sql = "select
sql = paste0("select
group_concat(distinct s.commonName order by s.commonName asc separator '; ') as analyteName,
group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs,
s.geneOrCompound as geneOrCompound,
p.pathwayName as pathwayName,
p.sourceId as pathwayId,
p.pathwayCategory as pathwayCategory,
p.type as pathwayType
from pathway p, analytehaspathway ap, source s
where s.rampId = ap.rampID
and ap.pathwayRampId = p.pathwayRampId
and (p.pathwayCategory not like 'smpdb%' or p.pathwayCategory is Null)
and p.pathwayName like '%[SOME_PW_NAME]%' group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound
order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;"
and p.",pathwayMatchCol," like '%[SOME_PW_NAME]%' group by s.rampId, p.pathwayName, p.sourceId, p.type, s.geneOrCompound
order by p.type desc, p.pathwayName asc, s.geneOrCompound asc;")

con <- connectToRaMP()
for(p in pathway) {
Expand All @@ -88,9 +96,9 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m

# if we have a result and max_pathway size is not Infinite, filter pathway results by pathway size
if(nrow(df) > 0 && max_pathway_size != Inf) {
pwAnalyteCounts <- data.frame(table(df$`Pathway Name`))
pwAnalyteCounts <- pwAnalyteCounts[pwAnalyteCounts$Freq < max_pathway_size,]
df <- df[df$`Pathway Name` %in% unlist(pwAnalyteCounts$Var1),]
pwAnalyteCounts <- data.frame(table(df$`pathwayName`))
pwAnalyteCounts <- pwAnalyteCounts[pwAnalyteCounts$Freq <= max_pathway_size,]
df <- df[df$`pathwayName` %in% unlist(pwAnalyteCounts$Var1),]
}

if(analyte_type=="gene") {
Expand All @@ -101,13 +109,12 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
allout <- df[which(df$`geneOrCompound`=="compound"),]
} else {
allout <- df

}

print("Timing ..")
print(proc.time() - now)

return(allout)
return(allout)
}


Expand Down
38 changes: 26 additions & 12 deletions R/ReturnPathwaysEnrich_InputAnalytes.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ runFisherTest <- function(analytes,
NameOrIds = NameOrIds
)
print("Custom background specified, genes will be discarded")

} else if (background_type=="file" & analyte_type == "metabolites") {
userbkg <- utils::read.table(background, header=F)[,1]
backgrounddf <- getPathwayFromAnalyte(userbkg,
Expand Down Expand Up @@ -131,9 +131,9 @@ runFisherTest <- function(analytes,
# do nothing, it's handled down below in if statements
}else{
stop("Only custom backgrounds are supported for custom pathway definitions. Please provide a 'list' or 'file' containing the analyte background")
}
}
}

## Check that all metabolites of interest are in the background
if (background_type != "database") {
if (length(setdiff(pathwaydf$rampId, backgrounddf$rampId) != 0)) {
Expand All @@ -151,7 +151,8 @@ runFisherTest <- function(analytes,
list_pid <- paste(list_pid, collapse = ",")

# Get the total number of metabolites that are mapped to pathways in RaMP (that's the default background)
query <- "select * from analytehaspathway"
# added conditional to not pull hmdb ids
query <- "select * from analytehaspathway where pathwaySource != 'hmdb';"
con <- connectToRaMP()
allids <- RMariaDB::dbGetQuery(con, query)

Expand Down Expand Up @@ -318,6 +319,12 @@ runFisherTest <- function(analytes,
totinpath <- c(totinpath, tot_in_pathway)
pidused <- c(pidused, i)
} # end for loop

print("")
print(now - proc.time())
print("before optional MCall")
print("")

# Now run fisher's tests for all other pids (all pathways not covered in dataset)
if (MCall == T) {
# Now run fisher's tests for all other pids
Expand All @@ -337,7 +344,8 @@ runFisherTest <- function(analytes,
restcids <- RMariaDB::dbGetQuery(con, query2) # [[1]]
RMariaDB::dbDisconnect(con)

query1 <- paste0("select rampId,pathwayRampId from analytehaspathway;")
# modify to not take hmdb pathways
query1 <- paste0("select rampId,pathwayRampId from analytehaspathway where pathwaySource != 'hmdb';")

con <- connectToRaMP()
allcids <- RMariaDB::dbGetQuery(con, query1) # [[1]]
Expand Down Expand Up @@ -474,16 +482,20 @@ runFisherTest <- function(analytes,
Num_In_Path = userinpath[keepers],
Total_In_Path = totinpath[keepers]
)
} # End else if MCall (when False)
# Remove duplicate pathways between wikipathways and KEGG
duplicate_pathways <- find_duplicate_pathways()
}
# End else if MCall (when False)

# Remove duplicate pathways between wikipathways and reactome, only perfect overlaps
# only make the dup list if it doesn't exist from a previous run in the session
if( !exists('duplicate_pathways')) {
duplicate_pathways <<- findDuplicatePathways()
}
if (any(out$pathwayRampId %in% duplicate_pathways)) {
out <- out[-which(out$pathwayRampId %in% duplicate_pathways), ]
}

out <- out[!duplicated(out), ]
print(dim(out))
print(colnames(out))

# for user is the output needed, based on what user input
return(list(out, pathwaydf))
}
Expand Down Expand Up @@ -547,6 +559,7 @@ runCombinedFisherTest <- function(analytes,
## fishmetab <- pathwaydf[grep("RAMP_C_", pathwaydf$rampId), ]

print("Running Fisher's tests on metabolites")

outmetab <- runFisherTest(
analytes = analytes,
analyte_type = "metabolites",
Expand All @@ -564,7 +577,8 @@ runCombinedFisherTest <- function(analytes,
M <- 1
}

# Grab pathways that contain genes to run Fisher on genes

## Grab pathways that contain genes to run Fisher on genes
## fishgene <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ]
## Genes are not evaluated if custom background is specified
if (background_type == "database" & pathway_definitions == "RaMP") {
Expand All @@ -579,7 +593,7 @@ runCombinedFisherTest <- function(analytes,
)
pathwaydf_gene <- outgene[[2]]
outgene <- outgene[[1]]
} else if (pathway_definitions != "RaMP"){
} else if (pathway_definitions != "RaMP") {
outgene <- runFisherTest(
analytes = analytes,
analyte_type = "genes",
Expand Down
3 changes: 3 additions & 0 deletions R/processNewRamp.R
Original file line number Diff line number Diff line change
Expand Up @@ -485,10 +485,13 @@ processData <- function(){
#username = <username>
#conpass = <connection_password>



# run these 3 methods, these generate files in the R RaMP library area
# if commiting to git, then copy the new files into your R git project inst/extdata

# pkg.globals <- setConnectionToRaMP(dbname=dbname,username=username,conpass=conpass,host=hostname)

# RaMP:::updateOverlapMatrices(method="balanced" ,all="all")
# RaMP:::updateOverlapMatrices(method="balanced" ,all="analyte")
# RaMP:::processData()
Expand Down
52 changes: 51 additions & 1 deletion R/rampQueryHelper.R
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,9 @@ segregateDataBySource<-function(input_RampIds){
##' @return List of duplicate Wikipathway IDs from Reactome.
##' @author Andrew Patt
find_duplicate_pathways <- function(){

.Deprecated("findDuplicatPathways")

pathway_overlap = analyte_result
duplicate_pairs = data.frame(Pathway1=character(),Pathway2=character())
for(i in 1:ncol(pathway_overlap)){
Expand All @@ -686,7 +689,7 @@ find_duplicate_pathways <- function(){
Pathway2=colnames(pathway_overlap)[i]))
}
}
query <- "select * from analytehaspathway;"
query <- "select * from analytehaspathway where pathwaySource != 'hmdb';"
con <- connectToRaMP()
allpids <- RMariaDB::dbGetQuery(con, query)
RMariaDB::dbDisconnect(con)
Expand All @@ -713,6 +716,53 @@ find_duplicate_pathways <- function(){
return(duplicate_pathways)
}

##' Return list of duplicate Wikipathway IDs from Reactome. This may be unnecessary in the future
##' @return List of duplicate Wikipathway IDs from Reactome.
##' @author John Braisted
findDuplicatePathways <- function() {
query <- "select pathwayRampId from pathway where type = 'reactome';"
con <- connectToRaMP()
reactomePIDs <- RMariaDB::dbGetQuery(con, query)
RMariaDB::dbDisconnect(con)

ar <- RaMP:::analyte_result
diag(ar) <- 0.0
ar[ar != 1.0] <- 0.0
colHits <- colnames(ar)[colSums(ar) >= 1.0]
rowHits <- colnames(ar)[rowSums(ar) >= 1.0]
ar2 <- ar[rowHits, colHits]
n = 0

for(r in rownames(ar2)) {
colHits <- colnames(ar2)[ar2[r,]==1.0]
rowHits <- rep(r, length(colHits))
df <- data.frame(colHits)
df <- cbind(df, rowHits)
if(n == 0) {
df2 <- df
} else {
df2 <- rbind(df2, df)
}
n = n + 1
}

dupReturnList <- list(nrow(df2))
# preference for reactome over wiki or kegg
for(r in 1:nrow(df2)) {
if(df2[r,1] %in% reactomePIDs[,1]) {
dupReturnList[[r]] <- df2[r,2]
} else if(df2[r,2] %in% reactomePIDs[,1]) {
dupReturnList[[r]] <- df2[r,1]
} else {
dupReturnList[[r]] <- df2[r,2]
}
}

return(unlist(dupReturnList))
}



#' Filter pathways by p-value cutoff for display and clustering
#' @param fishers_df The data frame generated by runFisherTest
#' @param pval_type Specifies which p-value to use as the filter threshold.
Expand Down
17 changes: 17 additions & 0 deletions man/findDuplicatePathways.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions man/getAnalyteFromPathway.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e24ce93

Please sign in to comment.