Skip to content

Commit

Permalink
Merge pull request #29 from BushmanLab/db_schema
Browse files Browse the repository at this point in the history
Db schema: PK is sampleName + refGenome
  • Loading branch information
anatolydryga committed Aug 19, 2015
2 parents 1183308 + e89ebb1 commit c16c5fc
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 57 deletions.
42 changes: 23 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,40 +52,44 @@ Rscript path/to/makeGeneTherapyPatientReport.R #check cu
Rscript path/to/makeGeneTherapyPatientReport.R ~/Frances/run20150505 #check a run folder
```

#### Note
Do NOT run multiple instances within the same folder
Reference genome is specified by `--ref_genome` or `-r` option with default of `hg18`:
```
Rscript path/to/makeGeneTherapyPatientReport.R ~/Frances/run20150505 --ref_genome hg19
```

To connect to databases .my.cnf should be present in ~ and group can be changed with `--sites_group` option
with default of `intsites_miseq` for integration sites DB:
```
Rscript path/to/makeGeneTherapyPatientReport.R ~/Frances/run20150505 --sites_group test_db
```

Metadata for GTSP is held in specimen_management DB and can be changed with `--gtsp_group` option
with default "specimen_management":
```
Rscript path/to/makeGeneTherapyPatientReport.R ~/Frances/run20150505 --gtsp_group gtsp_group_in_my_cnf
```

#### Database connfig file location


#### Database config file location

config file should be in home directory and called .my.cnf,
e.g. ~/.my.cnf

The .my.cnf format is as follows:

```
[intSitesDEV-dev]
[GROUP_NAME]
user=YYYYYYY
password=XXXXXX
host=microb98.med.upenn.edu
host=microbYYYY.med.upenn.edu
port=3309
database=intsitesdev
database=intsites_miseq
```

#### Dependencies

intSiteRetriever : https://github.com/esherm/intSiteRetriever
(at present get the project in current folder:
```
git clone https://github.com/esherm/intSiteRetriever
```)
hiAnnotator
reldist
Cancer Gene list:
```
git clone https://github.com/anatolydryga/CancerGeneList.git
```
...

#### Testing

Expand Down
68 changes: 36 additions & 32 deletions makeGeneTherapyPatientReport.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
options(stringsAsFactors = FALSE)

#### INPUTS: csv file/table GTSP to sampleName ####
csvfile <- "sampleName_GTSP.csv"
library(argparse)

parser <- ArgumentParser(description="Gene Therapy Patient Report for Single Patient")
parser$add_argument("sample_gtsp", nargs='?', default='sampleName_GTSP.csv')
parser$add_argument("-s", action='store_true', help="abundance by sonicLength package (Berry, C. 2012)")
parser$add_argument("--ref_genome", default="hg18", help="reference genome used for all samples")
parser$add_argument("--sites_group", default="intsites_miseq", help="group to use for integration sites db from ~/.my.cnf")
parser$add_argument("--gtsp_group", default="specimen_management", help="group to use for specimen management GTSP db from ~/.my.cnf")
arguments <- parser$parse_args()

args <- commandArgs(trailingOnly=TRUE)

use.sonicLength <- TRUE
# defaults:
use.sonicLength <- ! arguments$s
db_group_sites <- arguments$sites_group
db_group_gtsp <- arguments$gtsp_group
ref_genome <- arguments$ref_genome
#### INPUTS: csv file/table GTSP to sampleName ####
csvfile <- arguments$sample_gtsp

if( length(args)==1 ) {
csvfile <- args[1]
}
if( length(args)==2 & args[2] == "-s"){
csvfile <- args[1]
use.sonicLength <- FALSE
}else if(length(args)==2 & args[2] != "-s"){
message("Incorrect flags.")
stop()
}
if( !file.exists(csvfile) ) stop(csvfile, "not found")

codeDir <- dirname(sub("--file=", "", grep("--file=", commandArgs(trailingOnly=FALSE), value=T)))
Expand Down Expand Up @@ -59,13 +63,17 @@ stopifnot(all(c("sampleName", "GTSP") %in% colnames(sampleName_GTSP)))
message("\nGenerating report from the following sets")
print(sampleName_GTSP)

junk <- sapply(dbListConnections(MySQL()), dbDisconnect)
dbConn <- dbConnect(MySQL(), group="intSitesDev237")
stopifnot(all(setNameExists(sampleName_GTSP$sampleName, dbConn)))
dbConn <- dbConnect(MySQL(), group=db_group_sites)
info <- dbGetInfo(dbConn)
dbConn <- src_sql("mysql", dbConn, info = info)

sampleName_GTSP$refGenome <- rep(ref_genome, nrow(sampleName_GTSP))

stopifnot(all(setNameExists(sampleName_GTSP, dbConn)))

read_sites_sample_GTSP <- get_read_site_totals(sampleName_GTSP, dbConn)

sets <- get_metadata_for_GTSP(unique(sampleName_GTSP$GTSP))
sets <- get_metadata_for_GTSP(unique(sampleName_GTSP$GTSP), db_group_gtsp)
# reports are for a single patient
stopifnot(length(unique(sets$Patient)) == 1)
patient <- sets$Patient[1]
Expand All @@ -84,18 +92,17 @@ sets[sets$Timepoint=="NULL", "Timepoint"] = "d0"
sets <- merge(sets, read_sites_sample_GTSP)
sets$Timepoint <- sortFactorTimepoints(sets$Timepoint)

junk <- sapply(dbListConnections(MySQL()), dbDisconnect)
dbConn <- dbConnect(MySQL(), group="intSitesDev237")
refGenomes <- getRefGenome(sampleName_GTSP$sampleName, dbConn)
# at present the whole report is done for one genome
stopifnot(length(unique(refGenomes$refGenome))==1)
freeze <- refGenomes[1, "refGenome"]
stopifnot(length(unique(sampleName_GTSP$refGenome))==1)
freeze <- sampleName_GTSP[1, "refGenome"]

##==========GET AND PERFORM BASIC DEREPLICATION/SONICABUND ON SITES=============
message("Fetching unique sites and estimating abundance")
junk <- sapply(dbListConnections(MySQL()), dbDisconnect)
dbConn <- dbConnect(MySQL(), group="intSitesDev237")
sites <- merge(getUniquePCRbreaks(sampleName_GTSP$sampleName, dbConn), sampleName_GTSP)
dbConn <- dbConnect(MySQL(), group=db_group_sites)
info <- dbGetInfo(dbConn)
dbConn <- src_sql("mysql", dbConn, info = info)
sites <- merge(getUniquePCRbreaks(sampleName_GTSP, dbConn), sampleName_GTSP)
names(sites)[names(sites)=="position"] <- "integration"

#we really don't care about seqinfo - we just want a GRange object for easy manipulation
uniqueSites.gr <- GRanges(seqnames=Rle(sites$chr),
Expand Down Expand Up @@ -318,9 +325,10 @@ timepointPopulationInfo <- melt(timepointPopulationInfo, "group")

#==================Get abundance for multihit events=====================
message("Fetching multihit sites and estimating abundance")
junk <- sapply(dbListConnections(MySQL()), dbDisconnect)
dbConn <- dbConnect(MySQL(), group="intSitesDev237")
sites.multi <- merge( suppressWarnings(getMultihitLengths(sampleName_GTSP$sampleName, dbConn)), sampleName_GTSP)
dbConn <- dbConnect(MySQL(), group=db_group_sites)
info <- dbGetInfo(dbConn)
dbConn <- src_sql("mysql", dbConn, info = info)
sites.multi <- merge( suppressWarnings(getMultihitLengths(sampleName_GTSP, dbConn)), sampleName_GTSP)
if( nrow(sites.multi) > 0 ) {
sites.multi <- sites.multi %>%
group_by(multihitID) %>%
Expand Down Expand Up @@ -359,10 +367,6 @@ if( nrow(sites.multi) > 0 ) {

}


##write.csv(as.data.frame(standardizedDereplicatedSites),
## file=paste(trial, patient, "uniquehit.csv", sep="."))

save.image(RDataFile)
##end setting variables for markdown report

Expand Down
7 changes: 3 additions & 4 deletions read_site_totals.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
##source("intSiteRetriever/R/intSiteRetriever.R")

#' for a given df with 2 cols: sampleName and GTSP
#' gets counts from intSite DB and collapse replicates.
#' @param df with cols: sampleName, GTSP
Expand All @@ -15,8 +13,9 @@ get_read_site_totals <- function(sampleName_GTSP, connection) {
#' numbers.
#' @return df with 2 cols: GTSP, column_name
get_count_per_GTSP <- function(sampleName_GTSP, database_function, column_name, connection) {
sample_count <- database_function(sampleName_GTSP$sampleName, connection)
names(sample_count) <- c("sampleName", column_name)
# workaround suppressWarnings: type in DB of count int lead to warning when casting to num in R
sample_count <- suppressWarnings(database_function(sampleName_GTSP, connection))
names(sample_count) <- c("sampleName", "refGenome", column_name)
sample_count_GTSP <- merge(sample_count, sampleName_GTSP)
aggregate(sample_count_GTSP[column_name], sample_count_GTSP['GTSP'], FUN=sum)
}
4 changes: 2 additions & 2 deletions specimen_management.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
#' @param GTSP vector of GTSP
#' @return df with columns: Trial, GTSP, Patient, Timepoint, CellType, FragMethod, VCN.
#' @note all GTSP in th vector should be unique
get_metadata_for_GTSP <- function(GTSP) {
get_metadata_for_GTSP <- function(GTSP, db_group) {
stopifnot(length(GTSP) == length(unique(GTSP)))
GTSP = paste0("^", GTSP, "$", collapse="|")

GTSPDBconn <- dbConnect(MySQL(), group="specimen_management")
GTSPDBconn <- dbConnect(MySQL(), group=db_group)

query = paste0("SELECT Trial, SpecimenAccNum, Patient, Timepoint, CellType,
SamplePrepMethod, VCN
Expand Down

0 comments on commit c16c5fc

Please sign in to comment.