diff --git a/README.md b/README.md index 0d94519..30258a7 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Introduction -This code is designed to upload integration sites, PCR breakpoints, and multihits identified by `intSiteCaller` and upload them to the intsitesdev database. The database is currently located at `microbxxx.med.upenn.edu:3306` and is described by the included schema, `insitesdev.sql`. +This code is designed to upload integration sites, PCR breakpoints, and multihits identified by `intSiteCaller` and upload them to the intsitesdev database. The database is currently located at `microbxxx.med.upenn.edu:3306` and is described by the included schema, `integration_site_schema.sql`. ## Inputs @@ -17,20 +17,26 @@ primaryAnalysisDirectory │   ├── sites.final.RData │   ├── multihitData.RData │   └── allSites.RData -├── processingParams.tsv -└── sampleInfo.tsv +├─miseqid.txt +└─completeMetadata.Rdata ``` -There can be as few or as many samples as the user desires in the `primaryAnalysisDirectory`, so long as each sample is represented in both `processingParams.csv` and `sampleInfo.csv`. See [intSiteCaller's Documentation](http://www.github.com/esherm/intSiteCaller) for a description of the values contained in these two metadata files. +There can be as few or as many samples as the user desires in the +`primaryAnalysisDirectory`, so long as each sample is represented in `completeMetadata.Rdata` +. See [intSiteCaller's +Documentation](http://www.github.com/esherm/intSiteCaller) for a description of +the values contained in these two metadata files. ## Usage Code example: ``` cd run20150505 # a recent processed run folder -Rscript path/to/intSiteUploader.R . -Rscript intSiteUploader.R +Rscript path/to/intSiteUploader.R +Rscript intSiteUploader.R [mysql_group] ``` +at present default group for `mysql_group` is `intSitesDev237` + Note: * Run intSiteUploader.R only after running intSiteCaller, * Only run one instance at a time, @@ -65,3 +71,10 @@ run id from `myseqid.txt` that should be present in primary analysis folder. +## Testing + +sqllite version of db can be created by: + +``` +sqlite3 db.sqlite3 < integration_site_schema.sql +``` diff --git a/helper/reset_microb237.R b/helper/reset_microb237.R index b86b148..15f2dee 100644 --- a/helper/reset_microb237.R +++ b/helper/reset_microb237.R @@ -20,7 +20,7 @@ cmd <- sprintf("mysql --defaults-file=%s -e 'CREATE DATABASE IF NOT EXISTS intsi message(cmd) stopifnot( system(cmd)==0 ) -cmd <- sprintf("mysql --defaults-file=%s intsitesdevtest < %s/intsitesdev.sql", test_db_cnf, codeDir) +cmd <- sprintf("mysql --defaults-file=%s intsitesdevtest < %s/integration_site_schema.sql", test_db_cnf, codeDir) message(cmd) stopifnot( system(cmd)==0 ) diff --git a/intSiteUploader.R b/intSiteUploader.R index c058908..4de9ca0 100644 --- a/intSiteUploader.R +++ b/intSiteUploader.R @@ -1,28 +1,28 @@ -## check for presence of R packages -rPackages <- c("stats", "RMySQL", "GenomicRanges", "BiocGenerics", "parallel", "IRanges", "GenomeInfoDb") -rPackagesPresent <- is.element(rPackages, installed.packages()[,1]) -if(any(!rPackagesPresent)){ - stop(paste(rPackages[!rPackagesPresent]), " is not available") -} -stopifnot(sapply(rPackages, require, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE)) +library(RMySQL, quietly=TRUE, verbose=FALSE) +library(dplyr, quietly=TRUE, verbose=FALSE) + +codeDir <- dirname(sub("--file=", "", grep("--file=", commandArgs(trailingOnly=FALSE), value=T))) + +source(file.path(codeDir, "utils.R")) +source(file.path(codeDir, "load_tables.R")) +check_presence_packages() options(stringsAsFactors=F) ## check if file exist and permission .my.cnf stopifnot(file.exists("~/.my.cnf")) stopifnot(file.info("~/.my.cnf")$mode == as.octmode("600")) - -##check for presence of command line stuff -commandLinePrograms <- c("mysql") -programsPresent <- !sapply(sprintf("which %s > /dev/null 2>&1", commandLinePrograms), system) -if(any(!programsPresent)){ - stop(paste(commandLinePrograms[!programsPresent]), " is not available") -} +check_presence_command_line_tools() ## working directory (i.e. primary analysis directory) is passed in via command line +# and group for MySQL server args <- commandArgs(trailingOnly=TRUE) workingDir <- args[1] +mysql_group <- "intsites_miseq" +if ( ! is.na(args[2])) { + mysql_group <- args[2] +} if( interactive() | is.na(workingDir) ) workingDir <- "." stopifnot(!is.na(workingDir)) workingDir <- normalizePath(workingDir, mustWork=TRUE) @@ -35,69 +35,49 @@ stopifnot(length(miseqid)==1) message("miseqid: ", miseqid) ## get sample information -stopifnot(all(file.exists("sampleInfo.tsv", "completeMetadata.RData"))) +stopifnot(file.exists("completeMetadata.RData")) metadata <- get(load('completeMetadata.RData')) metadata <- subset(metadata, select=c("alias", "gender", "refGenome")) names(metadata) <- c("sampleName", "gender", "refGenome") ## initialize connection to database ## ~/.my.cnf must be present -junk <- sapply(dbListConnections(MySQL()), dbDisconnect) -dbConn <- dbConnect(MySQL(), group="intSitesDev237") -stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) +dbConn <- dbConnect(MySQL(), group=mysql_group) ## stop if any sample is already loaded -allSampleName <- dbGetQuery(dbConn, "SELECT DISTINCT sampleName FROM samples") -is.loaded <- metadata$sampleName %in% allSampleName$sampleName +read_conn <- create_src_mysql(dbConn) +is.loaded <- setNameExists(select(metadata, sampleName, refGenome), read_conn) if(any(is.loaded)) message( paste0("Sets already in the database: ", - paste(metadata$sampleName[is.loaded], collapse="\n"))) + paste(metadata[is.loaded, ], collapse="\n"))) -if( any(grepl("^GTSP", metadata$sampleName[is.loaded], ignore.case=TRUE)) ) stop("GTSP sample already loaded, delete from the database or leave them alone") - -metadata <- subset(metadata, !is.loaded) - -## Get max sampleID, and start from max+1 -stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) -currentMaxSampleID <- as.integer(suppressWarnings(dbGetQuery(dbConn, "SELECT MAX(sampleID) AS sampleID FROM samples;"))) -if(is.na(currentMaxSampleID)) { - nrows <- as.integer(dbGetQuery(dbConn, "SELECT count(*) FROM samples;")) - if(nrows==0) currentMaxSampleID<-0 - if(nrows!=0) stop("Failed to get currentMaxSampleID") +if( any(grepl("^GTSP", metadata$sampleName[is.loaded], ignore.case=TRUE)) ) { + stop("GTSP sample already loaded, delete from the database or leave them alone") } -## load table samples -metadata$sampleID <- seq(nrow(metadata))+currentMaxSampleID -metadata$miseqid <- miseqid -stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) -stopifnot( dbWriteTable(dbConn, "samples", metadata, append=T, row.names=F) ) -## check wether load was successful -sample.tab <- suppressWarnings(dbReadTable(dbConn, "samples")) -merged.tab <- merge(metadata, sample.tab, by="sampleName", all.x=TRUE) -if( !all(merged.tab$sampleID.x==merged.tab$sampleID.y) ) { - message("Sample ID error, check the following table") - print(merged.tab) +metadata <- subset(metadata, ! is.loaded) +if (nrow(metadata) == 0) { + message("All samples are already in the DB. Nothing is pushed into DB.") + q() } +metadata$miseqid <- miseqid + +dbGetQuery(dbConn, "START TRANSACTION;") +metadata <- write_table_samples(dbConn, metadata) ## Get max siteID, and start from max+1 -stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) -currentMaxSiteID <- as.integer(suppressWarnings(dbGetQuery(dbConn, "SELECT MAX(siteID) AS siteID FROM sites;"))) +currentMaxSiteID <- as.integer(dbGetQuery(dbConn, "SELECT MAX(siteID) AS siteID FROM sites;")) if(is.na(currentMaxSiteID)) { - nrows <- as.integer(dbGetQuery(dbConn, "SELECT count(*) FROM sites;")) - if(nrows==0) currentMaxSiteID<-0 - if(nrows!=0) stop("Failed to get currentMaxSiteID") + currentMaxSiteID<-0 } ## Get max MultihitID, and start from max+1 -stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) -currentMaxMultihitID <- as.integer(suppressWarnings(dbGetQuery(dbConn, "SELECT MAX(multihitID) AS multihitID FROM multihitpositions;"))) +currentMaxMultihitID <- as.integer(dbGetQuery(dbConn, "SELECT MAX(multihitID) AS multihitID FROM multihitpositions;")) if(is.na(currentMaxMultihitID)) { - nrows <- as.integer(dbGetQuery(dbConn, "SELECT count(*) FROM multihitpositions;")) - if(nrows==0) currentMaxMultihitID<-0 - if(nrows!=0) stop("Failed to get currentMaxSiteID") + currentMaxMultihitID<-0 } -## process by sample +## process by sample and upload to sites, pcrbreakpoints, multihitpositions, multihitlengths for(i in seq(nrow(metadata))){ file <- metadata[i,"sampleName"] message("\nProcessing: ", file) @@ -141,49 +121,14 @@ for(i in seq(nrow(metadata))){ pcrBreakpoints$count <- as(pcrBreakpoints$count, "integer") ## load table sites - stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) message("Loading sites: ", nrow(sites), " entries") stopifnot( dbWriteTable(dbConn, "sites", sites, append=T, row.names=F) ) - ## check loaded sites - sql <- sprintf("SELECT * FROM sites WHERE siteID>=%s AND siteID<=%s", - range(sites$siteID)[1], - range(sites$siteID)[2]) - sites.from.db <- suppressWarnings( dbGetQuery(dbConn, sql) ) - sites.from.db$siteID <- as(sites.from.db$siteID, "integer") - sites.from.db$sampleID <- as(sites.from.db$sampleID, "integer") - sites.from.db$position <- as(sites.from.db$position, "integer") - sites.from.db$chr <- as(sites.from.db$chr, "character") - sites.from.db$strand <- as(sites.from.db$strand, "character") - - sites <- plyr::arrange(sites, siteID, sampleID, position, chr, strand) - sites.from.db <- plyr::arrange(sites.from.db, siteID, sampleID, position, chr, strand) - if(!identical(sites, sites.from.db)) { - save.image("debug.rdata") - stop("sites, sites.from.db not identical") - } ## load table pcrbreakpoints - stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) message("Loading pcrbreakpoints: ", nrow(pcrBreakpoints), " entries") stopifnot( dbWriteTable(dbConn, "pcrbreakpoints", pcrBreakpoints, append=T, row.names=F) ) - ## check loaded pcrbreakpoints - sql <- sprintf("SELECT * FROM pcrbreakpoints WHERE siteID>=%s AND siteID<=%s", - range(sites$siteID)[1], - range(sites$siteID)[2]) - pcrbreakpoints.from.db <- suppressWarnings( dbGetQuery(dbConn, sql) ) - pcrbreakpoints.from.db$siteID <- as(pcrbreakpoints.from.db$siteID, "integer") - pcrbreakpoints.from.db$breakpoint <- as(pcrbreakpoints.from.db$breakpoint, "integer") - pcrbreakpoints.from.db$count <- as(pcrbreakpoints.from.db$count, "integer") - pcrBreakpoints <- plyr::arrange(pcrBreakpoints, siteID, breakpoint, count) - pcrbreakpoints.from.db <- plyr::arrange(pcrbreakpoints.from.db, siteID, breakpoint, count) - if(!identical(pcrBreakpoints, pcrbreakpoints.from.db)) { - save.image("debug.rdata") - stop("pcrBreakpoints, pcrbreakpoints.from.db not identical") - } - - newMaxSiteID <- as.integer(suppressWarnings(dbGetQuery(dbConn, "SELECT MAX(siteID) AS siteID FROM sites;"))) - stopifnot(newMaxSiteID == currentMaxSiteID + nrow(sites)) + newMaxSiteID = currentMaxSiteID + nrow(sites) currentMaxSiteID <- newMaxSiteID } } @@ -222,53 +167,21 @@ for(i in seq(nrow(metadata))){ multihitLengths$count <- as(multihitLengths$count, "integer") ## load table multihitpositions - stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) message("Loading multihitpositions:", nrow(multihitPositions), " entries") stopifnot( dbWriteTable(dbConn, "multihitpositions", multihitPositions, append=T, row.names=F) ) - ## check loaded multihitpositions - sql <- sprintf("SELECT * FROM multihitpositions WHERE multihitID>=%s AND multihitID<=%s", - range(multihitPositions$multihitID)[1], - range(multihitPositions$multihitID)[2]) - multihitpositions.from.db <- suppressWarnings( dbGetQuery(dbConn, sql) ) - multihitpositions.from.db$multihitID <- as(multihitpositions.from.db$multihitID, "integer") - multihitpositions.from.db$sampleID <- as(multihitpositions.from.db$sampleID, "integer") - multihitpositions.from.db$position <- as(multihitpositions.from.db$position, "integer") - multihitpositions.from.db$chr <- as(multihitpositions.from.db$chr, "character") - multihitpositions.from.db$strand <- as(multihitpositions.from.db$strand, "character") - - multihitpositions.from.db <- plyr::arrange(multihitpositions.from.db, multihitID,sampleID, position,chr,strand) - multihitPositions <- plyr::arrange(multihitPositions, multihitID, sampleID,position,chr,strand) - if(!identical(multihitPositions, multihitpositions.from.db)) { - save.image("debug.rdata") - stop("multihitPositions, multihitpositions.from.db not identical") - } ## load table multihitlengths - stopifnot(dbGetQuery(dbConn, "SELECT 1")==1) message("Loading multihitlengths: ", nrow(multihitLengths), " entries") stopifnot( dbWriteTable(dbConn, "multihitlengths", multihitLengths, append=T, row.names=F) ) - ## check loaded multihitlengths - sql <- sprintf("SELECT * FROM multihitlengths WHERE multihitID>=%s AND multihitID<=%s", - range(multihitPositions$multihitID)[1], - range(multihitPositions$multihitID)[2]) - multihitlengths.from.db <- suppressWarnings( dbGetQuery(dbConn, sql) ) - multihitlengths.from.db$multihitID <- as(multihitlengths.from.db$multihitID, "integer") - multihitlengths.from.db$length <- as(multihitlengths.from.db$length, "integer") - multihitlengths.from.db$count <- as(multihitlengths.from.db$count, "integer") - - multihitlengths.from.db <- plyr::arrange(multihitlengths.from.db, multihitID, length, count) - multihitLengths <- plyr::arrange(multihitLengths, multihitID, length, count) - if(!identical(multihitLengths, multihitlengths.from.db)) { - save.image("debug.rdata") - stop("multihitLengths, multihitlengths.from.db not identical") - } - - newMaxMultihitID <- as.integer(suppressWarnings(dbGetQuery(dbConn, "SELECT MAX(multihitID) AS multihitID FROM multihitpositions;"))) - stopifnot(newMaxMultihitID == currentMaxMultihitID + length(unique(multihitPositions$multihitID))) + newMaxMultihitID = currentMaxMultihitID + length(unique(multihitPositions$multihitID)) currentMaxMultihitID <- newMaxMultihitID } } } +dbGetQuery(dbConn, "COMMIT;") + +check_write_table_samples(dbConn, metadata) + dbDiscon <- dbDisconnect(dbConn) diff --git a/integration_site_schema.sql b/integration_site_schema.sql new file mode 100644 index 0000000..06f6c0b --- /dev/null +++ b/integration_site_schema.sql @@ -0,0 +1,54 @@ +-- drop tables with FK constraints +DROP TABLE IF EXISTS pcrbreakpoints; +DROP TABLE IF EXISTS sites; +DROP TABLE IF EXISTS multihitlengths; +DROP TABLE IF EXISTS multihitpositions; +DROP TABLE IF EXISTS samples; + +CREATE TABLE samples ( + sampleID int NOT NULL, + sampleName varchar(255) NOT NULL, + refGenome varchar(10) NOT NULL, + gender char(1) NOT NULL, + miseqid varchar(255), + PRIMARY KEY (sampleID), + CONSTRAINT uniq_samples UNIQUE (sampleName, refGenome) +); + +-- unique hits +CREATE TABLE sites ( + siteID int NOT NULL, + sampleID int NOT NULL, + position int NOT NULL, + chr varchar(255) NOT NULL, + strand char(1) NOT NULL, + PRIMARY KEY (siteID), + FOREIGN KEY (sampleID) REFERENCES samples(sampleID) +); + +CREATE TABLE `pcrbreakpoints` ( + siteID int NOT NULL, + breakpoint int NOT NULL, + count int NOT NULL, + PRIMARY KEY (siteID, breakpoint), + FOREIGN KEY (siteID) REFERENCES sites(siteID) +); + +-- multihit schema +CREATE TABLE multihitpositions ( + multihitID int NOT NULL, + sampleID int NOT NULL, + position int NOT NULL, + chr varchar(255) NOT NULL, + strand char(1) NOT NULL, + PRIMARY KEY (multihitID, position, chr, strand), + FOREIGN KEY (sampleID) REFERENCES samples(sampleID) +); + +CREATE TABLE multihitlengths ( + multihitID int NOT NULL, + length int NOT NULL, + count int NOT NULL, + PRIMARY KEY (multihitID, length), + FOREIGN KEY (multihitID) REFERENCES multihitpositions(multihitID) +); diff --git a/intsitesdev.sql b/intsitesdev.sql deleted file mode 100644 index 74a8db8..0000000 --- a/intsitesdev.sql +++ /dev/null @@ -1,115 +0,0 @@ --- MySQL dump 10.13 Distrib 5.1.73, for redhat-linux-gnu (x86_64) --- --- Host: microb237.med.upenn.edu Database: intsitesdevtest --- ------------------------------------------------------ --- Server version 5.5.5-10.0.17-MariaDB - -/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; -/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; -/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; -/*!40101 SET NAMES utf8 */; -/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; -/*!40103 SET TIME_ZONE='+00:00' */; -/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; -/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; -/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; -/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; - --- --- Table structure for table `multihitlengths` --- - -DROP TABLE IF EXISTS `multihitlengths`; -/*!40101 SET @saved_cs_client = @@character_set_client */; -/*!40101 SET character_set_client = utf8 */; -CREATE TABLE `multihitlengths` ( - `multihitID` int(11) unsigned NOT NULL, - `length` int(11) unsigned NOT NULL, - `count` int(8) unsigned NOT NULL DEFAULT '1', - PRIMARY KEY (`multihitID`,`length`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -/*!40101 SET character_set_client = @saved_cs_client */; - --- --- Table structure for table `multihitpositions` --- - -DROP TABLE IF EXISTS `multihitpositions`; -/*!40101 SET @saved_cs_client = @@character_set_client */; -/*!40101 SET character_set_client = utf8 */; -CREATE TABLE `multihitpositions` ( - `multihitID` int(11) unsigned NOT NULL, - `sampleID` int(11) unsigned NOT NULL, - `position` int(11) unsigned NOT NULL, - `chr` varchar(20) NOT NULL DEFAULT '', - `strand` char(1) NOT NULL DEFAULT '', - PRIMARY KEY (`multihitID`,`position`,`chr`,`strand`), - KEY `sampleID` (`sampleID`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -/*!40101 SET character_set_client = @saved_cs_client */; - --- --- Table structure for table `pcrbreakpoints` --- - -DROP TABLE IF EXISTS `pcrbreakpoints`; -/*!40101 SET @saved_cs_client = @@character_set_client */; -/*!40101 SET character_set_client = utf8 */; -CREATE TABLE `pcrbreakpoints` ( - `siteID` int(11) unsigned NOT NULL, - `breakpoint` int(11) unsigned NOT NULL, - `count` int(8) unsigned NOT NULL DEFAULT '1', - PRIMARY KEY (`siteID`,`breakpoint`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -/*!40101 SET character_set_client = @saved_cs_client */; - --- --- Table structure for table `samples` --- - -DROP TABLE IF EXISTS `samples`; -/*!40101 SET @saved_cs_client = @@character_set_client */; -/*!40101 SET character_set_client = utf8 */; -CREATE TABLE `samples` ( - `sampleID` int(11) unsigned NOT NULL, - `sampleName` varchar(255) NOT NULL DEFAULT '', - `refGenome` varchar(10) NOT NULL DEFAULT '', - `gender` char(1) NOT NULL DEFAULT '', - `miseqid` varchar(255) DEFAULT NULL, - PRIMARY KEY (`sampleID`), - UNIQUE KEY `sampleName` (`sampleName`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -/*!40101 SET character_set_client = @saved_cs_client */; - --- --- Table structure for table `sites` --- - -DROP TABLE IF EXISTS `sites`; -/*!40101 SET @saved_cs_client = @@character_set_client */; -/*!40101 SET character_set_client = utf8 */; -CREATE TABLE `sites` ( - `siteID` int(11) unsigned NOT NULL, - `sampleID` int(11) unsigned NOT NULL, - `position` int(11) unsigned NOT NULL, - `chr` varchar(20) NOT NULL DEFAULT '', - `strand` char(1) NOT NULL DEFAULT '', - PRIMARY KEY (`siteID`), - KEY `sampleID` (`sampleID`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1; -/*!40101 SET character_set_client = @saved_cs_client */; - --- --- Dumping routines for database 'intsitesdevtest' --- -/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; - -/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; -/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; -/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; -/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; -/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; -/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; -/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; - --- Dump completed on 2015-06-19 12:39:12 diff --git a/load_tables.R b/load_tables.R new file mode 100644 index 0000000..63e6a6b --- /dev/null +++ b/load_tables.R @@ -0,0 +1,22 @@ +write_table_samples <- function(dbConn, metadata) { + ## Get max sampleID, and start from max+1 + currentMaxSampleID <- as.integer(dbGetQuery(dbConn, "SELECT MAX(sampleID) AS sampleID FROM samples;")) + if(is.na(currentMaxSampleID)) { # empty DB + currentMaxSampleID<-0 + } + + ## load table samples + metadata$sampleID <- seq(nrow(metadata))+currentMaxSampleID + stopifnot( dbWriteTable(dbConn, "samples", metadata, append=T, row.names=F) ) + metadata +} + +check_write_table_samples <- function(dbConn, metadata) { + ## check wether load was successful + sample.tab <- suppressWarnings(dbReadTable(dbConn, "samples")) + merged.tab <- merge(metadata, sample.tab, by=c("sampleName", "refGenome"), all.x=TRUE) + if( !all(merged.tab$sampleID.x==merged.tab$sampleID.y) ) { + message("Sample ID error, check the following table") + print(merged.tab) + } +} diff --git a/utils.R b/utils.R new file mode 100644 index 0000000..5ec4a45 --- /dev/null +++ b/utils.R @@ -0,0 +1,23 @@ +#' create srq_mysql without specifying params in ctor +create_src_mysql <- function(con) { + info <- dbGetInfo(con) + src_sql("mysql", con, info = info) +} + +check_presence_packages <- function() { + rPackages <- c("intSiteRetriever", "stats", "RMySQL", "GenomicRanges", + "BiocGenerics", "parallel", "IRanges", "GenomeInfoDb") + rPackagesPresent <- is.element(rPackages, installed.packages()[,1]) + if(any(!rPackagesPresent)){ + stop(paste(rPackages[!rPackagesPresent]), " is not available") + } + stopifnot(sapply(rPackages, require, character.only=TRUE, quietly=TRUE, warn.conflicts=FALSE)) +} + +check_presence_command_line_tools <- function() { + commandLinePrograms <- c("mysql") + programsPresent <- !sapply(sprintf("which %s > /dev/null 2>&1", commandLinePrograms), system) + if(any(!programsPresent)){ + stop(paste(commandLinePrograms[!programsPresent]), " is not available") + } +}