diff --git a/Rulegraph.png b/Rulegraph.png index 7b9c939..02f5d5e 100644 Binary files a/Rulegraph.png and b/Rulegraph.png differ diff --git a/config/config_annotation.json b/config/config_annotation.json index 8a942c9..378cc18 100644 --- a/config/config_annotation.json +++ b/config/config_annotation.json @@ -30,6 +30,7 @@ "annot_tcc" : "hg19_targated_cancer_care_10262015.txt", "annot_civic" : "hg19_civic_10262015.txt", "blacklisted" : "hg19_blacklistedSites.txt", + "whitelisted" : "hg19_whitelistedSites.txt", "geneList" : "geneLists/combinedList_01032017", "cgc" : "geneLists/CancerGeneCensus.v78.txt", "somaticActSites" : "hg19_SomaticActionableSites.txt", diff --git a/config/config_common_biowulf.json b/config/config_common_biowulf.json index f57112a..a5116d7 100644 --- a/config/config_common_biowulf.json +++ b/config/config_common_biowulf.json @@ -41,7 +41,7 @@ "nextera" : "/data/Clinomics/Ref/khanlab/design/nextera.target.refseq.ensembl.noUTR.notfound.bed", "strexome" : "/data/Clinomics/Ref/khanlab/design/Strexome.target.bed", "sureselect.exon.v5_utr": "/data/Clinomics/Ref/khanlab/design/NIAID.sureselect.exon.v5_utr.target.bed", - "sureselect.exon.v6" : "/data/Clinomics/Ref/khanlab/design/NIAID.sureselect.exon.v6.target.bed", + "sureselect.exon.v6" : "/data/Clinomics/Ref/khanlab/design/NIAID.sureselect.exon.v6.target.bed" }, "hotspot_bed" :{ "wholegenome" : "/data/Clinomics/Ref/khanlab/hotspot/hg19_Hotspot.08.08.16_merged.bed", diff --git a/dag.png b/dag.png index 9c81ff7..ac9834a 100644 Binary files a/dag.png and b/dag.png differ diff --git a/ngs_pipeline.snakefile b/ngs_pipeline.snakefile index fad81ac..2d61401 100644 --- a/ngs_pipeline.snakefile +++ b/ngs_pipeline.snakefile @@ -152,12 +152,11 @@ ALL_QC += expand("{subject}/{TIME}/igv/session_{subject}.xml", TIME=TIME, su for subject in config['subject']: for library in config['subject'][subject]: if config['sample_captures'][library] not in config['Panel_List']: + # any output which is desired on all libraries but Panel goes here, the list of panel captures should be maintained in the Panel_List in config file ALL_QC += [subject+"/"+TIME+"/qc/"+subject+".circos.png"] - if config['sample_type'][library] == 'Normal': - # This might be temporary, if needed on all sample remove line above line. - ALL_QC += [subject+"/"+TIME+"/"+library+"/HLA/seq2HLA/"+library+"-ClassI.HLAgenotype4digits"] - ALL_QC += [subject+"/"+TIME+"/"+library+"/HLA/HLAminer/HLAminer_HPTASR.csv"] - ALL_QC += [subject+"/"+TIME+"/"+library+"/HLA/"+library+".Calls.txt"] + ALL_QC += [subject+"/"+TIME+"/"+library+"/HLA/seq2HLA/"+library+"-ClassI.HLAgenotype4digits"] + ALL_QC += [subject+"/"+TIME+"/"+library+"/HLA/HLAminer/HLAminer_HPTASR.csv"] + ALL_QC += [subject+"/"+TIME+"/"+library+"/HLA/"+library+".Calls.txt"] if len(config['sample_references']) > 0: for Tumor in config['sample_references']: @@ -172,6 +171,7 @@ if len(config['sample_references']) > 0: seq2HLA = "{subject}/{TIME}/{sample}/HLA/seq2HLA/{sample}-ClassI.HLAgenotype4digits".format(TIME=TIME, subject=SAMPLE_TO_SUBJECT[Normal], sample=Normal) HLAminer = "{subject}/{TIME}/{sample}/HLA/HLAminer/HLAminer_HPTASR.csv".format(TIME=TIME, subject=SAMPLE_TO_SUBJECT[Normal], sample=Normal) if config['sample_captures'][Tumor] not in config['Panel_List']: + # any output which is desired on all somatic libraries but Panel goes here, the list of panel captures should be maintained in the Panel_List in config file HLA[Tumor] = [seq2HLA, HLAminer] SequenzaPairs[Tumor] = ["{subject}/{TIME}/{sample}/{sample}.mpileup.gz".format(TIME=TIME, subject=SAMPLE_TO_SUBJECT[Normal], sample=Normal), "{subject}/{TIME}/{sample}/{sample}.mpileup.gz".format(TIME=TIME, subject=SAMPLE_TO_SUBJECT[Tumor], sample=Tumor) ] ########################################################################### @@ -223,6 +223,7 @@ for sample in config['sample_references'].keys(): UNION_SOM_MUT[sample] = local UNION_SOM_MUT_LIST +=[subject+"/"+TIME+ACT_DIR+sample+".unionSomaticVars.txt"] if config['sample_captures'][sample] not in config['Panel_List']: + # any output which is desired on all libraries but Panel goes here, the list of panel captures should be maintained in the Panel_List in config file UNION_SOM_MUT_LIST +=[subject+"/"+TIME+ACT_DIR+sample+".mutationalSignature.pdf"] ########################################################################## @@ -374,15 +375,13 @@ rule Khanlab_Pipeline: ActionableFiles, UNION_SOM_MUT_LIST, expand ("ngs_pipeline_{NOW}.rnaseq.done", NOW=NOW) - version: "1.0" + version: config["pipeline_version"] wildcard_constraints: NOW="\w+" params: rulename = "Final", batch = config[config['host']]["job_default"], group = config["group"], - wait4job = NGS_PIPELINE + "/scripts/block_for_jobid.pl", - sort = NGS_PIPELINE + "/scripts/awk_sort_withHeader.awk", mail = NGS_PIPELINE + "/scripts/tsv2html.final.sh", email = config["mail"], host = config["host"], @@ -400,7 +399,7 @@ rule Khanlab_Pipeline: chmod g+rw {WORK_DIR}/${{sub}}/{TIME}/successful.txt chgrp {params.group} {WORK_DIR}/${{sub}}/{TIME}/successful.txt done - ssh {params.host} "{params.mail} --location {WORK_DIR} --host {params.host} --head {WORK_DIR}/ngs_pipeline_{NOW}.csv |mutt -e \\\"my_hdr Content-Type: text/html\\\" -s 'Khanlab ngs-pipeline Status' `whoami`@mail.nih.gov {params.email}" + ssh {params.host} "{params.mail} --location {WORK_DIR} --host {params.host} --head --version {version} {WORK_DIR}/ngs_pipeline_{NOW}.csv |mutt -e \\\"my_hdr Content-Type: text/html\\\" -s 'Khanlab ngs-pipeline Status' `whoami`@mail.nih.gov {params.email}" rm -rf {WORK_DIR}/ngs_pipeline_{NOW}.csv ####################### """ @@ -685,6 +684,7 @@ rule CN_LRR: ####################### module load R/{version} module load bedtools/2.25.0 + export LC_ALL=C mkdir -p {wildcards.subject}/{TIME}/Actionable/ echo -e "#Chr\\tStart\\tEnd\\tNormalCoverage\\tTumorCoverage\\tRatio\\tLRR\\tGene(s)\\tStrand(s)" >{output.out} paste {input.files} |cut -f 1-4,8 |awk '{{OFS="\\t"}};{{print $1,$2,$3,$4,$5,($5+1)/($4+1),log(($5+1)/($4+1))/log(2)}}' >{output.out}.temp1 @@ -1004,6 +1004,7 @@ rule FormatInput: shell: """ ####################### module load annovar/{version} + export LC_ALL=C cut -f 1-5 {input.txtFiles} |sort |uniq > {wildcards.subject}/{TIME}/annotation/AnnotationInput perl {input.convertor} {wildcards.subject}/{TIME}/annotation/AnnotationInput rm -rf "{wildcards.subject}/{TIME}/annotation/AnnotationInput.pph" diff --git a/ruleBook/NeoAntigen.snakefile b/ruleBook/NeoAntigen.snakefile index 748cf5c..2cbf33b 100644 --- a/ruleBook/NeoAntigen.snakefile +++ b/ruleBook/NeoAntigen.snakefile @@ -54,6 +54,7 @@ rule MergeHLA: batch = config[config['host']]["job_default"] shell: """ ####################### + export LC_ALL=C perl {input.Tool} {input.B} {input.A} | sort > {output} ####################### """ @@ -79,7 +80,7 @@ rule VEP: perl {input.tool} -vcf {wildcards.base}/{wildcards.TIME}/{wildcards.sample}/calls/{wildcards.sample}.strelka.indels.raw.vcf,{wildcards.base}/{wildcards.TIME}/{wildcards.sample}/calls/{wildcards.sample}.strelka.snvs.raw.vcf,{wildcards.base}/{wildcards.TIME}/{wildcards.sample}/calls/{wildcards.sample}.MuTect.raw.vcf -order {params.normal},{wildcards.sample} -filter REJECT |vcf-subset -u -c {wildcards.sample} >{output.vcf}.tmp variant_effect_predictor.pl -i {output.vcf}.tmp --plugin Downstream --plugin Wildtype --terms SO --offline --cache --dir_cache $VEPCACHEDIR --assembly GRCh37 --output_file {output.vcf} --vcf --force_overwrite rm -rf {output.vcf}.tmp - + export LC_ALL=C perl {input.merge} {input.HLA[0]} {input.HLA[1]} | sort > {wildcards.base}/{wildcards.TIME}/{params.normal}/HLA/{params.normal}.Calls.txt ####################### """ @@ -105,6 +106,7 @@ rule pVACseq: module load pvacseq python/{params.python} pvacseq run --iedb-install-directory {params.IEDB} -e 8,9,10,11 --fasta-size=200 {input} {wildcards.sample} ${{allele}} {{NNalign,NetMHC,NetMHCIIpan,NetMHCcons,NetMHCpan,PickPocket,SMM,SMMPMBEC,SMMalign}} {wildcards.base}/{wildcards.TIME}/{wildcards.sample}/NeoAntigen/ + export LC_ALL=C perl {params.tool} {output} |sort |uniq >{wildcards.base}/{wildcards.TIME}/{wildcards.sample}/NeoAntigen/{wildcards.sample}.final.txt ####################### """ diff --git a/ruleBook/annot.snakefile b/ruleBook/annot.snakefile index c590590..02a8869 100644 --- a/ruleBook/annot.snakefile +++ b/ruleBook/annot.snakefile @@ -255,9 +255,9 @@ rule CombineAnnotation: "{base}/AnnotationInput.sift.out", convertor = NGS_PIPELINE + "/scripts/CombineAnnotations.pl", geneanno = NGS_PIPELINE + "/scripts/GeneAnnotation.pl", - filter = NGS_PIPELINE + "/scripts/filterVariants.v1.pl", - coding = NGS_PIPELINE + "/scripts/ProteinCoding.pl", + coding = NGS_PIPELINE + "/scripts/ProteinCodingRare.pl", blacklisted = config["annovar_data"]+config["blacklisted"], + whitelisted = config["annovar_data"]+config["whitelisted"], ACMG = config["annovar_data"]+config['ACMG'] output: filtered="{base}/AnnotationInput.coding.rare.txt", @@ -286,7 +286,7 @@ echo "{wildcards.base}/AnnotationInput perl {input.convertor} {wildcards.base}/list >{output.all}.tmp perl {input.geneanno} {input.ACMG} {output.all}.tmp >{output.all} - perl {input.coding} {wildcards.base}/AnnotationInput.annotations.final.txt | perl {input.filter} - {input.blacklisted} 0.05 >{output.all}.tmp + perl {input.coding} {input.blacklisted} {input.whitelisted} {wildcards.base}/AnnotationInput.annotations.final.txt 0.05 >{output.all}.tmp grep -P "Chr\\tStart\\tEnd\\tRef\\tAlt" {output.all}.tmp >{output.filtered} grep -v -P "Chr\\tStart\\tEnd\\tRef\\tAlt" {output.all}.tmp >>{output.filtered} rm -rf {output.all}.tmp {wildcards.base}/list diff --git a/ruleBook/bam2mpg.snakefile b/ruleBook/bam2mpg.snakefile index 3a18a1e..704509d 100644 --- a/ruleBook/bam2mpg.snakefile +++ b/ruleBook/bam2mpg.snakefile @@ -25,7 +25,7 @@ rule Bam2MPG: fi module load bam2mpg/{version} - module load vcftools/{params.vcftools} + module load vcftools/{params.vcftools} {params.samtools} for CHR in `seq 1 22` X Y; do echo "bam2mpg --shorten --vcf_spec --qual_filter 20 -bam_filter '-q31' --region chr${{CHR}} --only_nonref --snv_vcf ${{LOCAL}}/chr${{CHR}}{wildcards.sample}.snps.vcf --div_vcf ${{LOCAL}}/chr${{CHR}}{wildcards.sample}.indel.vcf {input.ref} {input.bam}" diff --git a/ruleBook/rnaseq_pipeline.snakefile b/ruleBook/rnaseq_pipeline.snakefile index 5364233..cb6eed0 100644 --- a/ruleBook/rnaseq_pipeline.snakefile +++ b/ruleBook/rnaseq_pipeline.snakefile @@ -19,6 +19,7 @@ for subject in config['RNASeq'].keys(): RNA_QC_ALL +=[subject+"/"+TIME+"/qc/"+subject+".RnaSeqQC.txt"] ALL_QC +=[subject+"/"+TIME+"/qc/"+subject+".transcriptCoverage.png"] for sample in config['RNASeq'][subject]: + ALL_QC += [subject+"/"+TIME+"/qc/"+subject+".circos.png"] ALL_QC += [subject+"/"+TIME+"/"+sample+"/HLA/seq2HLA/"+sample+"-ClassI.HLAgenotype4digits"] ALL_QC += [subject+"/"+TIME+"/"+sample+"/HLA/HLAminer/HLAminer_HPTASR.csv"] ALL_QC += [subject+"/"+TIME+"/"+sample+"/HLA/"+sample+".Calls.txt"] @@ -38,12 +39,12 @@ for subject in config['RNASeq'].keys(): ALL_QC += [subject+"/"+TIME+"/"+sample+"/qc/"+sample+".star.gt"] ALL_QC += [subject+"/"+TIME+"/"+sample+"/fusion/"+sample+".actionable.fusion.txt"] add_to_SUBJECT_ANNO(subject, "rnaseq", [subject+"/"+TIME+"/"+sample+"/calls/"+sample+".HC_RNASeq.annotated.txt"]) - EXPRESSION += [subject+"/"+TIME+"/"+sample+"/exonExp_UCSC/"+sample+".exonExpression.UCSC.txt"] - EXPRESSION += [subject+"/"+TIME+"/"+sample+"/exonExp_ENS/"+sample+".exonExpression.ENS.txt"] + #EXPRESSION += [subject+"/"+TIME+"/"+sample+"/exonExp_UCSC/"+sample+".exonExpression.UCSC.txt"] + #EXPRESSION += [subject+"/"+TIME+"/"+sample+"/exonExp_ENS/"+sample+".exonExpression.ENS.txt"] for gtf in config['GTF']: - EXPRESSION += [subject+"/"+TIME+"/"+sample+"/cufflinks_"+gtf+"/genes.fpkm_tracking_log2"] + #EXPRESSION += [subject+"/"+TIME+"/"+sample+"/cufflinks_"+gtf+"/genes.fpkm_tracking_log2"] EXPRESSION += [subject+"/"+TIME+"/"+sample+"/TPM_"+gtf+"/"+sample+"_counts.Gene.txt"] - ALL_QC += [subject+"/"+TIME+"/"+subject+"/db/matrixInput_"+sample+"_"+gtf] + #ALL_QC += [subject+"/"+TIME+"/"+subject+"/db/matrixInput_"+sample+"_"+gtf] RNA_CALLS += ["{subject}/{TIME}/{sample}/calls/{sample}.HC_RNASeq.raw.vcf".format(TIME=TIME, subject=SUB2RNA[s], sample=s) for s in config['RNASeq'][subject]] SUB_FUSION[subject] = ["{subject}/{TIME}/{sample}/fusion/{sample}.actionable.fusion.txt".format(TIME=TIME, subject=SUB2RNA[s], sample=s) for s in config['RNASeq'][subject]] SUB_QC[subject] = ["{subject}/{TIME}/{sample}/qc/{sample}.RnaSeqQC.txt".format(TIME=TIME, subject=SUB2RNA[s], sample=s) for s in config['RNASeq'][subject]] diff --git a/scripts/ProteinCodingRare.pl b/scripts/ProteinCodingRare.pl new file mode 100755 index 0000000..c4c694a --- /dev/null +++ b/scripts/ProteinCodingRare.pl @@ -0,0 +1,62 @@ +#!/usr/bin/perl +use strict; +use warnings; +use 5.010; +local $SIG{__WARN__} = sub {my $message =shift; die $message;}; + +# Author: Rajesh Patidar rajbtpatidar@gmail.com +# This scripts take 4 inputs (in the order listed) +# 1 a list of variants (chr\tstart\tend\ref\talt\totherInfo) which should be filtered out from the +# 2 a bed file of regions to be excluded from filtering out +# 3 file to work on, must be annotated using khanlab annotation-pipeline/ngs-pipeline +# 4 MAF filter (0.05) This remove any variant if present in any manoj/minor 1000g/ES/ExAC population. +# +# +open(BLACK, $ARGV[0]); +open(WHITE, $ARGV[1]); +open(IN, $ARGV[2]); +my $freq = $ARGV[3]; +my %REMOVE; +while(){ + chomp; + my @t =split("\t", $_); + $REMOVE{"$t[0]\t$t[1]\t$t[2]\t$t[3]\t$t[4]"} ="$t[5]"; +} +close BLACK; +my %KEEP; +while(){ + chomp; + my @t =split("\t", $_); + $KEEP{"$t[0]\t$t[1]\t$t[2]"} ="$t[3]"; +} +while(){ + chomp; + if($_ =~ /^Chr/){ + print "$_\n"; + next; + } + my @a = split ("\t", $_); + if (exists $REMOVE{"$a[0]\t$a[1]\t$a[2]\t$a[3]\t$a[4]"}){ + next; + } + foreach my $key (keys %KEEP){ + my @region= split("\t", $key); + if($a[0] =~ /^$region[0]$/ and $a[1] >=$region[1] and $a[1] <= $region[2]){ + print "$_\n"; + next; + } + } + #if(exists $KEEP{"$a[0]\t$a[1]\t$a[2]\t$a[3]\t$a[4]"}){ + # print "$_\n"; + # next; + #} + if($a[5] =~ /^exonic/ and ($a[8] =~ /nonsynonymous/ or $a[8] =~ /stop/ or $a[8] =~ /frameshift/) or $a[5] =~ /^splicing/){ + for (@a[12..28]){ + s/^\.$/-2/; + } + if( $a[12] <=$freq and $a[13] <=$freq and $a[14] <=$freq and $a[15] <=$freq and $a[16] <=$freq and $a[17] <=$freq and $a[18] <=$freq and $a[19] <=$freq and $a[20] <=$freq and $a[21] <=$freq and $a[22] <=$freq and $a[23] <=$freq and $a[24] <=$freq and $a[25] <=$freq and $a[26] <=$freq and $a[27] <=$freq and $a[28] <=$freq){ + print "$_\n"; + } + } +} +close IN; diff --git a/scripts/tsv2html.final.sh b/scripts/tsv2html.final.sh index a8e00fe..d4b92d1 100755 --- a/scripts/tsv2html.final.sh +++ b/scripts/tsv2html.final.sh @@ -13,30 +13,11 @@ Options: -d Specify delimiter to look for, instead of "\t". + --version Pipeline version. --head Treat first line as header, enclosing in and tags. - --foot Treat last line as footer, enclosing in and tags. - --Name To include in Mail - --Diagnosis To include in the mail body -Examples: - - 1. $(baselocation $0) --Name NCI00001 input.csv - - Above will parse file 'input.csv' with comma as the field separator and - output HTML tables to STDOUT. - - 2. $(baselocation $0) -d '|' < input.psv > output.htm - - Above will parse file "input.psv", looking for the pipe character as the - delimiter, then output results to "output.htm". - - 3. $(baselocation $0) -d '\t' --head --foot < input.tsv > output.htm - - Above will parse file "input.tsv", looking for tab as the delimiter, then - process first and last lines as header/footer (that contain data labels), then - write output to "output.htm". - + EOF } @@ -46,6 +27,10 @@ while true; do shift d="$1" ;; + --version) + shift + version="$1" + ;; --foot) foot="-v ftr=1" ;; @@ -121,7 +106,7 @@ echo "

Hello,

- ngs-pipeline finished successfully on $host

+ ngs-pipeline version $version finished successfully on $host

Subject(s) Processed: