Skip to content

Commit

Permalink
update provenance format (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
dfornika authored Jun 18, 2024
1 parent 36d8db1 commit 2f2f087
Show file tree
Hide file tree
Showing 12 changed files with 366 additions and 281 deletions.
112 changes: 63 additions & 49 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,65 +1,71 @@
#!/usr/bin/env nextflow

import java.time.LocalDateTime

nextflow.enable.dsl = 2

include { hash_files as hash_files_fastq } from './modules/hash_files.nf'
include { hash_files as hash_files_assemblies } from './modules/hash_files.nf'
include { trim_reads } from './modules/fastp.nf'
include { fastp_json_to_csv } from './modules/fastp.nf'
include { unicycler } from './modules/unicycler.nf'
include { mash_screen } from './modules/mash_screen.nf'
include { quast } from './modules/quast.nf'
include { parse_quast_report } from './modules/quast.nf'
include { mob_recon } from './modules/mob_recon.nf'
include { abricate } from './modules/abricate.nf'
include { join_mob_typer_and_abricate_reports } from './modules/join_reports.nf'
include { select_resistance_chromosomes } from './modules/join_reports.nf'
include { select_resistance_contigs } from './modules/select_resistance_contigs.nf'
include { select_resistance_reconstructions } from './modules/select_resistance_reconstructions.nf'
include { choose_reference_plasmids } from './modules/choose_reference_plasmids.nf'
include { get_reference_plasmid } from './modules/get_reference_plasmid.nf'
include { align_reads_to_reference_plasmid } from './modules/align_reads_to_reference_plasmid.nf'
include { calculate_coverage } from './modules/calculate_coverage.nf'
include { call_snps } from './modules/freebayes.nf'
include { hash_files as hash_files_fastq } from './modules/hash_files.nf'
include { hash_files as hash_files_assemblies } from './modules/hash_files.nf'
include { trim_reads } from './modules/fastp.nf'
include { fastp_json_to_csv } from './modules/fastp.nf'
include { unicycler } from './modules/unicycler.nf'
include { mash_screen } from './modules/mash_screen.nf'
include { quast } from './modules/quast.nf'
include { parse_quast_report } from './modules/quast.nf'
include { mob_recon } from './modules/mob_recon.nf'
include { abricate } from './modules/abricate.nf'
include { join_mob_typer_and_abricate_reports } from './modules/join_reports.nf'
include { select_resistance_chromosomes } from './modules/join_reports.nf'
include { select_resistance_contigs } from './modules/select_resistance_contigs.nf'
include { select_resistance_reconstructions } from './modules/select_resistance_reconstructions.nf'
include { choose_reference_plasmids } from './modules/choose_reference_plasmids.nf'
include { get_reference_plasmid } from './modules/get_reference_plasmid.nf'
include { align_reads_to_reference_plasmid } from './modules/align_reads_to_reference_plasmid.nf'
include { calculate_coverage } from './modules/calculate_coverage.nf'
include { call_snps } from './modules/freebayes.nf'
include { join_resistance_plasmid_and_snp_reports } from './modules/join_reports.nf'
include { concatenate_resistance_reports } from './modules/join_reports.nf'
include { collect_provenance } from './modules/provenance.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { concatenate_resistance_reports } from './modules/join_reports.nf'
include { collect_provenance } from './modules/provenance.nf'
include { pipeline_provenance } from './modules/provenance.nf'

workflow {
ch_start_time = Channel.of(LocalDateTime.now())
ch_pipeline_name = Channel.of(workflow.manifest.name)
ch_pipeline_version = Channel.of(workflow.manifest.version)

ch_pipeline_provenance = pipeline_provenance(ch_pipeline_name.combine(ch_pipeline_version).combine(ch_start_time))
ch_workflow_metadata = Channel.value([
workflow.sessionId,
workflow.runName,
workflow.manifest.name,
workflow.manifest.version,
workflow.start,
])

if (params.samplesheet_input != 'NO_FILE') {
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }
} else {
ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
}
ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata)

if (params.samplesheet_input != 'NO_FILE') {
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }
} else {
ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
}

ch_mob_db = Channel.fromPath(params.mob_db)
ch_mob_db = Channel.fromPath(params.mob_db)

main:

ch_provenance = ch_fastq.map{ it -> it[0] }

main:
hash_files_fastq(ch_fastq.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq_input")))

trim_reads(ch_fastq)

fastp_json_to_csv(trim_reads.out.json)

if (params.pre_assembled) {
if (params.samplesheet_input != 'NO_FILE') {
ch_assemblies = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['ASSEMBLY']] }
} else {
ch_assemblies = Channel.fromPath( params.assembly_search_path ).map{ it -> [it.baseName.split('_')[0], it] }.unique{ it -> it[0] }
}
hash_files_assemblies(ch_assemblies.map{ it -> [it[0], [it[1]]] }.combine(Channel.of("assembly_input")))
if (params.samplesheet_input != 'NO_FILE') {
ch_assemblies = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['ASSEMBLY']] }
} else {
ch_assemblies = Channel.fromPath( params.assembly_search_path ).map{ it -> [it.baseName.split('_')[0], it] }.unique{ it -> it[0] }
}
hash_files_assemblies(ch_assemblies.map{ it -> [it[0], [it[1]]] }.combine(Channel.of("assembly_input")))
} else {
unicycler(trim_reads.out.reads)
ch_assemblies = unicycler.out.assembly
unicycler(trim_reads.out.reads)
ch_assemblies = unicycler.out.assembly
}

quast(ch_assemblies)
Expand Down Expand Up @@ -93,17 +99,25 @@ workflow {
concatenate_resistance_reports(select_resistance_chromosomes.out.join(join_resistance_plasmid_and_snp_reports.out, remainder: true).groupTuple().map{ it -> [it[0], (it[1] - null) + (it[2] - null)]}.map{ it -> [it[0]] + [it[1].unique{ path -> path.getFileName() }] })


ch_provenance = mob_recon.out.provenance
ch_provenance = ch_provenance.join(abricate.out.provenance).map{ it -> [it[0], [it[1]] << it[2]] }
ch_provenance = ch_provenance.join(trim_reads.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
//
// Provenance collection processes
// The basic idea is to build up a channel with the following structure:
// [sample_id, [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]]
// ...and then concatenate them all together in the 'collect_provenance' process.
ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map{ it -> [it[0], [it[1]]] }
ch_provenance = ch_provenance.join(hash_files_fastq.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
if (params.pre_assembled) {
ch_provenance = ch_provenance.join(hash_files_assemblies.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(hash_files_assemblies.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
} else {
ch_provenance = ch_provenance.join(unicycler.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
}
ch_provenance = ch_provenance.join(trim_reads.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(quast.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(mash_screen.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(mob_recon.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(abricate.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(align_reads_to_reference_plasmid.out.provenance, remainder: true).map{ it -> it.collect{ x -> x ? x : [] }}.map{ it -> [it[0], it[1] << it[2]] }.groupTuple().map{ it -> [it[0], it[1].flatten()] }
ch_provenance = ch_provenance.join(call_snps.out.provenance, remainder: true).map{ it -> it.collect{ x -> x ? x : [] }}.map{ it -> [it[0], it[1] << it[2]] }.groupTuple().map{ it -> [it[0], it[1].flatten()] }
ch_provenance = ch_provenance.join(quast.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.join(ch_fastq.map{ it -> it[0] }.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] }
ch_provenance = ch_provenance.map{ [it[0]] + [it[1].unique{ path -> path.getFileName() }] }
collect_provenance(ch_provenance)
}
33 changes: 20 additions & 13 deletions modules/abricate.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,30 @@ process abricate {

tag { sample_id }

publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", pattern: "${sample_id}_abricate.tsv", mode: 'copy'

cpus 1
publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_abricate.tsv", mode: 'copy'

input:
tuple val(sample_id), path(assemblies)
tuple val(sample_id), path(assemblies)

output:
tuple val(sample_id), path("${sample_id}_abricate.tsv"), emit: report
tuple val(sample_id), path("${sample_id}*_provenance.yml"), emit: provenance
tuple val(sample_id), path("${sample_id}_abricate.tsv"), emit: report
tuple val(sample_id), path("${sample_id}*_provenance.yml"), emit: provenance

script:
"""
printf -- "- process_name: abricate\\n" > ${sample_id}_abricate_provenance.yml
printf -- " tool_name: abricate\\n tool_version: \$(abricate --version | cut -d ' ' -f 2)\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " - parameter: db\\n value: ncbi\\n" >> ${sample_id}_abricate_provenance.yml
abricate --db ncbi ${assemblies} > ${sample_id}_abricate.tsv
"""
db = "ncbi"
"""
printf -- "- process_name: abricate\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " tools:\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " - tool_name: abricate\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " tool_version: \$(abricate --version | cut -d ' ' -f 2)\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " - parameter: db\\n" >> ${sample_id}_abricate_provenance.yml
printf -- " value: ${db}\n" >> ${sample_id}_abricate_provenance.yml
abricate \
--threads ${task.cpus} \
--db ${db} \
--nopath \
${assemblies} > ${sample_id}_abricate.tsv
"""
}
74 changes: 39 additions & 35 deletions modules/align_reads_to_reference_plasmid.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,51 @@ process align_reads_to_reference_plasmid {

tag { sample_id + " / " + plasmid_id + " / " + resistance_gene }

publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", pattern: "${sample_id}_${plasmid_id}.sorted{.bam,.bam.bai}", mode: 'copy'
publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", pattern: "${reference_plasmid}", mode: 'copy'
publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_${plasmid_id}.sorted{.bam,.bam.bai}", mode: 'copy'
publishDir "${params.outdir}/${sample_id}", pattern: "${reference_plasmid}", mode: 'copy'

input:
tuple val(sample_id), path(reads_r1), path(reads_r2), val(plasmid_id), val(resistance_gene), path(reference_plasmid)
tuple val(sample_id), path(reads_r1), path(reads_r2), val(plasmid_id), val(resistance_gene), path(reference_plasmid)

output:
tuple val(sample_id), val(plasmid_id), val(resistance_gene), path(reference_plasmid), path("${sample_id}_${plasmid_id}.sorted.bam"), path("${sample_id}_${plasmid_id}.sorted.bam.bai"), emit: alignment
tuple val(sample_id), val(plasmid_id), path("${sample_id}_${plasmid_id}_coverage.csv"), emit: coverage
tuple val(sample_id), path("${sample_id}*_provenance.yml"), emit: provenance
tuple val(sample_id), val(plasmid_id), val(resistance_gene), path(reference_plasmid), path("${sample_id}_${plasmid_id}.sorted.bam"), path("${sample_id}_${plasmid_id}.sorted.bam.bai"), emit: alignment
tuple val(sample_id), val(plasmid_id), path("${sample_id}_${plasmid_id}_coverage.csv"), emit: coverage
tuple val(sample_id), path("${sample_id}*_provenance.yml"), emit: provenance

script:
// SAM flag 1540 = 'read unmapped' + 'read fails quality checks' + 'read is optical duplicate'
"""
printf -- "- process_name: align_reads_to_reference_plasmid\\n" > ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " process_tags:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " ref_plasmid_id: ${plasmid_id}\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " resistance_gene: ${resistance_gene}\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " tool_name: bwa\\n tool_version: \$(bwa 2>&1 | grep 'Version' | cut -d ' ' -f 2)\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " subcommand: mem\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: alignment_algorithm\\n value: mem\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: output_all_alignments\\n value: true\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: use_soft_clipping_for_supplementary_alignments\\n value: true\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: mark_shorter_split_hits_as_secondary\\n value: true\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- "- process_name: align_reads_to_reference_plasmid\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " process_tags:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " ref_plasmid_id: ${plasmid_id}\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " resistance_gene: ${resistance_gene}\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " tool_name: samtools\\n tool_version: \$(samtools --version | grep 'samtools' | cut -d ' ' -f 2)\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " subcommand: view\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: exclude_flags\\n value: 1540\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
samtools faidx ${reference_plasmid}
bwa index ${reference_plasmid}
bwa mem -a -Y -M -t ${task.cpus} ${reference_plasmid} ${reads_r1} ${reads_r2} | \
samtools view -@ ${task.cpus} -h --exclude-flags 1540 | \
samtools sort -@ ${task.cpus} -n | \
samtools fixmate -@ ${task.cpus} -m - - | \
samtools sort -@ ${task.cpus} | \
// SAM flag 1540 = 'read unmapped' + 'read fails quality checks' + 'read is optical duplicate'
"""
printf -- "- process_name: align_reads_to_reference_plasmid\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " process_tags:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " ref_plasmid_id: ${plasmid_id}\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " resistance_gene: ${resistance_gene}\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " tools:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - tool_name: bwa\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " tool_version: \$(bwa 2>&1 | grep 'Version' | cut -d ' ' -f 2)\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " subcommand: mem\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: output_all_alignments\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " value: true\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: use_soft_clipping_for_supplementary_alignments\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " value: true\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: mark_shorter_split_hits_as_secondary\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " value: true\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - tool_name: samtools\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " tool_version: \$(samtools --version | grep 'samtools' | cut -d ' ' -f 2)\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " subcommand: view\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " - parameter: exclude_flags\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
printf -- " value: 1540\\n" >> ${sample_id}_${plasmid_id}_${resistance_gene}_bwa_samtools_provenance.yml
samtools faidx ${reference_plasmid}
bwa index ${reference_plasmid}
bwa mem -a -Y -M -t ${task.cpus} ${reference_plasmid} ${reads_r1} ${reads_r2} | \
samtools view -h --exclude-flags 1540 | \
samtools sort -n | \
samtools fixmate -m - - | \
samtools sort | \
samtools markdup - - > ${sample_id}_${plasmid_id}.sorted.bam
samtools index ${sample_id}_${plasmid_id}.sorted.bam
Expand Down
Loading

0 comments on commit 2f2f087

Please sign in to comment.