-
Notifications
You must be signed in to change notification settings - Fork 3
/
vampirus.config
409 lines (355 loc) · 23.2 KB
/
vampirus.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
/*
=============================================================================================================================================================
Configuration File vAMPirus
=============================================================================================================================================================
vAMPirus
Author: Alex J. Veglia and Ramón Rivera-Vicéns
-------------------------------------------------------------------------------------------------------------------------------------------------------------
*/
params {
// -------------------------- EDIT variables where needed -------------------------- //
// Path to vAMPirus installation directory, will be filled automatically when startup script is run, otherwise, edit below
vampdir="VAMPDIR"
// Project specific information
// Project name - Name that will be used as a prefix for naming files by vAMPirus
projtag="vAMPirusAnalysis"
// Path to metadata spreadsheet file to be used for plot
metadata="/PATH/TO/vampirus_meta.csv"
// reads directory, must specify the path with "*R{1,2}*" or for single end reads you should put /path/*.fastq for the sample name to be properly read by Nextflow
reads="/PATH/TO/reads/*_R{1,2}*"
// Single-end data? Make single = true if this is the case.
single = false
// PATH to working directory of your choosing, will automatically be set to vAMPirus installation
workingdir="VAMPDIR"
// Name of directory created to store output of vAMPirus analyses (Nextflow will create this directory in the working directory)
outdir="results"
// Quality filter/trimming options
// Average read quality - forward or reverse reads will be discarded if average base quality across the read is below the number set below (25 is a good start)
avQ="25"
// Maximum number of "N"s acceptable in the forward or reverse reads (default for fastp is 5)
mN="5"
// Minmum base quality to be trimmed
trimq="15"
// Primer Removal parameters
// If not specifying primer sequences from paired-end data, forward and reverse reads will be trimmed by number of bases specified using "--gtrim #basesfromforward,#basesfromreverse". Use "trim" below for trimming single end reads.
gtrim=""
// If not specifying primer sequences, single reads will be trimmed by number of bases specified below.
trim=""
// Specific primer sequence on forward reads to be removed. Also, if using single-end mode add the primer sequence here and leave the rev="". NOTE - bbduk.sh which is used to trim the primers does not recognize Inosine (I) in the primer sequence, replace "I" with "N" in the sequence. It recognizes all other IUPAC degenerate base codes.
fwd=""
// Reverse primer sequence. Leave blank if using single-end mode. -- NOTE - bbduk.sh which is used to trim the primers does not recognize Inosine (I) in the primer sequence, replace "I" with "N" in the sequence. It recognizes all other IUPAC degenerate base codes.
rev=""
// Path to fasta file with primer sequences to remove (need to specify if using --multi option ). You can use this with single-end as well. -- NOTE - bbduk.sh which is used to trim the primers does not recognize Inosine (I) in the primer sequence, replace "I" with "N" in the sequence. It recognizes all other IUPAC degenerate base codes.
primers="/PATH/TO/PRIMERS.fasta"
// Primer length (default 26)- If trimming primers with the --multi option or by specifying primer sequences above, change this to the length of the longer of the two primer sequences
primerLength="26"
// Maximum kmer length for primer removal (must be shorter than your primer length; default = 13)
maxkmer="13"
// Minimum kmer length for primer removal (default = 3)
minkmer="3"
// Minimum non-merged read length after adapter and primer removal (default = 200)
minilen="100"
// Merged read length filtering parameters
// Minimum merged read length - reads with lengths greater than minLen and below the specified maximum read length will be used for counts only
minLen="400"
// Maximum merged read length - reads with length equal to the specified max read length will be used to generate uniques and ASVs (safe to set at expected amplicon size to start)
maxLen="420"
// Maximum expected error for vsearch merge command - vsearch discard sequences with more than the specified number of expected errors
maxEE="3"
// Maximum number of non-matching nucleotides allowed in overlap region
diffs="10"
// Maximum number of "N"'s in a sequence - if above the specified value, sequence will be discarded (should be similar to what is set for "mN" above in fastp parameters)
maxn="10"
// Minimum length of overlap for sequence merging to occur for a pair
minoverlap="10"
// ASV generation parameters
// Alpha value for denoising - the higher the alpha the higher the chance of false positives in ASV generation
alpha="2"
// Minimum size or representation in dataset for sequence to be considered in ASV generation (ex. If set to 4, any unique sequence that is not seen in the data more than 3 times is ignored). Default is 8 is USEARCH and VSEARCH, the lower the minimum size the higher the sensitivity.
minSize="8"
// ASV filtering parameters - You can set the filtering to run with the command --filter
// Path to database containing sequences that if ASVs match, are then removed prior to any analyses. Keep empty if only using a "keep" database.
filtDB=""
// Path to database containing sequences that if ASVs match to, are kept for final ASV file to be used in susequent analyses. Keep empty if only using a "filter" database.
keepDB=""
// Keep any sequences without hits - for yes, set keepnohit to ="true". All sequences without an alignment will kept if no "keep" database provided.
keepnohit="true"
//Parameters for diamond command for ASV filtering
// Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment
filtminID="80"
// Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment
filtminaln="30"
// Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive)
filtsensitivity="ultra-sensitive"
// Set the max e-value for best hit to be recorded
filtevalue="0.001"
// ASV clustering parameters
// Percent similarity to cluster nucleotide ASV sequences (used when --ncASV is set)
clusterNuclID="85"
// List of percent similarities to cluster nucleotide ASV sequences - must be separated by a comma (ex. "95,96")
clusterNuclIDlist=""
// Default percent similarity to cluster aminoacid sequences
clusterAAID="97"
// List of percent similarities to cluster aminoacid sequences - must be separated by "95,96"
clusterAAIDlist=""
// Minimum length of amino acid translation to be considered during protein clustered ASV (pcASV) generation. Recommended to put this at the expected amino acid sequence length based on your maximum read length (e.g. if maxLen="420", then minAA should be 420/3 so 140)
minAA="140"
// Counts table generation parameters
// Use --search_exact algorithm in vsearch to generate ASV counts tables. Change to "true" below or use --exact in the launch command.
exact = true
// If not using --search_exact (exact = false above), you will use --usearch_global. Set the minimum percent ID (97% = ".97") to count as a hit in counts table generation. .97 is default for USEARCH and VSEARCH even when using ASVs to make up for potential PCR errors.
id=".97"
// Minimum length of query read to be used in ASV/ncASV counts table generation with vsearch
minLencount="400"
// Parameters for protein counts table generation
// Minimum Bitscore for counts
ProtCountsBit="50"
// Minimum aminoacid sequence similarity for hit to count
ProtCountID="85"
// Minimum alignment length for hit to count -- Be mindful that this length should be around $minLencount/3, so if minLencount="400" then ProtsCountsLength should equal no more than 133 (400/3)
ProtCountsLength="50"
// Minimum Entropy Decomposition (MED) parameters for clustering (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/)
// If you plan to do MED on ASVs using the option "--asvMED" you can set here the number of entopy peak positions or for oligotyping to take into consideration.
// Decomposition of sequences based on specific positions in sequences -- either a single (asvC="1"; meaning decompose sequences based on position 1) or a comma seperated list of biologically meaningful positons (asvC="35,122,21"; meaning decompose sequences based on positions 35, 122, 21). If value given for asvC, it will overide asvc.
asvC=""
// Decomposition of sequences based on the top "x" amount of sequence positions with the highest entropy values. So if asvc = 10 it will decompose based on positions with the top ten highest entropy values.
asvc=""
// If you plan to do MED on ASVs using the option "--aminoMED" you can set here the number of positions for oligotyping to take into consideration.
// Decomposition of sequences based on specific positions in sequences -- either a single (asvC="1"; meaning decompose sequences based on position 1) or a comma seperated list of biologically meaningful positons (aminoC="35,122,21"; meaning decompose sequences based on positions 35, 122, 21). If value given for aminoC, it will overide aminoc.
aminoC=""
// Decomposition of sequences based on the top "x" amount of sequence positions with the highest entropy values. So if asvc = 10 it will decompose based on positions with the top ten highest entropy values.
aminoc=""
// Sequence alignment options -- Using musclev5 you can decide if you would like to perform single replicate alignment or Ensemble alignment methods (read more here: https://drive5.com/muscle)
// NOTE: if srep and ensemble below are either both true or both false, vAMPirus will default to doing single rep with the default muscle parameters
// Single replicate alignment options
// Set this = to "true" for single replicate sequence alignment with musclev5 -- if < 300 sequences, muscle will use MPC algorithm; > 300 sequences muscle will use Super5 algorithm
srep="false"
// Set guide tree permutation for muscle (default for muscle is none; other options include "abc, acb, bca")
perm="none"
// Set the pertubation seed "0, 1, 2 ..." (default for muscle is 0 = don't perterb)
pert="0"
// Ensemble alignment options
// Set this = to "true" for Ensemble sequence alignent approach
ensemble="true"
// Set "stratified" or "diversified" in ensemble alignment command -- When extracting best alignment from ensemble, diversified input is recommended
fied="diversified"
// Number of replicates for ensemble alignment -- Default for stratified is 4; for diversified is 100
N="100"
// Phylogeny-based ASV/AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)
// Add the "--asvTClust" option to the launch command to run phylogeny-based clustering of ASVs ; Add "--aminoTClust" to launch command for phylogeny-based clustering on AminoTypes
// NOTE: you can't use "--skipPhylogeny" when doing this form of sequence clustering
// TreeCluster command options for ASV clustering (--asvTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options (a good start is what is below)
asvTCopp="-t 0.045 -m max_clade -s 50"
// TreeCluster command options for AminoType clustering (--aminoTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options
aminoTCopp="-t 0.045 -m max_clade -s 50"
// Taxonomy inference parameters
//Parameters for diamond command
// Set which measurement to use for a minimum threshold in taxonomy inference - must be either "evalue" or "bitscore"
measurement="bitscore"
// Set maximum e-value for hits to be counted
evalue="0.001"
// Set minimum bitscore for best hit in taxonomy assignment (default = 50)
bitscore="50"
// Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment
minID="40"
// Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment
minaln="100"
// Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive)
sensitivity="ultra-sensitive"
// Database information
// Specify name of database to use for analysis
dbname="DATABASENAME"
// Path to Directory where database is being stored - vAMPirus will look here to make sure the database with the name provided above is present and built
dbdir="DATABASEDIR"
// Set database type (NCBI or RVDB). Lets vAMPirus know which sequence header format is being used and must be set to NCBI when using RefSeq or Non-Redundant databases. -> dbtype="NCBI" to toggle use of RefSeq header format; set to "RVDB" to signal the use of Reverence Viral DataBase (RVDB) headers (see manual)
dbtype="TYPE"
// Classification settings - if planning on inferring LCA from RVDB annotation files OR using NCBI taxonomy files, confirm options below are accurate.
// Path to directory RVDB hmm annotation .txt file - see manual for information on this. Leave as is if not planning on using RVDB LCA.
dbanno="DATABASEANNOT"
// Set lca="T" if you would like to add "Least Common Ancestor" classifications to taxonomy results using information provided by RVDB annotation files (works when using NCBI or RVDB databases) - example: "ASV1, Viruses::Duplodnaviria::Heunggongvirae::Peploviricota::Herviviricetes::Herpesvirales::Herpesviridae::Gammaherpesvirinae::Macavirus"
lca="LCA"
// DIAMOND taxonomy inference using NCBI taxmap files (can be downloaded using the startup script using the option -t); set to "true" for this to run (ONLY WORKS WITH dbtype="NCBI")
ncbitax="false"
// Phylogeny analysis parameters
// Color nodes on phylogenetic tree in Analyze report based on sequence ID (nodeCol="empty") MED Group information (nodeCol="MED"), taxonomy (nodeCol=TAX) hit, or TreeCluster Group Information (nodeCol=TC). If you would like nodes colored by sequence ID, leave nodeCol="" below.
nodeCol="empty" //Do not leave this parameter without a value, will cause error.
// Customs options for IQ-TREE (Example: "-option1 A -option2 B -option3 C -option4 D")
iqCustomnt=""
iqCustomaa=""
// These options below you can set at the command like, for example, to set to use model from ModelTest-NG with parametric bootstrapping --ModelTnt --ModelTaa --parametric
// Signal for IQ-TREE to use model determined by ModelTest-NG (Default is IQ-TREE will do automatic model testing with ModelFinder Plus) -- This sometime causes errors, some models determined by ModelTestNG is not found by IQTREE.
ModelTnt=false
ModelTaa=false
// Choose best model from ModelTest-NG -- BIC, AIC, or AICc?
crit="AIC"
// Set to have non-parametric, parametric, or transfer bootstrap expection (TBE; https://www.nature.com/articles/s41586-018-0043-0#Sec6) bootstrapping approach to be used by IQ-TREE. If all "false" by default parametric bootstrapping is done. TBE is good for a large dataset.
parametric=false
nonparametric=false
tbe=false
// Number of bootstraps (recommended 1000 for parametric and 100 for non-parametric)
boots="1000"
// Stats options
// Tell vAMPirus to perform statistical analyses by setting "stats = true" below or in the launch command by adding "--stats" to it
stats = false
// Minimum number of hit counts for a sample to have to be included in the downstream statistical analyses and report generation
minimumCounts="1000"
// Maximum number of iteration performed by metaMDS
trymax="900"
/*
// ---------------------------------------------------------------------- STOP ---------------------------------------------------------------------- //
// -------------------------------------------------------- Do not modify variables below this line. -------------------------------------------------------- //
// ----------------------------------------------------------- Proceed to modify processes at end ----------------------------------------------------------- //
// ----------------------------------------------------------------- If needed ----------------------------------------------------------------- //
*/
// Pipeline options
help = false
fullHelp = false
// Manadotory arguments
Analyze=false
DataCheck=false
// Non-Mandatory options
// Cluster nucleotide sequences (ncASVs)
ncASV = false
// Cluster by aminoacid translations and generate protein-based OTUs (pcASVs)
pcASV = false
// Generate virus types with MED of ASV sequences
asvMED = false
// Generate virus types with MED of ASV sequences
aminoMED = false
// Filter ASVs
filter = false
// Phylogeny-based ASV clustering
asvTClust = false
// Phylogeny-based AminoType clustering
aminoTClust = false
// Skip options
// Skip all Read Processing steps
skipReadProcessing = false
// Skip quality control processes only
skipFastQC = false
// Skip adapter removal process only
skipAdapterRemoval = false
// Skip primer removal process only
skipPrimerRemoval = false
// Skip AminoTyping
skipAminoTyping = false
// Skip Taxonomy
skipTaxonomy = false
// Skip phylogeny
skipPhylogeny = false
// Skip EMBOSS analyses
skipEMBOSS = false
// Skip Reports
skipReport = false
// Skip Merging steps -> will also skip all read Processing
skipMerging = false
// Data check parameters
datacheckntIDlist=".55,.65,.75,.80,.81,.82,.83,.84,.85,.86,.87,.88,.89,.90,.91,.92,.93,.94,.95,.96,.97,.98,.99"
datacheckaaIDlist=".55,.65,.75,.80,.81,.82,.83,.84,.85,.86,.87,.88,.89,.90,.91,.92,.93,.94,.95,.96,.97,.98,.99,1.0"
// If not specifying primer sequences OR --gtrim, forward reads will be trimmed by number of bases specified below
defaultFwdTrim="20"
// If not specifying primer sequences OR --gtrim, reverse reads will be trimmed by number of bases specified below
defaultRevTrim="26"
// Option for multi-barcoding approach
multi = false
// Directory for pipeline info
tracedir="PipelinePerformance"
// Path to logo
logo="${params.vampdir}/exampledata/conf"
// readTest
readsTest = false
// These options will chnage how the profiles work.
// Run with conda installed by the precheck
condaActivate = false
// Cache directory for conda and singularity files. Leave in blank if not sure
envCacheDir = "${params.vampdir}/vAMPirusCondaEnvs"
// Singularity
// Use singularity image created after pulling from docker and not from Galaxy depot (singularity image ready to use).
singularity_pull_docker_container = false
sing = false
}
/*
// ------------------------- Process variables below ------------------------- //
Proceed to modify processes if needed. Choose the scheduler and options:
Executor = SLURM, PBS, etc.
Cluster Options = Partition, Nodes, Priority, Email, etc.
If running locally leave the comments (the ""\\") on "executor" and "clusterOptions".
For more info see the README and/or Nextflow documentation.
*/
process {
withLabel: low_cpus {
cpus='1'
memory='2 GB'
//executor='slurm'
//clusterOptions='--cluster=cm2 --partition=cm2_tiny --qos=cm2_tiny --nodes=1'
}
withLabel: norm_cpus {
cpus='2'
memory='2 GB'
//executor='slurm'
//clusterOptions='--cluster=cm2 --partition=cm2_tiny --qos=cm2_tiny --nodes=1'
}
withLabel: high_cpus {
cpus='2'
memory='2 GB'
//executor='slurm'
//clusterOptions='--cluster=cm2 --partition=cm2_tiny --qos=cm2_tiny --nodes=1'
}
errorStrategy='finish'
}
// env variables (only for nextflow)
env.tools="${params.vampdir}/bin/"
process.shell = ['/bin/bash', '-euo', 'pipefail']
timeline {
enabled = true
overwrite = true
file = "${params.workingdir}/${params.outdir}/${params.tracedir}/vampirus_timeline.html"
}
report {
enabled = true
overwrite = true
file = "${params.workingdir}/${params.outdir}/${params.tracedir}/vampirus_report.html"
}
trace {
enabled = true
overwrite = true
file = "${params.workingdir}/${params.outdir}/${params.tracedir}/vampirus_trace.txt"
}
dag {
enabled = true
overwrite = true
file = "${params.workingdir}/${params.outdir}/${params.tracedir}/vampirus_dag.html"
}
profiles {
conda {
conda.enabled = true
params.condaActivate = true
// cache for condaEnv created individually
conda.cacheDir = "${params.vampdir}/vAMPirusCondaEnvs/"
}
docker {
docker.enabled = true
docker.runOptions = '-u \$(id -u):\$(id -g)'
}
singularity {
singularity.enabled = true
singularity.autoMounts = true
// cache for images from docker pull
singularity.cacheDir="${params.vampdir}/singularityCache/"
params.sing = true
}
podman {
podman.enabled = true
}
test {
includeConfig 'example_data/conf/test.config'
}
}
manifest {
name = 'vAMPirus'
author = 'Alex J. Veglia,Ramón Rivera-Vicéns'
description = 'Automated virus amplicon sequencing analysis program'
mainScript = 'vAMPirus.nf'
nextflowVersion = '>=21.04.1'
version = '2.0.2'
}