-
Notifications
You must be signed in to change notification settings - Fork 1
/
setup.sh
executable file
·348 lines (267 loc) · 9.11 KB
/
setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#!/usr/bin/env bash
SCRIPT_PATH=$(
cd "$(dirname "${BASH_SOURCE[0]}")" || exit 1
pwd -P
)
readonly VALID_TASKS=("all db_install db_setup tools_install tools_setup fusioncatcherdb ref example install_gatk install_gatk4 install_annovar")
function join_by { local IFS="$1"; shift; echo "$*"; }
function usage() {
echo "usage: setup -t task"
echo " -t task specify task: $(join_by ' ' ${VALID_TASKS})"
echo " -h show this help screen"
echo " -m file Path to including filename of MSigDB hallmarks gene-set h.all.vX.X.entrez.gmt file"
echo " -a file Annovar archive annovar.latest.tar.gz"
exit 1
}
while getopts d:t:m:a:ph option; do
case "${option}" in
d) readonly PARAM_DIR_PATIENT=$OPTARG ;;
t) PARAM_TASK=$OPTARG ;;
m) readonly HALLMARKS=$OPTARG ;;
a) ANNOVARS=$OPTARG ;;
h) usage ;;
\?)
echo "Unknown option: -$OPTARG" >&2
exit 1
;;
:)
echo "Missing option argument for -$OPTARG" >&2
exit 1
;;
*)
echo "Unimplemented option: -$OPTARG" >&2
exit 1
;;
esac
done
# if no patient is defined
if [[ -z "${PARAM_TASK}" ]]; then
PARAM_TASK='all'
fi
if [[ ! " ${VALID_TASKS[@]} " =~ " ${PARAM_TASK} " ]]; then
echo "unknown task: ${PARAM_TASK}"
echo "use one of the following values: $(join_by ' ' ${VALID_TASKS})"
exit 1
fi
[[ -z "$(which wget)" ]] && exit 1
readonly DIR_TOOLS="${SCRIPT_PATH}/tools"
readonly DIR_DATABASES="${SCRIPT_PATH}/databases"
readonly DIR_ASSETS="${SCRIPT_PATH}/assets"
readonly DIR_INPUT="${DIR_ASSETS}/input"
readonly DIR_REF="${DIR_ASSETS}/references"
readonly DIR_SEQUENCING="${DIR_REF}/sequencing"
# direct download of any file from gdrive
# https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/49444877#49444877
function curlgdrive() {
[[ -z "$(which curl)" ]] && exit 1
local fileid="${1}"
local filename="${2}"
local cookiefile="cookie-${fileid}"
# download file using cookie information
curl -c "${SCRIPT_PATH}/${cookiefile}" -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
curl -Lb "${SCRIPT_PATH}/${cookiefile}" "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ${SCRIPT_PATH}/${cookiefile}`&id=${fileid}" -o "${filename}"
# remove cookie
rm -f "${SCRIPT_PATH}/${cookiefile}"
}
# example
######################################################################################
function setup_example() {
echo "setting up example data"
curlgdrive "1gcCmsqJpbMsLSLmRfo3Afc_aTJVX7ziK" Capture_Regions.tar.gz
curlgdrive "1YQLyUtkZALZ5Bv-MTvEJJOXAOT_R59Z7" data.tar.gz
tar -xzf Capture_Regions.tar.gz -C "${DIR_SEQUENCING}" && rm -f Capture_Regions.tar.gz
tar -xzf data.tar.gz -C "${DIR_INPUT}" && rm -f data.tar.gz
echo "done"
}
# REF
######################################################################################
function setup_references() {
echo "setting up reference data"
curlgdrive "1QZSkniYbI1cWWj8CA6-FS93ViiAn8z_G" chromosomes.tar.gz
curlgdrive "1rSC-IuRYhdVvulo2yrSkHSBgVAo4iRt0" genome.tar.gz
curlgdrive "1w8PL_J6k0X96W6IkXkjOOi_VnsDaaw8U" mappability.tar.gz
tar -xzf chromosomes.tar.gz -C "${DIR_REF}" && rm -f chromosomes.tar.gz
tar -xzf genome.tar.gz -C "${DIR_REF}" && rm -f genome.tar.gz
tar -xzf mappability.tar.gz -C "${DIR_REF}/mappability" && rm -f mappability.tar.gz
echo "done"
}
# TOOLS
######################################################################################
version_GATK="3.8-1-0-gf15c1c3ef"
########
# GATK #
########
function install_tool_gatk() {
echo "installing tool gatk"
cd "${DIR_TOOLS}" || exit 1
echo "fetching gatk"
# download new version
wget "https://storage.googleapis.com/gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2" \
-O gatk.tar.bz2
# unpack
tar xjf gatk.tar.bz2
rm -f gatk.tar.bz2
# rename folder and file (neglect version information)
mv GenomeAnalysisTK*/* gatk/
rm -rf GenomeAnalysisTK*
echo "done"
}
#########
# GATK4 #
#########
function install_tool_gatk4() {
echo "installing tool gatk4"
cd "${DIR_TOOLS}" || exit 1
echo "fetching gatk"
# download new version
wget "https://github.com/broadinstitute/gatk/releases/download/4.2.1.0/gatk-4.2.1.0.zip" \
-O gatk4.zip
# unpack
unzip -o gatk4.zip
rm -f gatk4.zip
# rename folder and file (neglect version information)
mv gatk-4*/* gatk4/
rm -rf gatk-4*
echo "done"
}
###########
# annovar #
###########
function install_tool_annovar() {
echo "installing tool annovar"
cd "${DIR_TOOLS}" || exit 1
if [[ -z "${ANNOVARS}" ]]; then
echo "please visit http://download.openbioinformatics.org/annovar_download_form.php to get the download link for annovar via an email"
echo "enter annovar download link:"
read -r url_annovar
echo "fetching annovar"
wget "${url_annovar}" \
-O annovar.tar.gz
ANNOVARS="annovar.tar.gz"
fi
# unpack
tar -xzf "${ANNOVARS}"
if [[ -z "${ANNOVARS}" ]]; then
rm -f "${ANNOVARS}"
fi
cd annovar
echo "done"
}
function setup_tool_annovar() {
echo "setup tool annovar"
echo "download databases"
cd "${DIR_TOOLS}/annovar" || exit 1
# Download proposed databases directly from ANNOVAR
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar refGene humandb/
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar dbnsfp42a humandb/
# only take gnomAD_genome
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar gnomad211_genome humandb/ # version 2.1.1
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar avsnp150 humandb/
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar clinvar_20210501 humandb/
./annotate_variation.pl -buildver hg19 -downdb -webfrom annovar intervar_20180118 humandb/
echo "done"
}
function setup_tool_fusioncatcher() {
echo "setup tool fusioncatcher"
echo "download database"
mkdir -p "${DIR_TOOLS}/fusioncatcher/data"
cd "${DIR_TOOLS}/fusioncatcher/data" || exit 1
wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/human_v102.tar.gz.aa -O human_v102.tar.gz.aa
wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/human_v102.tar.gz.ab -O human_v102.tar.gz.ab
wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/human_v102.tar.gz.ac -O human_v102.tar.gz.ac
wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/human_v102.tar.gz.ad -O human_v102.tar.gz.ad
wget --no-check-certificate http://sourceforge.net/projects/fusioncatcher/files/data/human_v102.md5 -O human_v102.md5
md5sum -c human_v102.md5
if [ "$?" -ne "0" ]; then
echo -e "\n\n\n\033[33;7m ERROR: The downloaded files from above have errors! MD5 checksums do not match! Please, download them again or re-run this script again! \033[0m\n"
exit 1
fi
cat human_v102.tar.gz.* > human_v102.tar.gz
rm -f human_v102.tar.gz.*
tar -xzf human_v102.tar.gz
ln -s human_v102 current
rm -f human_v102.tar.gz
rm -f human_v102.tar.gz.aa
rm -f human_v102.tar.gz.ab
rm -f human_v102.tar.gz.ac
rm -f human_v102.tar.gz.ad
rm -f human_v102.md5
echo "done"
}
# databases
######################################################################################
function install_databases() {
echo "installing databases"
cd "${DIR_DATABASES}" || exit 1
# dbSNP
wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/All_20170710.vcf.gz -O "dbSNP/snp150hg19.vcf.gz"
wget https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/All_20170710.vcf.gz.tbi -O "dbSNP/snp150hg19.vcf.gz.tbi"
# Cancer Hotspots
wget http://www.cancerhotspots.org/files/hotspots_v2.xls
# DGIdb
wget https://www.dgidb.org/data/monthly_tsvs/2020-Oct/interactions.tsv -O DGIdb_interactions.tsv
# MSIseq db
wget http://steverozen.net/data/Hg19repeats.rda
echo "done"
}
function setup_databases() {
echo "setup databases"
if [[ -z "${HALLMARKS}" ]]; then
echo "no hallmarks file provided! Please provide the h.all.vX.X.entrez.gmt file from MSigDB."
exit 1
fi
echo "${HALLMARK}"
cd "${DIR_DATABASES}" || exit 1
BIN_RSCRIPT=$(which Rscript)
if [[ -z "${BIN_RSCRIPT}" ]]; then
echo "Rscript needs to be available and in PATH in order to install the databases"
exit 1
fi
## R Code for processing
${BIN_RSCRIPT} --vanilla geneset_generation.R "${HALLMARKS}"
echo "done"
}
case "${PARAM_TASK}" in
"tools_install")
install_tool_gatk
install_tool_gatk4
install_tool_annovar
;;
"install_gatk")
install_tool_gatk
;;
"install_gatk4")
install_tool_gatk4
;;
"install_annovar")
install_tool_annovar
;;
"db_install")
install_databases
;;
"db_setup")
setup_databases
;;
"tools_setup")
setup_tool_annovar
;;
"fusioncatcherdb")
setup_tool_fusioncatcher
;;
"ref")
setup_references
;;
"example")
setup_example
;;
*)
install_tool_gatk
install_tool_gatk4
install_tool_annovar
setup_tool_annovar
install_databases
setup_databases
setup_references
setup_example
;;
esac