-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmotifAna
executable file
·289 lines (263 loc) · 14.9 KB
/
motifAna
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENOME="mm9";
MODE=1;
## v1
#TFBS_FILE="/home/pundhir/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe.motifs"
## v2
TFBS_FILE="/home/pundhir/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe_jolma.motifs"
## v3
#TFBS_FILE="/home/pundhir/software/homer/data/knownTFs/vertebrates/hocomoco_mouse.motifs"
LIST="1,2"
PROCESSOR=1
LENGTH="8,10,12" ## recommended 7,8,9,10,11,12,13,14 (Andersson et al. 2014)
REDUCE_THRESHOLD="0.6" ## argument in compareMotifs.pl (-reduceThreshold) ## recommended 0.75 (Andersson et al. 2014)
#INFO="1.5" ## argument in compareMotifs.pl (-info)
#MIN_T=50 ## argument in compareMotifs.pl (-minT)
PVALUE="0.01" ## recommended 1e-15 (Andersson et al. 2014)
MIN_P_T=0 ## recommended 3 (Andersson et al. 2014)
B=100 ## recommended 30 (Andersson et al. 2014)
S=25 ## recommended 100 (Andersson et al. 2014)
SIZE=200
SIZE_HIST=1000
#### usage ####
usage() {
echo Program: "motifAna (annotate genomic regions for motifs corresponding to various TFBS)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: motifAna -i <file> -o <dir> [OPTIONS]"
echo "Options:"
echo " -i <file> [input file having genomic regions in BED format (or stdin)]"
echo " -o <dir> [output directory to store results]"
echo "[OPTIONS]"
echo " -m <int> [running mode (default: 1)]"
echo " [mode 1: Determine all TFBS motifs enriched in the input regions]"
echo " -s <file> [input file having PWM of TFBS motifs]"
echo " [default: /home/pundhir/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe_jolma.motifs]"
echo " -t <string> [motif length (default=8,10,12)]"
echo " -u <float> [similarity threshold used to remove similar motifs (default: 0.6; **OBSOLETE**)]"
echo " -v <float> [remove motifs with information content less than # (default: not used; **OBSOLETE**)]"
echo " -w <int> [remove motifs with less than # number of target instances (default: not used; **OBSOLETE**)]"
echo " -x <float> [p-value cutoff (default: 0.01)]"
echo " -y <float> [remove motifs with target percentage less than # (default: 0)]"
echo " -z <float> [remove motifs with background percentage greater than # (default: 100)]"
echo " -n <int> [number of motifs to optimize for each motif length (default: 25)]"
echo " -b <file> [custom background file in bed format]"
echo " [can also be a class identifier, where class is sixth column in input bed file (-i)]"
echo " [mode 2: analyze the enrichment of specific TFBS motifs in the input regions]"
echo " -l <file> [motif file for which enrichment will be analyzed]"
echo " [if multiple, separate them by a comma]"
echo " -j <string> [name of motifs that are of interest (id before a '/', eg. E2F1_MOUSE.H10MO.A in E2F1_MOUSE.H10MO.A/hocomoco)]"
echo " [if multiple separate them by a comma]"
echo " [If not provided analysis will be done using all motifs]"
echo " -S [soft match to motif name, regexp based (default: hard match)]"
echo " [id before a '/', eg. E2F1_MOUSE.H10MO.A in E2F1_MOUSE.H10MO.A/hocomoco]"
echo " -f [output histogram file only (default: histogram + annotation)]"
echo " -D <int> [size of region for histogram (default: 1000 bp)]"
echo " -g <string> [genome for which to perform the analysis (mm9, hg19, mm10, hg38, danRer7; default: mm9)]"
echo " -p <int> [number of processors to use (default: 1)]"
echo " -d <int> [size of region (default: 200 bp)]"
echo " [The size of the region used for motif finding is important. If analyzing ChIP-Seq peaks from a transcription factor,]"
echo " [Chuck would recommend 50 bp for establishing the primary motif bound by a given transcription factor and 200 bp for ]"
echo " [finding both primary and 'co-enriched' motifs for a transcription factor. When looking at histone marked regions, ]"
echo " [500-1000 bp is probably a good idea (i.e. H3K4me or H3/H4 acetylated regions). In theory, HOMER can work with very ]"
echo " [large regions (i.e. 10kb), but with the larger the regions comes more sequence and longer execution time. These ]"
echo " [regions will be based off the center of the peaks. If you prefer an offset, you can specify '-d -300,100' to ]"
echo " [search a region of size 400 that is centered 100 bp upstream of the peak center (useful if doing motif finding on ]"
echo " [putative TSS regions). If you have variable length regions, use the option '-d given' and HOMER will use the ]"
echo " [exact regions that were used as input.]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:o:m:s:t:u:v:w:x:y:z:n:b:l:j:SfD:g:p:d:h ARG; do
case "$ARG" in
i) BEDFILE=$OPTARG;;
o) OUTDIR=$OPTARG;;
m) MODE=$OPTARG;;
s) TFBS_FILE=$OPTARG;;
t) LENGTH=$OPTARG;;
u) REDUCE_THRESHOLD=$OPTARG;;
v) INFO=$OPTARG;;
w) MIN_T=$OPTARG;;
x) PVALUE=$OPTARG;;
y) MIN_P_T=$OPTARG;;
z) B=$OPTARG;;
n) S=$OPTARG;;
b) BKG_FILE=$OPTARG;;
l) LIST=$OPTARG;;
j) MOTIF_NAME=$OPTARG;;
S) SOFTMATCH=1;;
f) FORMAT_HIST=1;;
D) SIZE_HIST=$OPTARG;;
g) GENOME=$OPTARG;;
p) PROCESSOR=$OPTARG;;
d) SIZE=$OPTARG;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$BEDFILE" -o -z "$OUTDIR" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
<<"COMMENT1"
COMMENT1
echo -n "Create directory structure... "
if [ ! -d "$OUTDIR" ]; then
mkdir -p $OUTDIR
fi
echo "done"
echo -n "Populating files based on input genome, $GENOME (`date`).. "
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.simpleRepeat.gz"
GENOME_MOTIF="mm9r"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/human.hg19.simpleRepeat.gz"
GENOME_MOTIF="hg19r"
elif [ "$GENOME" == "mm10" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm10.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/mouse.mm10.simpleRepeat.gz"
GENOME_MOTIF="mm10r"
elif [ "$GENOME" == "hg38" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg38.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/human.hg38.simpleRepeat.gz"
GENOME_MOTIF="hg38r"
elif [ "$GENOME" == "danRer7" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/zebrafish.danRer7.genome"
REPEAT_FILE="/home/pundhir/project/genome_annotations/zebrafish.danRer7.simpleRepeat.gz"
GENOME_MOTIF="danRer7r"
else
echo "Presently the program only support analysis for mm9, hg19 or danRer7"
echo
usage
fi
echo done
## determine, if the input regions are from a file or stdin
echo -n "Create region file depending on if the input is from file or STDIN... "
if [ -f "$BEDFILE" ]; then
zless $BEDFILE > $OUTDIR/REGIONS_INTEREST.bed.tmp
elif [ "$BEDFILE" == "stdin" ]; then
while read LINE; do echo "${LINE}"; done > $OUTDIR/REGIONS_INTEREST.bed.tmp
else
usage
fi
ROW_COUNT=$(zless $OUTDIR/REGIONS_INTEREST.bed.tmp | wc -l)
ID_COUNT=$(zless $OUTDIR/REGIONS_INTEREST.bed.tmp | cut -f 4 | sort | uniq | wc -l)
KEEP_ID=0
if [ "$ROW_COUNT" -eq "$ID_COUNT" ]; then
KEEP_ID=1
fi
#echo -e "$ROW_COUNT\t$ID_COUNT\t$KEEP_ID\n"; exit
zless $OUTDIR/REGIONS_INTEREST.bed.tmp | perl -ane 'BEGIN { $counter=1; } if($F[5]!~/^\+$/ && $F[5]!~/^\-$/) { $F[5]="."; } if('$KEEP_ID') { print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t1\t$F[5]"; } else { print "$F[0]\t$F[1]\t$F[2]\tREGION$counter\t1\t$F[5]"; } print "\n"; $counter++;' > $OUTDIR/REGIONS_INTEREST.bed
rm $OUTDIR/REGIONS_INTEREST.bed.tmp
echo "done"
## determine motifs at input genomic regions
if [ "$MODE" -eq 1 ]; then
ADDITIONAL_ARGUMENT=""
echo -n "Determine all TFBS motifs enriched in the input regions... "
#if [ ! -z "$INFO" ]; then
# ADDITIONAL_ARGUMENT="$ADDITIONAL_ARGUMENT -info $INFO"
#fi
#if [ ! -z "$MIN_T" ]; then
# ADDITIONAL_ARGUMENT="$ADDITIONAL_ARGUMENT -minT $MIN_T"
#fi
<<"COMMENT"
COMMENT
if [ -z "$BKG_FILE" ]; then
#findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -reduceThresh $REDUCE_THRESHOLD -dumpFasta -S $S $ADDITIONAL_ARGUMENT -size $SIZE &> $OUTDIR/findMotifs.log
findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -dumpFasta -S $S $ADDITIONAL_ARGUMENT -size $SIZE -seqlogo &> $OUTDIR/findMotifs.log
else
if [ ! -f "$BKG_FILE" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
zless $OUTDIR/REGIONS_INTEREST.bed | grep -vw $BKG_FILE > $OUTDIR/$TMP.target
zless $OUTDIR/REGIONS_INTEREST.bed | grep -w $BKG_FILE > $OUTDIR/$TMP.background
mv $OUTDIR/$TMP.target $OUTDIR/REGIONS_INTEREST.bed
BKG_FILE="$OUTDIR/$TMP.background"
fi
#findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -reduceThresh $REDUCE_THRESHOLD -dumpFasta -S $S $ADDITIONAL_ARGUMENT -bg $BKG_FILE -size $SIZE &> $OUTDIR/findMotifs.log
findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -dumpFasta -S $S $ADDITIONAL_ARGUMENT -bg $BKG_FILE -size $SIZE -seqlogo &> $OUTDIR/findMotifs.log
fi
echo "done"
echo -n "Create final output files (DENOVO_MOTIFS.TXT, KNOWN_MOTIFS.TXT)... "
REGIONS_INTEREST_COUNT=$(cat $OUTDIR/REGIONS_INTEREST.bed | wc -l)
if [ -f "$OUTDIR/homerResults.html" ]; then
grep png $OUTDIR/homerResults.html | grep -vE ">\*<" | perl -ane '$_=~s/<tr><td>//gi; $_=~s/<\/td><td>/\t/gi; $_=~s/<\/td><\/tr>//gi; $_=~s/^[0-9]*\s+//g; print $_;' | perl -an -F'/\t/' -e '$F[0]=~s/^.*\=\"//g; $F[0]=~s/\".*//g; $F[0]=~s/\.logo.*//g; $F[6]=~s/\<BR.*//g; print "$F[6]\t$F[1]\t$F[3]\t$F[4]\t'$OUTDIR/'$F[0].motif\t'$REGIONS_INTEREST_COUNT'\n";' | perl -ane '$pval=sprintf("%0.20f", $F[1]); $my_pval=sprintf("%0.20f", '$PVALUE'); $F[2]=~s/\%//g; $F[3]=~s/\%//g; if($pval < $my_pval && $F[2]>='$MIN_P_T' && $F[3]<='$B') { print "$_"; }' > $OUTDIR/DENOVO_MOTIFS.TXT
else
touch $OUTDIR/DENOVO_MOTIFS.TXT
fi
if [ -f "$OUTDIR/knownResults.html" ]; then
grep png $OUTDIR/knownResults.html | grep -vE ">\*<" | perl -ane '$_=~s/<tr><td>//gi; $_=~s/<\/td><td>/\t/gi; $_=~s/<\/td><\/tr>//gi; $_=~s/^[0-9]*\s+//g; print $_;' | perl -an -F'/\t/' -e '$F[0]=~s/^.*\=\"//g; $F[0]=~s/\".*//g; $F[0]=~s/\.logo.*//g; print "$F[1]\t$F[2]\t$F[6]\t$F[8]\t'$OUTDIR/'$F[0].motif\t'$REGIONS_INTEREST_COUNT'\n";' | perl -ane '$pval=sprintf("%0.20f", $F[1]); $my_pval=sprintf("%0.20f", '$PVALUE'); $F[2]=~s/\%//g; $F[3]=~s/\%//g; if($pval < $my_pval && $F[2]>='$MIN_P_T' && $F[3]<='$B') { print "$_"; }' > $OUTDIR/KNOWN_MOTIFS.TXT
else
touch $OUTDIR/KNOWN_MOTIFS.TXT
fi
if [ -f "$OUTDIR/DENOVO_MOTIFS.TXT" -a -f "$OUTDIR/KNOWN_MOTIFS.TXT" ]; then
cat $OUTDIR/DENOVO_MOTIFS.TXT $OUTDIR/KNOWN_MOTIFS.TXT | perl -ane '$F[0]=~s/\(.*//g; if(!$seen{$F[0]}) { print $_; $seen{$F[0]}=1; }' > $OUTDIR/UNIQ_MOTIFS.TXT
fi
echo "done"
elif [ ! -z "$LIST" ]; then
oIFS=$IFS
IFS=","
FILE=($LIST)
IFS=$oIFS
PWM=""
for (( i=0; i<${#FILE[@]}; i++ )); do
PWM="$PWM ${FILE[$i]}"
done
if [ ! -z "$MOTIF_NAME" ];
then
echo -n "Create motif file depending upon input motifs of interest.. "
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
if [ -z "$SOFTMATCH" ]; then
zless $PWM | extractMotifs -i stdin -j $MOTIF_NAME > $TMP
else
zless $PWM | extractMotifs -i stdin -j $MOTIF_NAME -S > $TMP
fi
echo "done"
PWM=$TMP
fi
echo -n "Analyze the enrichment of specific TFBS motifs in the input regions.. "
## parameter histNorm added on Nov 12, 2015 for Janus project
if [ "$SIZE_HIST" == "given" ]; then
annotatePeaks.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF -m $PWM -size $SIZE_HIST -hist 500 -histNorm 5 -matrix $OUTDIR/REGIONS_INTEREST.matrix | perl -an -F'/\t+/' -e 'print "$F[0]\t"; $line=(); for($i=1; $i<scalar(@F)-4; $i+=3) { $line.="$F[$i]\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/REGIONS_INTEREST.hist
else
annotatePeaks.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF -m $PWM -size $SIZE_HIST -hist 10 -histNorm 5 -matrix $OUTDIR/REGIONS_INTEREST.matrix | perl -an -F'/\t+/' -e 'print "$F[0]\t"; $line=(); for($i=1; $i<scalar(@F)-4; $i+=3) { $line.="$F[$i]\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/REGIONS_INTEREST.hist
fi
if [ -z "$FORMAT_HIST" ]; then
findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -find $PWM -size $SIZE -seqlogo > $OUTDIR/REGIONS_INTEREST.find
## filter only highest scoring motif postion for each region
cat <(grep -w Offset $OUTDIR/REGIONS_INTEREST.find) <(grep -wv Offset $OUTDIR/REGIONS_INTEREST.find | sort -k 4,4 -k 1,1 -k 6rg,6 | perl -ane 'if(!$seen{$F[3]}{$F[0]}) { print $_; $seen{$F[3]}{$F[0]}=1; }') > $OUTDIR/REGIONS_INTEREST.find.unique
#mv $OUTDIR/REGIONS_INTEREST.find.unique $OUTDIR/REGIONS_INTEREST.find
annotatePeaks.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF -m $PWM -annStats $OUTDIR/REGIONS_INTEREST.annoStats -size $SIZE -mbed $OUTDIR/REGIONS_INTEREST.annoBed > $OUTDIR/REGIONS_INTEREST.anno
TOTAL=$(cat $OUTDIR/REGIONS_INTEREST.bed | wc -l)
MAPPED=$(grep -v Offset $OUTDIR/REGIONS_INTEREST.find | wc -l)
PER=$(perl -e '$per=('$MAPPED'*100)/'$TOTAL'; printf("%0.2f", $per);')
echo
echo
echo "$MAPPED out of $TOTAL ($PER) input sequences contain the input motifs" >> $OUTDIR/REGIONS_INTEREST.annoStats
echo "$MAPPED out of $TOTAL ($PER) input sequences contain the input motifs"
fi
echo "done"
else
usage
fi
if [ ! -z "$TMP" ]; then
rm $TMP
fi
#wait
#wait_for_jobs_to_finish "Wait for jobs to finish... "
#echo "done"