motifAna

#!/bin/bash
#PBS -l nodes=1:ppn=4

GENOME="mm9";
MODE=1;
## v1
#TFBS_FILE="/home/pundhir/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe.motifs"
## v2
TFBS_FILE="/home/pundhir/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe_jolma.motifs"
## v3
#TFBS_FILE="/home/pundhir/software/homer/data/knownTFs/vertebrates/hocomoco_mouse.motifs"
LIST="1,2"
PROCESSOR=1
LENGTH="8,10,12" ## recommended 7,8,9,10,11,12,13,14 (Andersson et al. 2014)
REDUCE_THRESHOLD="0.6" ## argument in compareMotifs.pl (-reduceThreshold)  ## recommended 0.75 (Andersson et al. 2014)
#INFO="1.5" ## argument in compareMotifs.pl (-info)
#MIN_T=50 ## argument in compareMotifs.pl (-minT)
PVALUE="0.01" ## recommended 1e-15 (Andersson et al. 2014)
MIN_P_T=0 ## recommended 3 (Andersson et al. 2014)
B=100 ## recommended 30 (Andersson et al. 2014)
S=25 ## recommended 100 (Andersson et al. 2014)
SIZE=200
SIZE_HIST=1000

#### usage ####
usage() {
	echo Program: "motifAna (annotate genomic regions for motifs corresponding to various TFBS)"
	echo Author: BRIC, University of Copenhagen, Denmark
	echo Version: 1.0
	echo Contact: pundhir@binf.ku.dk
	echo "Usage: motifAna -i <file> -o <dir> [OPTIONS]"
	echo "Options:"
	echo " -i <file>   [input file having genomic regions in BED format (or stdin)]"
    echo " -o <dir>    [output directory to store results]"
    echo "[OPTIONS]"
    echo " -m <int>    [running mode (default: 1)]"
    echo "             [mode 1: Determine all TFBS motifs enriched in the input regions]"
    echo "              -s <file>   [input file having PWM of TFBS motifs]"
    echo "                          [default: /home/pundhir/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe_jolma.motifs]"
    echo "              -t <string> [motif length (default=8,10,12)]"
    echo "              -u <float>  [similarity threshold used to remove similar motifs (default: 0.6; **OBSOLETE**)]"
    echo "              -v <float>  [remove motifs with information content less than # (default: not used; **OBSOLETE**)]"
    echo "              -w <int>    [remove motifs with less than # number of target instances (default: not used; **OBSOLETE**)]"
    echo "              -x <float>  [p-value cutoff (default: 0.01)]"
    echo "              -y <float>  [remove motifs with target percentage less than # (default: 0)]"
    echo "              -z <float>  [remove motifs with background percentage greater than # (default: 100)]"
    echo "              -n <int>    [number of motifs to optimize for each motif length (default: 25)]"
    echo "              -b <file>   [custom background file in bed format]"
    echo "                          [can also be a class identifier, where class is sixth column in input bed file (-i)]"
    echo "             [mode 2: analyze the enrichment of specific TFBS motifs in the input regions]"
    echo "              -l <file>   [motif file for which enrichment will be analyzed]"
    echo "                          [if multiple, separate them by a comma]"
    echo "              -j <string> [name of motifs that are of interest (id before a '/', eg. E2F1_MOUSE.H10MO.A in E2F1_MOUSE.H10MO.A/hocomoco)]"
    echo "                          [if multiple separate them by a comma]"
    echo "                          [If not provided analysis will be done using all motifs]"
    echo "              -S          [soft match to motif name, regexp based (default: hard match)]"
    echo "                          [id before a '/', eg. E2F1_MOUSE.H10MO.A in E2F1_MOUSE.H10MO.A/hocomoco]"
    echo "              -f          [output histogram file only (default: histogram + annotation)]"
    echo "              -D <int>    [size of region for histogram (default: 1000 bp)]"
    echo " -g <string> [genome for which to perform the analysis (mm9, hg19, mm10, hg38, danRer7; default: mm9)]"
    echo " -p <int>    [number of processors to use (default: 1)]"
    echo " -d <int>    [size of region (default: 200 bp)]"
    echo "             [The size of the region used for motif finding is important.  If analyzing ChIP-Seq peaks from a transcription factor,]"
    echo "             [Chuck would recommend 50 bp for establishing the primary motif bound by a given transcription factor and 200 bp for  ]"
    echo "             [finding both primary and 'co-enriched' motifs for a transcription factor.  When looking at histone marked regions,   ]"
    echo "             [500-1000 bp is probably a good idea (i.e. H3K4me or H3/H4 acetylated regions).  In theory, HOMER can work with very  ]"
    echo "             [large regions (i.e. 10kb), but with the larger the regions comes more sequence and longer execution time.  These     ]"
    echo "             [regions will be based off the center of the peaks.  If you prefer an offset, you can specify '-d -300,100' to     ]"
    echo "             [search a region of size 400 that is centered 100 bp upstream of the peak center (useful if doing motif finding on    ]"
    echo "             [putative TSS regions).  If you have variable length regions, use the option '-d given' and HOMER will use the     ]"
    echo "             [exact regions that were used as input.]"
	echo " -h          [help]"
	echo
	exit 0
}

#### parse options ####
while getopts i:o:m:s:t:u:v:w:x:y:z:n:b:l:j:SfD:g:p:d:h ARG; do
	case "$ARG" in
		i) BEDFILE=$OPTARG;;
        o) OUTDIR=$OPTARG;;
        m) MODE=$OPTARG;;
        s) TFBS_FILE=$OPTARG;;
        t) LENGTH=$OPTARG;;
        u) REDUCE_THRESHOLD=$OPTARG;;
        v) INFO=$OPTARG;;
        w) MIN_T=$OPTARG;;
        x) PVALUE=$OPTARG;;
        y) MIN_P_T=$OPTARG;;
        z) B=$OPTARG;;
        n) S=$OPTARG;;
        b) BKG_FILE=$OPTARG;;
        l) LIST=$OPTARG;;
        j) MOTIF_NAME=$OPTARG;;
        S) SOFTMATCH=1;;
        f) FORMAT_HIST=1;;
        D) SIZE_HIST=$OPTARG;;
        g) GENOME=$OPTARG;;
        p) PROCESSOR=$OPTARG;;
        d) SIZE=$OPTARG;;
		h) HELP=1;;
	esac
done

## usage, if necessary file and directories are given/exist
if [ -z "$BEDFILE" -o -z "$OUTDIR" -o "$HELP" ]; then
	usage
fi

###################
#helperfunction
function wait_for_jobs_to_finish {
    for job in `jobs -p`
    do
        echo $job
        wait $job
    done
    echo $1
}
###############

<<"COMMENT1"
COMMENT1
echo -n "Create directory structure... "
if [ ! -d "$OUTDIR" ]; then
    mkdir -p $OUTDIR
fi
echo "done"

echo -n "Populating files based on input genome, $GENOME (`date`).. "
if [ "$GENOME" == "mm9" ]; then
    GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
    REPEAT_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.simpleRepeat.gz"
    GENOME_MOTIF="mm9r"
elif [ "$GENOME" == "hg19" ]; then
    GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
    REPEAT_FILE="/home/pundhir/project/genome_annotations/human.hg19.simpleRepeat.gz"
    GENOME_MOTIF="hg19r"
elif [ "$GENOME" == "mm10" ]; then
    GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm10.genome"
    REPEAT_FILE="/home/pundhir/project/genome_annotations/mouse.mm10.simpleRepeat.gz"
    GENOME_MOTIF="mm10r"
elif [ "$GENOME" == "hg38" ]; then
    GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg38.genome"
    REPEAT_FILE="/home/pundhir/project/genome_annotations/human.hg38.simpleRepeat.gz"
    GENOME_MOTIF="hg38r"
elif [ "$GENOME" == "danRer7" ]; then
    GENOME_FILE="/home/pundhir/project/genome_annotations/zebrafish.danRer7.genome"
    REPEAT_FILE="/home/pundhir/project/genome_annotations/zebrafish.danRer7.simpleRepeat.gz"
    GENOME_MOTIF="danRer7r"
else
    echo "Presently the program only support analysis for mm9, hg19 or danRer7"
    echo
    usage
fi
echo done

## determine, if the input regions are from a file or stdin
echo -n "Create region file depending on if the input is from file or STDIN... "
if [ -f "$BEDFILE" ]; then
    zless $BEDFILE > $OUTDIR/REGIONS_INTEREST.bed.tmp 
elif [ "$BEDFILE" == "stdin" ]; then
    while read LINE; do echo "${LINE}"; done > $OUTDIR/REGIONS_INTEREST.bed.tmp
else
    usage
fi

ROW_COUNT=$(zless $OUTDIR/REGIONS_INTEREST.bed.tmp | wc -l)
ID_COUNT=$(zless $OUTDIR/REGIONS_INTEREST.bed.tmp | cut -f 4 | sort | uniq | wc -l)

KEEP_ID=0
if [ "$ROW_COUNT" -eq "$ID_COUNT" ]; then
    KEEP_ID=1
fi

#echo -e "$ROW_COUNT\t$ID_COUNT\t$KEEP_ID\n"; exit
zless $OUTDIR/REGIONS_INTEREST.bed.tmp | perl -ane 'BEGIN { $counter=1; } if($F[5]!~/^\+$/ && $F[5]!~/^\-$/) { $F[5]="."; } if('$KEEP_ID') { print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t1\t$F[5]"; } else { print "$F[0]\t$F[1]\t$F[2]\tREGION$counter\t1\t$F[5]"; } print "\n"; $counter++;' > $OUTDIR/REGIONS_INTEREST.bed
rm $OUTDIR/REGIONS_INTEREST.bed.tmp
echo "done"

## determine motifs at input genomic regions
if [ "$MODE" -eq 1 ]; then
    ADDITIONAL_ARGUMENT=""
    echo -n "Determine all TFBS motifs enriched in the input regions... "
    #if [ ! -z "$INFO" ]; then
    #    ADDITIONAL_ARGUMENT="$ADDITIONAL_ARGUMENT -info $INFO"
    #fi
    #if [ ! -z "$MIN_T" ]; then
    #    ADDITIONAL_ARGUMENT="$ADDITIONAL_ARGUMENT -minT $MIN_T"
    #fi

<<"COMMENT"
COMMENT
    if [ -z "$BKG_FILE" ]; then
        #findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -reduceThresh $REDUCE_THRESHOLD -dumpFasta -S $S $ADDITIONAL_ARGUMENT -size $SIZE &> $OUTDIR/findMotifs.log
        findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -dumpFasta -S $S $ADDITIONAL_ARGUMENT -size $SIZE -seqlogo &> $OUTDIR/findMotifs.log
    else
        if [ ! -f "$BKG_FILE" ]; then
            TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
            zless $OUTDIR/REGIONS_INTEREST.bed | grep -vw $BKG_FILE > $OUTDIR/$TMP.target
            zless $OUTDIR/REGIONS_INTEREST.bed | grep -w $BKG_FILE > $OUTDIR/$TMP.background
            mv $OUTDIR/$TMP.target $OUTDIR/REGIONS_INTEREST.bed
            BKG_FILE="$OUTDIR/$TMP.background"
        fi

        #findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -reduceThresh $REDUCE_THRESHOLD -dumpFasta -S $S $ADDITIONAL_ARGUMENT -bg $BKG_FILE -size $SIZE &> $OUTDIR/findMotifs.log
        findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -mcheck $TFBS_FILE -mknown $TFBS_FILE -p $PROCESSOR -len $LENGTH -dumpFasta -S $S $ADDITIONAL_ARGUMENT -bg $BKG_FILE -size $SIZE -seqlogo &> $OUTDIR/findMotifs.log
    fi
    echo "done"

    echo -n "Create final output files (DENOVO_MOTIFS.TXT, KNOWN_MOTIFS.TXT)... "
    REGIONS_INTEREST_COUNT=$(cat $OUTDIR/REGIONS_INTEREST.bed | wc -l)
    if [ -f "$OUTDIR/homerResults.html" ]; then
        grep png $OUTDIR/homerResults.html | grep -vE ">\*<" | perl -ane '$_=~s/<tr><td>//gi; $_=~s/<\/td><td>/\t/gi; $_=~s/<\/td><\/tr>//gi; $_=~s/^[0-9]*\s+//g; print $_;' | perl -an -F'/\t/' -e '$F[0]=~s/^.*\=\"//g; $F[0]=~s/\".*//g; $F[0]=~s/\.logo.*//g; $F[6]=~s/\<BR.*//g; print "$F[6]\t$F[1]\t$F[3]\t$F[4]\t'$OUTDIR/'$F[0].motif\t'$REGIONS_INTEREST_COUNT'\n";' | perl -ane '$pval=sprintf("%0.20f", $F[1]); $my_pval=sprintf("%0.20f", '$PVALUE'); $F[2]=~s/\%//g; $F[3]=~s/\%//g; if($pval < $my_pval && $F[2]>='$MIN_P_T' && $F[3]<='$B') { print "$_"; }' > $OUTDIR/DENOVO_MOTIFS.TXT
    else
        touch $OUTDIR/DENOVO_MOTIFS.TXT
    fi
    if [ -f "$OUTDIR/knownResults.html" ]; then
        grep png $OUTDIR/knownResults.html | grep -vE ">\*<" | perl -ane '$_=~s/<tr><td>//gi; $_=~s/<\/td><td>/\t/gi; $_=~s/<\/td><\/tr>//gi; $_=~s/^[0-9]*\s+//g; print $_;' | perl -an -F'/\t/' -e '$F[0]=~s/^.*\=\"//g; $F[0]=~s/\".*//g; $F[0]=~s/\.logo.*//g; print "$F[1]\t$F[2]\t$F[6]\t$F[8]\t'$OUTDIR/'$F[0].motif\t'$REGIONS_INTEREST_COUNT'\n";' | perl -ane '$pval=sprintf("%0.20f", $F[1]); $my_pval=sprintf("%0.20f", '$PVALUE'); $F[2]=~s/\%//g; $F[3]=~s/\%//g; if($pval < $my_pval && $F[2]>='$MIN_P_T' && $F[3]<='$B') { print "$_"; }' > $OUTDIR/KNOWN_MOTIFS.TXT
    else
        touch $OUTDIR/KNOWN_MOTIFS.TXT
    fi
    if [ -f "$OUTDIR/DENOVO_MOTIFS.TXT" -a -f "$OUTDIR/KNOWN_MOTIFS.TXT" ]; then
        cat $OUTDIR/DENOVO_MOTIFS.TXT $OUTDIR/KNOWN_MOTIFS.TXT | perl -ane '$F[0]=~s/\(.*//g; if(!$seen{$F[0]}) { print $_; $seen{$F[0]}=1; }' > $OUTDIR/UNIQ_MOTIFS.TXT
    fi
    echo "done"
elif [ ! -z "$LIST" ]; then
    oIFS=$IFS
    IFS=","
    FILE=($LIST)
    IFS=$oIFS

    PWM=""
    for (( i=0; i<${#FILE[@]}; i++ )); do
        PWM="$PWM ${FILE[$i]}"
    done

    if [ ! -z "$MOTIF_NAME" ];
    then
        echo -n "Create motif file depending upon input motifs of interest.. "
        TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
        if [ -z "$SOFTMATCH" ]; then
            zless $PWM | extractMotifs -i stdin -j $MOTIF_NAME > $TMP
        else
            zless $PWM | extractMotifs -i stdin -j $MOTIF_NAME -S > $TMP
        fi
        echo "done"
        PWM=$TMP
    fi

    echo -n "Analyze the enrichment of specific TFBS motifs in the input regions.. "
    ## parameter histNorm added on Nov 12, 2015 for Janus project
    if [ "$SIZE_HIST" == "given" ]; then
        annotatePeaks.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF -m $PWM -size $SIZE_HIST -hist 500 -histNorm 5 -matrix $OUTDIR/REGIONS_INTEREST.matrix | perl -an -F'/\t+/' -e 'print "$F[0]\t"; $line=(); for($i=1; $i<scalar(@F)-4; $i+=3) { $line.="$F[$i]\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/REGIONS_INTEREST.hist
    else
        annotatePeaks.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF -m $PWM -size $SIZE_HIST -hist 10 -histNorm 5 -matrix $OUTDIR/REGIONS_INTEREST.matrix | perl -an -F'/\t+/' -e 'print "$F[0]\t"; $line=(); for($i=1; $i<scalar(@F)-4; $i+=3) { $line.="$F[$i]\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/REGIONS_INTEREST.hist
    fi

    if [ -z "$FORMAT_HIST" ]; then
        findMotifsGenome.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF $OUTDIR -find $PWM -size $SIZE -seqlogo > $OUTDIR/REGIONS_INTEREST.find

        ## filter only highest scoring motif postion for each region
        cat <(grep -w Offset $OUTDIR/REGIONS_INTEREST.find) <(grep -wv Offset $OUTDIR/REGIONS_INTEREST.find | sort -k 4,4 -k 1,1 -k 6rg,6 | perl -ane 'if(!$seen{$F[3]}{$F[0]}) { print $_; $seen{$F[3]}{$F[0]}=1; }') > $OUTDIR/REGIONS_INTEREST.find.unique
        #mv $OUTDIR/REGIONS_INTEREST.find.unique $OUTDIR/REGIONS_INTEREST.find

        annotatePeaks.pl $OUTDIR/REGIONS_INTEREST.bed $GENOME_MOTIF -m $PWM -annStats $OUTDIR/REGIONS_INTEREST.annoStats -size $SIZE -mbed $OUTDIR/REGIONS_INTEREST.annoBed > $OUTDIR/REGIONS_INTEREST.anno

        TOTAL=$(cat $OUTDIR/REGIONS_INTEREST.bed | wc -l)
        MAPPED=$(grep -v Offset $OUTDIR/REGIONS_INTEREST.find | wc -l)
        PER=$(perl -e '$per=('$MAPPED'*100)/'$TOTAL'; printf("%0.2f", $per);')
        echo
        echo
        echo "$MAPPED out of $TOTAL ($PER) input sequences contain the input motifs" >> $OUTDIR/REGIONS_INTEREST.annoStats
        echo "$MAPPED out of $TOTAL ($PER) input sequences contain the input motifs"

    fi
    echo "done"
else
    usage
fi

if [ ! -z "$TMP" ]; then
    rm $TMP
fi

#wait
#wait_for_jobs_to_finish "Wait for jobs to finish... "
#echo "done"