-
Notifications
You must be signed in to change notification settings - Fork 104
/
bismark2bedGraph
executable file
·892 lines (721 loc) · 33.9 KB
/
bismark2bedGraph
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
#!/usr/bin/env perl
use warnings;
use strict;
$|++;
use Getopt::Long;
use Cwd;
use Carp;
## This program is Copyright (C) 2010-23, Felix Krueger (fkrueger@altoslabs.com)
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
my $bismark2bedGraph_version = 'v0.24.2';
my @bedfiles;
my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
my @sorting_files;
my ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir,$ucsc) = process_commandline();
warn "Using these input files: @sorting_files\n";
warn "\nSummary of parameters for bismark2bedGraph conversion:\n";
warn '='x54,"\n";
warn "bedGraph output:\t\t$bedGraph_output\n";
warn "output directory:\t\t>$output_dir<\n";
if ($remove){
warn "remove whitespaces:\t\tyes\n";
}
else{
warn "remove whitespaces:\t\tno\n";
}
if ($CX_context){
warn "CX context:\t\t\tyes\n";
}
else{
warn "CX context:\t\t\tno (CpG context only, default)\n";
}
if ($no_header){
warn "No-header selected:\t\tyes\n";
}
else{
warn "No-header selected:\t\tno\n";
}
if ($ample_mem){
warn "Sorting method:\t\t\tArray-based (faster, but larger memory footprint)\n";
}
else{
warn "Sorting method:\t\t\tUnix sort-based (smaller memory footprint, but slower)\n";
}
unless($ample_mem){
warn "Sort buffer size:\t\t$sort_size\n";
}
warn "Coverage threshold:\t\t$coverage_threshold\n";
warn "="x77,"\n";
warn "Methylation information will now be written into a bedGraph and coverage file\n";
warn "="x77,"\n\n";
sleep (2);
### deciding which files to use for bedGraph conversion
foreach my $filename (@sorting_files){
### DETERMINING THE FULL PATH OF INPUT FILES
if ($filename =~ /^(.*\/)(.*)$/){ # if files are in a different output folder we extract the filename again
# warn "folder name: $1\nfilename: $2\n\n";
chdir $1 or die "Failed to change directory to $1\n"; # $1 might be a relative path
$input_dir = getcwd(); # this will always be the full path
$filename = $2;
# warn "Full Input folder: $input_dir\nFilename: $filename\n\n"; sleep (1);
chdir $parent_dir or die "Failed to move back to the parent directory\n\n"; # moving back
}
else{
$input_dir = $parent_dir;
}
$input_dir .= '/';
if ($CX_context){
# push @bedfiles,$output_dir.$filename;
push @bedfiles,$input_dir.$filename;
}
else{ ## CpG context only (default)
if ($filename =~ /^CpG/){ # only testing the actual filename without the path information
push @bedfiles,$input_dir.$filename; # we are adding the full path to the filename
}
else{
# skipping CHH or CHG files
}
}
}
if (@bedfiles){
warn "Using the following files as Input:\n";
print join ("\t",@bedfiles),"\n\n";
sleep (2);
}
else{
die "It seems that you are trying to generate bedGraph files for files not starting with CpG.... Please specify the option '--CX' and try again\n\n";
}
open (OUT,"| gzip -c - > ${output_dir}${bedGraph_output}") or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!\n";
warn "Writing bedGraph to file: $bedGraph_output\n";
print OUT "track type=bedGraph\n";
my $coverage_output = $bedGraph_output;
unless ($coverage_output =~ s/bedGraph\.gz$/bismark.cov.gz/){
$coverage_output =~ s/$/.bismark.cov.gz/;
}
open (COVERAGE,"| gzip -c - > ${output_dir}${coverage_output}") or die "Problems writing to the coverage output detected. File path: '$output_dir'\tfile name: '$coverage_output' $!\n\n";
warn "Also writing out a coverage file including counts methylated and unmethylated residues to file: $coverage_output\n";
if ($zero){
my $zero_coverage_output = $bedGraph_output;
unless ($zero_coverage_output =~ s/bedGraph$/bismark.zero.cov/){
$zero_coverage_output =~ s/$/.bismark.zero.cov/;
}
open (ZEROCOVERAGE,'>',$output_dir.$zero_coverage_output) or die "Problems writing to the zero-based coverage output detected. File path: '$output_dir'\tfile name: '$\zero_coverage_output' $!\n\n";
warn "Also writing out a 0-based, half-open coverage file including counts methylated and unmethylated residues to file: $zero_coverage_output\n";
}
warn "\n";
my %temp_fhs;
my @temp_files; # writing all context files (default: CpG only) to these files prior to sorting
my %chr_lengths; # storing chromosome lengths in '--ample_memory' mode
### changing to the output directory
unless ($output_dir eq ''){ # default
chdir $output_dir or die "Failed to change directory to $output_dir\n";
warn "Changed directory to $output_dir\n";
}
if ($gazillion){
if (scalar @bedfiles == 1){
warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Sorting everything in memory instead of writing out individual chromosome files ...\n";
}
else{
warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Merging all input files and sorting everything in memory instead of writing out individual chromosome files...\n";
my $merge = "$bedGraph_output.methylation_calls.merged";
open (MERGE,'>',$merge) or die "Failed to write to temporary merged file $merge: $!\n";
warn "Writing all merged methylation calls to temp file $merge\n\n"; sleep(2);
push @temp_files, $merge;
}
}
foreach my $infile (@bedfiles) {
# warn "Processing $infile\n";
if ($remove) {
warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
if ($infile =~ /gz$/){
open (READ,"gunzip -c $infile |") or die $!;
}
else{
open (READ,$infile) or die $!;
}
my $removed_spaces_outfile = $infile;
$removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
$removed_spaces_outfile =~ s/.*\///;# replacing everything up to the last slash in the filename
warn "Attempting to write to file ${output_dir}${removed_spaces_outfile}\n\n";
open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
unless ($no_header){
$_ = <READ>; ### Bismark version header
print REM $_; ### Bismark version header
}
while (<READ>) {
chomp;
my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
$id =~ s/\s+/_/g;
print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
}
close READ or die $!;
close REM or die $!;
### changing the infile name to the new file without spaces
$infile = $removed_spaces_outfile; # at this stage we are already in the output directory so it should pick it up correctly
}
# opening infile
if ($infile =~ /gz$/){
open (IN,"gunzip -c $infile |") or die "Couldn't find file '$infile': $!\n";
}
else{
open (IN,$infile) or die "Couldn't find file '$infile': $!\n";
}
if ($infile =~ /\//){ # if input files are in a different folders we extract the filename again
$infile =~ s/.*\///;# replacing everything up to the last slash in the filename
# warn "Renamed Infile: $infile\n";
}
### People these days seem to be aligning their data to newly assembled genomes more and more, which sometimes consist of up to half a million scaffolds instead of ~23 chromosomes. This
### does normally clash with the operating system's limit of files that can be open for writing at the same time, and it is difficult and probably not advisable to increase this
### limit (some even say there is a reason for the OS doing so...).
### To still allow then generation of bedGraph files we will in these cases sort everything using the Linux sort command instead, which will sort by chromosome and position (the
### chromosome sorting is not carried out for chromosome sorted files which makes the sort MUCH faster).
if ($gazillion){
# using all infiles instead of sorting
if (scalar @bedfiles == 1){
push @temp_files, $infile;
}
else{
## always ignoring the version header
unless ($no_header){
$_ = <IN>; ### Bismark version header
}
while (<IN>) {
if ($_ =~ /^Bismark /){
warn "Found Bismark version information. Skipping this line (should still work fine) but consider using '--no_header' next time...\n";
next;
}
print MERGE;
}
warn "Finished writing methylation calls from $infile to merged temp file\n";
}
}
else{
warn "Now writing methylation information for file >>$infile<< to individual files for each chromosome\n";
## always ignoring the version header
unless ($no_header){
$_ = <IN>; ### Bismark version header
}
while (<IN>) {
if ($_ =~ /^Bismark /){
warn "Found Bismark version information. Skipping this line (should still work fine) but consider losing '--no_header' next time...\n";
next;
}
chomp;
my ($chr,$pos) = (split (/\t/))[2,3];
### If --ample_mem was specified we are keeping track of the highest position for each chromosome as this will determine the size of the array we need to create in the next step
if ($ample_mem){
### setting the first position for this chromosome
unless (defined $chr_lengths{$chr} ){
$chr_lengths{$chr} = $pos;
}
# for all subsequent postions for this chromosome
if ($pos > $chr_lengths{$chr} ){
$chr_lengths{$chr} = $pos; # set the current position as the new highest position
}
}
# warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
$chr =~ s/\|/_/g; # replacing pipe ('|') characters in the chromosome names so that they don't interfere with the temp file creation
$chr =~ s/\//_/g; # replacing forward slash ('/') characters in the chromosome names
# warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
unless (exists $temp_fhs{$chr}) { # Including the infile name to the temporary chromosome files to enable parallel processing of multiple files at the same time
my $temp_file_name = $infile.'.chr'.$chr.'.methXtractor.temp';
# warn "Using temp file name: $temp_file_name\n"; sleep(1);
open ($temp_fhs{$chr},'>',$infile.'.chr'.$chr.'.methXtractor.temp') or die "Failed to open filehandle: $!";
push @temp_files, $temp_file_name; # storing temp files as we open them instead
}
print {$temp_fhs{$chr}} "$_\n";
}
warn "Finished writing out individual chromosome files for $infile\n";
}
}
# closing temporary filehandles to force writing out buffered content
foreach my $temp_fh(keys %temp_fhs){
close $temp_fhs{$temp_fh} or warn "Failed to close temporary filehandle $temp_fhs{$temp_fh}: $!\n";
}
### printing out the determined maximum position for each chromosome
if ($ample_mem){
foreach my $chr (sort keys %chr_lengths){
warn "Highest determined position for chromosome $chr:\t\t$chr_lengths{$chr} bp\n";
}
warn "\n";
}
unless ($gazillion){
warn "\n";
warn "Collecting temporary chromosome file information... Processing the following input file(s):\n";
warn join ("\n",@temp_files),"\n\n";
sleep (1);
}
if ($gazillion){
if (scalar @bedfiles > 1){
close (MERGE) or die "Failed to close filehandle MERGE: $!\n";
}
}
foreach my $in (sort @temp_files) {
if ($sort_size){
warn "Sorting input file $in by positions (using -S of $sort_size)\n" unless ($ample_mem);
}
my $ifh;
my $name;
my $meth_state;
my $chr = "";
my $pos = 0;
my $meth_state2;
my $last_pos;
my $last_chr;
### If the user specified to have a lot of RAM available (probably in the range of > 16GB for 2 arrays of human genome Chromosome 1) we will sort the methylation calls in two big arrays instead of using the Unix sort command
if ($ample_mem){
# warn "Generating enormous array instead of sorting the file. This may temporily use quite a bit of memory (RAM)!\n\n";
my @meth_count;
my @unmeth_count;
open ($ifh,$in) or die "Couldn't read from temporary file '$in': $!\n";
while (my $line = <$ifh>){
next if ($line =~ /^Bismark/);
chomp $line;
($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
unless ($last_pos and $last_chr){
$last_chr = $chr;
$last_pos = $pos;
}
unless (@meth_count and @unmeth_count){
warn "Setting maximum position of arrays \@meth_count and \@unmeth_count for chromosome $chr to $chr_lengths{$chr}\n";
@meth_count = (0) x $chr_lengths{$chr};
@unmeth_count = (0) x $chr_lengths{$chr};
# warn "length of array meth count: ",scalar @meth_count,"\n";
warn "Finished generating arrays\n";
# sleep(1);
}
# warn "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; sleep(1);
# print join ("\t",$name, $meth_state, $chr, $pos, $meth_state2),"\n";
# sleep(1);
# if ($last_chr ne $chr) {
# die "Reached new chromosome '$chr' which mustn't happen from pre-sorted files (previous chromosome was: '$last_chr')\n";
# }
my $validated = validate_methylation_call($meth_state, $meth_state2); # as a comment, methylation calls in Unknown context (U, u) would fail this check, but they should be ignored by the methylation extractor anyway
unless($validated){
warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
next;
}
if ($meth_state eq '+'){
# warn "increasing meth $pos by 1\n"; sleep(1);
$meth_count[$pos-1]++;
}
else{
$unmeth_count[$pos-1]++;
# warn "increasing unmeth $pos by 1\n"; sleep(1);
}
}
close $ifh or die $!;
warn "Now printing methylation information for this chromosome\n";
# warn "length of array meth count: ",scalar @meth_count,"\n";
# warn "chr\tposition\tcount methylated\tcount unmethylated\tcount total\n";
foreach my $index (0..$#meth_count){
my $totalcount = $meth_count[$index] + $unmeth_count[$index];
if ($totalcount > 0){
# warn "$index\t$meth_count[$index]\t$unmeth_count[$index]\t$totalcount\n";
# sleep(1);
my $bed_pos = $index; ### bedGraph coordinates are 0 based
my $one_based_pos = $bed_pos + 1;
my $meth_percentage;
($totalcount >= $coverage_threshold) ? ($meth_percentage = ($meth_count[$index]/$totalcount) * 100) : ($meth_percentage = undef);
if (defined $meth_percentage){
# as of version 0.9.1 we will by default write out both a bedGraph and a more detailed coverage file
# this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (half-open. Clever, huh?)
print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
# this is the coverage file. Coordinates are 1-based
print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
# this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
if ($zero){
print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
}
}
}
}
@meth_count = ();
@unmeth_count = ();
}
### default: we assume that the user wants to use the Linux Sort command. This is quite a bit slower, but features a much smaller memory footprint
else{
my $sort_dir = './'; # there has been a cd into the output_directory already
# my $sort_dir = $output_dir;
# if ($sort_dir eq ''){
# $sort_dir = './';
# }
if ($gazillion){
if ($in =~ /gz$/){
open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k3,3V -k4,4n |" or die "Input file could not be sorted. $!\n";
}
else{
open $ifh, "sort -S $sort_size -T $sort_dir -k3,3V -k4,4n $in |" or die "Input file could not be sorted. $!\n";
}
### Comment by Volker Brendel, Indiana University
### "The -k3,3V sort option is critical when the sequence names are numbered scaffolds (without left-buffering of zeros). Omit the V, and things go very wrong in the tallying of reads."
}
else{
### this sort command was used previously and sorts according to chromosome in addition to position. Since the files are being sorted according to chromosomes anyway,
### we may drop the -k3,3V option. It has been reported that this will result in a dramatic speed increase
if ($in =~ /gz$/){
open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k4,4n |" or die "Input file could not be sorted. $!\n";
}
else{
open $ifh, "sort -S $sort_size -T $sort_dir -k4,4n $in |" or die "Input file could not be sorted. $!\n";
}
}
while (my $line = <$ifh>) {
next if ($line =~ /^Bismark/);
chomp $line;
#warn "full line:\n$line\n";
$last_chr = $chr;
$last_pos = $pos;
($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
#warn "$name, $meth_state, $chr, $pos, $meth_state2\n"; sleep(1);
if (($last_pos ne $pos) || ($last_chr ne $chr)) {
generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
@methylcalls = qw (0 0 0);
}
my $validated = validate_methylation_call($meth_state, $meth_state2);
unless($validated){
warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
next;
}
if ($meth_state eq "+") {
$methylcalls[0]++;
$methylcalls[2]++;
} else {
$methylcalls[1]++;
$methylcalls[2]++;
}
}
$last_chr = $chr;
$last_pos = $pos;
if ($methylcalls[2] > 0) {
generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
}
close $ifh or die $!;
@methylcalls = qw (0 0 0); # resetting @methylcalls
}
### deleting temporary files (only needed if --gazillion hasn't been specified
if ($gazillion and scalar @bedfiles == 1){
# if there was only 1 file to sort this will be the input file, which obviously shouldn't be removed
}
else{
my $delete = unlink $in;
if ($delete) {
warn "Successfully deleted the temporary input file $in\n\n";
}
else {
warn "The temporary inputfile $in could not be deleted $!\n\n";
}
}
}
close OUT or die $!;
close COVERAGE or die $!;
if ($zero){
close ZEROCOVERAGE or die $!;
}
### Optional conversion of the bedGraph file to be compatible with UCSC
if ($ucsc){
warn "Finally, creating a UCSC compatible version of the bedGraph file. This shouldn't take long...\n ============== ============== ============== ============== ==============\n";
if ($bedGraph_output =~ /\.gz$/){
open (UCSCTEMP,"gunzip -c $bedGraph_output | ") or die "Failed to read bedGraph file (.gz) >$bedGraph_output<: $!";
}
else{
open (UCSCTEMP,$bedGraph_output) or die "Failed to read bedGraph file >$bedGraph_output<: $!";
}
my $ucsc_outfile = $bedGraph_output;
$ucsc_outfile =~ s/\.gz$//;
$ucsc_outfile .= '_UCSC.bedGraph.gz';
warn "Writing a new version of the bedGraph file compatible to UCSC genomes to file >$ucsc_outfile<\n";
warn "Chromosomes names will start with 'chr'\n";
warn "Chromosome MT (Ensembl mitochondrial DNA http://www.ensembl.org/Homo_sapiens/Location/Chromosome?chr=MT;r=MT:1-16569) will be renamed to 'chrM'\n";
open (UCSCOUT, "| gzip -c - > ${output_dir}$ucsc_outfile") or die "Failed to write to >$ucsc_outfile<: $!";
my $firstline = <UCSCTEMP>;
print UCSCOUT $firstline;
while (<UCSCTEMP>){
my ($chr,$start,$end,$percentage) = (split /\t/);
if ($chr eq "MT"){
# warn "renaming >MT< to >chrM<\n";
$chr = "chrM";
}
unless ($chr =~ /^chr/){
# warn "renaming >$chr< to >chr$chr<\n";
$chr = "chr$chr";
}
print UCSCOUT join ("\t",$chr,$start,$end,$percentage);
}
close UCSCTEMP;
close UCSCOUT or warn "Had trouble closing filehandle UCSCOUT: $!\n";
warn "\nAll done. Finished UCSC conversion.\n\n";
}
exit 0;
sub validate_methylation_call{
my $meth_state = shift;
croak "Missing (+/-) methylation call" unless defined $meth_state;
my $meth_state2 = shift;
croak "Missing alphabetical methylation call" unless defined $meth_state2;
my $is_consistent;
($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
: ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
return 1 if $is_consistent;
return 0;
}
sub check_CpG_methylation_call{
my $meth1 = shift;
my $meth2 = shift;
return 1 if($meth1 eq "+" && $meth2 eq "Z");
return 1 if($meth1 eq "-" && $meth2 eq "z");
return 0;
}
sub check_nonCpG_methylation_call{
my $meth1 = shift;
my $meth2 = shift;
return 1 if($meth1 eq "+" && $meth2 eq "C");
return 1 if($meth1 eq "+" && $meth2 eq "X");
return 1 if($meth1 eq "+" && $meth2 eq "H");
return 1 if($meth1 eq "-" && $meth2 eq "c");
return 1 if($meth1 eq "-" && $meth2 eq "x");
return 1 if($meth1 eq "-" && $meth2 eq "h");
return 0;
}
sub generate_output{
my $methcount = $methylcalls[0];
my $nonmethcount = $methylcalls[1];
my $totalcount = $methylcalls[2];
my $last_chr = shift;
my $last_pos = shift;
croak "Should not be generating output if there's no reads to this region" unless ($totalcount > 0);
croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if ($totalcount != ($methcount + $nonmethcount) );
my $bed_pos = $last_pos - 1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
my $meth_percentage;
($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
# $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
if (defined $meth_percentage){
# this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (clever, huh?)
my $one_based_pos = $bed_pos + 1;
print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
# this is the coverage file. Coordinates are 1-based
print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
# this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
if ($zero){
print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
}
}
}
sub process_commandline{
my $help;
my $output_dir;
my $bedGraph_output;
my $no_header;
my $coverage_threshold; # Minimum number of reads covering before calling methylation status
my $remove;
my $counts;
my $CX_context;
my $sort_size;
my $version;
my $gazillion;
my $ample_mem;
my $zero;
my $input_dir;
my $ucsc;
my $command_line = GetOptions ('help|man' => \$help,
'dir=s' => \$output_dir,
'o|output=s' => \$bedGraph_output,
'no_header' => \$no_header,
"cutoff=i" => \$coverage_threshold,
"remove_spaces" => \$remove,
"counts" => \$counts,
"CX|CX_context" => \$CX_context,
"buffer_size=s" => \$sort_size,
'version' => \$version,
'gazillion|scaffolds' => \$gazillion,
'ample_memory' => \$ample_mem,
"zero_based" => \$zero,
'ucsc' => \$ucsc,
);
### EXIT ON ERROR if there were errors with any of the supplied options
unless ($command_line){
die "Please respecify command line options\n";
}
### HELPFILE
if ($help){
print_helpfile();
exit;
}
if ($version){
print << "VERSION";
Bismark Methylation Extractor Module -
bismark2bedGraph
Bismark Extractor Version: $bismark2bedGraph_version
Copyright 2010-22 Felix Krueger, Altos Bioinformatics
https://github.com/FelixKrueger/Bismark
VERSION
exit;
}
@sorting_files = @ARGV;
### no files provided
unless (@sorting_files){
warn "You need to provide one or more Bismark methylation caller files to create an individual C methylation bedGraph output. Please respecify!\n\n";
sleep(2);
print_helpfile();
exit;
}
### PARENT DIRECTORY
my $parent_dir = getcwd();
# warn "parent directory is: $parent_dir\n";
### OUTPUT DIR PATH
if (defined $output_dir){
unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
unless ($output_dir =~ /\/$/){
$output_dir =~ s/$/\//;
}
unless (-d $output_dir){
mkdir $output_dir or die "Failed to create output directory $output_dir: $!\n\n";
warn "Created output directory $output_dir\n";
}
### want to get an absolute path for the output directory instead of a relative one
chdir $output_dir or die "Failed to move into output directory '$output_dir': $!\n\n";
$output_dir = getcwd();
unless ($output_dir =~ /\/$/){
$output_dir =~ s/$/\//;
}
# warn "output directory is: $output_dir\n";
# changing back to the parent directory
chdir $parent_dir or die "Failed to move back into parent directory '$parent_dir': $!\n\n";
}
}
else{
$output_dir = '';
}
unless (defined $bedGraph_output){
die "Please provide the name of the output file using the option -o/--output filename\n";
}
if ($bedGraph_output =~ /\//){ # this is supposed a filename and not a path name
die "Please specify a file name without any path information (or use --dir if necessary)\n\n";
}
unless ($bedGraph_output =~ /\.gz$/){
$bedGraph_output = "${bedGraph_output}.gz"; ### 22 07 2015: Output will be gzip compressed
}
### NO HEADER
unless ($no_header){
$no_header = 0;
}
### remove white spaces in read ID (needed for sorting using the sort command
unless ($remove){
$remove = 0;
}
### COVERAGE THRESHOLD FOR bedGraph OUTPUT
if (defined $coverage_threshold){
unless ($coverage_threshold > 0){
die "Please select a coverage greater than 0 (positive integers only)\n";
}
}
else{
$coverage_threshold = 1;
}
### SORT buffer size
if (defined $sort_size){
### Using ample memory and the UNIX SORT are mutually exclusive
if ($ample_mem){
die "The options '--ample_mem' and using the UNIX sort function are mutually exclusive. Please make your pick!\n\n";
}
unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
}
}
else{
$sort_size = '2G';
}
unless ($CX_context){
$CX_context = 0;
}
unless ($counts){
$counts = 1;
}
if ($gazillion){
if ($ample_mem){
die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
}
}
if ($ucsc){
warn "Creating additional bedGraph file that has known Ensembl chromosome names replaced to work with the UCSC genome browser\n";
warn "This option:\n- prefixes chromosome names with 'chr'\n";
warn "- changes 'MT' to 'chrM'\n\n";
}
return ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir,$ucsc);
}
sub print_helpfile{
print <<EOF
SYNOPSIS:
This script uses positional methylation data generated by the Bismark methylation extractor to generate
a bedGraph file as well as a coverage file which are both sorted by chromosomal position. The bedGraph
file uses 0-based genomic start and 1-based genomic end coordinates and should be UCSC compatible (if
UCSC genomes were used for the alignment step). In addition this module will write out a coverage file
which is similar to the bedGraph file, but uses 1-based genomic coordinates and also reports the count
of methylated and unmethylated cytosines for any covered position; this coverage file is required if you
wish to generate a genome-wide cytosine report with the module coverage2cytosine.
USAGE: bismark2bedGraph [options] -o <output> [methylation extractor input files]
Methylation extractor input files: These files are required to start with CpG... in order for the
script to correctly work out the sequence context when using CpG context only (default). If all cytosine
contexts are selected ('--CX_context'), all input files will be used regardless of their file file name(s).
-o/--output <filename> Name of the output file, mandatory.
--dir Output directory. Output is written to the current directory if not specified explicitly.
--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
before its methylation percentage is reported. Default: 1.
--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
in the experiment irrespective of its sequence context. This applies to both forward and
reverse strands. Please be aware that this option may generate large temporary and output files
and may take a long time to sort (up to many hours). Default: OFF.
(i.e. Default = CpG context only).
--buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
(e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
Defaults to 2G.
--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
individual chromosome files. These errors were caused by the operating system's limit
of the number of filehandle that can be written to at any one time (typically 1024; to
find out this limit on Linux, type: ulimit -a).
To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
methylation calls into individual chromosome files. Instead, all input files are
temporarily merged into a single file (unless there is only a single file), and this
file will then be sorted by both chromosome AND position using the Unix sort command.
Please be aware that this option might take a looooong time to complete, depending on
the size of the input files, and the memory you allocate to this process (see --buffer_size).
Nevertheless, it seems to be working.
--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
instead use two arrays to sort methylated and unmethylated calls, respectively. This may result
in a faster sorting process for very large files, but this comes at the cost of a larger memory
footprint (as an estimate, two arrays of the length of (the largest) human chromosome 1 (nearly
250 million bp) temporarily consume around 16GB of RAM). Note however that due to the overheads
of creating and looping through arrays this option might in fact be *slower* for small-ish
files (up to a few million alignments). Note also that this option is not currently compatible
with options '--scaffolds/--gazillion'.
--zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
instead of using 1-based coordinates throughout. Default: OFF.
--ucsc Writes out an additional bedGraph file (ending in '_UCSC.bedGraph.gz') that is compatible with the
UCSC genome browser. If alignments were carried out against an Ensembl version of the genome, this
step will prefix chromosome names with 'chr', and rename the mitochondrial chromosome from 'MT' to
'chrM'.
The bedGraph output looks like this (tab-delimited; 0-based start coords, 1-based end coords):
==============================================================================================
track type=bedGraph (header line)
<chromosome> <start position> <end position> <methylation percentage>
The coverage output looks like this (tab-delimited, 1-based genomic coords; optional zero-based, half-open coords with '--zero_based'):
=======================================================================================================================================
<chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
Script last modified: 22 Sept 2020
EOF
;
exit 1;
}