From bc3d3d8d25f39b7c201f8644ef1a44d8e7e0ec1e Mon Sep 17 00:00:00 2001 From: borisevichdi Date: Wed, 11 Jul 2018 17:23:46 +0200 Subject: [PATCH] 1. BedAnnotate now preserves initial sort order. 2. Adjusted comments in bedFile.h to better represent actual binning model used for last several years. In before AnnotateBed used genomic binning. Being isolated tool, it never really benefited or used in any way binning. However, bare usage of binning forced AnnotateBed output to be re-sorted within chromosomes; first - from bigger to smaller bins, second - per start coordinate within a bin. It creates unexpected from documentation changes in output file. This commit alter annotateBed and functions, specific to it, so that new version of code will preserve lines order from input file. --- src/annotateBed/annotateBed.cpp | 154 ++++++++++++++------------------ src/utils/bedFile/bedFile.cpp | 103 +++++++++++---------- src/utils/bedFile/bedFile.h | 27 +++--- 3 files changed, 138 insertions(+), 146 deletions(-) diff --git a/src/annotateBed/annotateBed.cpp b/src/annotateBed/annotateBed.cpp index d4629449c..bfd0108cf 100644 --- a/src/annotateBed/annotateBed.cpp +++ b/src/annotateBed/annotateBed.cpp @@ -75,26 +75,17 @@ void BedAnnotate::PrintHeader() { void BedAnnotate::InitializeMainFile() { - // process each chromosome - masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin(); - masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - // for each chrom, process each bin - binsToBedCovLists::iterator binItr = chromItr->second.begin(); - binsToBedCovLists::iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { - // initialize BEDCOVLIST in this chrom/bin - vector::iterator bedItr = binItr->second.begin(); - vector::iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // initialize the depthMaps, counts, etc. for each anno file. - for (size_t i = 0; i < _annoFiles.size(); ++i) { - map dummy; - bedItr->depthMapList.push_back(dummy); - bedItr->counts.push_back(0); - bedItr->minOverlapStarts.push_back(INT_MAX); - } - } + // compute and report + // the observed coverage for each feature + vector::iterator bedItr = _bed->bedCovListFlat.begin(); + vector::iterator bedEnd = _bed->bedCovListFlat.end(); + for (; bedItr != bedEnd; ++bedItr) { + // initialize the depthMaps, counts, etc. for each anno file. + for (size_t i = 0; i < _annoFiles.size(); ++i) { + map dummy; + bedItr->depthMapList.push_back(dummy); + bedItr->counts.push_back(0); + bedItr->minOverlapStarts.push_back(INT_MAX); } } } @@ -104,7 +95,7 @@ void BedAnnotate::AnnotateBed() { // load the "main" bed file into a map so // that we can easily compare each annoFile to it for overlaps - _bed->loadBedCovListFileIntoMap(); + _bed->loadBedCovListFileIntoVector(); // open the annotations files for processing; OpenAnnoFiles(); // initialize counters, depths, etc. for the main file @@ -118,7 +109,7 @@ void BedAnnotate::AnnotateBed() { // process each entry in the current anno file while (anno->GetNextBed(a)) { if (anno->_status == BED_VALID) { - _bed->countListHits(a, annoIndex, _sameStrand, _diffStrand); + _bed->countListHitsWithoutBins(a, annoIndex, _sameStrand, _diffStrand); } } } @@ -136,73 +127,64 @@ void BedAnnotate::ReportAnnotations() { PrintHeader(); } - // process each chromosome - masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin(); - masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - // for each chrom, process each bin - binsToBedCovLists::const_iterator binItr = chromItr->second.begin(); - binsToBedCovLists::const_iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { - // for each chrom & bin, compute and report - // the observed coverage for each feature - vector::const_iterator bedItr = binItr->second.begin(); - vector::const_iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // print the main BED entry. - _bed->reportBedTab(*bedItr); - - // now report the coverage from each annotation file. - for (size_t i = 0; i < _annoFiles.size(); ++i) { - unsigned int totalLength = 0; - int zeroDepthCount = 0; // number of bases with zero depth - int depth = 0; // tracks the depth at the current base - - // the start is either the first base in the feature OR - // the leftmost position of an overlapping feature. e.g. (s = start): - // A ---------- - // B s ------------ - int start = min(bedItr->minOverlapStarts[i], bedItr->start); - - map::const_iterator depthItr; - map::const_iterator depthEnd; - - // compute the coverage observed at each base in the feature marching from start to end. - for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { - // map pointer grabbing the starts and ends observed at this position - depthItr = bedItr->depthMapList[i].find(pos); - depthEnd = bedItr->depthMapList[i].end(); - - // increment coverage if starts observed at this position. - if (depthItr != depthEnd) - depth += depthItr->second.starts; - // update zero depth - if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) - zeroDepthCount++; - // decrement coverage if ends observed at this position. - if (depthItr != depthEnd) - depth = depth - depthItr->second.ends; - } - // Summarize the coverage for the current interval, - CHRPOS length = bedItr->end - bedItr->start; - totalLength += length; - int nonZeroBases = (length - zeroDepthCount); - float fractCovered = (float) nonZeroBases / length; - if (_reportCounts == false && _reportBoth == false) - printf("%f", fractCovered); - else if (_reportCounts == true && _reportBoth == false) - printf("%d", bedItr->counts[i]); - else if (_reportCounts == false && _reportBoth == true) - printf("%d\t%f", bedItr->counts[i], fractCovered); - - if (i != _annoFiles.size() - 1) - printf("\t"); - } - // print newline for next feature. - printf("\n"); + // compute and report + // the observed coverage for each feature + vector::const_iterator bedItr = _bed->bedCovListFlat.begin(); + vector::const_iterator bedEnd = _bed->bedCovListFlat.end(); + for (; bedItr != bedEnd; ++bedItr) { + // print the main BED entry. + _bed->reportBedTab(*bedItr); + + // now report the coverage from each annotation file. + for (size_t i = 0; i < _annoFiles.size(); ++i) { + unsigned int totalLength = 0; + int zeroDepthCount = 0; // number of bases with zero depth + int depth = 0; // tracks the depth at the current base + + // the start is either the first base in the feature OR + // the leftmost position of an overlapping feature. e.g. (s = start): + // A ---------- + // B s ------------ + int start = min(bedItr->minOverlapStarts[i], bedItr->start); + + map::const_iterator depthItr; + map::const_iterator depthEnd; + + // compute the coverage observed at each base in the feature marching from start to end. + for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { + // map pointer grabbing the starts and ends observed at this position + depthItr = bedItr->depthMapList[i].find(pos); + depthEnd = bedItr->depthMapList[i].end(); + + // increment coverage if starts observed at this position. + if (depthItr != depthEnd) + depth += depthItr->second.starts; + // update zero depth + if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) + zeroDepthCount++; + // decrement coverage if ends observed at this position. + if (depthItr != depthEnd) + depth = depth - depthItr->second.ends; } + // Summarize the coverage for the current interval, + CHRPOS length = bedItr->end - bedItr->start; + totalLength += length; + int nonZeroBases = (length - zeroDepthCount); + float fractCovered = (float) nonZeroBases / length; + if (_reportCounts == false && _reportBoth == false) + printf("%f", fractCovered); + else if (_reportCounts == true && _reportBoth == false) + printf("%d", bedItr->counts[i]); + else if (_reportCounts == false && _reportBoth == true) + printf("%d\t%f", bedItr->counts[i], fractCovered); + + if (i != _annoFiles.size() - 1) + printf("\t"); } + // print newline for next feature. + printf("\n"); } + } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 6c838dd38..1bb318829 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -587,60 +587,42 @@ void BedFile::countSplitHits(const vector &bedBlocks, bool sameStrand, bool } -void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) { +void BedFile::countListHitsWithoutBins(const BED &a, int index, bool sameStrand, bool diffStrand) { - BIN startBin, endBin; - startBin = (a.start >> _binFirstShift); - endBin = ((a.end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy - for (BINLEVEL i = 0; i < _binLevels; ++i) { + // compute and report the observed coverage for each feature + vector::iterator bedItr = bedCovListFlat.begin(); + vector::iterator bedEnd = bedCovListFlat.end(); + for (; bedItr != bedEnd; ++bedItr) { - // loop through each bin at this level of the hierarchy - BIN offset = _binOffsetsExtended[i]; - for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it - // overlaps with the feature that was passed in. if so, - // add the feature tothe list of hits. - vector::iterator - bedItr = bedCovListMap[a.chrom][j].begin(); - vector::iterator - bedEnd = bedCovListMap[a.chrom][j].end(); - for (; bedItr != bedEnd; ++bedItr) { - - bool strands_are_same = (a.strand == bedItr->strand); - // skip the hit if not on the same strand (and we care) - if ((sameStrand == true && strands_are_same == false) || - (diffStrand == true && strands_are_same == true) - ) - { - continue; - } - else if (overlaps(bedItr->start, bedItr->end, - a.start, a.end) > 0) - { - bedItr->counts[index]++; - if (a.zeroLength == false) { - bedItr->depthMapList[index][a.start+1].starts++; - bedItr->depthMapList[index][a.end].ends++; - } - else { - // correct for the fact that we artificially expanded - // the zeroLength feature - bedItr->depthMapList[index][a.start+2].starts++; - bedItr->depthMapList[index][a.end-1].ends++; - } + bool strands_are_same = (a.strand == bedItr->strand); + // skip the hit if not on the same strand (and we care) + if ((sameStrand == true && strands_are_same == false) || + (diffStrand == true && strands_are_same == true) + ) + { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, + a.start, a.end) > 0) + { + bedItr->counts[index]++; + if (a.zeroLength == false) { + bedItr->depthMapList[index][a.start+1].starts++; + bedItr->depthMapList[index][a.end].ends++; + } + else { + // correct for the fact that we artificially expanded + // the zeroLength feature + bedItr->depthMapList[index][a.start+2].starts++; + bedItr->depthMapList[index][a.end-1].ends++; + } - if (a.start < bedItr->minOverlapStarts[index]) { - bedItr->minOverlapStarts[index] = a.start; - } - } + if (a.start < bedItr->minOverlapStarts[index]) { + bedItr->minOverlapStarts[index] = a.start; } } - startBin >>= _binNextShift; - endBin >>= _binNextShift; } + } void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; } @@ -743,6 +725,31 @@ void BedFile::loadBedCovListFileIntoMap() { Close(); } +void BedFile::loadBedCovListFileIntoVector() { + + BED bedEntry; + Open(); + while (GetNextBed(bedEntry)) { + if (_status == BED_VALID) { + + BEDCOVLIST bedCovList; + bedCovList.chrom = bedEntry.chrom; + bedCovList.start = bedEntry.start; + bedCovList.end = bedEntry.end; + bedCovList.name = bedEntry.name; + bedCovList.score = bedEntry.score; + bedCovList.strand = bedEntry.strand; + bedCovList.fields = bedEntry.fields; + bedCovList.other_idxs = bedEntry.other_idxs; + bedCovList.zeroLength = bedEntry.zeroLength; + + bedCovListFlat.push_back(bedCovList); + } + } + Close(); + +} + void BedFile::loadBedFileIntoMapNoBin() { diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 8e7e4c969..a03c779c7 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -46,18 +46,17 @@ typedef uint32_t UINT; //************************************************* // Genome binning constants //************************************************* -const BIN _numBins = 37450; const BINLEVEL _binLevels = 7; -// bins range in size from 16kb to 512Mb -// Bin 0 spans 512Mbp, # Level 1 -// Bins 1-8 span 64Mbp, # Level 2 -// Bins 9-72 span 8Mbp, # Level 3 -// Bins 73-584 span 1Mbp # Level 4 -// Bins 585-4680 span 128Kbp # Level 5 -// Bins 4681-37449 span 16Kbp # Level 6 +// bins range in size from 2kb to 512Mb +// Bin 0 spans 512Mbp, # Level 0 +// Bins 1-8 span 64Mbp, # Level 1 +// Bins 9-72 span 8Mbp, # Level 2 +// Bins 73-584 span 1Mbp # Level 3 +// Bins 585-4680 span 128Kbp # Level 4 +// Bins 4681-37448 span 16Kbp # Level 5 +// Bins 37449-299592 span 2Kbp # Level 6 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; -//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ @@ -434,7 +433,10 @@ class BedFile { // vector of BEDCOVLISTs void loadBedCovListFileIntoMap(); - // load a BED file into a map keyed by chrom. value is vector of BEDs + // load a BED file into a vector of BEDCOVLISTs + void loadBedCovListFileIntoVector(); + + // load and sort a BED file into a map keyed by chrom. value is vector of BEDs void loadBedFileIntoMapNoBin(); // load a BED file into a vector of BEDs @@ -475,8 +477,8 @@ class BedFile { // Given a chrom, start, end and strand for a single feature, // increment a the number of hits for each feature in B file // that the feature overlaps - void countListHits(const BED &a, int index, - bool sameStrand, bool diffStrand); + void countListHitsWithoutBins(const BED &a, int index, + bool sameStrand, bool diffStrand); // return the total length of all the intervals in the file. @@ -501,6 +503,7 @@ class BedFile { masterBedMap bedMap; bedVector bedList; masterBedMapNoBin bedMapNoBin; + bedCovListVector bedCovListFlat; BedLineStatus _status; int _lineNum;