diff --git a/src/annotateBed/annotateBed.cpp b/src/annotateBed/annotateBed.cpp index d4629449c..bfd0108cf 100644 --- a/src/annotateBed/annotateBed.cpp +++ b/src/annotateBed/annotateBed.cpp @@ -75,26 +75,17 @@ void BedAnnotate::PrintHeader() { void BedAnnotate::InitializeMainFile() { - // process each chromosome - masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin(); - masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - // for each chrom, process each bin - binsToBedCovLists::iterator binItr = chromItr->second.begin(); - binsToBedCovLists::iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { - // initialize BEDCOVLIST in this chrom/bin - vector::iterator bedItr = binItr->second.begin(); - vector::iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // initialize the depthMaps, counts, etc. for each anno file. - for (size_t i = 0; i < _annoFiles.size(); ++i) { - map dummy; - bedItr->depthMapList.push_back(dummy); - bedItr->counts.push_back(0); - bedItr->minOverlapStarts.push_back(INT_MAX); - } - } + // compute and report + // the observed coverage for each feature + vector::iterator bedItr = _bed->bedCovListFlat.begin(); + vector::iterator bedEnd = _bed->bedCovListFlat.end(); + for (; bedItr != bedEnd; ++bedItr) { + // initialize the depthMaps, counts, etc. for each anno file. + for (size_t i = 0; i < _annoFiles.size(); ++i) { + map dummy; + bedItr->depthMapList.push_back(dummy); + bedItr->counts.push_back(0); + bedItr->minOverlapStarts.push_back(INT_MAX); } } } @@ -104,7 +95,7 @@ void BedAnnotate::AnnotateBed() { // load the "main" bed file into a map so // that we can easily compare each annoFile to it for overlaps - _bed->loadBedCovListFileIntoMap(); + _bed->loadBedCovListFileIntoVector(); // open the annotations files for processing; OpenAnnoFiles(); // initialize counters, depths, etc. for the main file @@ -118,7 +109,7 @@ void BedAnnotate::AnnotateBed() { // process each entry in the current anno file while (anno->GetNextBed(a)) { if (anno->_status == BED_VALID) { - _bed->countListHits(a, annoIndex, _sameStrand, _diffStrand); + _bed->countListHitsWithoutBins(a, annoIndex, _sameStrand, _diffStrand); } } } @@ -136,73 +127,64 @@ void BedAnnotate::ReportAnnotations() { PrintHeader(); } - // process each chromosome - masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin(); - masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - // for each chrom, process each bin - binsToBedCovLists::const_iterator binItr = chromItr->second.begin(); - binsToBedCovLists::const_iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { - // for each chrom & bin, compute and report - // the observed coverage for each feature - vector::const_iterator bedItr = binItr->second.begin(); - vector::const_iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // print the main BED entry. - _bed->reportBedTab(*bedItr); - - // now report the coverage from each annotation file. - for (size_t i = 0; i < _annoFiles.size(); ++i) { - unsigned int totalLength = 0; - int zeroDepthCount = 0; // number of bases with zero depth - int depth = 0; // tracks the depth at the current base - - // the start is either the first base in the feature OR - // the leftmost position of an overlapping feature. e.g. (s = start): - // A ---------- - // B s ------------ - int start = min(bedItr->minOverlapStarts[i], bedItr->start); - - map::const_iterator depthItr; - map::const_iterator depthEnd; - - // compute the coverage observed at each base in the feature marching from start to end. - for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { - // map pointer grabbing the starts and ends observed at this position - depthItr = bedItr->depthMapList[i].find(pos); - depthEnd = bedItr->depthMapList[i].end(); - - // increment coverage if starts observed at this position. - if (depthItr != depthEnd) - depth += depthItr->second.starts; - // update zero depth - if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) - zeroDepthCount++; - // decrement coverage if ends observed at this position. - if (depthItr != depthEnd) - depth = depth - depthItr->second.ends; - } - // Summarize the coverage for the current interval, - CHRPOS length = bedItr->end - bedItr->start; - totalLength += length; - int nonZeroBases = (length - zeroDepthCount); - float fractCovered = (float) nonZeroBases / length; - if (_reportCounts == false && _reportBoth == false) - printf("%f", fractCovered); - else if (_reportCounts == true && _reportBoth == false) - printf("%d", bedItr->counts[i]); - else if (_reportCounts == false && _reportBoth == true) - printf("%d\t%f", bedItr->counts[i], fractCovered); - - if (i != _annoFiles.size() - 1) - printf("\t"); - } - // print newline for next feature. - printf("\n"); + // compute and report + // the observed coverage for each feature + vector::const_iterator bedItr = _bed->bedCovListFlat.begin(); + vector::const_iterator bedEnd = _bed->bedCovListFlat.end(); + for (; bedItr != bedEnd; ++bedItr) { + // print the main BED entry. + _bed->reportBedTab(*bedItr); + + // now report the coverage from each annotation file. + for (size_t i = 0; i < _annoFiles.size(); ++i) { + unsigned int totalLength = 0; + int zeroDepthCount = 0; // number of bases with zero depth + int depth = 0; // tracks the depth at the current base + + // the start is either the first base in the feature OR + // the leftmost position of an overlapping feature. e.g. (s = start): + // A ---------- + // B s ------------ + int start = min(bedItr->minOverlapStarts[i], bedItr->start); + + map::const_iterator depthItr; + map::const_iterator depthEnd; + + // compute the coverage observed at each base in the feature marching from start to end. + for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { + // map pointer grabbing the starts and ends observed at this position + depthItr = bedItr->depthMapList[i].find(pos); + depthEnd = bedItr->depthMapList[i].end(); + + // increment coverage if starts observed at this position. + if (depthItr != depthEnd) + depth += depthItr->second.starts; + // update zero depth + if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) + zeroDepthCount++; + // decrement coverage if ends observed at this position. + if (depthItr != depthEnd) + depth = depth - depthItr->second.ends; } + // Summarize the coverage for the current interval, + CHRPOS length = bedItr->end - bedItr->start; + totalLength += length; + int nonZeroBases = (length - zeroDepthCount); + float fractCovered = (float) nonZeroBases / length; + if (_reportCounts == false && _reportBoth == false) + printf("%f", fractCovered); + else if (_reportCounts == true && _reportBoth == false) + printf("%d", bedItr->counts[i]); + else if (_reportCounts == false && _reportBoth == true) + printf("%d\t%f", bedItr->counts[i], fractCovered); + + if (i != _annoFiles.size() - 1) + printf("\t"); } + // print newline for next feature. + printf("\n"); } + } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 6c838dd38..1bb318829 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -587,60 +587,42 @@ void BedFile::countSplitHits(const vector &bedBlocks, bool sameStrand, bool } -void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) { +void BedFile::countListHitsWithoutBins(const BED &a, int index, bool sameStrand, bool diffStrand) { - BIN startBin, endBin; - startBin = (a.start >> _binFirstShift); - endBin = ((a.end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy - for (BINLEVEL i = 0; i < _binLevels; ++i) { + // compute and report the observed coverage for each feature + vector::iterator bedItr = bedCovListFlat.begin(); + vector::iterator bedEnd = bedCovListFlat.end(); + for (; bedItr != bedEnd; ++bedItr) { - // loop through each bin at this level of the hierarchy - BIN offset = _binOffsetsExtended[i]; - for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it - // overlaps with the feature that was passed in. if so, - // add the feature tothe list of hits. - vector::iterator - bedItr = bedCovListMap[a.chrom][j].begin(); - vector::iterator - bedEnd = bedCovListMap[a.chrom][j].end(); - for (; bedItr != bedEnd; ++bedItr) { - - bool strands_are_same = (a.strand == bedItr->strand); - // skip the hit if not on the same strand (and we care) - if ((sameStrand == true && strands_are_same == false) || - (diffStrand == true && strands_are_same == true) - ) - { - continue; - } - else if (overlaps(bedItr->start, bedItr->end, - a.start, a.end) > 0) - { - bedItr->counts[index]++; - if (a.zeroLength == false) { - bedItr->depthMapList[index][a.start+1].starts++; - bedItr->depthMapList[index][a.end].ends++; - } - else { - // correct for the fact that we artificially expanded - // the zeroLength feature - bedItr->depthMapList[index][a.start+2].starts++; - bedItr->depthMapList[index][a.end-1].ends++; - } + bool strands_are_same = (a.strand == bedItr->strand); + // skip the hit if not on the same strand (and we care) + if ((sameStrand == true && strands_are_same == false) || + (diffStrand == true && strands_are_same == true) + ) + { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, + a.start, a.end) > 0) + { + bedItr->counts[index]++; + if (a.zeroLength == false) { + bedItr->depthMapList[index][a.start+1].starts++; + bedItr->depthMapList[index][a.end].ends++; + } + else { + // correct for the fact that we artificially expanded + // the zeroLength feature + bedItr->depthMapList[index][a.start+2].starts++; + bedItr->depthMapList[index][a.end-1].ends++; + } - if (a.start < bedItr->minOverlapStarts[index]) { - bedItr->minOverlapStarts[index] = a.start; - } - } + if (a.start < bedItr->minOverlapStarts[index]) { + bedItr->minOverlapStarts[index] = a.start; } } - startBin >>= _binNextShift; - endBin >>= _binNextShift; } + } void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; } @@ -743,6 +725,31 @@ void BedFile::loadBedCovListFileIntoMap() { Close(); } +void BedFile::loadBedCovListFileIntoVector() { + + BED bedEntry; + Open(); + while (GetNextBed(bedEntry)) { + if (_status == BED_VALID) { + + BEDCOVLIST bedCovList; + bedCovList.chrom = bedEntry.chrom; + bedCovList.start = bedEntry.start; + bedCovList.end = bedEntry.end; + bedCovList.name = bedEntry.name; + bedCovList.score = bedEntry.score; + bedCovList.strand = bedEntry.strand; + bedCovList.fields = bedEntry.fields; + bedCovList.other_idxs = bedEntry.other_idxs; + bedCovList.zeroLength = bedEntry.zeroLength; + + bedCovListFlat.push_back(bedCovList); + } + } + Close(); + +} + void BedFile::loadBedFileIntoMapNoBin() { diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 8e7e4c969..a03c779c7 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -46,18 +46,17 @@ typedef uint32_t UINT; //************************************************* // Genome binning constants //************************************************* -const BIN _numBins = 37450; const BINLEVEL _binLevels = 7; -// bins range in size from 16kb to 512Mb -// Bin 0 spans 512Mbp, # Level 1 -// Bins 1-8 span 64Mbp, # Level 2 -// Bins 9-72 span 8Mbp, # Level 3 -// Bins 73-584 span 1Mbp # Level 4 -// Bins 585-4680 span 128Kbp # Level 5 -// Bins 4681-37449 span 16Kbp # Level 6 +// bins range in size from 2kb to 512Mb +// Bin 0 spans 512Mbp, # Level 0 +// Bins 1-8 span 64Mbp, # Level 1 +// Bins 9-72 span 8Mbp, # Level 2 +// Bins 73-584 span 1Mbp # Level 3 +// Bins 585-4680 span 128Kbp # Level 4 +// Bins 4681-37448 span 16Kbp # Level 5 +// Bins 37449-299592 span 2Kbp # Level 6 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; -//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ @@ -434,7 +433,10 @@ class BedFile { // vector of BEDCOVLISTs void loadBedCovListFileIntoMap(); - // load a BED file into a map keyed by chrom. value is vector of BEDs + // load a BED file into a vector of BEDCOVLISTs + void loadBedCovListFileIntoVector(); + + // load and sort a BED file into a map keyed by chrom. value is vector of BEDs void loadBedFileIntoMapNoBin(); // load a BED file into a vector of BEDs @@ -475,8 +477,8 @@ class BedFile { // Given a chrom, start, end and strand for a single feature, // increment a the number of hits for each feature in B file // that the feature overlaps - void countListHits(const BED &a, int index, - bool sameStrand, bool diffStrand); + void countListHitsWithoutBins(const BED &a, int index, + bool sameStrand, bool diffStrand); // return the total length of all the intervals in the file. @@ -501,6 +503,7 @@ class BedFile { masterBedMap bedMap; bedVector bedList; masterBedMapNoBin bedMapNoBin; + bedCovListVector bedCovListFlat; BedLineStatus _status; int _lineNum;