Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changing default behavior of BedAnnotate to preserving input lines order #644

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 68 additions & 86 deletions src/annotateBed/annotateBed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,26 +75,17 @@ void BedAnnotate::PrintHeader() {


void BedAnnotate::InitializeMainFile() {
// process each chromosome
masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin();
masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end();
for (; chromItr != chromEnd; ++chromItr) {
// for each chrom, process each bin
binsToBedCovLists::iterator binItr = chromItr->second.begin();
binsToBedCovLists::iterator binEnd = chromItr->second.end();
for (; binItr != binEnd; ++binItr) {
// initialize BEDCOVLIST in this chrom/bin
vector<BEDCOVLIST>::iterator bedItr = binItr->second.begin();
vector<BEDCOVLIST>::iterator bedEnd = binItr->second.end();
for (; bedItr != bedEnd; ++bedItr) {
// initialize the depthMaps, counts, etc. for each anno file.
for (size_t i = 0; i < _annoFiles.size(); ++i) {
map<unsigned int, DEPTH> dummy;
bedItr->depthMapList.push_back(dummy);
bedItr->counts.push_back(0);
bedItr->minOverlapStarts.push_back(INT_MAX);
}
}
// compute and report
// the observed coverage for each feature
vector<BEDCOVLIST>::iterator bedItr = _bed->bedCovListFlat.begin();
vector<BEDCOVLIST>::iterator bedEnd = _bed->bedCovListFlat.end();
for (; bedItr != bedEnd; ++bedItr) {
// initialize the depthMaps, counts, etc. for each anno file.
for (size_t i = 0; i < _annoFiles.size(); ++i) {
map<unsigned int, DEPTH> dummy;
bedItr->depthMapList.push_back(dummy);
bedItr->counts.push_back(0);
bedItr->minOverlapStarts.push_back(INT_MAX);
}
}
}
Expand All @@ -104,7 +95,7 @@ void BedAnnotate::AnnotateBed() {

// load the "main" bed file into a map so
// that we can easily compare each annoFile to it for overlaps
_bed->loadBedCovListFileIntoMap();
_bed->loadBedCovListFileIntoVector();
// open the annotations files for processing;
OpenAnnoFiles();
// initialize counters, depths, etc. for the main file
Expand All @@ -118,7 +109,7 @@ void BedAnnotate::AnnotateBed() {
// process each entry in the current anno file
while (anno->GetNextBed(a)) {
if (anno->_status == BED_VALID) {
_bed->countListHits(a, annoIndex, _sameStrand, _diffStrand);
_bed->countListHitsWithoutBins(a, annoIndex, _sameStrand, _diffStrand);
}
}
}
Expand All @@ -136,73 +127,64 @@ void BedAnnotate::ReportAnnotations() {
PrintHeader();
}

// process each chromosome
masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin();
masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end();
for (; chromItr != chromEnd; ++chromItr) {
// for each chrom, process each bin
binsToBedCovLists::const_iterator binItr = chromItr->second.begin();
binsToBedCovLists::const_iterator binEnd = chromItr->second.end();
for (; binItr != binEnd; ++binItr) {
// for each chrom & bin, compute and report
// the observed coverage for each feature
vector<BEDCOVLIST>::const_iterator bedItr = binItr->second.begin();
vector<BEDCOVLIST>::const_iterator bedEnd = binItr->second.end();
for (; bedItr != bedEnd; ++bedItr) {
// print the main BED entry.
_bed->reportBedTab(*bedItr);

// now report the coverage from each annotation file.
for (size_t i = 0; i < _annoFiles.size(); ++i) {
unsigned int totalLength = 0;
int zeroDepthCount = 0; // number of bases with zero depth
int depth = 0; // tracks the depth at the current base

// the start is either the first base in the feature OR
// the leftmost position of an overlapping feature. e.g. (s = start):
// A ----------
// B s ------------
int start = min(bedItr->minOverlapStarts[i], bedItr->start);

map<unsigned int, DEPTH>::const_iterator depthItr;
map<unsigned int, DEPTH>::const_iterator depthEnd;

// compute the coverage observed at each base in the feature marching from start to end.
for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) {
// map pointer grabbing the starts and ends observed at this position
depthItr = bedItr->depthMapList[i].find(pos);
depthEnd = bedItr->depthMapList[i].end();

// increment coverage if starts observed at this position.
if (depthItr != depthEnd)
depth += depthItr->second.starts;
// update zero depth
if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0))
zeroDepthCount++;
// decrement coverage if ends observed at this position.
if (depthItr != depthEnd)
depth = depth - depthItr->second.ends;
}
// Summarize the coverage for the current interval,
CHRPOS length = bedItr->end - bedItr->start;
totalLength += length;
int nonZeroBases = (length - zeroDepthCount);
float fractCovered = (float) nonZeroBases / length;
if (_reportCounts == false && _reportBoth == false)
printf("%f", fractCovered);
else if (_reportCounts == true && _reportBoth == false)
printf("%d", bedItr->counts[i]);
else if (_reportCounts == false && _reportBoth == true)
printf("%d\t%f", bedItr->counts[i], fractCovered);

if (i != _annoFiles.size() - 1)
printf("\t");
}
// print newline for next feature.
printf("\n");
// compute and report
// the observed coverage for each feature
vector<BEDCOVLIST>::const_iterator bedItr = _bed->bedCovListFlat.begin();
vector<BEDCOVLIST>::const_iterator bedEnd = _bed->bedCovListFlat.end();
for (; bedItr != bedEnd; ++bedItr) {
// print the main BED entry.
_bed->reportBedTab(*bedItr);

// now report the coverage from each annotation file.
for (size_t i = 0; i < _annoFiles.size(); ++i) {
unsigned int totalLength = 0;
int zeroDepthCount = 0; // number of bases with zero depth
int depth = 0; // tracks the depth at the current base

// the start is either the first base in the feature OR
// the leftmost position of an overlapping feature. e.g. (s = start):
// A ----------
// B s ------------
int start = min(bedItr->minOverlapStarts[i], bedItr->start);

map<unsigned int, DEPTH>::const_iterator depthItr;
map<unsigned int, DEPTH>::const_iterator depthEnd;

// compute the coverage observed at each base in the feature marching from start to end.
for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) {
// map pointer grabbing the starts and ends observed at this position
depthItr = bedItr->depthMapList[i].find(pos);
depthEnd = bedItr->depthMapList[i].end();

// increment coverage if starts observed at this position.
if (depthItr != depthEnd)
depth += depthItr->second.starts;
// update zero depth
if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0))
zeroDepthCount++;
// decrement coverage if ends observed at this position.
if (depthItr != depthEnd)
depth = depth - depthItr->second.ends;
}
// Summarize the coverage for the current interval,
CHRPOS length = bedItr->end - bedItr->start;
totalLength += length;
int nonZeroBases = (length - zeroDepthCount);
float fractCovered = (float) nonZeroBases / length;
if (_reportCounts == false && _reportBoth == false)
printf("%f", fractCovered);
else if (_reportCounts == true && _reportBoth == false)
printf("%d", bedItr->counts[i]);
else if (_reportCounts == false && _reportBoth == true)
printf("%d\t%f", bedItr->counts[i], fractCovered);

if (i != _annoFiles.size() - 1)
printf("\t");
}
// print newline for next feature.
printf("\n");
}

}


103 changes: 55 additions & 48 deletions src/utils/bedFile/bedFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -587,60 +587,42 @@ void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool sameStrand, bool
}


void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) {
void BedFile::countListHitsWithoutBins(const BED &a, int index, bool sameStrand, bool diffStrand) {

BIN startBin, endBin;
startBin = (a.start >> _binFirstShift);
endBin = ((a.end-1) >> _binFirstShift);

// loop through each bin "level" in the binning hierarchy
for (BINLEVEL i = 0; i < _binLevels; ++i) {
// compute and report the observed coverage for each feature
vector<BEDCOVLIST>::iterator bedItr = bedCovListFlat.begin();
vector<BEDCOVLIST>::iterator bedEnd = bedCovListFlat.end();
for (; bedItr != bedEnd; ++bedItr) {

// loop through each bin at this level of the hierarchy
BIN offset = _binOffsetsExtended[i];
for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {

// loop through each feature in this chrom/bin and see if it
// overlaps with the feature that was passed in. if so,
// add the feature tothe list of hits.
vector<BEDCOVLIST>::iterator
bedItr = bedCovListMap[a.chrom][j].begin();
vector<BEDCOVLIST>::iterator
bedEnd = bedCovListMap[a.chrom][j].end();
for (; bedItr != bedEnd; ++bedItr) {

bool strands_are_same = (a.strand == bedItr->strand);
// skip the hit if not on the same strand (and we care)
if ((sameStrand == true && strands_are_same == false) ||
(diffStrand == true && strands_are_same == true)
)
{
continue;
}
else if (overlaps(bedItr->start, bedItr->end,
a.start, a.end) > 0)
{
bedItr->counts[index]++;
if (a.zeroLength == false) {
bedItr->depthMapList[index][a.start+1].starts++;
bedItr->depthMapList[index][a.end].ends++;
}
else {
// correct for the fact that we artificially expanded
// the zeroLength feature
bedItr->depthMapList[index][a.start+2].starts++;
bedItr->depthMapList[index][a.end-1].ends++;
}
bool strands_are_same = (a.strand == bedItr->strand);
// skip the hit if not on the same strand (and we care)
if ((sameStrand == true && strands_are_same == false) ||
(diffStrand == true && strands_are_same == true)
)
{
continue;
}
else if (overlaps(bedItr->start, bedItr->end,
a.start, a.end) > 0)
{
bedItr->counts[index]++;
if (a.zeroLength == false) {
bedItr->depthMapList[index][a.start+1].starts++;
bedItr->depthMapList[index][a.end].ends++;
}
else {
// correct for the fact that we artificially expanded
// the zeroLength feature
bedItr->depthMapList[index][a.start+2].starts++;
bedItr->depthMapList[index][a.end-1].ends++;
}

if (a.start < bedItr->minOverlapStarts[index]) {
bedItr->minOverlapStarts[index] = a.start;
}
}
if (a.start < bedItr->minOverlapStarts[index]) {
bedItr->minOverlapStarts[index] = a.start;
}
}
startBin >>= _binNextShift;
endBin >>= _binNextShift;
}

}

void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }
Expand Down Expand Up @@ -743,6 +725,31 @@ void BedFile::loadBedCovListFileIntoMap() {
Close();
}

void BedFile::loadBedCovListFileIntoVector() {

BED bedEntry;
Open();
while (GetNextBed(bedEntry)) {
if (_status == BED_VALID) {

BEDCOVLIST bedCovList;
bedCovList.chrom = bedEntry.chrom;
bedCovList.start = bedEntry.start;
bedCovList.end = bedEntry.end;
bedCovList.name = bedEntry.name;
bedCovList.score = bedEntry.score;
bedCovList.strand = bedEntry.strand;
bedCovList.fields = bedEntry.fields;
bedCovList.other_idxs = bedEntry.other_idxs;
bedCovList.zeroLength = bedEntry.zeroLength;

bedCovListFlat.push_back(bedCovList);
}
}
Close();

}


void BedFile::loadBedFileIntoMapNoBin() {

Expand Down
27 changes: 15 additions & 12 deletions src/utils/bedFile/bedFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,17 @@ typedef uint32_t UINT;
//*************************************************
// Genome binning constants
//*************************************************
const BIN _numBins = 37450;
const BINLEVEL _binLevels = 7;

// bins range in size from 16kb to 512Mb
// Bin 0 spans 512Mbp, # Level 1
// Bins 1-8 span 64Mbp, # Level 2
// Bins 9-72 span 8Mbp, # Level 3
// Bins 73-584 span 1Mbp # Level 4
// Bins 585-4680 span 128Kbp # Level 5
// Bins 4681-37449 span 16Kbp # Level 6
// bins range in size from 2kb to 512Mb
// Bin 0 spans 512Mbp, # Level 0
// Bins 1-8 span 64Mbp, # Level 1
// Bins 9-72 span 8Mbp, # Level 2
// Bins 73-584 span 1Mbp # Level 3
// Bins 585-4680 span 128Kbp # Level 4
// Bins 4681-37448 span 16Kbp # Level 5
// Bins 37449-299592 span 2Kbp # Level 6
const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};

const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */
const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */
Expand Down Expand Up @@ -434,7 +433,10 @@ class BedFile {
// vector of BEDCOVLISTs
void loadBedCovListFileIntoMap();

// load a BED file into a map keyed by chrom. value is vector of BEDs
// load a BED file into a vector of BEDCOVLISTs
void loadBedCovListFileIntoVector();

// load and sort a BED file into a map keyed by chrom. value is vector of BEDs
void loadBedFileIntoMapNoBin();

// load a BED file into a vector of BEDs
Expand Down Expand Up @@ -475,8 +477,8 @@ class BedFile {
// Given a chrom, start, end and strand for a single feature,
// increment a the number of hits for each feature in B file
// that the feature overlaps
void countListHits(const BED &a, int index,
bool sameStrand, bool diffStrand);
void countListHitsWithoutBins(const BED &a, int index,
bool sameStrand, bool diffStrand);


// return the total length of all the intervals in the file.
Expand All @@ -501,6 +503,7 @@ class BedFile {
masterBedMap bedMap;
bedVector bedList;
masterBedMapNoBin bedMapNoBin;
bedCovListVector bedCovListFlat;

BedLineStatus _status;
int _lineNum;
Expand Down