Skip to content

Commit

Permalink
Modify CreateVariantIngestFiles to write missing ref intervals with t…
Browse files Browse the repository at this point in the history
…he ZERO state, unless we are dropping that (ZERO) state and none other.
  • Loading branch information
gbggrant committed Oct 23, 2023
1 parent 5addbba commit c1e334a
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ public enum GQStateEnum {
FORTY("4", 40, 4),
FIFTY("5", 50, 5),
SIXTY("6", 60, 6),
// NOTE: MISSING is no longer used (it is now being written as ZERO, *unless* we are dropping ref_blocks with that state.
// However, we will keep this enum value around in case the code needs to access older data sets written with MISSING values
MISSING("m", null, 9),
UNKNOWN("u", null, 10),
NONE("");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ public final class CreateVariantIngestFiles extends VariantWalker {

private boolean shouldWriteLoadStatusStarted = true;

private final Set<GQStateEnum> gqStatesToIgnore = new HashSet<>();

// getGenotypes() returns list of lists for all samples at variant
// assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0
public static boolean isNoCall(VariantContext variant) {
Expand Down Expand Up @@ -288,8 +290,16 @@ public void onTraversalStart() {
final GenomeLocParser genomeLocParser = new GenomeLocParser(seqDictionary);
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocParser, IntervalUtils.genomeLocsFromLocatables(genomeLocParser, intervalArgumentCollection.getIntervals(seqDictionary)));

if (gqStateToIgnore != null) {
gqStatesToIgnore.add(gqStateToIgnore);
if (dropAboveGqThreshold) {
// TODO - Do we want to get rid of this option ("dropAboveGqThreshold") - never used and confuses things.
gqStatesToIgnore.addAll(RefCreator.getGQStateEnumGreaterThan(gqStateToIgnore));
}
}

if (enableReferenceRanges && !refRangesRowsExist) {
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStateToIgnore, dropAboveGqThreshold, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);
}

if (enableVet && !vetRowsExist) {
Expand Down Expand Up @@ -360,10 +370,14 @@ public Object onTraversalSuccess() {
}

if (refCreator != null) {
try {
refCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet);
} catch (IOException ioe) {
throw new GATKException("Error writing missing intervals", ioe);
if ((gqStatesToIgnore.size() != 1) || (!gqStatesToIgnore.contains(GQStateEnum.ZERO))) {
// We will write missing intervals as ZERO ('GQ0') unless that is the (ONLY???) GQ state that we are dropping.
// If ZERO/GQ0 is the ONLY state that we are dropping then we do not write those intervals.
try {
refCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet);
} catch (IOException ioe) {
throw new GATKException("Error writing missing intervals", ioe);

Check warning on line 379 in src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java#L378-L379

Added lines #L378 - L379 were not covered by tests
}
}
// Wait until all data has been submitted and in pending state to commit
refCreator.commitData();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public final class RefCreator {
private final boolean writeReferenceRanges;
private final Long sampleId;
private SimpleInterval previousInterval;
private final Set<GQStateEnum> gqStatesToIgnore = new HashSet<>();
private final Set<GQStateEnum> gqStatesToIgnore;
private final GenomeLocSortedSet coverageLocSortedSet;
private final boolean storeCompressedReferences;
private static final String PREFIX_SEPARATOR = "_";
Expand All @@ -43,11 +43,12 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr
return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}

public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final boolean dropAboveGqThreshold, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName, final boolean storeCompressedReferences) {
public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, Set<GQStateEnum> gqStatesToIgnore, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName, final boolean storeCompressedReferences) {
this.sampleId = sampleId;
this.outputType = outputType;
this.writeReferenceRanges = writeReferenceRanges;
this.storeCompressedReferences = storeCompressedReferences;
this.gqStatesToIgnore = gqStatesToIgnore;

coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));

Expand All @@ -72,11 +73,6 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
} catch (final IOException ioex) {
throw new UserException("Could not create reference range outputs", ioex);
}

this.gqStatesToIgnore.add(gqStateToIgnore);
if (dropAboveGqThreshold) {
this.gqStatesToIgnore.addAll(getGQStateEnumGreaterThan(gqStateToIgnore));
}
}

public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) throws IOException {
Expand Down Expand Up @@ -189,14 +185,14 @@ public void writeMissingPositions(long start, long end) throws IOException {
int position = SchemaUtils.decodePosition(localStart);
refRangesWriter.writeCompressed(
SchemaUtils.encodeCompressedRefBlock(chromosome, position, length,
GQStateEnum.MISSING.getCompressedValue()),
GQStateEnum.ZERO.getCompressedValue()),
sampleId

Check warning on line 189 in src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java#L184-L189

Added lines #L184 - L189 were not covered by tests
);
} else {
refRangesWriter.write(localStart,
sampleId,

Check warning on line 193 in src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java#L191-L193

Added lines #L191 - L193 were not covered by tests
length,
GQStateEnum.MISSING.getValue()
GQStateEnum.ZERO.getValue()

Check warning on line 195 in src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java#L195

Added line #L195 was not covered by tests
);
}
localStart = localStart + length ;
Expand Down

0 comments on commit c1e334a

Please sign in to comment.