diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/GQStateEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/GQStateEnum.java index 1dbf39f961b..f00c5657382 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/GQStateEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/GQStateEnum.java @@ -10,6 +10,8 @@ public enum GQStateEnum { FORTY("4", 40, 4), FIFTY("5", 50, 5), SIXTY("6", 60, 6), + // NOTE: MISSING is no longer used (it is now being written as ZERO, *unless* we are dropping ref_blocks with that state. + // However, we will keep this enum value around in case the code needs to access older data sets written with MISSING values MISSING("m", null, 9), UNKNOWN("u", null, 10), NONE(""); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java index 8749d9ac919..339bbc8c733 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java @@ -169,6 +169,8 @@ public final class CreateVariantIngestFiles extends VariantWalker { private boolean shouldWriteLoadStatusStarted = true; + private final Set gqStatesToIgnore = new HashSet<>(); + // getGenotypes() returns list of lists for all samples at variant // assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0 public static boolean isNoCall(VariantContext variant) { @@ -288,8 +290,16 @@ public void onTraversalStart() { final GenomeLocParser genomeLocParser = new GenomeLocParser(seqDictionary); intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocParser, IntervalUtils.genomeLocsFromLocatables(genomeLocParser, intervalArgumentCollection.getIntervals(seqDictionary))); + if (gqStateToIgnore != null) { + gqStatesToIgnore.add(gqStateToIgnore); + if (dropAboveGqThreshold) { + // TODO - Do we want to get rid of this option ("dropAboveGqThreshold") - never used and confuses things. + gqStatesToIgnore.addAll(RefCreator.getGQStateEnumGreaterThan(gqStateToIgnore)); + } + } + if (enableReferenceRanges && !refRangesRowsExist) { - refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStateToIgnore, dropAboveGqThreshold, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences); + refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences); } if (enableVet && !vetRowsExist) { @@ -360,10 +370,14 @@ public Object onTraversalSuccess() { } if (refCreator != null) { - try { - refCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet); - } catch (IOException ioe) { - throw new GATKException("Error writing missing intervals", ioe); + if ((gqStatesToIgnore.size() != 1) || (!gqStatesToIgnore.contains(GQStateEnum.ZERO))) { + // We will write missing intervals as ZERO ('GQ0') unless that is the (ONLY???) GQ state that we are dropping. + // If ZERO/GQ0 is the ONLY state that we are dropping then we do not write those intervals. + try { + refCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet); + } catch (IOException ioe) { + throw new GATKException("Error writing missing intervals", ioe); + } } // Wait until all data has been submitted and in pending state to commit refCreator.commitData(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java index ca6b7593641..0876d860f5b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java @@ -32,7 +32,7 @@ public final class RefCreator { private final boolean writeReferenceRanges; private final Long sampleId; private SimpleInterval previousInterval; - private final Set gqStatesToIgnore = new HashSet<>(); + private final Set gqStatesToIgnore; private final GenomeLocSortedSet coverageLocSortedSet; private final boolean storeCompressedReferences; private static final String PREFIX_SEPARATOR = "_"; @@ -43,11 +43,12 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId); } - public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final boolean dropAboveGqThreshold, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName, final boolean storeCompressedReferences) { + public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, SAMSequenceDictionary seqDictionary, Set gqStatesToIgnore, final File outputDirectory, final CommonCode.OutputType outputType, final boolean writeReferenceRanges, final String projectId, final String datasetName, final boolean storeCompressedReferences) { this.sampleId = sampleId; this.outputType = outputType; this.writeReferenceRanges = writeReferenceRanges; this.storeCompressedReferences = storeCompressedReferences; + this.gqStatesToIgnore = gqStatesToIgnore; coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); @@ -72,11 +73,6 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin } catch (final IOException ioex) { throw new UserException("Could not create reference range outputs", ioex); } - - this.gqStatesToIgnore.add(gqStateToIgnore); - if (dropAboveGqThreshold) { - this.gqStatesToIgnore.addAll(getGQStateEnumGreaterThan(gqStateToIgnore)); - } } public void apply(VariantContext variant, List intervalsToWrite) throws IOException { @@ -189,14 +185,14 @@ public void writeMissingPositions(long start, long end) throws IOException { int position = SchemaUtils.decodePosition(localStart); refRangesWriter.writeCompressed( SchemaUtils.encodeCompressedRefBlock(chromosome, position, length, - GQStateEnum.MISSING.getCompressedValue()), + GQStateEnum.ZERO.getCompressedValue()), sampleId ); } else { refRangesWriter.write(localStart, sampleId, length, - GQStateEnum.MISSING.getValue() + GQStateEnum.ZERO.getValue() ); } localStart = localStart + length ;