locally working first pass at adding in the ploidy info. Still needs …

…to have the arguments passed through so it works in the WDLs
broadinstitute · Jun 3, 2024 · 2fb59b7 · 2fb59b7
1 parent f73c0b4
commit 2fb59b7
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 11 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -6,6 +6,8 @@
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang.math.LongRange;
+import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.engine.FeatureContext;
@@ -34,6 +36,12 @@
 public class ExtractCohortEngine {
     private final Logger logger;
 
+    private final List<LongRange> PARRegions = List.of(
+            new LongRange(23000000010001L, 23000002781479L),    // PAR1, X
+            new LongRange(24000000010001L, 24000002781479L),    // PAR1, Y
+            new LongRange(23000155701383L, 23000156030895L),    // PAR2, X
+            new LongRange(24000056887903L, 24000057217415L));   // PAR2, Y
+
     private final boolean printDebugInformation;
     private final int localSortMaxRecordsInRam;
     private final List<SimpleInterval> traversalIntervals;
@@ -303,7 +311,6 @@ public void traverse() {
                     String sampleId = queryRow.get(SchemaUtils.SAMPLE_ID).toString();
                     Integer ploidy = Integer.parseInt(queryRow.get(SchemaUtils.PLOIDY).toString());
                     // If not for this, we'd need a map to another data structure just to look up the ploidy. Best to flatten the compound key
-                    String key = chromosome+"-"+sampleId;
                     samplePloidyMap.put(chromosome+"-"+sampleId, ploidy);
                 }
                 processBytesScanned(reader);
@@ -511,22 +518,22 @@ protected VariantContext processSampleRecordsForLocation(final long location,
                     // Nothing to do here -- just needed to mark the sample as seen so it doesn't get put in the high confidence ref band
                     break;
                 case "1":  // Non-Variant Block with 10 <=  GQ < 20
-                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 10));
+                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 10, sampleRecord.getSampleId().intValue()));
                     break;
                 case "2":  // Non-Variant Block with 20 <= GQ < 30
-                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 20));
+                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 20, sampleRecord.getSampleId().intValue()));
                     break;
                 case "3":  // Non-Variant Block with 30 <= GQ < 40
-                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 30));
+                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 30, sampleRecord.getSampleId().intValue()));
                     break;
                 case "4":  // Non-Variant Block with 40 <= GQ < 50
-                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 40));
+                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 40, sampleRecord.getSampleId().intValue()));
                     break;
                 case "5":  // Non-Variant Block with 50 <= GQ < 60
-                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 50));
+                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 50, sampleRecord.getSampleId().intValue()));
                     break;
                 case "6":  // Non-Variant Block with 60 <= GQ (usually omitted from tables)
-                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 60));
+                    refCalls.add(new ReferenceGenotypeInfo(sampleName, 60, sampleRecord.getSampleId().intValue()));
                     break;
                 case "*":   // Spanning Deletion - do nothing. just mark the sample as seen
                     break;
@@ -587,9 +594,23 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
         for (final ReferenceGenotypeInfo info : referenceCalls) {
             genotypeBuilder.reset(false);
             genotypeBuilder.name(info.getSampleName());
-            // TODO: This is where we expand out "this is ref" to a "0/0" call.  Reference ploidy data here
-            String key = "asdf";
-            genotypeBuilder.alleles(gtAlleles);
+
+            long chromAsPosition = location - SchemaUtils.decodePosition(location);
+            String key = chromAsPosition+"-"+info.getSampleId();
+
+            // Logic for determining the correct ploidy for reference data
+            // If we have no info in the table, the ploidy is explicitly 2, OR we are in a PAR, use diploid reference.
+            // If we have looked up the ploidy in our table and it says 1, use a haploid reference
+            // Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
+            int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
+            if (ploidy == 2 || isInPAR(location)) {
+                genotypeBuilder.alleles(gtAlleles);
+            } else if (ploidy == 1) {
+                genotypeBuilder.alleles(gtHaploidAlleles);
+            } else {
+                throw new UserException("GVS cannot currently handle extracting with a ploidy of "+ploidy+" as seen at "+SchemaUtils.decodeContig(location)+": "+SchemaUtils.decodePosition(location)+".");
+            }
+
             genotypeBuilder.GQ(info.getGQ());
             genotypes.add(genotypeBuilder.make());
         }
@@ -649,6 +670,16 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
         return removeAnnotations(filteredVC);
     }
 
+    boolean isInPAR(final long location) {
+        for (LongRange region : PARRegions) {
+            if (region.containsLong(location)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+
     private VariantContext filterSiteByAlleleSpecificVQScore(VariantContext mergedVC,
                                                              Map<Allele, Map<Allele, Double>> scoreMap,
                                                              Map<Allele, Map<Allele, Double>> vqScoreMap,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceGenotypeInfo.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceGenotypeInfo.java
@@ -2,11 +2,13 @@
 
 public class ReferenceGenotypeInfo {
     private final String sampleName;
+    private final int sampleId;
     private final int GQ;
 
-    public ReferenceGenotypeInfo(String sampleName, int GQ) {
+    public ReferenceGenotypeInfo(String sampleName, int GQ, int sampleId) {
         this.sampleName = sampleName;
         this.GQ = GQ;
+        this.sampleId = sampleId;
     }
 
     public String getSampleName() {
@@ -16,4 +18,8 @@ public String getSampleName() {
     public int getGQ() {
         return GQ;
     }
+
+    public int getSampleId() {
+        return sampleId;
+    }
 }