Skip to content

Commit

Permalink
Mutect2 germline resource can have split multiallelic format (#8837)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored Jun 3, 2024
1 parent d4744f7 commit 0be44f2
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -404,9 +404,7 @@ private <E> Optional<E> getForNormal(final Supplier<E> supplier) {
private static Map<String, Object> getNegativeLogPopulationAFAnnotation(List<VariantContext> germlineResourceVariants,
final List<Allele> allAlleles,
final double afOfAllelesNotInGermlineResource) {
final Optional<VariantContext> germlineVC = germlineResourceVariants.isEmpty() ? Optional.empty()
: Optional.of(germlineResourceVariants.get(0)); // assume only one VC per site
final double[] populationAlleleFrequencies = getGermlineAltAlleleFrequencies(allAlleles, germlineVC, afOfAllelesNotInGermlineResource);
final double[] populationAlleleFrequencies = getGermlineAltAlleleFrequencies(allAlleles, germlineResourceVariants, afOfAllelesNotInGermlineResource);

return ImmutableMap.of(GATKVCFConstants.POPULATION_AF_KEY, MathUtils.applyToArray(populationAlleleFrequencies, x -> - Math.log10(x)));
}
Expand All @@ -416,27 +414,35 @@ private static Map<String, Object> getNegativeLogPopulationAFAnnotation(List<Var
* @param allAlleles Every emitted allele, with the reference allele first. Only alt alleles are annotated, but we
* * need the ref allele in case the germline resource has a more or less parsimonious representation
* * For example, eg ref = A, alt = C; germline ref = AT, germline alt = CT
* @param germlineVC Germline resource variant context from which AF INFO field is drawn
* @param germlineVCs Germline resource variant contexts from which AF INFO field is drawn
* @param afOfAllelesNotInGermlineResource Default value of population AF annotation
* @return
*/
@VisibleForTesting
static double[] getGermlineAltAlleleFrequencies(final List<Allele> allAlleles, final Optional<VariantContext> germlineVC, final double afOfAllelesNotInGermlineResource) {
static double[] getGermlineAltAlleleFrequencies(final List<Allele> allAlleles, final List<VariantContext> germlineVCs, final double afOfAllelesNotInGermlineResource) {
Utils.validateArg(!allAlleles.isEmpty(), "allAlleles are empty -- there is not even a reference allele.");
if (germlineVC.isPresent()) {
if (! germlineVC.get().hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
logger.warn("Germline resource variant at " + germlineVC.get().getContig() + ":" + germlineVC.get().getStart() +" missing AF attribute");
return Doubles.toArray(Collections.nCopies(allAlleles.size() - 1, afOfAllelesNotInGermlineResource));
final Map<Allele, Double> alleleFrequencies = new HashMap<>();
allAlleles.forEach(a -> alleleFrequencies.put(a, afOfAllelesNotInGermlineResource)); // initialize everything to the default

// look through every germline resource variant context at this locus and fill in the AFs
for (final VariantContext germlineVC : germlineVCs) {
if (! germlineVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
logger.warn("Germline resource variant at " + germlineVC.getContig() + ":" + germlineVC.getStart() +" missing AF attribute");
}

List<OptionalInt> germlineIndices = GATKVariantContextUtils.alleleIndices(allAlleles, germlineVC.getAlleles());
final List<Double> germlineAltAFs = Mutect2Engine.getAttributeAsDoubleList(germlineVC, VCFConstants.ALLELE_FREQUENCY_KEY, afOfAllelesNotInGermlineResource);

if (germlineAltAFs.size() == (germlineVC.getNAlleles() - 1)) { // skip VCs with a bad AF field that got parsed as a wrong-length list
for (int alleleIndex = 1; alleleIndex < allAlleles.size(); alleleIndex++) { // start at 1 to skip the reference, which doesn't have an AF annotation
final Allele allele = allAlleles.get(alleleIndex);
// note the -1 since germlineAltAFs do not include ref
germlineIndices.get(alleleIndex).ifPresent(germlineIndex -> alleleFrequencies.put(allele, germlineAltAFs.get(germlineIndex - 1)));
}
}
List<OptionalInt> germlineIndices = GATKVariantContextUtils.alleleIndices(allAlleles, germlineVC.get().getAlleles());
final List<Double> germlineAltAFs = Mutect2Engine.getAttributeAsDoubleList(germlineVC.get(), VCFConstants.ALLELE_FREQUENCY_KEY, afOfAllelesNotInGermlineResource);

return germlineIndices.stream().skip(1) // skip the reference allele
.mapToDouble(idx -> idx.isPresent() ? germlineAltAFs.get(idx.getAsInt() - 1) : afOfAllelesNotInGermlineResource) // note the -1 since germlineAltAFs do not include ref
.toArray();
} else {
return Doubles.toArray(Collections.nCopies(allAlleles.size() - 1, afOfAllelesNotInGermlineResource));
}

return allAlleles.stream().skip(1).mapToDouble(alleleFrequencies::get).toArray(); // skip the reference allele
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import javax.ws.rs.core.Variant;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
Expand Down Expand Up @@ -53,8 +55,23 @@ public void testGetGermlineAltAlleleFrequencies(final List<Allele> calledAlleles
final int stop = start + length - 1;
final VariantContext vc1 = new VariantContextBuilder("SOURCE", "1", start, stop, germlineAlleles)
.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, germlineAltAFs).make();
final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(calledAlleles, Optional.of(vc1), DEFAULT_AF);
final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(calledAlleles, List.of(vc1), DEFAULT_AF);
Assert.assertEquals(result, expected, 1.0e-10);

// multiallelic -- test splitting into multiple VCs
if (germlineAlleles.size() > 2) {
final Allele ref = germlineAlleles.get(0);
final List<VariantContext> germlineVCs = new ArrayList<>();
for (int n = 1; n < germlineAlleles.size(); n++) {
final Allele alt = germlineAlleles.get(n);
final VariantContext splitVC = new VariantContextBuilder("SOURCE", "1", start, stop, List.of(ref, alt))
.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, new double[]{germlineAltAFs[n-1]}).make();
germlineVCs.add(splitVC);
}

final double[] splitResult = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(calledAlleles, germlineVCs, DEFAULT_AF);
Assert.assertEquals(splitResult, expected, 1.0e-10);
}
}

@DataProvider(name = "missingAFData")
Expand All @@ -69,7 +86,21 @@ Object[][] missingAFData() {

@Test(dataProvider = "missingAFData")
public void testGetGermlineAltAlleleFrequenciesWithMissingAF(final VariantContext vc) {
final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(vc.getAlleles(), Optional.of(vc), DEFAULT_AF);
final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(vc.getAlleles(), List.of(vc), DEFAULT_AF);
Assert.assertEquals(result, new double[] {DEFAULT_AF}, 1.0e-10);
}

// test getting alt allele frequencies when each alt has its own VCF line and its own VariantContext
@Test
public void testGetGermlineAltAlleleFrequenciesFromSplitAllelesFormat() {
final double af1 = 0.1;
final double af2 = 0.2;
final VariantContext vc1 = new VariantContextBuilder("SOURCE", "1", 1, 1, Arrays.asList(Allele.REF_A, Allele.ALT_C))
.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, af1).make();
final VariantContext vc2 = new VariantContextBuilder("SOURCE", "1", 1, 1, Arrays.asList(Allele.REF_A, Allele.ALT_T))
.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, af2).make();
final List<Allele> alleles = List.of(Allele.REF_A, Allele.ALT_C, Allele.ALT_T);
final double[] result = SomaticGenotypingEngine.getGermlineAltAlleleFrequencies(alleles, List.of(vc1, vc2), DEFAULT_AF);
Assert.assertEquals(result, new double[] {af1, af2}, 1.0e-10);
}
}

0 comments on commit 0be44f2

Please sign in to comment.