Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bigger Permutect tensors and Permutect test datasets can be annotated with truth VCF #8836

Merged
merged 3 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
Expand All @@ -20,6 +22,7 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* For each sample and for each allele a list feature vectors of supporting reads
Expand All @@ -33,6 +36,11 @@ public class FeaturizedReadSets {
public static final int DEFAULT_BASE_QUALITY = 25;

private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
private static final int FEATURES_PER_RANGE = 5;
private static final List<Integer> RANGES = List.of(5, 10, 25, 50);
public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
private static final int VERY_BAD_QUAL_THRESHOLD = 10;
private static final int BAD_QUAL_THRESHOLD = 20;

private FeaturizedReadSets() { }

Expand Down Expand Up @@ -92,9 +100,9 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
result.add(read.isReverseStrand() ? 1 : 0);

// distances from ends of read
final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPosition);
result.add(read.getLength() - readPosition);
final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPositionOfVariantStart);
result.add(read.getLength() - readPositionOfVariantStart);


result.add(Math.abs(read.getFragmentLength()));
Expand Down Expand Up @@ -123,15 +131,64 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
vc.getContig(), vc.getStart()));
result.add(3);
result.add(2);

for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
result.add(0);
}
} else {
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
byte[] haplotypeBases = haplotype.getBases();
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);

final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
asm.stepForwardOnGenome();
final List<int[]> rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();

while (!asm.isRightEdge()) {
final PileupElement pe = asm.makePileupElement();
final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);

// pick which array's features we are accounting. If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant
final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
if (relevantRange.isPresent()) {
final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
if (pe.isAfterInsertion()) {
featuresToAddTo[0]++;
}

if (pe.isDeletion()) {
featuresToAddTo[1]++;
} else {
final byte base = pe.getBase();
final byte qual = pe.getQual();
final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];

if (base != haplotypeBase) {
featuresToAddTo[2]++;
}

if (qual < VERY_BAD_QUAL_THRESHOLD) {
featuresToAddTo[3]++;
} else if (qual < BAD_QUAL_THRESHOLD) {
featuresToAddTo[4]++;
}
}
}
asm.stepForwardOnGenome();
}

for (final int[] featuresToAdd : rangedFeatures) {
for (final int val : featuresToAdd) {
result.add(val);
}
}
}
Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ public double getDefaultAlleleFrequency() {
public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;

public enum Mutect3DatasetMode {
ILLUMINA(11),
ULTIMA(11);
ILLUMINA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES),
ULTIMA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES);

final private int numReadFeatures;

Expand All @@ -229,6 +229,10 @@ public int getNumReadFeatures() {
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
* If this VCF is not given the dataset is generated with an weak-labelling strategy based on allele fractions.
*
* Although the normal use of this input is in generating training data, it can also be used when generating test data
* for making Permutect calls. In this case, the test data is labeled with truth from the VCF, Permutect makes filtered calls as
* usual, and Permutect uses the labels to analyze the quality of its results.
*/
@Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true)
public FeatureInput<VariantContext> mutect3TrainingTruth;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final int diff = altAlleleString.length() - refAllele.length();
final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION);

if (trainingMode) {
if (trainingMode) { // training mode -- collecting tensors to train the Permutect artifact model
final ArrayBlockingQueue<Integer> unmatchedQueue = unmatchedArtifactAltCounts.get(type);
final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD;

Expand Down Expand Up @@ -182,8 +182,13 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
} else {
labels.add(Label.IGNORE);
}
} else {
labels.add(Label.UNLABELED);
} else { // not training mode -- we are generating tensors in order to apply the Permutect artifact model to a callset
if (truthVCs.isPresent()) {
// here, for the purposes of test data, both sequencing errors and technical artifacts get the "ARTIFACT" label
labels.add(truthAlleles.contains(remappedAltAlelle) ? Label.VARIANT : Label.ARTIFACT);
} else {
labels.add(Label.UNLABELED);
}
}
}

Expand Down
Loading