Merge branch 'gtr_substitution' into main

broadinstitute · May 14, 2021 · 465e8be · 465e8be
2 parents 7c8b01b + a5b226b
commit 465e8be
Show file tree

Hide file tree

Showing 9 changed files with 514 additions and 35 deletions.
diff --git a/adapt/alignment.py b/adapt/alignment.py
@@ -732,7 +732,7 @@ def determine_representative_guides(self, start, guide_length,
             representatives.add(gd)
         return representatives
 
-    def compute_activity(self, start, gd_sequence, predictor):
+    def compute_activity(self, start, gd_sequence, predictor, mutator=None):
         """Compute activity between a guide sequence and every target sequence
         in the alignment.
 
@@ -743,15 +743,45 @@ def compute_activity(self, start, gd_sequence, predictor):
             start: start position in alignment at which to target
             gd_sequence: str representing guide sequence
             predictor: a adapt.utils.predict_activity.Predictor object
+            mutator: a adapt.utils.mutate.Mutator object
 
         Returns:
             numpy array x where x[i] gives the predicted activity between
-            gd_sequence and the sequence in the alignment at index i
+            gd_sequence and the sequence in the alignment at index i. If
+            mutator is not None, x[i] gives the predicted activity after the
+            mutations specified in the mutator.
         """
         guide_length = len(gd_sequence)
         assert start + guide_length <= self.seq_length
 
         if isinstance(predictor, predict_activity.SimpleBinaryPredictor):
+            if mutator:
+                # Extract the target sequences, including context to use with
+                # prediction (i.e. the flanking sequences, if they exist)
+                left_context = 0
+                right_context = 0
+                if predictor:
+                    if predictor.required_flanking_seqs[0]:
+                        left_context = len(predictor.required_flanking_seqs[0])
+                    if predictor.required_flanking_seqs[1]:
+                        right_context = len(predictor.required_flanking_seqs[1])
+
+                aln_for_guide_with_context = self.extract_range(
+                        start - left_context,
+                        start + guide_length + right_context)
+                seq_rows_with_context = aln_for_guide_with_context.make_list_of_seqs()
+
+                # Start array of predicted activities; make this all 0s so that
+                # sequences for which activities are not computed (e.g., gap)
+                # have activity=0
+                activities = np.zeros(self.num_sequences)
+                for i, seq_with_context in enumerate(seq_rows_with_context):
+                    activity = mutator.compute_mutated_activity(predictor,
+                                                                seq_with_context,
+                                                                gd_sequence,
+                                                                start=start)
+                    activities[i] = activity
+                return activities
             # Do not use a model; just predict binary activity (1 or 0)
             # based on distance between guide and targets
             return predictor.compute_activity(start, gd_sequence, self)
@@ -785,12 +815,24 @@ def compute_activity(self, start, gd_sequence, predictor):
         # is best to batch these
         pairs_to_eval = []
         pairs_to_eval_seq_idx = []
-        for seq_with_context, seq_idx in seq_rows_with_context:
-            pair = (seq_with_context, gd_sequence)
-            pairs_to_eval += [pair]
-            pairs_to_eval_seq_idx += [seq_idx]
-        # Evaluate activity
-        evals = predictor.compute_activity(start, pairs_to_eval)
+        evals = []
+
+        if mutator:
+            for seq_with_context, seq_idx in seq_rows_with_context:
+                activity = mutator.compute_mutated_activity(predictor,
+                                                            seq_with_context,
+                                                            gd_sequence,
+                                                            start=start)
+                evals.append(activity)
+                pairs_to_eval_seq_idx.append(seq_idx)
+        else:
+            for seq_with_context, seq_idx in seq_rows_with_context:
+                pair = (seq_with_context, gd_sequence)
+                pairs_to_eval.append(pair)
+                pairs_to_eval_seq_idx.append(seq_idx)
+            # Evaluate activity
+            evals = predictor.compute_activity(start, pairs_to_eval)
+
         for activity, seq_idx in zip(evals, pairs_to_eval_seq_idx):
             # Fill in the activity for seq_idx
             activities[seq_idx] = activity
@@ -860,6 +902,27 @@ def position_entropy(self):
 
         return position_entropy
 
+    def base_percentages(self):
+        """Determines the percentage of each base pair in the alignment.
+
+        Returns:
+            dictionary of base pair to its percentage in the alignment
+        """
+        counts = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
+        for i in range(self.seq_length):
+            for b in [self.seqs[i][j] for j in range(self.num_sequences)]:
+                if b in counts:
+                    counts[b] += 1
+                elif b in guide.FASTA_CODES:
+                    for c in guide.FASTA_CODES[b]:
+                        counts[c] += 1.0 / len(guide.FASTA_CODES[b])
+                elif b != '-':
+                    raise ValueError("Unknown base call %s" % b)
+        total = sum(counts.values())
+        for base in counts:
+            counts[base] /= total
+        return counts
+
     @staticmethod
     def from_list_of_seqs(seqs):
         """Construct a Alignment from aligned list of sequences.

diff --git a/adapt/guide_search.py b/adapt/guide_search.py
@@ -349,7 +349,7 @@ def guide_set_activities_percentile(self, window_start, window_end,
         """Compute percentiles of activity across target sequences for
         a guide set in a window.
 
-        For example, when percentiles is 0.5, this returns the median
+        For example, when percentiles is 50, this returns the median
         activity across the target sequences that the guide set provides.
 
         Args:

diff --git a/adapt/target_search.py b/adapt/target_search.py
@@ -12,6 +12,7 @@
 import logging
 import math
 import os
+import numpy as np
 
 from adapt import guide_search
 from adapt import primer_search
@@ -28,7 +29,8 @@ def __init__(self, ps, gs, obj_type='min',
             max_primers_at_site=None,
             max_target_length=None, obj_weights=None,
             only_account_for_amplified_seqs=False,
-            halt_early=False, obj_value_shift=None):
+            halt_early=False, obj_value_shift=None,
+            mutator=None):
         """
         Args:
             ps: PrimerSearcher object
@@ -62,6 +64,8 @@ def __init__(self, ps, gs, obj_type='min',
                 positive so that there is not confusion about differences
                 between positive/negative. If None (default), defaults are
                 set below based on obj_type.
+            mutator: a adapt.utils.mutate Mutator object. If None (default),
+                do not predict activity after mutations.
         """
         self.ps = ps
         self.gs = gs
@@ -99,6 +103,8 @@ def __init__(self, ps, gs, obj_type='min',
                 # negative, so shift all up so that most are positive
                 self.obj_value_shift = 4.0
 
+        self.mutator = mutator
+
     def _find_primer_pairs(self):
         """Find suitable primer pairs using self.ps.
 
@@ -115,6 +121,33 @@ def _find_primer_pairs(self):
                 p2 = primers[j]
                 yield (p1, p2)
 
+    def _find_mutated_activity(self, targets):
+        """Find the activities of the guides after mutation
+
+        For each target, find the activity of the best of the guide
+        set across each potential starting location after mutation,
+        averaged across the sequences.
+
+        Args:
+            targets: list of tuples (obj_value, target) where target is a tuple
+                ((p1, p2), (guides_stats, guides))
+        Returns:
+            list of mutated activities, ordered by target's ordering.
+
+        """
+        mutated_activities = [None] * len(targets)
+        for i, (obj_value, target) in enumerate(targets):
+            ((p1, p2), (guides_stats, guides)) = target
+            mutated_activities[i] = max(max(np.average(
+                        self.gs.aln.compute_activity(
+                            start_pos,
+                            guide,
+                            self.gs.predictor,
+                            self.mutator))
+                    for start_pos in self.gs._selected_guide_positions[guide])
+                for guide in guides)
+        return mutated_activities
+
     def find_targets(self, best_n=10, no_overlap='amplicon'):
         """Find targets across an alignment.
 
@@ -520,10 +553,12 @@ def find_and_write_targets(self, out_fn, best_n=10, no_overlap='amplicon'):
                 When 'none', many targets in the best_n may be very similar
         """
         targets = self.find_targets(best_n=best_n, no_overlap=no_overlap)
+        if self.mutator:
+            mutated_activity = self._find_mutated_activity(targets)
 
         with open(out_fn, 'w') as outf:
             # Write a header
-            outf.write('\t'.join(['objective-value', 'target-start', 'target-end',
+            headers = ['objective-value', 'target-start', 'target-end',
                 'target-length',
                 'left-primer-start', 'left-primer-num-primers',
                 'left-primer-frac-bound', 'left-primer-target-sequences',
@@ -533,10 +568,13 @@ def find_and_write_targets(self, out_fn, best_n=10, no_overlap='amplicon'):
                 'guide-set-expected-activity',
                 'guide-set-median-activity', 'guide-set-5th-pctile-activity',
                 'guide-expected-activities',
-                'guide-target-sequences', 'guide-target-sequence-positions']) +
-                '\n')
+                'guide-target-sequences', 'guide-target-sequence-positions']
+            if self.mutator:
+                headers.append('guide-set-5th-pctile-mutated-activity')
+
+            outf.write('\t'.join(headers) + '\n')
 
-            for (obj_value, target) in targets:
+            for i, (obj_value, target) in enumerate(targets):
                 ((p1, p2), (guides_stats, guides)) = target
 
                 # Break out guides_stats
@@ -585,6 +623,8 @@ def find_and_write_targets(self, out_fn, best_n=10, no_overlap='amplicon'):
                     guides_activity_median, guides_activity_5thpctile,
                     expected_activities_per_guide_str,
                     guides_seqs_str, guides_positions_str]
+                if self.mutator:
+                    line.append(mutated_activity[i])
 
                 outf.write('\t'.join([str(x) for x in line]) + '\n')
 

diff --git a/adapt/tests/test_alignment.py b/adapt/tests/test_alignment.py
@@ -572,33 +572,59 @@ def test_position_entropy_simple(self):
                 'AAAAA']
         aln = alignment.Alignment.from_list_of_seqs(seqs)
         all_ps = [[1],
-                  [0.25, 0.75], 
-                  [0.25, 0.25, .5], 
+                  [0.25, 0.75],
+                  [0.25, 0.25, .5],
                   [0.25, 0.25, 0.25, 0.25],
                   [0.5, 0.5]]
 
         entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps]
-        self.assertEqual(aln.position_entropy(),
-                         entropy)
+        self.assertEqual(aln.position_entropy(), entropy)
 
     def test_position_entropy_with_ambiguity(self):
-        seqs = ['MRWSYKVHDBN']
+        seqs = ['MRWSYKVHDBN-']
         aln = alignment.Alignment.from_list_of_seqs(seqs)
         all_ps = [[0.5, 0.5],
                   [0.5, 0.5],
                   [0.5, 0.5],
                   [0.5, 0.5],
                   [0.5, 0.5],
                   [0.5, 0.5],
-                  [1/3.0, 1/3.0, 1/3.0], 
-                  [1/3.0, 1/3.0, 1/3.0], 
-                  [1/3.0, 1/3.0, 1/3.0], 
-                  [1/3.0, 1/3.0, 1/3.0], 
-                  [0.25, 0.25, 0.25, 0.25]]
+                  [1/3.0, 1/3.0, 1/3.0],
+                  [1/3.0, 1/3.0, 1/3.0],
+                  [1/3.0, 1/3.0, 1/3.0],
+                  [1/3.0, 1/3.0, 1/3.0],
+                  [0.25, 0.25, 0.25, 0.25],
+                  [1]]
 
         entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps]
-        self.assertEqual(aln.position_entropy(),
-                         entropy)
+        self.assertEqual(aln.position_entropy(), entropy)
+
+    def test_base_percentages_simple(self):
+        seqs = ['ACCCC',
+                'AAGGC',
+                'AAATA',
+                'AAAAA']
+        aln = alignment.Alignment.from_list_of_seqs(seqs)
+        base_p = {
+            'A': 0.6,
+            'C': 0.25,
+            'G': 0.1,
+            'T': 0.05
+        }
+
+        self.assertEqual(aln.base_percentages(), base_p)
+
+    def test_base_percentages_with_ambiguity(self):
+        seqs = ['MRWSYKVHDBN-']
+        aln = alignment.Alignment.from_list_of_seqs(seqs)
+        base_p = {
+            'A': 0.25,
+            'C': 0.25,
+            'G': 0.25,
+            'T': 0.25
+        }
+
+        self.assertEqual(aln.base_percentages(), base_p)
 
     def test_construct_from_0_seqs(self):
         with self.assertRaises(Exception):

diff --git a/adapt/tests/test_target_search.py b/adapt/tests/test_target_search.py
@@ -9,7 +9,7 @@
 from adapt import guide_search
 from adapt import primer_search
 from adapt import target_search
-from adapt.utils import lsh
+from adapt.utils import lsh, mutate, predict_activity
 
 __author__ = 'Hayden Metsky <hayden@mit.edu>'
 
@@ -339,6 +339,24 @@ def test_find_targets_allowing_overlap_maximize_random_greedy(self):
                 'random-greedy', 1, 3, 100, check_for_one_guide=True,
                 check_for_no_targets=True)
 
+    def test_find_mutated_activity(self):
+        mutator = mutate.GTRSubstitutionMutator(self.a_aln, 1, 1, 1, 1, 1, 1,
+                                                1, 1, 1)
+        predictor = predict_activity.SimpleBinaryPredictor(
+                    6, False, required_flanking_seqs=('A', None))
+        c_ps = primer_search.PrimerSearcher(self.a_aln, 4, 0, 1.0, (1, 1, 100))
+        c_gs = guide_search.GuideSearcherMaximizeActivity(
+                    self.a_aln, 6, 1, 5, 0.05, (1, 1, 100), algorithm='greedy',
+                    predictor=predictor)
+        c = target_search.TargetSearcher(c_ps, c_gs, max_primers_at_site=2,
+                                         obj_type='max', mutator=mutator)
+        for best_n in [1, 2, 3, 4, 5, 6]:
+            targets = c.find_targets(best_n=best_n, no_overlap='none')
+            mut_activities = c._find_mutated_activity(targets)
+            self.assertEqual(len(mut_activities), best_n)
+            for mut_activity in mut_activities:
+                self.assertIsInstance(mut_activity, float)
+
     def tearDown(self):
         # Re-enable logging
         logging.disable(logging.NOTSET)