Merge pull request #43 from broadinstitute/write-mean-activity-in-ana…

…lysis Add option to write mean activity of guides in coverage analysis
broadinstitute · Dec 15, 2020 · dff7383 · dff7383
2 parents cf1c007 + 35c81df
commit dff7383
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 4 deletions.
diff --git a/adapt/coverage_analysis.py b/adapt/coverage_analysis.py
@@ -4,6 +4,8 @@
 from collections import defaultdict
 import logging
 
+import numpy as np
+
 from adapt.utils import guide
 
 __author__ = 'Hayden Metsky <hayden@mit.edu>'
@@ -117,7 +119,8 @@ def guide_bind_fn(self, seq, target_seq, pos):
         """
         raise NotImplementedError()
 
-    def find_binding_pos(self, target_seq_name, seq, bind_fn):
+    def find_binding_pos(self, target_seq_name, seq, bind_fn,
+            save_activities=None):
         """Find if and where a sequence binds to in a target sequence.
 
         This takes a naive (but fully sensitive) approach: it simply
@@ -136,6 +139,8 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
             seq: guide or primer sequence to lookup
             bind_fn: function accepting (seq, target_seq, pos) and outputs
                 positions in pos to which seq binds to target_seq
+            save_activities: if set, pass along to guide_fn to save computed
+                activities
 
         Returns:
             collection of start positions in target_seq to which seq binds;
@@ -156,6 +161,13 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
         if not self.seqs_are_indexed:
             self._index_seqs()
 
+        def bind_fn_with_save(seq, target_seq, pos):
+            if save_activities is None:
+                return bind_fn(seq, target_seq, pos)
+            else:
+                return bind_fn(seq, target_seq, pos,
+                        save_activities=save_activities)
+
         target_seq = self.seqs[target_seq_name]
 
         bind_pos = set()
@@ -177,7 +189,7 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
                     if (kmer_start - j >= 0 and
                             kmer_start - j + len(seq) <= len(target_seq)):
                         pos_to_evaluate = [kmer_start - j]
-                        bind_pos.update(bind_fn(seq, target_seq,
+                        bind_pos.update(bind_fn_with_save(seq, target_seq,
                             pos_to_evaluate))
             if len(bind_pos) > 0:
                 # We have found at least 1 binding position; this may
@@ -191,7 +203,7 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
             # sliding approach
 
         pos_to_evaluate = list(range(len(target_seq) - len(seq) + 1))
-        bind_pos = bind_fn(seq, target_seq, pos_to_evaluate)
+        bind_pos = bind_fn_with_save(seq, target_seq, pos_to_evaluate)
         return bind_pos
 
     def seqs_where_guides_bind(self, guide_seqs):
@@ -320,6 +332,21 @@ def frac_of_seqs_bound(self):
             frac_bound[design_id] = float(len(seqs_bound)) / len(self.seqs)
         return frac_bound
 
+    def mean_activity_of_guides(self):
+        """Determine the mean activity of guides from each design.
+
+        Returns:
+            dict mapping design identifier (self.designs.keys()) to the
+            mean activity, across the target sequences, of its guide set
+        """
+        mean_activities = {}
+        for design_id, design in self.designs.items():
+            logger.info(("Computing mean activities of guides in design "
+                "'%s'"), str(design_id))
+            activities_across_targets = self.activities_where_guide_binds(design.guides)
+            mean_activities[design_id] = np.mean(list(activities_across_targets.values()))
+        return mean_activities
+
 
 class CoverageAnalyzerWithMismatchModel(CoverageAnalyzer):
     """Methods to analyze coverage of a design using model with fixed number
@@ -348,6 +375,9 @@ def __init__(self, seqs, designs, guide_mismatches, primer_mismatches=None,
         self.guide_mismatches = guide_mismatches
         self.allow_gu_pairs = allow_gu_pairs
 
+    def activities_where_guide_binds(self, guide_seqs):
+        raise NotImplementedError()
+
     def guide_bind_fn(self, seq, target_seq, pos):
         """Evaluate binding with a mismatch model.
 
@@ -393,7 +423,34 @@ def __init__(self, seqs, designs, predictor, primer_mismatches=None,
         self.predictor = predictor
         self.highly_active = highly_active
 
-    def guide_bind_fn(self, seq, target_seq, pos):
+    def activities_where_guide_binds(self, guide_seqs):
+        """Determine activities across the target sequences.
+
+        Args:
+            guide_seqs: guide sequences to lookup (treat as a guide set)
+
+        Returns:
+            dict {target seq in self.seqs: predicted activity}
+        """
+        target_act = {}
+        for seq_name, target_seq in self.seqs.items():
+            activities_for_target = []
+            for guide_seq in guide_seqs:
+                activities = {}
+                bind_pos = self.find_binding_pos(seq_name, guide_seq,
+                        self.guide_bind_fn, save_activities=activities)
+                if len(bind_pos) > 0:
+                    # guide_seq binds somewhere in target_seq
+                    # As its activity, take the maximum over multiple
+                    # positions (if there is more than one where it may bind)
+                    activities_for_target += [max(activities.values())]
+                else:
+                    activities_for_target += [0]
+            # If there are multiple guides, take the max across them
+            target_act[target_seq] = max(activities_for_target)
+        return target_act
+
+    def guide_bind_fn(self, seq, target_seq, pos, save_activities=None):
         """Evaluate binding with a predictor -- i.e., based on what is
         predicted to be highly active.
 
@@ -402,6 +459,9 @@ def guide_bind_fn(self, seq, target_seq, pos):
             target_seq: target sequence against which to check
             pos: list of positions indicating subsequence in target_seq
                 to check for binding of seq
+            save_activities: if set, this saves computed activities as
+                save_activites[p]=activity for each p in pos for which
+                this can compute activity; self.highly_active must be False
 
         Returns:
             set of positions in pos where seq binds to target_seq
@@ -427,9 +487,15 @@ def guide_bind_fn(self, seq, target_seq, pos):
         if self.highly_active:
             predictions = self.predictor.determine_highly_active(-1,
                     pairs_to_evaluate)
+            if save_activities is not None:
+                raise Exception(("Cannot use save_activities when using "
+                    "'highly active' as a criterion"))
         else:
             predictions = self.predictor.compute_activity(-1,
                     pairs_to_evaluate)
+            if save_activities is not None:
+                for i, p in zip(pos_evaluating, predictions):
+                    save_activities[i] = p
             predictions = [bool(p > 0) for p in predictions]
         bind_pos = set()
         for i, p in zip(pos_evaluating, predictions):

diff --git a/bin/analyze_coverage.py b/bin/analyze_coverage.py
@@ -112,6 +112,30 @@ def write_frac_bound(designs, frac_bound, out_fn):
             fw.write('\t'.join([str(x) for x in row]) + '\n')
 
 
+def write_mean_activity_of_guides(designs, mean_activities, out_fn):
+    """Write table giving the mean activity of guide sets.
+
+    Args:
+        designs: dict {design_id: design} where design_id is an
+            identifier for design and design is a coverage_analysis.
+            Design object
+        mean_activities: dict {design_id: activity} where design_id is
+            an identifier for a design and activity is the mean activity,
+            across the target sequences, of its guide set
+        out_fn: path to TSV file at which to write table
+    """
+    header = ['design_id',
+              'guide-target-sequences',
+              'mean-activity']
+    with open(out_fn, 'w') as fw:
+        fw.write('\t'.join(header) + '\n')
+        for design_id in sorted(list(designs.keys())):
+            guides = designs[design_id].guides
+            guides = ' '.join(sorted(guides))
+            row = [design_id, guides, mean_activities[design_id]]
+            fw.write('\t'.join([str(x) for x in row]) + '\n')
+
+
 def main(args):
     # Allow G-U base pairing, unless it is explicitly disallowed
     allow_gu_pairs = not args.do_not_allow_gu_pairing
@@ -169,6 +193,17 @@ def main(args):
         frac_bound = analyzer.frac_of_seqs_bound()
         write_frac_bound(designs, frac_bound, args.write_frac_bound)
         performed_analysis = True
+    if args.write_mean_activity_of_guides:
+        if (not args.predict_activity_model_path or
+                args.predict_activity_require_highly_active):
+            raise Exception(("To use --write-mean-activity-of-guides, "
+                    "a predictive model must be set and "
+                    "--predict-activity-require-highly-active must *not* "
+                    "be set"))
+        mean_activity = analyzer.mean_activity_of_guides()
+        write_mean_activity_of_guides(designs, mean_activity,
+                args.write_mean_activity_of_guides)
+        performed_analysis = True
 
     if not performed_analysis:
         logger.warning(("No analysis was requested"))
@@ -198,6 +233,15 @@ def main(args):
               "the row number of the design in the designs input (1 for "
               "the first design). The provided argument is a path to "
               "a TSV file at which to the write the table."))
+    parser.add_argument('--write-mean-activity-of-guides',
+        help=("If set, write a table in which each row represents an "
+              "input design and gives the mean activity across the target "
+              "sequences of the guide set. The 'design_id' column gives "
+              "the row number of the design in the designs input (1 for "
+              "the first design). The provided argument is a path to "
+              "a TSV file at which to write the table. If set, a predictive "
+              "model must be set without "
+              "--predict-activity-require-highly-active"))
 
     # Parameter determining whether a primer binds to target
     parser.add_argument('-pm', '--primer-mismatches',