Merge pull request #30 from broadinstitute/diversity_guides

Diversity metric and best n guides in design_naively
broadinstitute · Sep 17, 2020 · 31aaa49 · 31aaa49
2 parents f202977 + 0051d4b
commit 31aaa49
Show file tree

Hide file tree

Showing 9 changed files with 420 additions and 81 deletions.
diff --git a/adapt/alignment.py b/adapt/alignment.py
@@ -6,6 +6,7 @@
 import statistics
 
 import numpy as np
+from math import log2
 
 from adapt.utils import guide
 from adapt.utils import lsh
@@ -801,7 +802,7 @@ def sequences_bound_by_guide(self, gd_seq, gd_start, mismatches,
         """Determine the sequences to which a guide hybridizes.
 
         Args:
-            gd_seq: seequence of the guide
+            gd_seq: sequence of the guide
             gd_start: start position of the guide in the alignment
             mismatches: threshold on number of mismatches for determining whether
                 a guide would hybridize to a target sequence
@@ -832,6 +833,33 @@ def sequences_bound_by_guide(self, gd_seq, gd_start, mismatches,
                 binding_seqs += [seq_idx]
         return binding_seqs
 
+    def position_entropy(self):
+        """Determine the entropy at each position in the alignment.
+
+        Returns:
+            list with the entropy of position i listed at index i
+        """
+
+        position_entropy = []
+        for i in range(self.seq_length):
+            counts = {'A': 0, 'T': 0, 'C': 0, 'G': 0, '-': 0}
+            for b in [self.seqs[i][j] for j in range(self.num_sequences)]:
+                if b in counts:
+                    counts[b] += 1
+                elif b in guide.FASTA_CODES:
+                    for c in guide.FASTA_CODES[b]:
+                        counts[c] += 1.0 / len(guide.FASTA_CODES[b])
+                else:
+                    raise ValueError("Unknown base call %s" % b)
+
+            # Calculate entropy
+            probabilities = [counts[base]/self.num_sequences for base in counts]
+            this_position_entropy = sum([-p*log2(p) for p in probabilities if p > 0])
+
+            position_entropy.append(this_position_entropy)
+
+        return position_entropy
+
     @staticmethod
     def from_list_of_seqs(seqs):
         """Construct a Alignment from aligned list of sequences.

diff --git a/adapt/guide_search.py b/adapt/guide_search.py
@@ -1504,7 +1504,7 @@ def _analyze_guides(self, start, curr_activities):
 
     def _analyze_guides_memoized(self, start, curr_activities,
             use_last=False):
-        """Make a memoized call to self._analyze_gudes().
+        """Make a memoized call to self._analyze_guides().
 
         Args:
             start: start position in alignment at which to target

diff --git a/adapt/specificity/alignment_query.py b/adapt/specificity/alignment_query.py
@@ -325,7 +325,7 @@ def setup(self):
         (up to ambiguity) to be non-specific. However, this alternative option
         does not make sense in the extreme case: for example, if a k-mer is all
         Ns, then every sequence would be deemed non-specific. Nevertheless,
-        ignoring k-mers with amguity entirely might not be the best choice so
+        ignoring k-mers with ambiguity entirely might not be the best choice so
         this should be revisited (TODO).
         """
         unambig_chars = set(['a', 'c', 'g', 't'])

diff --git a/adapt/tests/test_alignment.py b/adapt/tests/test_alignment.py
@@ -3,6 +3,7 @@
 
 import random
 import unittest
+from math import log2
 
 from adapt import alignment
 from adapt.utils import lsh
@@ -564,6 +565,40 @@ def test_most_common_sequence_with_ambiguity(self):
         self.assertEqual(aln.determine_most_common_sequence(skip_ambiguity=True),
                          'GGGCCC')
 
+    def test_position_entropy_simple(self):
+        seqs = ['ACCCC',
+                'AAGGC',
+                'AAATA',
+                'AAAAA']
+        aln = alignment.Alignment.from_list_of_seqs(seqs)
+        all_ps = [[1],
+                  [0.25, 0.75], 
+                  [0.25, 0.25, .5], 
+                  [0.25, 0.25, 0.25, 0.25],
+                  [0.5, 0.5]]
+
+        entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps]
+        self.assertEqual(aln.position_entropy(),
+                         entropy)
+
+    def test_position_entropy_with_ambiguity(self):
+        seqs = ['MRWSYKVHDBN']
+        aln = alignment.Alignment.from_list_of_seqs(seqs)
+        all_ps = [[0.5, 0.5],
+                  [0.5, 0.5],
+                  [0.5, 0.5],
+                  [0.5, 0.5],
+                  [0.5, 0.5],
+                  [0.5, 0.5],
+                  [1/3.0, 1/3.0, 1/3.0], 
+                  [1/3.0, 1/3.0, 1/3.0], 
+                  [1/3.0, 1/3.0, 1/3.0], 
+                  [1/3.0, 1/3.0, 1/3.0], 
+                  [0.25, 0.25, 0.25, 0.25]]
+        entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps]
+        self.assertEqual(aln.position_entropy(),
+                         entropy)
+
     def test_construct_from_0_seqs(self):
         with self.assertRaises(Exception):
             seqs = []

diff --git a/bin/__init__.py b/bin/__init__.py
diff --git a/bin/design.py b/bin/design.py
@@ -1179,7 +1179,7 @@ def __call__(self, parser, namespace, values, option_string=None):
     input_auto_common_subparser.add_argument('--ncbi-api-key',
         help=("API key to use for NCBI e-utils. Using this increases the "
               "limit on requests/second and may prevent an IP address "
-              "from being block due to too many requests"))
+              "from being blocked due to too many requests"))
     input_auto_common_subparser.add_argument('--aws-access-key-id',
         help=("User Account Access Key for AWS. This is only necessary "
             "if using S3 for memoization via PREP_MEMOIZE_DIR and AWS CLI "