Skip to content

Commit

Permalink
Merge pull request #30 from broadinstitute/diversity_guides
Browse files Browse the repository at this point in the history
Diversity metric and best n guides in design_naively
  • Loading branch information
priyappillai authored Sep 17, 2020
2 parents f202977 + 0051d4b commit 31aaa49
Show file tree
Hide file tree
Showing 9 changed files with 420 additions and 81 deletions.
30 changes: 29 additions & 1 deletion adapt/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import statistics

import numpy as np
from math import log2

from adapt.utils import guide
from adapt.utils import lsh
Expand Down Expand Up @@ -801,7 +802,7 @@ def sequences_bound_by_guide(self, gd_seq, gd_start, mismatches,
"""Determine the sequences to which a guide hybridizes.
Args:
gd_seq: seequence of the guide
gd_seq: sequence of the guide
gd_start: start position of the guide in the alignment
mismatches: threshold on number of mismatches for determining whether
a guide would hybridize to a target sequence
Expand Down Expand Up @@ -832,6 +833,33 @@ def sequences_bound_by_guide(self, gd_seq, gd_start, mismatches,
binding_seqs += [seq_idx]
return binding_seqs

def position_entropy(self):
"""Determine the entropy at each position in the alignment.
Returns:
list with the entropy of position i listed at index i
"""

position_entropy = []
for i in range(self.seq_length):
counts = {'A': 0, 'T': 0, 'C': 0, 'G': 0, '-': 0}
for b in [self.seqs[i][j] for j in range(self.num_sequences)]:
if b in counts:
counts[b] += 1
elif b in guide.FASTA_CODES:
for c in guide.FASTA_CODES[b]:
counts[c] += 1.0 / len(guide.FASTA_CODES[b])
else:
raise ValueError("Unknown base call %s" % b)

# Calculate entropy
probabilities = [counts[base]/self.num_sequences for base in counts]
this_position_entropy = sum([-p*log2(p) for p in probabilities if p > 0])

position_entropy.append(this_position_entropy)

return position_entropy

@staticmethod
def from_list_of_seqs(seqs):
"""Construct a Alignment from aligned list of sequences.
Expand Down
2 changes: 1 addition & 1 deletion adapt/guide_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1504,7 +1504,7 @@ def _analyze_guides(self, start, curr_activities):

def _analyze_guides_memoized(self, start, curr_activities,
use_last=False):
"""Make a memoized call to self._analyze_gudes().
"""Make a memoized call to self._analyze_guides().
Args:
start: start position in alignment at which to target
Expand Down
2 changes: 1 addition & 1 deletion adapt/specificity/alignment_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def setup(self):
(up to ambiguity) to be non-specific. However, this alternative option
does not make sense in the extreme case: for example, if a k-mer is all
Ns, then every sequence would be deemed non-specific. Nevertheless,
ignoring k-mers with amguity entirely might not be the best choice so
ignoring k-mers with ambiguity entirely might not be the best choice so
this should be revisited (TODO).
"""
unambig_chars = set(['a', 'c', 'g', 't'])
Expand Down
35 changes: 35 additions & 0 deletions adapt/tests/test_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import random
import unittest
from math import log2

from adapt import alignment
from adapt.utils import lsh
Expand Down Expand Up @@ -564,6 +565,40 @@ def test_most_common_sequence_with_ambiguity(self):
self.assertEqual(aln.determine_most_common_sequence(skip_ambiguity=True),
'GGGCCC')

def test_position_entropy_simple(self):
seqs = ['ACCCC',
'AAGGC',
'AAATA',
'AAAAA']
aln = alignment.Alignment.from_list_of_seqs(seqs)
all_ps = [[1],
[0.25, 0.75],
[0.25, 0.25, .5],
[0.25, 0.25, 0.25, 0.25],
[0.5, 0.5]]

entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps]
self.assertEqual(aln.position_entropy(),
entropy)

def test_position_entropy_with_ambiguity(self):
seqs = ['MRWSYKVHDBN']
aln = alignment.Alignment.from_list_of_seqs(seqs)
all_ps = [[0.5, 0.5],
[0.5, 0.5],
[0.5, 0.5],
[0.5, 0.5],
[0.5, 0.5],
[0.5, 0.5],
[1/3.0, 1/3.0, 1/3.0],
[1/3.0, 1/3.0, 1/3.0],
[1/3.0, 1/3.0, 1/3.0],
[1/3.0, 1/3.0, 1/3.0],
[0.25, 0.25, 0.25, 0.25]]
entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps]
self.assertEqual(aln.position_entropy(),
entropy)

def test_construct_from_0_seqs(self):
with self.assertRaises(Exception):
seqs = []
Expand Down
Empty file added bin/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion bin/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -1179,7 +1179,7 @@ def __call__(self, parser, namespace, values, option_string=None):
input_auto_common_subparser.add_argument('--ncbi-api-key',
help=("API key to use for NCBI e-utils. Using this increases the "
"limit on requests/second and may prevent an IP address "
"from being block due to too many requests"))
"from being blocked due to too many requests"))
input_auto_common_subparser.add_argument('--aws-access-key-id',
help=("User Account Access Key for AWS. This is only necessary "
"if using S3 for memoization via PREP_MEMOIZE_DIR and AWS CLI "
Expand Down
Loading

0 comments on commit 31aaa49

Please sign in to comment.