diff --git a/adapt/alignment.py b/adapt/alignment.py index 1c541b0..5481e13 100644 --- a/adapt/alignment.py +++ b/adapt/alignment.py @@ -732,7 +732,7 @@ def determine_representative_guides(self, start, guide_length, representatives.add(gd) return representatives - def compute_activity(self, start, gd_sequence, predictor): + def compute_activity(self, start, gd_sequence, predictor, mutator=None): """Compute activity between a guide sequence and every target sequence in the alignment. @@ -743,15 +743,45 @@ def compute_activity(self, start, gd_sequence, predictor): start: start position in alignment at which to target gd_sequence: str representing guide sequence predictor: a adapt.utils.predict_activity.Predictor object + mutator: a adapt.utils.mutate.Mutator object Returns: numpy array x where x[i] gives the predicted activity between - gd_sequence and the sequence in the alignment at index i + gd_sequence and the sequence in the alignment at index i. If + mutator is not None, x[i] gives the predicted activity after the + mutations specified in the mutator. """ guide_length = len(gd_sequence) assert start + guide_length <= self.seq_length if isinstance(predictor, predict_activity.SimpleBinaryPredictor): + if mutator: + # Extract the target sequences, including context to use with + # prediction (i.e. the flanking sequences, if they exist) + left_context = 0 + right_context = 0 + if predictor: + if predictor.required_flanking_seqs[0]: + left_context = len(predictor.required_flanking_seqs[0]) + if predictor.required_flanking_seqs[1]: + right_context = len(predictor.required_flanking_seqs[1]) + + aln_for_guide_with_context = self.extract_range( + start - left_context, + start + guide_length + right_context) + seq_rows_with_context = aln_for_guide_with_context.make_list_of_seqs() + + # Start array of predicted activities; make this all 0s so that + # sequences for which activities are not computed (e.g., gap) + # have activity=0 + activities = np.zeros(self.num_sequences) + for i, seq_with_context in enumerate(seq_rows_with_context): + activity = mutator.compute_mutated_activity(predictor, + seq_with_context, + gd_sequence, + start=start) + activities[i] = activity + return activities # Do not use a model; just predict binary activity (1 or 0) # based on distance between guide and targets return predictor.compute_activity(start, gd_sequence, self) @@ -785,12 +815,24 @@ def compute_activity(self, start, gd_sequence, predictor): # is best to batch these pairs_to_eval = [] pairs_to_eval_seq_idx = [] - for seq_with_context, seq_idx in seq_rows_with_context: - pair = (seq_with_context, gd_sequence) - pairs_to_eval += [pair] - pairs_to_eval_seq_idx += [seq_idx] - # Evaluate activity - evals = predictor.compute_activity(start, pairs_to_eval) + evals = [] + + if mutator: + for seq_with_context, seq_idx in seq_rows_with_context: + activity = mutator.compute_mutated_activity(predictor, + seq_with_context, + gd_sequence, + start=start) + evals.append(activity) + pairs_to_eval_seq_idx.append(seq_idx) + else: + for seq_with_context, seq_idx in seq_rows_with_context: + pair = (seq_with_context, gd_sequence) + pairs_to_eval.append(pair) + pairs_to_eval_seq_idx.append(seq_idx) + # Evaluate activity + evals = predictor.compute_activity(start, pairs_to_eval) + for activity, seq_idx in zip(evals, pairs_to_eval_seq_idx): # Fill in the activity for seq_idx activities[seq_idx] = activity @@ -860,6 +902,27 @@ def position_entropy(self): return position_entropy + def base_percentages(self): + """Determines the percentage of each base pair in the alignment. + + Returns: + dictionary of base pair to its percentage in the alignment + """ + counts = {'A': 0, 'T': 0, 'C': 0, 'G': 0} + for i in range(self.seq_length): + for b in [self.seqs[i][j] for j in range(self.num_sequences)]: + if b in counts: + counts[b] += 1 + elif b in guide.FASTA_CODES: + for c in guide.FASTA_CODES[b]: + counts[c] += 1.0 / len(guide.FASTA_CODES[b]) + elif b != '-': + raise ValueError("Unknown base call %s" % b) + total = sum(counts.values()) + for base in counts: + counts[base] /= total + return counts + @staticmethod def from_list_of_seqs(seqs): """Construct a Alignment from aligned list of sequences. diff --git a/adapt/guide_search.py b/adapt/guide_search.py index c3a8971..76376cb 100644 --- a/adapt/guide_search.py +++ b/adapt/guide_search.py @@ -349,7 +349,7 @@ def guide_set_activities_percentile(self, window_start, window_end, """Compute percentiles of activity across target sequences for a guide set in a window. - For example, when percentiles is 0.5, this returns the median + For example, when percentiles is 50, this returns the median activity across the target sequences that the guide set provides. Args: diff --git a/adapt/target_search.py b/adapt/target_search.py index cd8d280..4a326af 100644 --- a/adapt/target_search.py +++ b/adapt/target_search.py @@ -12,6 +12,7 @@ import logging import math import os +import numpy as np from adapt import guide_search from adapt import primer_search @@ -28,7 +29,8 @@ def __init__(self, ps, gs, obj_type='min', max_primers_at_site=None, max_target_length=None, obj_weights=None, only_account_for_amplified_seqs=False, - halt_early=False, obj_value_shift=None): + halt_early=False, obj_value_shift=None, + mutator=None): """ Args: ps: PrimerSearcher object @@ -62,6 +64,8 @@ def __init__(self, ps, gs, obj_type='min', positive so that there is not confusion about differences between positive/negative. If None (default), defaults are set below based on obj_type. + mutator: a adapt.utils.mutate Mutator object. If None (default), + do not predict activity after mutations. """ self.ps = ps self.gs = gs @@ -99,6 +103,8 @@ def __init__(self, ps, gs, obj_type='min', # negative, so shift all up so that most are positive self.obj_value_shift = 4.0 + self.mutator = mutator + def _find_primer_pairs(self): """Find suitable primer pairs using self.ps. @@ -115,6 +121,33 @@ def _find_primer_pairs(self): p2 = primers[j] yield (p1, p2) + def _find_mutated_activity(self, targets): + """Find the activities of the guides after mutation + + For each target, find the activity of the best of the guide + set across each potential starting location after mutation, + averaged across the sequences. + + Args: + targets: list of tuples (obj_value, target) where target is a tuple + ((p1, p2), (guides_stats, guides)) + Returns: + list of mutated activities, ordered by target's ordering. + + """ + mutated_activities = [None] * len(targets) + for i, (obj_value, target) in enumerate(targets): + ((p1, p2), (guides_stats, guides)) = target + mutated_activities[i] = max(max(np.average( + self.gs.aln.compute_activity( + start_pos, + guide, + self.gs.predictor, + self.mutator)) + for start_pos in self.gs._selected_guide_positions[guide]) + for guide in guides) + return mutated_activities + def find_targets(self, best_n=10, no_overlap='amplicon'): """Find targets across an alignment. @@ -520,10 +553,12 @@ def find_and_write_targets(self, out_fn, best_n=10, no_overlap='amplicon'): When 'none', many targets in the best_n may be very similar """ targets = self.find_targets(best_n=best_n, no_overlap=no_overlap) + if self.mutator: + mutated_activity = self._find_mutated_activity(targets) with open(out_fn, 'w') as outf: # Write a header - outf.write('\t'.join(['objective-value', 'target-start', 'target-end', + headers = ['objective-value', 'target-start', 'target-end', 'target-length', 'left-primer-start', 'left-primer-num-primers', 'left-primer-frac-bound', 'left-primer-target-sequences', @@ -533,10 +568,13 @@ def find_and_write_targets(self, out_fn, best_n=10, no_overlap='amplicon'): 'guide-set-expected-activity', 'guide-set-median-activity', 'guide-set-5th-pctile-activity', 'guide-expected-activities', - 'guide-target-sequences', 'guide-target-sequence-positions']) + - '\n') + 'guide-target-sequences', 'guide-target-sequence-positions'] + if self.mutator: + headers.append('guide-set-5th-pctile-mutated-activity') + + outf.write('\t'.join(headers) + '\n') - for (obj_value, target) in targets: + for i, (obj_value, target) in enumerate(targets): ((p1, p2), (guides_stats, guides)) = target # Break out guides_stats @@ -585,6 +623,8 @@ def find_and_write_targets(self, out_fn, best_n=10, no_overlap='amplicon'): guides_activity_median, guides_activity_5thpctile, expected_activities_per_guide_str, guides_seqs_str, guides_positions_str] + if self.mutator: + line.append(mutated_activity[i]) outf.write('\t'.join([str(x) for x in line]) + '\n') diff --git a/adapt/tests/test_alignment.py b/adapt/tests/test_alignment.py index 7628826..6bd4aa5 100644 --- a/adapt/tests/test_alignment.py +++ b/adapt/tests/test_alignment.py @@ -572,17 +572,16 @@ def test_position_entropy_simple(self): 'AAAAA'] aln = alignment.Alignment.from_list_of_seqs(seqs) all_ps = [[1], - [0.25, 0.75], - [0.25, 0.25, .5], + [0.25, 0.75], + [0.25, 0.25, .5], [0.25, 0.25, 0.25, 0.25], [0.5, 0.5]] entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps] - self.assertEqual(aln.position_entropy(), - entropy) + self.assertEqual(aln.position_entropy(), entropy) def test_position_entropy_with_ambiguity(self): - seqs = ['MRWSYKVHDBN'] + seqs = ['MRWSYKVHDBN-'] aln = alignment.Alignment.from_list_of_seqs(seqs) all_ps = [[0.5, 0.5], [0.5, 0.5], @@ -590,15 +589,42 @@ def test_position_entropy_with_ambiguity(self): [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], - [1/3.0, 1/3.0, 1/3.0], - [1/3.0, 1/3.0, 1/3.0], - [1/3.0, 1/3.0, 1/3.0], - [1/3.0, 1/3.0, 1/3.0], - [0.25, 0.25, 0.25, 0.25]] + [1/3.0, 1/3.0, 1/3.0], + [1/3.0, 1/3.0, 1/3.0], + [1/3.0, 1/3.0, 1/3.0], + [1/3.0, 1/3.0, 1/3.0], + [0.25, 0.25, 0.25, 0.25], + [1]] entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps] - self.assertEqual(aln.position_entropy(), - entropy) + self.assertEqual(aln.position_entropy(), entropy) + + def test_base_percentages_simple(self): + seqs = ['ACCCC', + 'AAGGC', + 'AAATA', + 'AAAAA'] + aln = alignment.Alignment.from_list_of_seqs(seqs) + base_p = { + 'A': 0.6, + 'C': 0.25, + 'G': 0.1, + 'T': 0.05 + } + + self.assertEqual(aln.base_percentages(), base_p) + + def test_base_percentages_with_ambiguity(self): + seqs = ['MRWSYKVHDBN-'] + aln = alignment.Alignment.from_list_of_seqs(seqs) + base_p = { + 'A': 0.25, + 'C': 0.25, + 'G': 0.25, + 'T': 0.25 + } + + self.assertEqual(aln.base_percentages(), base_p) def test_construct_from_0_seqs(self): with self.assertRaises(Exception): diff --git a/adapt/tests/test_target_search.py b/adapt/tests/test_target_search.py index 1462991..750d161 100644 --- a/adapt/tests/test_target_search.py +++ b/adapt/tests/test_target_search.py @@ -9,7 +9,7 @@ from adapt import guide_search from adapt import primer_search from adapt import target_search -from adapt.utils import lsh +from adapt.utils import lsh, mutate, predict_activity __author__ = 'Hayden Metsky ' @@ -339,6 +339,24 @@ def test_find_targets_allowing_overlap_maximize_random_greedy(self): 'random-greedy', 1, 3, 100, check_for_one_guide=True, check_for_no_targets=True) + def test_find_mutated_activity(self): + mutator = mutate.GTRSubstitutionMutator(self.a_aln, 1, 1, 1, 1, 1, 1, + 1, 1, 1) + predictor = predict_activity.SimpleBinaryPredictor( + 6, False, required_flanking_seqs=('A', None)) + c_ps = primer_search.PrimerSearcher(self.a_aln, 4, 0, 1.0, (1, 1, 100)) + c_gs = guide_search.GuideSearcherMaximizeActivity( + self.a_aln, 6, 1, 5, 0.05, (1, 1, 100), algorithm='greedy', + predictor=predictor) + c = target_search.TargetSearcher(c_ps, c_gs, max_primers_at_site=2, + obj_type='max', mutator=mutator) + for best_n in [1, 2, 3, 4, 5, 6]: + targets = c.find_targets(best_n=best_n, no_overlap='none') + mut_activities = c._find_mutated_activity(targets) + self.assertEqual(len(mut_activities), best_n) + for mut_activity in mut_activities: + self.assertIsInstance(mut_activity, float) + def tearDown(self): # Re-enable logging logging.disable(logging.NOTSET) diff --git a/adapt/utils/mutate.py b/adapt/utils/mutate.py new file mode 100644 index 0000000..cf7cef5 --- /dev/null +++ b/adapt/utils/mutate.py @@ -0,0 +1,201 @@ +"""Use GTR to model distribution of future viral sequences +""" + +import numpy as np +from adapt import alignment +from adapt.utils import predict_activity +from scipy.linalg import expm + +__author__ = 'David K. Yang , Priya P. Pillai ' + + +class GTRSubstitutionMutator: + """Use the GTR Substitution model to mutate viral sequences + """ + def __init__(self, aln, + rAC, rAG, rAT, rCG, rCT, rGT, + mu, t, n): + """ + Args: + aln: An alignment of sequences to use to determine base percentages + rAC: Relative rate of conversion between A and C + rAG: Relative rate of conversion between A and G + rAT: Relative rate of conversion between A and T + rCG: Relative rate of conversion between C and G + rCT: Relative rate of conversion between C and T + rGT: Relative rate of conversion between G and T + mu: Overall rate of substitutions per site per year + t: Years to simulate substitutions over + n: Number of sequences to simulate mutations over + """ + base_percentages = aln.base_percentages() + self.piA = base_percentages['A'] + self.piC = base_percentages['C'] + self.piG = base_percentages['G'] + self.piT = base_percentages['T'] + self.rAC = rAC + self.rAG = rAG + self.rAT = rAT + self.rCG = rCG + self.rCT = rCT + self.rGT = rGT + self.mu = mu + self.t = t + self.n = n + + self.Q = self._construct_rateQ() + self.P = self._construct_P() + + def _construct_rateQ(self): + """Compute transition rate matrix + + Computes transition rate matrix Q, given base frequencies and + transition rates under GTR model (Tavaré 1986). + + The transition rate matrix defines the rate each base is mutated to + each other base. The rows indicate the starting base; the columns + indicate the final base. Bases are ordered A, C, G, T. Diagonal + elements are set such that the row sums to 0. + """ + + beta = 1 / (2 * ( + self.piA * (self.rAC*self.piC + + self.rAG*self.piG + + self.rAT*self.piT) + + self.piC * (self.rCG*self.piG + + self.rCT*self.piT) + + self.piG * (self.rGT*self.piT) + )) + Q = np.array([ + [-(self.rAC*self.piC + self.rAG*self.piG + self.rAT*self.piT), + self.rAC*self.piC, + self.rAG*self.piG, + self.rAT*self.piT], + [self.rAC*self.piA, + -(self.rAC*self.piA + self.rCG*self.piG + self.rCT*self.piT), + self.rCG*self.piG, + self.rCT*self.piT], + [self.rAG*self.piA, + self.rCG*self.piC, + -(self.rAG*self.piA + self.rCG*self.piC + self.rGT*self.piT), + self.rGT*self.piT], + [self.rAT*self.piA, + self.rCT*self.piC, + self.rGT*self.piG, + -(self.rAT*self.piA + self.rCT*self.piC + self.rGT*self.piG)] + ]) + + return beta * Q + + def _construct_P(self): + """Compute transition probability matrix + + Computes transition probability matrix P from rate matrix Q, + substitution rate m, and time t + + The transition probablility matrix defines the likelihood each base is + mutated to each other base. The rows indicate the starting base; the + columns indicate the final base. Bases are ordered A, C, G, T. + """ + + P = expm(self.Q * self.mu * self.t) + # Normalize so each row sums to 1. + # Matrix exponentiation should already do this in principle + row_sums = P.sum(axis=1) + normalized_matrix = P / row_sums[:, np.newaxis] + return normalized_matrix + + def _seq_to_encoding(self, seq): + """Encode string sequence into a nucleotide index list + + Map 'A' to 0, 'C' to 1, 'G' to 2, and 'T' to 3 + (e.g. "ACGT" returns [0, 1, 2, 3]) + + Args: + str: nucleotide sequence + + Returns: + list(int): list of nucleotide indexs. + """ + + base_key = "ACGT" + seq_as_list = list(seq) + return [base_key.index(base) for base in seq_as_list] + + def compute_sequence_probability(self, wild_seq, mut_seq): + """Compute probability of wild_seq mutating into mut_seq + + Under transition probability matrix P, finds probability of wild_seq + mutating into mut_seq + + Args: + wild_seq: reference sequence + mut_seq: mutated sequence to compute probability for + + Returns: + float: probability of mutated sequence + """ + + wild_seq_encoded = self._seq_to_encoding(wild_seq) + mut_seq_encoded = self._seq_to_encoding(mut_seq) + + prob = 1.0 + for a, b in zip(wild_seq_encoded, mut_seq_encoded): + prob = prob * self.P[a][b] + + return prob + + def mutate(self, wild_seq): + """Sample future sequences under transition probability matrix P + + Args: + wild_seq: reference sequence + + Returns: + list(str): list of samples from the distribution of future + sequences + """ + + wild_seq_encoded = self._seq_to_encoding(wild_seq) + sampled_seq_matrix = [] + + for res in wild_seq_encoded: + samples = np.random.choice(["A", "C", "G", "T"], self.n, + p=self.P[res]) + sampled_seq_matrix.append(samples) + + sampled_seqs_list = np.array(sampled_seq_matrix).transpose() + return ["".join(seq) for seq in sampled_seqs_list] + + def compute_mutated_activity(self, predictor, target_seq, + guide_seq, start=0): + """Calculate the activity of the guide after mutating the target + + Args: + predictor: an adapt.utils.predict_activity Predictor object + target_seq: string of what sequence the guide targets. Includes + context if predictor requires it + guide_seq: string of what the guide is + start: int, start position for the guide sequence in the alignment. + Required to use predictor memoization, if it exists + + Returns: + The 5th percentile of activity values from simulated mutations + """ + mutated_target_seqs = self.mutate(target_seq) + if isinstance(predictor, predict_activity.SimpleBinaryPredictor): + mutated_aln = alignment.Alignment.from_list_of_seqs( + mutated_target_seqs) + left_context = 0 + if predictor.required_flanking_seqs[0]: + left_context = len(predictor.required_flanking_seqs[0]) + _, activity = predictor.compute_activity(left_context, guide_seq, + mutated_aln, percentiles=5) + else: + pairs_to_eval = [] + for mutated_target_seq in mutated_target_seqs: + pair = (mutated_target_seq, guide_seq) + pairs_to_eval.append(pair) + _, activity = predictor.compute_activity(start, pairs_to_eval, + percentiles=5) + return activity diff --git a/adapt/utils/predict_activity.py b/adapt/utils/predict_activity.py index 2952e2a..5eef5d4 100644 --- a/adapt/utils/predict_activity.py +++ b/adapt/utils/predict_activity.py @@ -377,16 +377,20 @@ def determine_highly_active(self, start_pos, pairs): mem = self._memoized_evaluations[start_pos] return [mem[pair][1] for pair in pairs] - def compute_activity(self, start_pos, pairs): + def compute_activity(self, start_pos, pairs, percentiles=None): """Compute a single activity measurement for pairs. Args: start_pos: start position of all guides in pairs; used for memoizations pairs: list of tuples (target with context, guide) + percentiles: single percentile or list of percentiles to compute, + each in [0,100] (0 is minimum, 100 is maximum) Returns: - activity value for each pair + If percentile is not defined, list of activity value for each pair + If percentile is defined, tuple of (list of activity value for each + pair, the percentile(s) of those activities) """ # Determine which pairs do not have memoized results, and call # these @@ -397,7 +401,11 @@ def compute_activity(self, start_pos, pairs): if pair not in mem] self._run_models_and_memoize(start_pos, unique_pairs_to_evaluate) - return [mem[pair][0] for pair in pairs] + activities = [mem[pair][0] for pair in pairs] + if percentiles: + percentile_activity = np.percentile(activities, percentiles) + return (activities, percentile_activity) + return activities def cleanup_memoized(self, start_pos): """Cleanup memoizations no longer needed at a start position. @@ -438,7 +446,7 @@ def __init__(self, mismatches, allow_gu_pairs, self.rough_max_activity = 1.0 - def compute_activity(self, start_pos, gd_sequence, aln): + def compute_activity(self, start_pos, gd_sequence, aln, percentiles=None): """Compute activity by checking hybridization across an alignment. This says the activity is 1.0 if a guide is deemed to bind to @@ -448,10 +456,15 @@ def compute_activity(self, start_pos, gd_sequence, aln): start_pos: start position of guide sequence in aln gd_sequence: str representing guide sequence aln: alignment.Alignment object + percentiles: single percentile or list of percentiles to compute, + each in [0,100] (0 is minimum, 100 is maximum) Returns: - numpy array x where x[i] gives the activity (1 or 0) between - gd_sequence and the sequence in the alignment at index i + If percentile is not defined, numpy array x where x[i] gives the + activity (1 or 0) between gd_sequence and the sequence in the + alignment at index i + If percentile is defined, tuple of (the above, the percentile(s) of + the activities) """ # Start with all 0s activities = np.zeros(aln.num_sequences) @@ -465,6 +478,10 @@ def compute_activity(self, start_pos, gd_sequence, aln): for seq_idx in seqs_bound: activities[seq_idx] = 1.0 + if percentiles: + percentile_activity = np.percentile(activities, percentiles) + return (activities, percentile_activity) + return activities def cleanup_memoized(self, start_pos): diff --git a/adapt/utils/tests/test_mutate.py b/adapt/utils/tests/test_mutate.py new file mode 100644 index 0000000..c4b9cdf --- /dev/null +++ b/adapt/utils/tests/test_mutate.py @@ -0,0 +1,77 @@ +"""Tests mutate module. +""" + +import unittest + +import numpy as np + +from adapt import alignment +from adapt.utils import mutate, predict_activity + +__author__ = 'Priya Pillai ' + + +class TestGTRSubstitutionMutator(unittest.TestCase): + """Tests methods in the GTRSubstitutionMutator class. + """ + def setUp(self): + self.seqs = ['AC', + 'GT'] + aln_a = alignment.Alignment.from_list_of_seqs(self.seqs) + self.mutator_a = mutate.GTRSubstitutionMutator(aln_a, + 1, 1, 1, 1, 1, 1, + 1, 1000, 10) + self.wild_seq = 'AAAAAAAA' + self.mutated_seqs = self.mutator_a.mutate(self.wild_seq) + + aln_b = alignment.Alignment.from_list_of_seqs(self.mutated_seqs) + rates = np.random.rand(6) + self.mutator_b = mutate.GTRSubstitutionMutator(aln_b, + *rates, + 1, 1, 5) + + def test_construct_rateQ(self): + expected_a_Q = [[-1, 1/3, 1/3, 1/3], + [1/3, -1, 1/3, 1/3], + [1/3, 1/3, -1, 1/3], + [1/3, 1/3, 1/3, -1]] + np.testing.assert_allclose(self.mutator_a.Q, expected_a_Q) + + Q = self.mutator_b.Q + for row in Q: + self.assertAlmostEqual(sum(row), 0) + + def test_construct_P(self): + expected_a_P = [[0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25]] + np.testing.assert_allclose(self.mutator_a.P, expected_a_P) + + P = self.mutator_b.P + for row in P: + self.assertAlmostEqual(sum(row), 1) + + def test_mutate(self): + self.assertEqual(len(self.mutated_seqs), self.mutator_a.n) + for mutated_seq in self.mutated_seqs: + self.assertEqual(len(mutated_seq), len(self.wild_seq)) + + def test_compute_sequence_probability(self): + for mutated_seq in self.mutated_seqs: + self.assertAlmostEqual(self.mutator_a.compute_sequence_probability( + self.wild_seq, mutated_seq), (0.25**8)) + + def test_compute_mutated_activity(self): + predictor_a = predict_activity.SimpleBinaryPredictor(8, False) + activity = self.mutator_a.compute_mutated_activity(predictor_a, + self.wild_seq, + self.wild_seq) + self.assertEqual(activity, 1) + + predictor_a.mismatches = 0 + predictor_a.required_flanking_seqs = ('A', None) + activity = self.mutator_a.compute_mutated_activity(predictor_a, + 'A'+self.wild_seq, + self.wild_seq) + self.assertEqual(activity, 0) diff --git a/bin/design.py b/bin/design.py index e7b5bd3..b3e75aa 100644 --- a/bin/design.py +++ b/bin/design.py @@ -25,6 +25,7 @@ from adapt.utils import predict_activity from adapt.utils import seq_io from adapt.utils import year_cover +from adapt.utils import mutate from adapt.utils.version import get_project_path, get_latest_model_version try: @@ -678,9 +679,22 @@ def guide_is_suitable(guide): "--predict-cas13a-activity-model must be specified if " "--obj is 'maximize-activity' (unless " "--use-simple-binary-activity-prediction is set)")) + if args.predict_activity_degradation: + raise Exception(("--predict-activity-model-path must be " + "specified if --predict-activity-degradation is set " + "(unless --use-simple-binary-activity-prediction is set)")) # Do not predict activity predictor = None + # Construct mutator, if necessary + if args.predict_activity_degradation: + mutator = mutate.GTRSubstitutionMutator(aln, *args.predict_activity_degradation, + args.predict_activity_degradation_mu, + args.predict_activity_degradation_t, + args.predict_activity_degradation_n) + else: + mutator = None + # Find an optimal set of guides for each window in the genome, # and write them to a file; ensure that the selected guides are # specific to this alignment @@ -723,7 +737,7 @@ def guide_is_suitable(guide): args.out_tsv[i], window_step=args.window_step, sort=args.sort_out, - print_analysis=(args.log_level == logging.INFO)) + print_analysis=(args.log_level<=logging.INFO)) elif args.search_cmd == 'complete-targets': # Find optimal targets (primer and guide set combinations), # and write them to a file @@ -748,7 +762,7 @@ def guide_is_suitable(guide): max_target_length=args.max_target_length, obj_weights=args.obj_fn_weights, only_account_for_amplified_seqs=args.only_account_for_amplified_seqs, - halt_early=args.halt_search_early) + halt_early=args.halt_search_early, mutator=mutator) ts.find_and_write_targets(args.out_tsv[i], best_n=args.best_n_targets, no_overlap=args.do_not_overlap) else: @@ -1076,6 +1090,29 @@ def __call__(self, parser, namespace, values, option_string=None): "neither --predict-activity-model-path nor " "--predict-cas13a-activity-model should be set when this " "is set.")) + # Predict activity degradation + base_subparser.add_argument('--predict-activity-degradation', + nargs=6, type=float, + help=("If set, predict the degradation in activity over time due to " + "substitution using the General Time-Reversible model. Six " + "arguments should be set: the relative rates of substitutions " + "per year between (1) A and C, (2) A and G, (3) A and T, " + "(4) C and G, (5) C and T, & (6) G and T. Each of these should " + "be between 0 and 1. Base pair frequencies will be calculated " + "from input sequences. The 5th percentile of simulated activities " + "will be reported.")) + base_subparser.add_argument('--predict-activity-degradation-t', + type=int, default=5, + help=("Amount of time to simulate substitutions over, in years " + "(defaults to 5)")) + base_subparser.add_argument('--predict-activity-degradation-mu', + type=float, default=0.001, + help=("Overall rate of substitutions per site per year (defaults to 0.001)")) + base_subparser.add_argument('--predict-activity-degradation-n', + type=int, default=500, + help=("Number of sequences to simulate mutations over (defaults " + "to 500). Higher values increase accuracy, but also increase" + "runtime.")) # Technical options base_subparser.add_argument('--do-not-memoize-guide-computations',