From 97920db0fc9e4a4ecd85aea12b9bd597a612b38d Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Mon, 4 Jan 2021 12:56:25 -0600 Subject: [PATCH 1/7] Moving David's GTR model code to this repo Moving David's model code from adapt-seq-design to adapt --- adapt/gtr_substitution_model.py | 118 ++++++++++++++++++++++++++++++++ bin/design.py | 20 +++--- 2 files changed, 128 insertions(+), 10 deletions(-) create mode 100644 adapt/gtr_substitution_model.py diff --git a/adapt/gtr_substitution_model.py b/adapt/gtr_substitution_model.py new file mode 100644 index 0000000..8109ede --- /dev/null +++ b/adapt/gtr_substitution_model.py @@ -0,0 +1,118 @@ +"""Use GTR to model distribution of future viral sequences +""" + +import numpy as np +from scipy.linalg import expm + +__author__ = 'David K. Yang ' + + +class GTRSubstitutionModel: + """GTR Substitution model to model distribution of future viral sequences + """ + def __init__(self, piA, piC, piG, piT, + rAC, rAG, rAT, rCG, rCT, rGT, + mu, t, guide_length): + self.piA = piA + self.piC = piC + self.piG = piG + self.piT = piT + self.rAC = rAC + self.rAG = rAG + self.rAT = rAT + self.rCG = rCG + self.rCT = rCT + self.rGT = rGT + self.mu = mu + self.t = t + + self.Q = self._construct_rateQ() + self.P = self._construct_P() + + def _construct_rateQ(self): + """Computes transition rate matrix Q, given base frequencies and transition rates under GTR model (Tavaré 1986). + """ + + beta = 1 / (2*(self.piA * (self.rAC*self.piC + self.rAG*self.piG + self.rAT*self.piT) + self.piC * (self.rCG*self.piG + self.rCT*self.piT) + self.piG*self.piT)) + Q = np.array([ + [-(self.rAC*self.piC + self.rAG*self.piG + self.rAT*self.piT), self.rAC*self.piC, self.rAG*self.piG, self.rAT*self.piT], + [self.rAC*self.piA, -(self.rAC*self.piA + self.rCG*self.piG + self.rCT*self.piT), self.rCG*self.piG, self.rCT*self.piT], + [self.rAG*self.piA, self.rCG*self.piC, -(self.rAG*self.piA + self.rCG*self.piC + self.rGT*self.piT), self.rGT*self.piT], + [self.rAT*self.piA, self.rCT*self.piC, self.rGT*self.piG, -(self.rAT*self.piA + self.rCT*self.piC + self.rGT*self.piG)] + ]) + + return beta * Q + + def _construct_P(self): + """Computes transition probability matrix P from rate matrix Q, substitution rate m, and time t + """ + + P = expm(self.Q * self.mu * self.t) + row_sums = P.sum(axis=1) + normalized_matrix = P / row_sums[:, np.newaxis] # normalize so each row sums to 1. Matrix exponentiation should already do this in principle + return normalized_matrix + + def _seq_to_encoding(self, seq): + """Encodes string sequence into an AA index list. e.g. "ACGT" returns [0, 1, 2, 3] + + Args: + str: NT sequence + + Returns: + list(int): list of AA idxs. + """ + + base_key = "ACGT" + seq_as_list = list(seq) + return [base_key.index(base) for base in seq_as_list] + + def compute_sequence_probability(self, wild_seq, mut_seq): + """Computes probability of wild_seq mutating into mut_seq, under transition probability matrix P + + Args: + str: reference sequence + str: mutated sequence to compute probability for + list(list(float)): transition probability matrix P + + Returns: + float: probability of mutated sequence + """ + + wild_seq_encoded = self._seq_to_encoding(wild_seq) + mut_seq_encoded = self._seq_to_encoding(mut_seq) + + prob = 1.0 + for a, b in zip(wild_seq_encoded, mut_seq_encoded): + prob = prob * self.P[a][b] + + return prob + + def naive_sampler(self, wild_seq, n): + """Sample future sequences under transition probability matrix P + + Args: + str: reference sequence + list(list(float)): transition probability matrix P + int: number of samples + + Returns: + list(str): list of samples from the distribution of future sequences + """ + + wild_seq_encoded = self._seq_to_encoding(wild_seq) + sampled_seq_matrix = [] + + for res in wild_seq_encoded: + samples = np.random.choice(["A", "C", "G", "T"], n, p=self.P[res]) + sampled_seq_matrix.append(samples) + + sampled_seqs_list = np.array(sampled_seq_matrix).transpose() + return ["".join(seq) for seq in sampled_seqs_list] + + +class MutationEffectPredictor: + """Methods to predict the effect of mutations on guide activity""" + def __init__(self, alignment, gsm, predictor): + self.alignment = alignment + self.gsm = gsm + self.predictor = predictor diff --git a/bin/design.py b/bin/design.py index 0874567..08fb600 100644 --- a/bin/design.py +++ b/bin/design.py @@ -262,10 +262,10 @@ def prepare_alignments(args): if not os.path.exists(align_stat_memoize_dir): os.makedirs(align_stat_memoize_dir) - am = align.AlignmentMemoizer(align_memoize_dir, + am = align.AlignmentMemoizer(align_memoize_dir, aws_access_key_id = args.aws_access_key_id, aws_secret_access_key = args.aws_secret_access_key) - asm = align.AlignmentStatMemoizer(align_stat_memoize_dir, + asm = align.AlignmentStatMemoizer(align_stat_memoize_dir, aws_access_key_id = args.aws_access_key_id, aws_secret_access_key = args.aws_secret_access_key) else: @@ -351,13 +351,13 @@ def prepare_alignments(args): nc, specific_against_metadata_acc = prepare_alignment.prepare_for( tax_id, segment, ref_accs, aln_file_dir.name, aln_memoizer=am, aln_stat_memoizer=asm, - sample_seqs=args.sample_seqs, + sample_seqs=args.sample_seqs, prep_influenza=args.prep_influenza, years_tsv=years_tsv_tmp_name, cluster_threshold=args.cluster_threshold, accessions_to_use=accessions_to_use_for_tax, sequences_to_use=sequences_to_use_for_tax, - meta_filt=meta_filt, + meta_filt=meta_filt, meta_filt_against=meta_filt_against) for i in range(nc): @@ -478,7 +478,7 @@ def design_for_id(args): specific_against_metadata_start = len(alns) for i in range(num_aln_for_design): if len(args.specific_against_metadata_accs[i]) > 0: - logger.info(("Fetching %d sequences within taxon %d to be specific against"), + logger.info(("Fetching %d sequences within taxon %d to be specific against"), len(args.specific_against_metadata_accs[i]), args.taxid_for_fasta[i]) seqs = prepare_alignment.fetch_sequences_for_acc_list(list( args.specific_against_metadata_accs[i])) @@ -505,7 +505,7 @@ def design_for_id(args): alns += [seqs_list] specific_against_exists = ((len(args.specific_against_fastas) > 0 or - args.specific_against_taxa is not None) or + args.specific_against_taxa is not None) or (specific_against_metadata_end - specific_against_metadata_start) > 0) required_guides, blacklisted_ranges, blacklisted_kmers = \ @@ -570,7 +570,7 @@ def design_for_id(args): required_guides_for_aln = required_guides[i] blacklisted_ranges_for_aln = blacklisted_ranges[i] alns_in_same_taxon = aln_with_taxid[taxid] - # For metadata filtering, we only want to be specific against the + # For metadata filtering, we only want to be specific against the # accessions in alns[specific_against_metadata_index] specific_against_metadata_index = specific_against_metadata_indices[i] \ if i in specific_against_metadata_indices else None @@ -603,9 +603,9 @@ def guide_is_suitable(guide): for j in range(num_aln_for_design): if j in alns_in_same_taxon: aq.mask_aln(j) - logger.info(("Masking alignment %d as it is taxon %d"), + logger.info(("Masking alignment %d as it is taxon %d"), j + 1, taxid) - # Also mask any specific against metadata sequence lists that do not + # Also mask any specific against metadata sequence lists that do not # match the one for this alignment if specific_against_metadata_index is not None: logger.info(("Masking all sequence lists for specificity except the one " @@ -694,7 +694,7 @@ def guide_is_suitable(guide): args.out_tsv[i], window_step=args.window_step, sort=args.sort_out, - print_analysis=(args.log_level==logging.INFO)) + print_analysis=(args.log_level<=logging.INFO)) elif args.search_cmd == 'complete-targets': # Find optimal targets (primer and guide set combinations), # and write them to a file From 0238ef9db6a33d9394d20c68abb2eac4008154e6 Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Fri, 9 Apr 2021 19:37:20 -0400 Subject: [PATCH 2/7] Add mutated activity argument to TargetSearcher Change GTRSubstitutionModel into slightly more generic mutator. make mutation predictor, add mutation predictor object to TargetSearcher. Does not yet have command line argument access, add anything to output, nor aggregate the mutated activities. --- adapt/target_search.py | 22 +++++++++++- .../mutate.py} | 22 ++++-------- adapt/utils/predict_activity.py | 35 +++++++++++++++++++ 3 files changed, 62 insertions(+), 17 deletions(-) rename adapt/{gtr_substitution_model.py => utils/mutate.py} (85%) diff --git a/adapt/target_search.py b/adapt/target_search.py index b12280b..87ad173 100644 --- a/adapt/target_search.py +++ b/adapt/target_search.py @@ -28,7 +28,8 @@ def __init__(self, ps, gs, obj_type='min', max_primers_at_site=None, max_target_length=None, obj_weights=None, only_account_for_amplified_seqs=False, - halt_early=False, obj_value_shift=None): + halt_early=False, obj_value_shift=None, + mutation_predictor=None): """ Args: ps: PrimerSearcher object @@ -62,6 +63,8 @@ def __init__(self, ps, gs, obj_type='min', positive so that there is not confusion about differences between positive/negative. If None (default), defaults are set below based on obj_type. + mutation_predictor: a adapt.utils.predict_activity MutationPredictor + object. If None (default), do not predict activity after mutations. """ self.ps = ps self.gs = gs @@ -99,6 +102,8 @@ def __init__(self, ps, gs, obj_type='min', # negative, so shift all up so that most are positive self.obj_value_shift = 4.0 + self.mutation_predictor = mutation_predictor + def _find_primer_pairs(self): """Find suitable primer pairs using self.ps. @@ -115,6 +120,19 @@ def _find_primer_pairs(self): p2 = primers[j] yield (p1, p2) + def _find_mutated_activity(self): + mutated_activities = [None] * len(targets) + for i, (obj_value, target) in enumerate(targets): + ((p1, p2), (guides_stats, guides)) = target + guides_seqs_sorted = sorted(list(guides)) + mutated_activities[i] = [[self.gs.aln.compute_activity( + start_pos, + guide, + self.mutation_predictor) \ + for start_pos in self.gs._selected_guide_positions[guide]] \ + for guide in guides_seqs_sorted] + return mutated_activities + def find_targets(self, best_n=10, no_overlap=True): """Find targets across an alignment. @@ -502,6 +520,8 @@ def find_and_write_targets(self, out_fn, best_n=10): objective value """ targets = self.find_targets(best_n=best_n) + if self.mutation_predictor: + mutated_activity = self._find_mutated_activity(targets) with open(out_fn, 'w') as outf: # Write a header diff --git a/adapt/gtr_substitution_model.py b/adapt/utils/mutate.py similarity index 85% rename from adapt/gtr_substitution_model.py rename to adapt/utils/mutate.py index 8109ede..d767256 100644 --- a/adapt/gtr_substitution_model.py +++ b/adapt/utils/mutate.py @@ -7,7 +7,7 @@ __author__ = 'David K. Yang ' -class GTRSubstitutionModel: +class GTRSubstitutionMutator: """GTR Substitution model to model distribution of future viral sequences """ def __init__(self, piA, piC, piG, piT, @@ -70,9 +70,8 @@ def compute_sequence_probability(self, wild_seq, mut_seq): """Computes probability of wild_seq mutating into mut_seq, under transition probability matrix P Args: - str: reference sequence - str: mutated sequence to compute probability for - list(list(float)): transition probability matrix P + wild_seq: reference sequence + mut_seq: mutated sequence to compute probability for Returns: float: probability of mutated sequence @@ -87,13 +86,12 @@ def compute_sequence_probability(self, wild_seq, mut_seq): return prob - def naive_sampler(self, wild_seq, n): + def mutate(self, wild_seq, n): """Sample future sequences under transition probability matrix P Args: - str: reference sequence - list(list(float)): transition probability matrix P - int: number of samples + wild_seq: reference sequence + n: number of samples (int) Returns: list(str): list of samples from the distribution of future sequences @@ -108,11 +106,3 @@ def naive_sampler(self, wild_seq, n): sampled_seqs_list = np.array(sampled_seq_matrix).transpose() return ["".join(seq) for seq in sampled_seqs_list] - - -class MutationEffectPredictor: - """Methods to predict the effect of mutations on guide activity""" - def __init__(self, alignment, gsm, predictor): - self.alignment = alignment - self.gsm = gsm - self.predictor = predictor diff --git a/adapt/utils/predict_activity.py b/adapt/utils/predict_activity.py index c04702f..bc3a56e 100644 --- a/adapt/utils/predict_activity.py +++ b/adapt/utils/predict_activity.py @@ -472,3 +472,38 @@ def cleanup_memoized(self, start_pos): no memoizations. """ pass + + +class MutationPredictor: + """Methods to predict the effect of mutations on guide activity""" + def __init__(self, mutator, predictor, n): + """ + Args: + mutator: a adapt.utils.mutate Mutator object + predictor: a Predictor object + n: number of child mutations per sequence + """ + self.mutator = mutator + self.predictor = predictor + self.n = n + self.context_nt = 0 + if not isinstance(predictor, predict_activity.SimpleBinaryPredictor): + self.context_nt = predictor.context_nt + + def compute_activity(self, start_pos, pairs): + """Compute a single activity measurement for pairs. + + Args: + start_pos: start position of all guides in pairs; used for + memoizations + pairs: list of tuples (target with context, guide) + + Returns: + activity value for each pair + """ + mutated_activities = [None * len(pairs)] + for i, pair in enumerate(pairs): + mutated_target_seqs = self.mutator.mutate(pair[0], self.n) + pairs = [(mutated_target_seq, guide_seq) for mutated_target_seq in mutated_target_seqs] + mutated_activities[i] = self.predictor.compute_activity(start_pos, pairs) + return mutated_activities From 938bf6cba8d7698b362ae59fb608898fc5fce190 Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Fri, 23 Apr 2021 20:02:43 -0400 Subject: [PATCH 3/7] Remove MutationPredictor, tests Calculates mutated activity, writes to output. Make number of sequences input to mutate. Add percentile argument to compute_activity for predictors. Tests for alignment and mutate. Still need tests for target search --- adapt/alignment.py | 52 +++++++++++--- adapt/guide_search.py | 10 +-- adapt/target_search.py | 41 ++++++----- adapt/tests/test_alignment.py | 50 ++++++++++---- adapt/utils/mutate.py | 113 +++++++++++++++++++++++++------ adapt/utils/predict_activity.py | 54 ++++----------- adapt/utils/tests/test_mutate.py | 77 +++++++++++++++++++++ bin/design.py | 34 +++++++++- 8 files changed, 328 insertions(+), 103 deletions(-) create mode 100644 adapt/utils/tests/test_mutate.py diff --git a/adapt/alignment.py b/adapt/alignment.py index 1c541b0..18ee8ad 100644 --- a/adapt/alignment.py +++ b/adapt/alignment.py @@ -732,7 +732,7 @@ def determine_representative_guides(self, start, guide_length, representatives.add(gd) return representatives - def compute_activity(self, start, gd_sequence, predictor): + def compute_activity(self, start, gd_sequence, predictor, mutator=None): """Compute activity between a guide sequence and every target sequence in the alignment. @@ -743,10 +743,13 @@ def compute_activity(self, start, gd_sequence, predictor): start: start position in alignment at which to target gd_sequence: str representing guide sequence predictor: a adapt.utils.predict_activity.Predictor object + mutator: a adapt.utils.mutate.Mutator object Returns: numpy array x where x[i] gives the predicted activity between - gd_sequence and the sequence in the alignment at index i + gd_sequence and the sequence in the alignment at index i. If + mutator is not None, x[i] gives the predicted activity after the + mutations specified in the mutator. """ guide_length = len(gd_sequence) assert start + guide_length <= self.seq_length @@ -785,12 +788,24 @@ def compute_activity(self, start, gd_sequence, predictor): # is best to batch these pairs_to_eval = [] pairs_to_eval_seq_idx = [] - for seq_with_context, seq_idx in seq_rows_with_context: - pair = (seq_with_context, gd_sequence) - pairs_to_eval += [pair] - pairs_to_eval_seq_idx += [seq_idx] - # Evaluate activity - evals = predictor.compute_activity(start, pairs_to_eval) + evals = [] + + if mutator: + for seq_with_context, seq_idx in seq_rows_with_context: + activity = mutator.computed_mutated_activity(predictor, + seq_with_context, + gd_sequence, + start=start) + evals.append(activity) + pairs_to_eval_seq_idx.append(seq_idx) + else: + for seq_with_context, seq_idx in seq_rows_with_context: + pair = (seq_with_context, gd_sequence) + pairs_to_eval.append(pair) + pairs_to_eval_seq_idx.append(seq_idx) + # Evaluate activity + evals = predictor.compute_activity(start, pairs_to_eval) + for activity, seq_idx in zip(evals, pairs_to_eval_seq_idx): # Fill in the activity for seq_idx activities[seq_idx] = activity @@ -860,6 +875,27 @@ def position_entropy(self): return position_entropy + def base_percentages(self): + """Determines the percentage of each base pair in the alignment. + + Returns: + dictionary of base pair to its percentage in the alignment + """ + counts = {'A': 0, 'T': 0, 'C': 0, 'G': 0} + for i in range(self.seq_length): + for b in [self.seqs[i][j] for j in range(self.num_sequences)]: + if b in counts: + counts[b] += 1 + elif b in guide.FASTA_CODES: + for c in guide.FASTA_CODES[b]: + counts[c] += 1.0 / len(guide.FASTA_CODES[b]) + elif b != '-': + raise ValueError("Unknown base call %s" % b) + total = sum(counts.values()) + for base in counts: + counts[base] /= total + return counts + @staticmethod def from_list_of_seqs(seqs): """Construct a Alignment from aligned list of sequences. diff --git a/adapt/guide_search.py b/adapt/guide_search.py index c5e1cfc..5a598b2 100644 --- a/adapt/guide_search.py +++ b/adapt/guide_search.py @@ -349,7 +349,7 @@ def guide_set_activities_percentile(self, window_start, window_end, """Compute percentiles of activity across target sequences for a guide set in a window. - For example, when percentiles is 0.5, this returns the median + For example, when percentiles is 50, this returns the median activity across the target sequences that the guide set provides. Args: @@ -481,7 +481,7 @@ def _find_guides_for_each_window(self, window_size, raise ValueError(("window size must be < the length of the " "alignment")) if window_size < self.guide_length: - raise ValueError("window size must be >= guide length") + raise ValueError("window size must be >= guide length") for start in range(0, self.aln.seq_length - window_size + 1, window_step): @@ -676,7 +676,7 @@ def make_key(): key = (seqs_to_consider_frozen, num_needed_frozen) return key - + p = super()._compute_guide_memoized(start, construct_p, make_key, use_last=use_last) return p @@ -1170,7 +1170,7 @@ def find_guides_with_sliding_window(self, window_size, out_fn, num_with_min_count = sum(1 for x in guide_collections if len(x[2]) == min_count) - min_count_str = (str(min_count) + " guide" + + min_count_str = (str(min_count) + " guide" + ("s" if min_count > 1 else "")) stat_display = [ @@ -1287,7 +1287,7 @@ def obj_value(self, window_start, window_end, guide_set, activities=None): if activities is None: activities = self.guide_set_activities(window_start, window_end, guide_set) - + # Use the mean (i.e., uniform prior over target sequences) expected_activity = np.mean(activities) diff --git a/adapt/target_search.py b/adapt/target_search.py index 87ad173..2000c4d 100644 --- a/adapt/target_search.py +++ b/adapt/target_search.py @@ -12,6 +12,7 @@ import logging import math import os +import numpy as np from adapt import guide_search from adapt import primer_search @@ -29,7 +30,7 @@ def __init__(self, ps, gs, obj_type='min', max_target_length=None, obj_weights=None, only_account_for_amplified_seqs=False, halt_early=False, obj_value_shift=None, - mutation_predictor=None): + mutator=None): """ Args: ps: PrimerSearcher object @@ -63,8 +64,8 @@ def __init__(self, ps, gs, obj_type='min', positive so that there is not confusion about differences between positive/negative. If None (default), defaults are set below based on obj_type. - mutation_predictor: a adapt.utils.predict_activity MutationPredictor - object. If None (default), do not predict activity after mutations. + mutator: a adapt.utils.mutate Mutator object. If None (default), + do not predict activity after mutations. """ self.ps = ps self.gs = gs @@ -102,7 +103,7 @@ def __init__(self, ps, gs, obj_type='min', # negative, so shift all up so that most are positive self.obj_value_shift = 4.0 - self.mutation_predictor = mutation_predictor + self.mutator = mutator def _find_primer_pairs(self): """Find suitable primer pairs using self.ps. @@ -120,17 +121,18 @@ def _find_primer_pairs(self): p2 = primers[j] yield (p1, p2) - def _find_mutated_activity(self): + def _find_mutated_activity(self, targets): mutated_activities = [None] * len(targets) for i, (obj_value, target) in enumerate(targets): ((p1, p2), (guides_stats, guides)) = target - guides_seqs_sorted = sorted(list(guides)) - mutated_activities[i] = [[self.gs.aln.compute_activity( - start_pos, - guide, - self.mutation_predictor) \ - for start_pos in self.gs._selected_guide_positions[guide]] \ - for guide in guides_seqs_sorted] + mutated_activities[i] = max([max([np.average( + self.gs.aln.compute_activity( + start_pos, + guide, + self.gs.predictor, + self.mutator)) \ + for start_pos in self.gs._selected_guide_positions[guide]]) \ + for guide in guides]) return mutated_activities def find_targets(self, best_n=10, no_overlap=True): @@ -520,12 +522,12 @@ def find_and_write_targets(self, out_fn, best_n=10): objective value """ targets = self.find_targets(best_n=best_n) - if self.mutation_predictor: + if self.mutator: mutated_activity = self._find_mutated_activity(targets) with open(out_fn, 'w') as outf: # Write a header - outf.write('\t'.join(['objective-value', 'target-start', 'target-end', + headers = ['objective-value', 'target-start', 'target-end', 'target-length', 'left-primer-start', 'left-primer-num-primers', 'left-primer-frac-bound', 'left-primer-target-sequences', @@ -535,10 +537,13 @@ def find_and_write_targets(self, out_fn, best_n=10): 'guide-set-expected-activity', 'guide-set-median-activity', 'guide-set-5th-pctile-activity', 'guide-expected-activities', - 'guide-target-sequences', 'guide-target-sequence-positions']) + - '\n') + 'guide-target-sequences', 'guide-target-sequence-positions'] + if self.mutator: + headers.append('guide-set-5th-pctile-mutated-activity') - for (obj_value, target) in targets: + outf.write('\t'.join(headers) + '\n') + + for i, (obj_value, target) in enumerate(targets): ((p1, p2), (guides_stats, guides)) = target # Break out guides_stats @@ -587,6 +592,8 @@ def find_and_write_targets(self, out_fn, best_n=10): guides_activity_median, guides_activity_5thpctile, expected_activities_per_guide_str, guides_seqs_str, guides_positions_str] + if self.mutator: + line.append(mutated_activity[i]) outf.write('\t'.join([str(x) for x in line]) + '\n') diff --git a/adapt/tests/test_alignment.py b/adapt/tests/test_alignment.py index 7628826..6bd4aa5 100644 --- a/adapt/tests/test_alignment.py +++ b/adapt/tests/test_alignment.py @@ -572,17 +572,16 @@ def test_position_entropy_simple(self): 'AAAAA'] aln = alignment.Alignment.from_list_of_seqs(seqs) all_ps = [[1], - [0.25, 0.75], - [0.25, 0.25, .5], + [0.25, 0.75], + [0.25, 0.25, .5], [0.25, 0.25, 0.25, 0.25], [0.5, 0.5]] entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps] - self.assertEqual(aln.position_entropy(), - entropy) + self.assertEqual(aln.position_entropy(), entropy) def test_position_entropy_with_ambiguity(self): - seqs = ['MRWSYKVHDBN'] + seqs = ['MRWSYKVHDBN-'] aln = alignment.Alignment.from_list_of_seqs(seqs) all_ps = [[0.5, 0.5], [0.5, 0.5], @@ -590,15 +589,42 @@ def test_position_entropy_with_ambiguity(self): [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], - [1/3.0, 1/3.0, 1/3.0], - [1/3.0, 1/3.0, 1/3.0], - [1/3.0, 1/3.0, 1/3.0], - [1/3.0, 1/3.0, 1/3.0], - [0.25, 0.25, 0.25, 0.25]] + [1/3.0, 1/3.0, 1/3.0], + [1/3.0, 1/3.0, 1/3.0], + [1/3.0, 1/3.0, 1/3.0], + [1/3.0, 1/3.0, 1/3.0], + [0.25, 0.25, 0.25, 0.25], + [1]] entropy = [sum([-p*log2(p) for p in ps]) for ps in all_ps] - self.assertEqual(aln.position_entropy(), - entropy) + self.assertEqual(aln.position_entropy(), entropy) + + def test_base_percentages_simple(self): + seqs = ['ACCCC', + 'AAGGC', + 'AAATA', + 'AAAAA'] + aln = alignment.Alignment.from_list_of_seqs(seqs) + base_p = { + 'A': 0.6, + 'C': 0.25, + 'G': 0.1, + 'T': 0.05 + } + + self.assertEqual(aln.base_percentages(), base_p) + + def test_base_percentages_with_ambiguity(self): + seqs = ['MRWSYKVHDBN-'] + aln = alignment.Alignment.from_list_of_seqs(seqs) + base_p = { + 'A': 0.25, + 'C': 0.25, + 'G': 0.25, + 'T': 0.25 + } + + self.assertEqual(aln.base_percentages(), base_p) def test_construct_from_0_seqs(self): with self.assertRaises(Exception): diff --git a/adapt/utils/mutate.py b/adapt/utils/mutate.py index d767256..edc698c 100644 --- a/adapt/utils/mutate.py +++ b/adapt/utils/mutate.py @@ -2,6 +2,8 @@ """ import numpy as np +from adapt import alignment +from adapt.utils import predict_activity from scipy.linalg import expm __author__ = 'David K. Yang ' @@ -10,13 +12,14 @@ class GTRSubstitutionMutator: """GTR Substitution model to model distribution of future viral sequences """ - def __init__(self, piA, piC, piG, piT, + def __init__(self, aln, rAC, rAG, rAT, rCG, rCT, rGT, - mu, t, guide_length): - self.piA = piA - self.piC = piC - self.piG = piG - self.piT = piT + mu, t, n): + base_percentages = aln.base_percentages() + self.piA = base_percentages['A'] + self.piC = base_percentages['C'] + self.piG = base_percentages['G'] + self.piT = base_percentages['T'] self.rAC = rAC self.rAG = rAG self.rAT = rAT @@ -25,38 +28,69 @@ def __init__(self, piA, piC, piG, piT, self.rGT = rGT self.mu = mu self.t = t + self.n = n self.Q = self._construct_rateQ() self.P = self._construct_P() def _construct_rateQ(self): - """Computes transition rate matrix Q, given base frequencies and transition rates under GTR model (Tavaré 1986). + """Compute transition rate matrix + + Computes transition rate matrix Q, given base frequencies and + transition rates under GTR model (Tavaré 1986). """ - beta = 1 / (2*(self.piA * (self.rAC*self.piC + self.rAG*self.piG + self.rAT*self.piT) + self.piC * (self.rCG*self.piG + self.rCT*self.piT) + self.piG*self.piT)) + beta = 1 / (2 * ( + self.piA * (self.rAC*self.piC + + self.rAG*self.piG + + self.rAT*self.piT) + + self.piC * (self.rCG*self.piG + + self.rCT*self.piT) + + self.piG * (self.rGT*self.piT) + )) Q = np.array([ - [-(self.rAC*self.piC + self.rAG*self.piG + self.rAT*self.piT), self.rAC*self.piC, self.rAG*self.piG, self.rAT*self.piT], - [self.rAC*self.piA, -(self.rAC*self.piA + self.rCG*self.piG + self.rCT*self.piT), self.rCG*self.piG, self.rCT*self.piT], - [self.rAG*self.piA, self.rCG*self.piC, -(self.rAG*self.piA + self.rCG*self.piC + self.rGT*self.piT), self.rGT*self.piT], - [self.rAT*self.piA, self.rCT*self.piC, self.rGT*self.piG, -(self.rAT*self.piA + self.rCT*self.piC + self.rGT*self.piG)] + [-(self.rAC*self.piC + self.rAG*self.piG + self.rAT*self.piT), + self.rAC*self.piC, + self.rAG*self.piG, + self.rAT*self.piT], + [self.rAC*self.piA, + -(self.rAC*self.piA + self.rCG*self.piG + self.rCT*self.piT), + self.rCG*self.piG, + self.rCT*self.piT], + [self.rAG*self.piA, + self.rCG*self.piC, + -(self.rAG*self.piA + self.rCG*self.piC + self.rGT*self.piT), + self.rGT*self.piT], + [self.rAT*self.piA, + self.rCT*self.piC, + self.rGT*self.piG, + -(self.rAT*self.piA + self.rCT*self.piC + self.rGT*self.piG)] ]) return beta * Q def _construct_P(self): - """Computes transition probability matrix P from rate matrix Q, substitution rate m, and time t + """Compute transition probability matrix + + Computes transition probability matrix P from rate matrix Q, + substitution rate m, and time t """ P = expm(self.Q * self.mu * self.t) + # Normalize so each row sums to 1. + # Matrix exponentiation should already do this in principle row_sums = P.sum(axis=1) - normalized_matrix = P / row_sums[:, np.newaxis] # normalize so each row sums to 1. Matrix exponentiation should already do this in principle + normalized_matrix = P / row_sums[:, np.newaxis] return normalized_matrix def _seq_to_encoding(self, seq): - """Encodes string sequence into an AA index list. e.g. "ACGT" returns [0, 1, 2, 3] + """Encode string sequence into an AA index list + + Map 'A' to 0, 'C' to 1, 'G' to 2, and 'T' to 3 + (e.g. "ACGT" returns [0, 1, 2, 3]) Args: - str: NT sequence + str: nucleotide sequence Returns: list(int): list of AA idxs. @@ -67,7 +101,10 @@ def _seq_to_encoding(self, seq): return [base_key.index(base) for base in seq_as_list] def compute_sequence_probability(self, wild_seq, mut_seq): - """Computes probability of wild_seq mutating into mut_seq, under transition probability matrix P + """Compute probability of wild_seq mutating into mut_seq + + Under transition probability matrix P, finds probability of wild_seq + mutating into mut_seq Args: wild_seq: reference sequence @@ -86,23 +123,57 @@ def compute_sequence_probability(self, wild_seq, mut_seq): return prob - def mutate(self, wild_seq, n): + def mutate(self, wild_seq): """Sample future sequences under transition probability matrix P Args: wild_seq: reference sequence - n: number of samples (int) Returns: - list(str): list of samples from the distribution of future sequences + list(str): list of samples from the distribution of future + sequences """ wild_seq_encoded = self._seq_to_encoding(wild_seq) sampled_seq_matrix = [] for res in wild_seq_encoded: - samples = np.random.choice(["A", "C", "G", "T"], n, p=self.P[res]) + samples = np.random.choice(["A", "C", "G", "T"], self.n, + p=self.P[res]) sampled_seq_matrix.append(samples) sampled_seqs_list = np.array(sampled_seq_matrix).transpose() return ["".join(seq) for seq in sampled_seqs_list] + + def computed_mutated_activity(self, predictor, target_seq, + guide_seq, start=None): + """Calculate the activity of the guide after mutating the target + + Args: + predictor: an adapt.utils.predict_activity Predictor object + target_seq: string of what sequence the guide targets. Includes + context if predictor requires it + guide_seq: string of what the guide is + start: int, start position for the guide sequence in the alignment + Required to use predictor memoization, if it exists + + Returns: + The 5th percentile of activity values from simulated mutations + """ + mutated_target_seqs = self.mutate(target_seq) + if isinstance(predictor, predict_activity.SimpleBinaryPredictor): + mutated_aln = alignment.Alignment.from_list_of_seqs( + mutated_target_seqs) + start = 0 + if predictor.required_flanking_seqs[0]: + start = len(predictor.required_flanking_seqs[0]) + _, activity = predictor.compute_activity(start, guide_seq, + mutated_aln, percentile=5) + else: + pairs_to_eval = [] + for mutated_target_seq in mutated_target_seqs: + pair = (mutated_target_seq, guide_seq) + pairs_to_eval.append(pair) + _, activity = predictor.compute_activity(start, pairs_to_eval, + percentile=5) + return activity diff --git a/adapt/utils/predict_activity.py b/adapt/utils/predict_activity.py index bc3a56e..552826d 100644 --- a/adapt/utils/predict_activity.py +++ b/adapt/utils/predict_activity.py @@ -377,16 +377,19 @@ def determine_highly_active(self, start_pos, pairs): mem = self._memoized_evaluations[start_pos] return [mem[pair][1] for pair in pairs] - def compute_activity(self, start_pos, pairs): + def compute_activity(self, start_pos, pairs, percentile=None): """Compute a single activity measurement for pairs. Args: start_pos: start position of all guides in pairs; used for memoizations pairs: list of tuples (target with context, guide) + percentile: a percentile between 0 and 100 of the activity + values to return Returns: - activity value for each pair + activity value for each pair, if percentile is included, + activity value for the percentile amongst the pairs """ # Determine which pairs do not have memoized results, and call # these @@ -397,7 +400,11 @@ def compute_activity(self, start_pos, pairs): if pair not in mem] self._run_models_and_memoize(start_pos, unique_pairs_to_evaluate) - return [mem[pair][0] for pair in pairs] + activities = [mem[pair][0] for pair in pairs] + if percentile: + percentile_activity = np.percentile(activities, percentile) + return (activities, percentile_activity) + return activities def cleanup_memoized(self, start_pos): """Cleanup memoizations no longer needed at a start position. @@ -438,7 +445,7 @@ def __init__(self, mismatches, allow_gu_pairs, self.rough_max_activity = 1.0 - def compute_activity(self, start_pos, gd_sequence, aln): + def compute_activity(self, start_pos, gd_sequence, aln, percentile=None): """Compute activity by checking hybridization across an alignment. This says the activity is 1.0 if a guide is deemed to bind to @@ -465,6 +472,10 @@ def compute_activity(self, start_pos, gd_sequence, aln): for seq_idx in seqs_bound: activities[seq_idx] = 1.0 + if percentile: + percentile_activity = np.percentile(activities, percentile) + return (activities, percentile_activity) + return activities def cleanup_memoized(self, start_pos): @@ -472,38 +483,3 @@ def cleanup_memoized(self, start_pos): no memoizations. """ pass - - -class MutationPredictor: - """Methods to predict the effect of mutations on guide activity""" - def __init__(self, mutator, predictor, n): - """ - Args: - mutator: a adapt.utils.mutate Mutator object - predictor: a Predictor object - n: number of child mutations per sequence - """ - self.mutator = mutator - self.predictor = predictor - self.n = n - self.context_nt = 0 - if not isinstance(predictor, predict_activity.SimpleBinaryPredictor): - self.context_nt = predictor.context_nt - - def compute_activity(self, start_pos, pairs): - """Compute a single activity measurement for pairs. - - Args: - start_pos: start position of all guides in pairs; used for - memoizations - pairs: list of tuples (target with context, guide) - - Returns: - activity value for each pair - """ - mutated_activities = [None * len(pairs)] - for i, pair in enumerate(pairs): - mutated_target_seqs = self.mutator.mutate(pair[0], self.n) - pairs = [(mutated_target_seq, guide_seq) for mutated_target_seq in mutated_target_seqs] - mutated_activities[i] = self.predictor.compute_activity(start_pos, pairs) - return mutated_activities diff --git a/adapt/utils/tests/test_mutate.py b/adapt/utils/tests/test_mutate.py new file mode 100644 index 0000000..bea38f6 --- /dev/null +++ b/adapt/utils/tests/test_mutate.py @@ -0,0 +1,77 @@ +"""Tests mutate module. +""" + +import unittest + +import numpy as np + +from adapt import alignment +from adapt.utils import mutate, predict_activity + +__author__ = 'Priya Pillai ' + + +class TestGTRSubstitutionMutator(unittest.TestCase): + """Tests methods in the GTRSubstitutionMutator class. + """ + def setUp(self): + self.seqs = ['AC', + 'GT'] + aln_a = alignment.Alignment.from_list_of_seqs(self.seqs) + self.mutator_a = mutate.GTRSubstitutionMutator(aln_a, + 1, 1, 1, 1, 1, 1, + 1, 1000, 10) + self.wild_seq = 'AAAAAAAA' + self.mutated_seqs = self.mutator_a.mutate(self.wild_seq) + + aln_b = alignment.Alignment.from_list_of_seqs(self.mutated_seqs) + rates = np.random.rand(6) + self.mutator_b = mutate.GTRSubstitutionMutator(aln_b, + *rates, + 1, 1, 5) + + def test_construct_rateQ(self): + expected_a_Q = [[-1, 1/3, 1/3, 1/3], + [1/3, -1, 1/3, 1/3], + [1/3, 1/3, -1, 1/3], + [1/3, 1/3, 1/3, -1]] + np.testing.assert_allclose(self.mutator_a.Q, expected_a_Q) + + Q = self.mutator_b.Q + for row in Q: + self.assertAlmostEqual(sum(row), 0) + + def test_construct_P(self): + expected_a_P = [[0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25], + [0.25, 0.25, 0.25, 0.25]] + np.testing.assert_allclose(self.mutator_a.P, expected_a_P) + + P = self.mutator_b.P + for row in P: + self.assertAlmostEqual(sum(row), 1) + + def test_mutate(self): + self.assertEqual(len(self.mutated_seqs), self.mutator_a.n) + for mutated_seq in self.mutated_seqs: + self.assertEqual(len(mutated_seq), len(self.wild_seq)) + + def test_compute_sequence_probability(self): + for mutated_seq in self.mutated_seqs: + self.assertAlmostEqual(self.mutator_a.compute_sequence_probability( + self.wild_seq, mutated_seq), (0.25**8)) + + def test_compute_mutated_activity(self): + predictor_a = predict_activity.SimpleBinaryPredictor(8, False) + activity = self.mutator_a.computed_mutated_activity(predictor_a, + self.wild_seq, + self.wild_seq) + self.assertEqual(activity, 1) + + predictor_a.mismatches = 0 + predictor_a.required_flanking_seqs = ('A', None) + activity = self.mutator_a.computed_mutated_activity(predictor_a, + 'A'+self.wild_seq, + self.wild_seq) + self.assertEqual(activity, 0) diff --git a/bin/design.py b/bin/design.py index 08fb600..11e16e0 100644 --- a/bin/design.py +++ b/bin/design.py @@ -26,6 +26,7 @@ from adapt.utils import predict_activity from adapt.utils import seq_io from adapt.utils import year_cover +from adapt.utils import mutate try: import boto3 @@ -652,6 +653,15 @@ def guide_is_suitable(guide): # Do not predict activity predictor = None + # Construct mutator, if necessary + if args.predict_activity_degradation: + mutator = mutate.GTRSubstitutionMutator(aln, *args.predict_activity_degradation, + args.predict_activity_degradation_mu, + args.predict_activity_degradation_t, + args.predict_activity_degradation_n) + else: + mutator = None + # Find an optimal set of guides for each window in the genome, # and write them to a file; ensure that the selected guides are # specific to this alignment @@ -719,7 +729,7 @@ def guide_is_suitable(guide): max_target_length=args.max_target_length, obj_weights=args.obj_fn_weights, only_account_for_amplified_seqs=args.only_account_for_amplified_seqs, - halt_early=args.halt_search_early) + halt_early=args.halt_search_early, mutator=mutator) ts.find_and_write_targets(args.out_tsv[i], best_n=args.best_n_targets) else: @@ -1041,6 +1051,28 @@ def __call__(self, parser, namespace, values, option_string=None): "does not use a serialized model for predicting activity, so " "--predict-activity-model-path should not be set when this " "is set.")) + # Predict activity degradation + base_subparser.add_argument('--predict-activity-degradation', + nargs=6, type=float, + help=("If set, predict the degradation in activity over time due to " + "substitution using the General Time-Reversible model. Six " + "arguments should be set: the relative rates of substitutions " + "per year between (1) A and C, (2) A and G, (3) A and T, " + "(4) C and G, (5) C and T, & (6) G and T. Each of these should " + "be between 0 and 1. Base pair frequencies will be calculated " + "from input sequences. The 5th percentile of simulated activities " + "will be reported.")) + base_subparser.add_argument('--predict-activity-degradation-t', + type=int, default=5, + help=("Amount of time to simulate substitutions over, in years " + "(defaults to 5)")) + base_subparser.add_argument('--predict-activity-degradation-mu', + type=float, default=0.001, + help=("Overall rate of substitutions per year (defaults to 0.001)")) + base_subparser.add_argument('--predict-activity-degradation-n', + type=int, default=500, + help=("Number of sequences to simulate mutations over (defaults " + "to 500)")) # Technical options base_subparser.add_argument('--do-not-memoize-guide-computations', From 4ce8e93b20dbe6ca12801df01f967c025e1c57bf Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Mon, 26 Apr 2021 12:48:01 -0400 Subject: [PATCH 4/7] Test target searcher, fix SimpleBinaryPredictor case Add a test in test_target_search for the mutations; throw error in design.py if model not set but mutation of activity wanted, add code to compute_activity to handle the SimpleBinaryPredictor case; add documentation --- adapt/alignment.py | 27 +++++++++++++++++++++++++++ adapt/target_search.py | 27 ++++++++++++++++++++------- adapt/tests/test_target_search.py | 26 +++++++++++++++++++++++--- adapt/utils/mutate.py | 8 ++++---- bin/design.py | 4 ++++ 5 files changed, 78 insertions(+), 14 deletions(-) diff --git a/adapt/alignment.py b/adapt/alignment.py index 18ee8ad..08e7a34 100644 --- a/adapt/alignment.py +++ b/adapt/alignment.py @@ -755,6 +755,33 @@ def compute_activity(self, start, gd_sequence, predictor, mutator=None): assert start + guide_length <= self.seq_length if isinstance(predictor, predict_activity.SimpleBinaryPredictor): + if mutator: + # Extract the target sequences, including context to use with + # prediction (i.e. the flanking sequences, if they exist) + left_context = 0 + right_context = 0 + if predictor: + if predictor.required_flanking_seqs[0]: + left_context = len(predictor.required_flanking_seqs[0]) + if predictor.required_flanking_seqs[1]: + right_context = len(predictor.required_flanking_seqs[1]) + + aln_for_guide_with_context = self.extract_range( + start - left_context, + start + guide_length + right_context) + seq_rows_with_context = aln_for_guide_with_context.make_list_of_seqs() + + # Start array of predicted activities; make this all 0s so that + # sequences for which activities are not computed (e.g., gap) + # have activity=0 + activities = np.zeros(self.num_sequences) + for i, seq_with_context in enumerate(seq_rows_with_context): + activity = mutator.computed_mutated_activity(predictor, + seq_with_context, + gd_sequence, + start=start) + activities[i] = activity + return activities # Do not use a model; just predict binary activity (1 or 0) # based on distance between guide and targets return predictor.compute_activity(start, gd_sequence, self) diff --git a/adapt/target_search.py b/adapt/target_search.py index 2000c4d..1c9f6bf 100644 --- a/adapt/target_search.py +++ b/adapt/target_search.py @@ -122,17 +122,30 @@ def _find_primer_pairs(self): yield (p1, p2) def _find_mutated_activity(self, targets): + """Find the activities of the guides after mutation + + For each target, find the activity of the best of the guide + set across each potential starting location after mutation, + averaged across the sequences. + + Args: + targets: list of tuples (obj_value, target) where target is a tuple + ((p1, p2), (guides_stats, guides)) + Returns: + list of mutated activities, ordered by target's ordering. + + """ mutated_activities = [None] * len(targets) for i, (obj_value, target) in enumerate(targets): ((p1, p2), (guides_stats, guides)) = target mutated_activities[i] = max([max([np.average( - self.gs.aln.compute_activity( - start_pos, - guide, - self.gs.predictor, - self.mutator)) \ - for start_pos in self.gs._selected_guide_positions[guide]]) \ - for guide in guides]) + self.gs.aln.compute_activity( + start_pos, + guide, + self.gs.predictor, + self.mutator)) + for start_pos in self.gs._selected_guide_positions[guide]]) + for guide in guides]) return mutated_activities def find_targets(self, best_n=10, no_overlap=True): diff --git a/adapt/tests/test_target_search.py b/adapt/tests/test_target_search.py index da8bdfd..eab89d6 100644 --- a/adapt/tests/test_target_search.py +++ b/adapt/tests/test_target_search.py @@ -9,7 +9,7 @@ from adapt import guide_search from adapt import primer_search from adapt import target_search -from adapt.utils import lsh +from adapt.utils import lsh, mutate, predict_activity __author__ = 'Hayden Metsky ' @@ -67,7 +67,7 @@ def test_find_primer_pairs_simple_minimize(self): break self.assertTrue(in_aln) - def test_find_targets_allowing_overlap_minmize(self): + def test_find_targets_allowing_overlap_minimize(self): for best_n in [1, 2, 3, 4, 5, 6]: targets = self.a_min.find_targets(best_n=best_n, no_overlap=False) self.assertEqual(len(targets), best_n) @@ -300,6 +300,26 @@ def test_find_targets_allowing_overlap_maximize_random_greedy(self): 'random-greedy', 1, 3, 100, check_for_one_guide=True, check_for_no_targets=True) + def test_find_mutated_activity(self): + mutator = mutate.GTRSubstitutionMutator(self.a_aln, 1, 1, 1, 1, 1, 1, + 1, 1, 1) + predictor = predict_activity.SimpleBinaryPredictor( + 6, + False, + required_flanking_seqs=('A', None)) + c_ps = primer_search.PrimerSearcher(self.a_aln, 4, 0, 1.0, (1, 1, 100)) + c_gs = guide_search.GuideSearcherMaximizeActivity( + self.a_aln, 6, 1, 5, 0.05, (1, 1, 100), algorithm='greedy', + predictor=predictor) + c = target_search.TargetSearcher(c_ps, c_gs, max_primers_at_site=2, + obj_type='max', mutator=mutator) + for best_n in [1, 2, 3, 4, 5, 6]: + targets = c.find_targets(best_n=best_n, no_overlap=False) + mut_activities = c._find_mutated_activity(targets) + self.assertEqual(len(mut_activities), best_n) + for mut_activity in mut_activities: + self.assertIsInstance(mut_activity, float) + def tearDown(self): # Re-enable logging - logging.disable(logging.NOTSET) + logging.disable(logging.NOTSET) diff --git a/adapt/utils/mutate.py b/adapt/utils/mutate.py index edc698c..8819d9a 100644 --- a/adapt/utils/mutate.py +++ b/adapt/utils/mutate.py @@ -146,7 +146,7 @@ def mutate(self, wild_seq): return ["".join(seq) for seq in sampled_seqs_list] def computed_mutated_activity(self, predictor, target_seq, - guide_seq, start=None): + guide_seq, start=0): """Calculate the activity of the guide after mutating the target Args: @@ -164,10 +164,10 @@ def computed_mutated_activity(self, predictor, target_seq, if isinstance(predictor, predict_activity.SimpleBinaryPredictor): mutated_aln = alignment.Alignment.from_list_of_seqs( mutated_target_seqs) - start = 0 + left_context = 0 if predictor.required_flanking_seqs[0]: - start = len(predictor.required_flanking_seqs[0]) - _, activity = predictor.compute_activity(start, guide_seq, + left_context = len(predictor.required_flanking_seqs[0]) + _, activity = predictor.compute_activity(left_context, guide_seq, mutated_aln, percentile=5) else: pairs_to_eval = [] diff --git a/bin/design.py b/bin/design.py index 11e16e0..03c7f4e 100644 --- a/bin/design.py +++ b/bin/design.py @@ -650,6 +650,10 @@ def guide_is_suitable(guide): raise Exception(("--predict-activity-model-path must be " "specified if --obj is 'maximize-activity' (unless " "--use-simple-binary-activity-prediction is set)")) + if args.predict_activity_degradation: + raise Exception(("--predict-activity-model-path must be " + "specified if --predict-activity-degradation is set " + "(unless --use-simple-binary-activity-prediction is set)")) # Do not predict activity predictor = None From 194da64b796552348046e7312519f45be2095865 Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Wed, 28 Apr 2021 10:13:14 -0400 Subject: [PATCH 5/7] Fix test_find_mutated_activity Change overlap in test_target_search.test_find_mutated_activity to 'none' to match with merge. --- adapt/tests/test_target_search.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/adapt/tests/test_target_search.py b/adapt/tests/test_target_search.py index 75fd50a..750d161 100644 --- a/adapt/tests/test_target_search.py +++ b/adapt/tests/test_target_search.py @@ -343,9 +343,7 @@ def test_find_mutated_activity(self): mutator = mutate.GTRSubstitutionMutator(self.a_aln, 1, 1, 1, 1, 1, 1, 1, 1, 1) predictor = predict_activity.SimpleBinaryPredictor( - 6, - False, - required_flanking_seqs=('A', None)) + 6, False, required_flanking_seqs=('A', None)) c_ps = primer_search.PrimerSearcher(self.a_aln, 4, 0, 1.0, (1, 1, 100)) c_gs = guide_search.GuideSearcherMaximizeActivity( self.a_aln, 6, 1, 5, 0.05, (1, 1, 100), algorithm='greedy', @@ -353,7 +351,7 @@ def test_find_mutated_activity(self): c = target_search.TargetSearcher(c_ps, c_gs, max_primers_at_site=2, obj_type='max', mutator=mutator) for best_n in [1, 2, 3, 4, 5, 6]: - targets = c.find_targets(best_n=best_n, no_overlap=False) + targets = c.find_targets(best_n=best_n, no_overlap='none') mut_activities = c._find_mutated_activity(targets) self.assertEqual(len(mut_activities), best_n) for mut_activity in mut_activities: From 653c9f3f69c421bdedf2aae163ba93935bf72cb2 Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Thu, 6 May 2021 19:30:54 -0400 Subject: [PATCH 6/7] Clarify comments, rename compute_mutated_activity, remove lists from max --- adapt/alignment.py | 16 +++++++------- adapt/target_search.py | 6 +++--- adapt/utils/mutate.py | 36 +++++++++++++++++++++++++------- adapt/utils/predict_activity.py | 5 +++-- adapt/utils/tests/test_mutate.py | 12 +++++------ bin/design.py | 5 +++-- 6 files changed, 52 insertions(+), 28 deletions(-) diff --git a/adapt/alignment.py b/adapt/alignment.py index 08e7a34..5481e13 100644 --- a/adapt/alignment.py +++ b/adapt/alignment.py @@ -776,10 +776,10 @@ def compute_activity(self, start, gd_sequence, predictor, mutator=None): # have activity=0 activities = np.zeros(self.num_sequences) for i, seq_with_context in enumerate(seq_rows_with_context): - activity = mutator.computed_mutated_activity(predictor, - seq_with_context, - gd_sequence, - start=start) + activity = mutator.compute_mutated_activity(predictor, + seq_with_context, + gd_sequence, + start=start) activities[i] = activity return activities # Do not use a model; just predict binary activity (1 or 0) @@ -819,10 +819,10 @@ def compute_activity(self, start, gd_sequence, predictor, mutator=None): if mutator: for seq_with_context, seq_idx in seq_rows_with_context: - activity = mutator.computed_mutated_activity(predictor, - seq_with_context, - gd_sequence, - start=start) + activity = mutator.compute_mutated_activity(predictor, + seq_with_context, + gd_sequence, + start=start) evals.append(activity) pairs_to_eval_seq_idx.append(seq_idx) else: diff --git a/adapt/target_search.py b/adapt/target_search.py index 4b41bb2..4a326af 100644 --- a/adapt/target_search.py +++ b/adapt/target_search.py @@ -138,14 +138,14 @@ def _find_mutated_activity(self, targets): mutated_activities = [None] * len(targets) for i, (obj_value, target) in enumerate(targets): ((p1, p2), (guides_stats, guides)) = target - mutated_activities[i] = max([max([np.average( + mutated_activities[i] = max(max(np.average( self.gs.aln.compute_activity( start_pos, guide, self.gs.predictor, self.mutator)) - for start_pos in self.gs._selected_guide_positions[guide]]) - for guide in guides]) + for start_pos in self.gs._selected_guide_positions[guide]) + for guide in guides) return mutated_activities def find_targets(self, best_n=10, no_overlap='amplicon'): diff --git a/adapt/utils/mutate.py b/adapt/utils/mutate.py index 8819d9a..50dfb10 100644 --- a/adapt/utils/mutate.py +++ b/adapt/utils/mutate.py @@ -6,15 +6,28 @@ from adapt.utils import predict_activity from scipy.linalg import expm -__author__ = 'David K. Yang ' +__author__ = 'David K. Yang , Priya P. Pillai ' class GTRSubstitutionMutator: - """GTR Substitution model to model distribution of future viral sequences + """Use the GTR Substitution model to mutate viral sequences """ def __init__(self, aln, rAC, rAG, rAT, rCG, rCT, rGT, mu, t, n): + """ + Args: + aln: An alignment of sequences to use to determine base percentages + rAC: Relative rate of conversion between A and C + rAG: Relative rate of conversion between A and G + rAT: Relative rate of conversion between A and T + rCG: Relative rate of conversion between C and G + rCT: Relative rate of conversion between C and T + rGT: Relative rate of conversion between G and T + mu: Overall rate of substitutions per site per year + t: Years to simulate substitutions over + n: Number of sequences to simulate mutations over + """ base_percentages = aln.base_percentages() self.piA = base_percentages['A'] self.piC = base_percentages['C'] @@ -38,6 +51,11 @@ def _construct_rateQ(self): Computes transition rate matrix Q, given base frequencies and transition rates under GTR model (Tavaré 1986). + + The transition rate matrix defines the rate each base is mutated to + each other base. The rows indicate the starting base; the columns + indicate the final base. Bases are ordered A, C, G, T. Diagonal + elements are set such that the row sums to 0. """ beta = 1 / (2 * ( @@ -74,6 +92,10 @@ def _construct_P(self): Computes transition probability matrix P from rate matrix Q, substitution rate m, and time t + + The transition probablility matrix defines the likelihood each base is + mutated to each other base. The rows indicate the starting base; the + columns indicate the final base. Bases are ordered A, C, G, T. """ P = expm(self.Q * self.mu * self.t) @@ -84,7 +106,7 @@ def _construct_P(self): return normalized_matrix def _seq_to_encoding(self, seq): - """Encode string sequence into an AA index list + """Encode string sequence into a nucleotide index list Map 'A' to 0, 'C' to 1, 'G' to 2, and 'T' to 3 (e.g. "ACGT" returns [0, 1, 2, 3]) @@ -93,7 +115,7 @@ def _seq_to_encoding(self, seq): str: nucleotide sequence Returns: - list(int): list of AA idxs. + list(int): list of nucleotide indexs. """ base_key = "ACGT" @@ -145,8 +167,8 @@ def mutate(self, wild_seq): sampled_seqs_list = np.array(sampled_seq_matrix).transpose() return ["".join(seq) for seq in sampled_seqs_list] - def computed_mutated_activity(self, predictor, target_seq, - guide_seq, start=0): + def compute_mutated_activity(self, predictor, target_seq, + guide_seq, start=0): """Calculate the activity of the guide after mutating the target Args: @@ -154,7 +176,7 @@ def computed_mutated_activity(self, predictor, target_seq, target_seq: string of what sequence the guide targets. Includes context if predictor requires it guide_seq: string of what the guide is - start: int, start position for the guide sequence in the alignment + start: int, start position for the guide sequence in the alignment. Required to use predictor memoization, if it exists Returns: diff --git a/adapt/utils/predict_activity.py b/adapt/utils/predict_activity.py index 552826d..620df39 100644 --- a/adapt/utils/predict_activity.py +++ b/adapt/utils/predict_activity.py @@ -388,8 +388,9 @@ def compute_activity(self, start_pos, pairs, percentile=None): values to return Returns: - activity value for each pair, if percentile is included, - activity value for the percentile amongst the pairs + If percentile is not defined, list of activity value for each pair + If percentile is defined, tuple of (list of activity value for each + pair, the percentile of those activities) """ # Determine which pairs do not have memoized results, and call # these diff --git a/adapt/utils/tests/test_mutate.py b/adapt/utils/tests/test_mutate.py index bea38f6..c4b9cdf 100644 --- a/adapt/utils/tests/test_mutate.py +++ b/adapt/utils/tests/test_mutate.py @@ -64,14 +64,14 @@ def test_compute_sequence_probability(self): def test_compute_mutated_activity(self): predictor_a = predict_activity.SimpleBinaryPredictor(8, False) - activity = self.mutator_a.computed_mutated_activity(predictor_a, - self.wild_seq, - self.wild_seq) + activity = self.mutator_a.compute_mutated_activity(predictor_a, + self.wild_seq, + self.wild_seq) self.assertEqual(activity, 1) predictor_a.mismatches = 0 predictor_a.required_flanking_seqs = ('A', None) - activity = self.mutator_a.computed_mutated_activity(predictor_a, - 'A'+self.wild_seq, - self.wild_seq) + activity = self.mutator_a.compute_mutated_activity(predictor_a, + 'A'+self.wild_seq, + self.wild_seq) self.assertEqual(activity, 0) diff --git a/bin/design.py b/bin/design.py index bb21058..3e03608 100644 --- a/bin/design.py +++ b/bin/design.py @@ -1068,11 +1068,12 @@ def __call__(self, parser, namespace, values, option_string=None): "(defaults to 5)")) base_subparser.add_argument('--predict-activity-degradation-mu', type=float, default=0.001, - help=("Overall rate of substitutions per year (defaults to 0.001)")) + help=("Overall rate of substitutions per site per year (defaults to 0.001)")) base_subparser.add_argument('--predict-activity-degradation-n', type=int, default=500, help=("Number of sequences to simulate mutations over (defaults " - "to 500)")) + "to 500). Higher values increase accuracy, but also increase" + "runtime.")) # Technical options base_subparser.add_argument('--do-not-memoize-guide-computations', From b0f1fd84ac5613eff5ad160bbeb07d054345d612 Mon Sep 17 00:00:00 2001 From: "Priya P. Pillai" <34429433+priyappillai@users.noreply.github.com> Date: Mon, 10 May 2021 11:53:18 -0400 Subject: [PATCH 7/7] Change percentile to an (optional) list --- adapt/utils/mutate.py | 4 ++-- adapt/utils/predict_activity.py | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/adapt/utils/mutate.py b/adapt/utils/mutate.py index 50dfb10..cf7cef5 100644 --- a/adapt/utils/mutate.py +++ b/adapt/utils/mutate.py @@ -190,12 +190,12 @@ def compute_mutated_activity(self, predictor, target_seq, if predictor.required_flanking_seqs[0]: left_context = len(predictor.required_flanking_seqs[0]) _, activity = predictor.compute_activity(left_context, guide_seq, - mutated_aln, percentile=5) + mutated_aln, percentiles=5) else: pairs_to_eval = [] for mutated_target_seq in mutated_target_seqs: pair = (mutated_target_seq, guide_seq) pairs_to_eval.append(pair) _, activity = predictor.compute_activity(start, pairs_to_eval, - percentile=5) + percentiles=5) return activity diff --git a/adapt/utils/predict_activity.py b/adapt/utils/predict_activity.py index 620df39..f605fe1 100644 --- a/adapt/utils/predict_activity.py +++ b/adapt/utils/predict_activity.py @@ -377,20 +377,20 @@ def determine_highly_active(self, start_pos, pairs): mem = self._memoized_evaluations[start_pos] return [mem[pair][1] for pair in pairs] - def compute_activity(self, start_pos, pairs, percentile=None): + def compute_activity(self, start_pos, pairs, percentiles=None): """Compute a single activity measurement for pairs. Args: start_pos: start position of all guides in pairs; used for memoizations pairs: list of tuples (target with context, guide) - percentile: a percentile between 0 and 100 of the activity - values to return + percentiles: single percentile or list of percentiles to compute, + each in [0,100] (0 is minimum, 100 is maximum) Returns: If percentile is not defined, list of activity value for each pair If percentile is defined, tuple of (list of activity value for each - pair, the percentile of those activities) + pair, the percentile(s) of those activities) """ # Determine which pairs do not have memoized results, and call # these @@ -402,8 +402,8 @@ def compute_activity(self, start_pos, pairs, percentile=None): self._run_models_and_memoize(start_pos, unique_pairs_to_evaluate) activities = [mem[pair][0] for pair in pairs] - if percentile: - percentile_activity = np.percentile(activities, percentile) + if percentiles: + percentile_activity = np.percentile(activities, percentiles) return (activities, percentile_activity) return activities @@ -446,7 +446,7 @@ def __init__(self, mismatches, allow_gu_pairs, self.rough_max_activity = 1.0 - def compute_activity(self, start_pos, gd_sequence, aln, percentile=None): + def compute_activity(self, start_pos, gd_sequence, aln, percentiles=None): """Compute activity by checking hybridization across an alignment. This says the activity is 1.0 if a guide is deemed to bind to @@ -456,10 +456,15 @@ def compute_activity(self, start_pos, gd_sequence, aln, percentile=None): start_pos: start position of guide sequence in aln gd_sequence: str representing guide sequence aln: alignment.Alignment object + percentiles: single percentile or list of percentiles to compute, + each in [0,100] (0 is minimum, 100 is maximum) Returns: - numpy array x where x[i] gives the activity (1 or 0) between - gd_sequence and the sequence in the alignment at index i + If percentile is not defined, numpy array x where x[i] gives the + activity (1 or 0) between gd_sequence and the sequence in the + alignment at index i + If percentile is defined, tuple of (the above, the percentile(s) of + the activities) """ # Start with all 0s activities = np.zeros(aln.num_sequences) @@ -473,8 +478,8 @@ def compute_activity(self, start_pos, gd_sequence, aln, percentile=None): for seq_idx in seqs_bound: activities[seq_idx] = 1.0 - if percentile: - percentile_activity = np.percentile(activities, percentile) + if percentiles: + percentile_activity = np.percentile(activities, percentiles) return (activities, percentile_activity) return activities