Skip to content

Commit

Permalink
Merge pull request #43 from broadinstitute/write-mean-activity-in-ana…
Browse files Browse the repository at this point in the history
…lysis

Add option to write mean activity of guides in coverage analysis
  • Loading branch information
haydenm authored Dec 15, 2020
2 parents cf1c007 + 35c81df commit dff7383
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 4 deletions.
74 changes: 70 additions & 4 deletions adapt/coverage_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from collections import defaultdict
import logging

import numpy as np

from adapt.utils import guide

__author__ = 'Hayden Metsky <hayden@mit.edu>'
Expand Down Expand Up @@ -117,7 +119,8 @@ def guide_bind_fn(self, seq, target_seq, pos):
"""
raise NotImplementedError()

def find_binding_pos(self, target_seq_name, seq, bind_fn):
def find_binding_pos(self, target_seq_name, seq, bind_fn,
save_activities=None):
"""Find if and where a sequence binds to in a target sequence.
This takes a naive (but fully sensitive) approach: it simply
Expand All @@ -136,6 +139,8 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
seq: guide or primer sequence to lookup
bind_fn: function accepting (seq, target_seq, pos) and outputs
positions in pos to which seq binds to target_seq
save_activities: if set, pass along to guide_fn to save computed
activities
Returns:
collection of start positions in target_seq to which seq binds;
Expand All @@ -156,6 +161,13 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
if not self.seqs_are_indexed:
self._index_seqs()

def bind_fn_with_save(seq, target_seq, pos):
if save_activities is None:
return bind_fn(seq, target_seq, pos)
else:
return bind_fn(seq, target_seq, pos,
save_activities=save_activities)

target_seq = self.seqs[target_seq_name]

bind_pos = set()
Expand All @@ -177,7 +189,7 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
if (kmer_start - j >= 0 and
kmer_start - j + len(seq) <= len(target_seq)):
pos_to_evaluate = [kmer_start - j]
bind_pos.update(bind_fn(seq, target_seq,
bind_pos.update(bind_fn_with_save(seq, target_seq,
pos_to_evaluate))
if len(bind_pos) > 0:
# We have found at least 1 binding position; this may
Expand All @@ -191,7 +203,7 @@ def find_binding_pos(self, target_seq_name, seq, bind_fn):
# sliding approach

pos_to_evaluate = list(range(len(target_seq) - len(seq) + 1))
bind_pos = bind_fn(seq, target_seq, pos_to_evaluate)
bind_pos = bind_fn_with_save(seq, target_seq, pos_to_evaluate)
return bind_pos

def seqs_where_guides_bind(self, guide_seqs):
Expand Down Expand Up @@ -320,6 +332,21 @@ def frac_of_seqs_bound(self):
frac_bound[design_id] = float(len(seqs_bound)) / len(self.seqs)
return frac_bound

def mean_activity_of_guides(self):
"""Determine the mean activity of guides from each design.
Returns:
dict mapping design identifier (self.designs.keys()) to the
mean activity, across the target sequences, of its guide set
"""
mean_activities = {}
for design_id, design in self.designs.items():
logger.info(("Computing mean activities of guides in design "
"'%s'"), str(design_id))
activities_across_targets = self.activities_where_guide_binds(design.guides)
mean_activities[design_id] = np.mean(list(activities_across_targets.values()))
return mean_activities


class CoverageAnalyzerWithMismatchModel(CoverageAnalyzer):
"""Methods to analyze coverage of a design using model with fixed number
Expand Down Expand Up @@ -348,6 +375,9 @@ def __init__(self, seqs, designs, guide_mismatches, primer_mismatches=None,
self.guide_mismatches = guide_mismatches
self.allow_gu_pairs = allow_gu_pairs

def activities_where_guide_binds(self, guide_seqs):
raise NotImplementedError()

def guide_bind_fn(self, seq, target_seq, pos):
"""Evaluate binding with a mismatch model.
Expand Down Expand Up @@ -393,7 +423,34 @@ def __init__(self, seqs, designs, predictor, primer_mismatches=None,
self.predictor = predictor
self.highly_active = highly_active

def guide_bind_fn(self, seq, target_seq, pos):
def activities_where_guide_binds(self, guide_seqs):
"""Determine activities across the target sequences.
Args:
guide_seqs: guide sequences to lookup (treat as a guide set)
Returns:
dict {target seq in self.seqs: predicted activity}
"""
target_act = {}
for seq_name, target_seq in self.seqs.items():
activities_for_target = []
for guide_seq in guide_seqs:
activities = {}
bind_pos = self.find_binding_pos(seq_name, guide_seq,
self.guide_bind_fn, save_activities=activities)
if len(bind_pos) > 0:
# guide_seq binds somewhere in target_seq
# As its activity, take the maximum over multiple
# positions (if there is more than one where it may bind)
activities_for_target += [max(activities.values())]
else:
activities_for_target += [0]
# If there are multiple guides, take the max across them
target_act[target_seq] = max(activities_for_target)
return target_act

def guide_bind_fn(self, seq, target_seq, pos, save_activities=None):
"""Evaluate binding with a predictor -- i.e., based on what is
predicted to be highly active.
Expand All @@ -402,6 +459,9 @@ def guide_bind_fn(self, seq, target_seq, pos):
target_seq: target sequence against which to check
pos: list of positions indicating subsequence in target_seq
to check for binding of seq
save_activities: if set, this saves computed activities as
save_activites[p]=activity for each p in pos for which
this can compute activity; self.highly_active must be False
Returns:
set of positions in pos where seq binds to target_seq
Expand All @@ -427,9 +487,15 @@ def guide_bind_fn(self, seq, target_seq, pos):
if self.highly_active:
predictions = self.predictor.determine_highly_active(-1,
pairs_to_evaluate)
if save_activities is not None:
raise Exception(("Cannot use save_activities when using "
"'highly active' as a criterion"))
else:
predictions = self.predictor.compute_activity(-1,
pairs_to_evaluate)
if save_activities is not None:
for i, p in zip(pos_evaluating, predictions):
save_activities[i] = p
predictions = [bool(p > 0) for p in predictions]
bind_pos = set()
for i, p in zip(pos_evaluating, predictions):
Expand Down
44 changes: 44 additions & 0 deletions bin/analyze_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,30 @@ def write_frac_bound(designs, frac_bound, out_fn):
fw.write('\t'.join([str(x) for x in row]) + '\n')


def write_mean_activity_of_guides(designs, mean_activities, out_fn):
"""Write table giving the mean activity of guide sets.
Args:
designs: dict {design_id: design} where design_id is an
identifier for design and design is a coverage_analysis.
Design object
mean_activities: dict {design_id: activity} where design_id is
an identifier for a design and activity is the mean activity,
across the target sequences, of its guide set
out_fn: path to TSV file at which to write table
"""
header = ['design_id',
'guide-target-sequences',
'mean-activity']
with open(out_fn, 'w') as fw:
fw.write('\t'.join(header) + '\n')
for design_id in sorted(list(designs.keys())):
guides = designs[design_id].guides
guides = ' '.join(sorted(guides))
row = [design_id, guides, mean_activities[design_id]]
fw.write('\t'.join([str(x) for x in row]) + '\n')


def main(args):
# Allow G-U base pairing, unless it is explicitly disallowed
allow_gu_pairs = not args.do_not_allow_gu_pairing
Expand Down Expand Up @@ -169,6 +193,17 @@ def main(args):
frac_bound = analyzer.frac_of_seqs_bound()
write_frac_bound(designs, frac_bound, args.write_frac_bound)
performed_analysis = True
if args.write_mean_activity_of_guides:
if (not args.predict_activity_model_path or
args.predict_activity_require_highly_active):
raise Exception(("To use --write-mean-activity-of-guides, "
"a predictive model must be set and "
"--predict-activity-require-highly-active must *not* "
"be set"))
mean_activity = analyzer.mean_activity_of_guides()
write_mean_activity_of_guides(designs, mean_activity,
args.write_mean_activity_of_guides)
performed_analysis = True

if not performed_analysis:
logger.warning(("No analysis was requested"))
Expand Down Expand Up @@ -198,6 +233,15 @@ def main(args):
"the row number of the design in the designs input (1 for "
"the first design). The provided argument is a path to "
"a TSV file at which to the write the table."))
parser.add_argument('--write-mean-activity-of-guides',
help=("If set, write a table in which each row represents an "
"input design and gives the mean activity across the target "
"sequences of the guide set. The 'design_id' column gives "
"the row number of the design in the designs input (1 for "
"the first design). The provided argument is a path to "
"a TSV file at which to write the table. If set, a predictive "
"model must be set without "
"--predict-activity-require-highly-active"))

# Parameter determining whether a primer binds to target
parser.add_argument('-pm', '--primer-mismatches',
Expand Down

0 comments on commit dff7383

Please sign in to comment.