Skip to content

Commit

Permalink
Merge pull request #28 from broadinstitute/hm-virus-curation-edge-cases
Browse files Browse the repository at this point in the history
Resolve curation edge cases for 4 viral species
  • Loading branch information
haydenm authored Aug 21, 2020
2 parents 7df194f + 181473d commit d4e4bed
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions adapt/prepare/prepare_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def prepare_for(taxid, segment, ref_accs, out,
aln_memoizer=None, aln_stat_memoizer=None,
sample_seqs=None, filter_warn=0.25, min_seq_len=200,
sample_seqs=None, filter_warn=0.25, min_seq_len=150,
min_cluster_size=2, prep_influenza=False, years_tsv=None,
cluster_threshold=0.1, accessions_to_use=None,
sequences_to_use=None):
Expand Down Expand Up @@ -212,10 +212,22 @@ def prepare_for(taxid, segment, ref_accs, out,
"during curation for tax %d (segment: %s) using references %s") %
(frac_filtered, taxid, segment, ref_accs))

# Check if there are no sequences left; if that's the case, don't
# proceed
# Check if there are no sequences left; if that's the case, warn
# and try just a reference sequence
if len(seqs_unaligned_curated) == 0:
raise Exception("No sequences remain after curation")
# Find a reference genome that was downloaded
ref_accver_used = None
for accver, seq in seqs_unaligned.items():
if accver.split('.')[0] in ref_accs:
seqs_unaligned_curated[accver] = seq
ref_accver_used = accver
break
if len(seqs_unaligned_curated) > 0:
logger.critical(("No sequences remained after curation, so "
"proceeding with design from a single reference sequence "
"(%s)") % (ref_accver_used))
else:
raise Exception("No sequences are available for design")

# Produce clusters of unaligned sequences
logger.info(("Clustering %d sequences"), len(seqs_unaligned_curated))
Expand Down

0 comments on commit d4e4bed

Please sign in to comment.