diff --git a/README.md b/README.md index bac7b8c..1cbbd92 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,11 @@ ADAPT requires: * [SciPy](https://www.scipy.org) == 1.4.1 * [TensorFlow](https://www.tensorflow.org) >= 2.3.0 -Installing ADAPT with `pip`, as described below, will install NumPy, SciPy, and TensorFlow if they are not already installed. +ADAPT with AWS cloud features requires: +* [Boto3](https://aws.amazon.com/sdk-for-python/) >= 1.14.54 +* [Botocore](https://botocore.amazonaws.com/v1/documentation/api/latest/index.html) >= 1.17.54 + +Installing ADAPT with `pip`, as described below, will install NumPy, SciPy, and TensorFlow if they are not already installed. Installing ADAPT with `pip` using the AWS cloud features, as described below, will install Boto3 and Botocore if they are not already installed as well. If using alignment features in subcommands below, ADAPT also requires a path to an executable of [MAFFT](https://mafft.cbrc.jp/alignment/software/). @@ -83,6 +87,11 @@ pip install -e . ``` Depending on your setup (i.e., if you do not have write permissions in the installation directory), you may need to supply `--user` to `pip install`. +If you want to be able to use AWS cloud features, replace the last line with the following: +```bash +pip install -e ".[AWS]" +``` + ## Testing The package uses Python's `unittest` framework. @@ -227,7 +236,7 @@ Below are some additional arguments when INPUT-TYPE is `auto-from-{file,args}`: * `--mafft-path MAFFT_PATH`: Use the [MAFFT](https://mafft.cbrc.jp/alignment/software/) executable at MAFFT_PATH for generating alignments. * `--prep-memoize-dir PREP_MEMOIZE_DIR`: Memoize alignments and statistics on these alignments in PREP_MEMOIZE_DIR. -If not set (default), do not memoize this information. +This can save the memo to an S3 bucket by using the syntax `s3://BUCKET/PATH`, though this requires the AWS cloud installation mentioned in [Downloading and installing](#downloading-and-installing) and setting access key information. Access key information can either be set using AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY or by installing and configuring [AWS CLI](https://aws.amazon.com/cli/). If not set (default), do not memoize this information. If repeatedly re-running on the same taxonomies, using this can significantly improve runtime. * `--prep-influenza`: If set, use NCBI's influenza database for fetching data. This must be specified if design is for influenza A/B/C viruses. @@ -238,6 +247,8 @@ The distance is average nucleotide dissimilarity (1-ANI); higher values result i (Default: 0.2.) * `--use-accessions USE_ACCESSIONS`: Use the specified accessions, in a file at the path USE_ACCESSIONS, for generating input. This is performed instead of fetching neighbors from NCBI. +* `--aws-access-key-id AWS_ACCESS_KEY_ID`: Use AWS_ACCESS_KEY_ID to log in to AWS Cloud Services. Both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are necessary to log in. This is only necessary if saving the memo to an S3 bucket using PREP_MEMOIZE_DIR and AWS CLI has not been installed and configured. If AWS CLI has been installed and configured and this argument is passed in, AWS_ACCESS_KEY_ID will override the AWS CLI configuration. +* `--aws-secret-access-key AWS_SECRET_ACCESS_KEY`: Use AWS_SECRET_ACCESS_KEY to log in to AWS Cloud Services. Both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are necessary to log in. This is only necessary if saving the memo to an S3 bucket using PREP_MEMOIZE_DIR and AWS CLI has not been installed and configured. If AWS CLI has been installed and configured and this argument is passed in, AWS_ACCESS_KEY_ID will override the AWS CLI configuration. See `design.py [SEARCH-TYPE] auto-from-{file,args} --help` for details on the format of the file. ## Output diff --git a/adapt/prepare/align.py b/adapt/prepare/align.py index 50f070d..79d74ae 100644 --- a/adapt/prepare/align.py +++ b/adapt/prepare/align.py @@ -9,6 +9,13 @@ import re import subprocess import tempfile +try: + import boto3 + from botocore.exceptions import ClientError +except: + cloud = False +else: + cloud = True from adapt.utils import seq_io @@ -52,27 +59,42 @@ class AlignmentMemoizer: This stores alignments as fasta files named by the hash. """ - def __init__(self, path): + def __init__(self, path, aws_access_key_id=None, aws_secret_access_key=None): """ Args: path: path to directory in which to read and store - memoized alignments + memoized alignments; if path begins with "s3://", + stores bucket """ - self.path = path - - if not os.path.exists(self.path): - os.makedirs(self.path) + if path[:5] == "s3://": + # Store path as an S3 Bucket Object + folders = path.split("/") + self.path = "/".join(folders[3:]) + self.bucket = folders[2] + if aws_access_key_id is not None and aws_secret_access_key is not None: + self.S3 = boto3.client("s3", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key) + else: + self.S3 = boto3.client("s3") + else: + self.path = path + self.bucket = None + if not os.path.exists(self.path): + os.makedirs(self.path) def _hash_accvers_filepath(self, accvers): - """Generate a path to use as the filename for an alignment. + """Generate a path or S3 key to use as the filename for an alignment. Args: accvers: collection of accession.version in an alignment Returns: - path to file that should store accvers + path to file or S3 key that should store accvers """ h = hashlib.md5(str(sorted(set(accvers))).encode('utf-8')).hexdigest() + if self.bucket: + return "/".join([self.path, h]) return os.path.join(self.path, h) def get(self, accvers): @@ -86,11 +108,29 @@ def get(self, accvers): if accvers is not memoized """ p = self._hash_accvers_filepath(accvers) + seqs = None + + if self.bucket: + # Download file from source if it exists, + # otherwise return None + try: + f = self.S3.get_object(Bucket = self.bucket, Key = p) + except ClientError as e: + if e.response['Error']['Code'] == "NoSuchKey": + return None + else: + raise + else: + lines = [line.decode('utf-8') for line in f["Body"].iter_lines()] + seqs = seq_io.process_fasta(lines) - if os.path.exists(p): - # Read the fasta, and verify that the accessions in it - # match accs (i.e., that there is not a collision) + elif os.path.exists(p): + # Read the fasta seqs = seq_io.read_fasta(p) + + if seqs: + # Verify that the accessions in the file match accs + # (i.e., that there is not a collision) if set(accvers) == set(seqs.keys()): return seqs else: @@ -106,12 +146,20 @@ def save(self, seqs): """ p = self._hash_accvers_filepath(seqs.keys()) - # Generate a random 8-digit hex to append to p temporarily, so that - # two files don't write to p at the same time - p_tmp = p + '.' + ("%08x" % random.getrandbits(32)) - - seq_io.write_fasta(seqs, p_tmp) - os.replace(p_tmp, p) + if self.bucket: + # Make temporary file to store FASTA, then upload to S3 + p_tmp = tempfile.NamedTemporaryFile(delete=False) + p_tmp.close() + seq_io.write_fasta(seqs, p_tmp.name) + with open(p_tmp.name, 'rb') as f: + self.S3.put_object(Bucket=self.bucket, Key=p, Body=f) + os.unlink(p_tmp.name) + else: + # Generate a random 8-digit hex to append to p temporarily, so that + # two files don't write to p at the same time + p_tmp = "%s.%08x" % (p, random.getrandbits(32)) + seq_io.write_fasta(seqs, p_tmp) + os.replace(p_tmp, p) class AlignmentStatMemoizer: @@ -123,19 +171,31 @@ class AlignmentStatMemoizer: This stores, for each alignment, the tuple (aln_identity, aln_identity_ccg). """ - def __init__(self, path): + def __init__(self, path, aws_access_key_id=None, aws_secret_access_key=None): """ Args: path: path to directory in which to read and store memoized - stats + stats; if path begins with "s3://", stores bucket """ - self.path = path - - if not os.path.exists(self.path): - os.makedirs(self.path) + if path[:5] == "s3://": + # Store path as an S3 Bucket Object + folders = path.split("/") + self.path = "/".join(folders[3:]) + self.bucket = folders[2] + if aws_access_key_id is not None and aws_secret_access_key is not None: + self.S3 = boto3.client("s3", + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key) + else: + self.S3 = boto3.client("s3") + else: + self.path = path + self.bucket = None + if not os.path.exists(self.path): + os.makedirs(self.path) def _hash_accvers_filepath(self, accvers): - """Generate a path to use as the filename for an alignment. + """Generate a path or S3 key to use as the filename for an alignment. Let h be the hash of accers and and h_2 be the first two (hexadecimal) digits of h. This stores h in a directory @@ -146,11 +206,15 @@ def _hash_accvers_filepath(self, accvers): accvers: collection of accession.version in an alignment Returns: - path to file that should store stats on accvers + path to file or S3 key that should store stats on accvers """ h = hashlib.md5(str(sorted(set(accvers))).encode('utf-8')).hexdigest() h_2 = h[:2] + + if self.bucket: + return "/".join([self.path, h_2, h]) + h_dir = os.path.join(self.path, h_2) if not os.path.exists(h_dir): os.makedirs(h_dir) @@ -168,14 +232,32 @@ def get(self, accvers): accvers; or None if accvers is not memoized """ p = self._hash_accvers_filepath(accvers) + ls = None + + if self.bucket: + # Download file from source if it exists, + # otherwise return None + try: + f = self.S3.get_object(Bucket = self.bucket, Key = p) + except ClientError as e: + if e.response['Error']['Code'] == 'NoSuchKey': + return None + else: + raise e + else: + lines = f["Body"].read().decode('utf-8').rstrip() + ls = lines.split('\t') - if os.path.exists(p): - # Read the file, and verify that the accessions in it - # match accvers (i.e., that there is not a collision) + elif os.path.exists(p): + # Read the file with open(p) as f: lines = [line.rstrip() for line in f] - assert len(lines) == 1 # should be just 1 line - ls = lines[0].split('\t') + assert len(lines) == 1 # should be just 1 line + ls = lines[0].split('\t') + + if ls: + # Verify that the accessions in file match accvers + # (i.e., that there is not a collision) accvers_in_p = ls[0].split(',') if set(accvers) == set(accvers_in_p): stats = (float(ls[1]), float(ls[2])) @@ -193,17 +275,20 @@ def save(self, accvers, stats): stats: tuple (aln_identity, aln_identity_ccg) """ p = self._hash_accvers_filepath(accvers) - - # Generate a random 8-digit hex to append to p temporarily, so that - # two files don't write to p at the same time - p_tmp = p + '.' + ("%08x" % random.getrandbits(32)) - accvers_str = ','.join(accvers) - with open(p_tmp, 'w') as fw: - fw.write(('\t'.join([accvers_str, str(stats[0]), str(stats[1])]) + - '\n')) - os.rename(p_tmp, p) + if self.bucket: + body = '\t'.join([accvers_str, str(stats[0]), str(stats[1])]) + '\n' + self.S3.put_object(Bucket=self.bucket, Key=p, + Body=body.encode('utf-8')) + else: + # Generate a random 8-digit hex to append to p temporarily, so that + # two files don't write to p at the same time + p_tmp = "%s.%08x" % (p, random.getrandbits(32)) + with open(p_tmp, 'w') as fw: + fw.write(('\t'.join([accvers_str, str(stats[0]), str(stats[1])]) + + '\n')) + os.rename(p_tmp, p) _mafft_exec = None diff --git a/adapt/prepare/prepare_alignment.py b/adapt/prepare/prepare_alignment.py index ce0bbfa..471e73b 100644 --- a/adapt/prepare/prepare_alignment.py +++ b/adapt/prepare/prepare_alignment.py @@ -133,6 +133,7 @@ def prepare_for(taxid, segment, ref_accs, out, # show up multiple times in neighbors due to having multiple RefSeq # entries acc_to_sample = list(set([n.acc for n in neighbors])) + acc_to_sample.sort() acc_to_fetch = random.choices(acc_to_sample, k=sample_seqs) neighbors = [n for n in neighbors if n.acc in acc_to_fetch] # Because this is sampling with replacement, an accession may diff --git a/adapt/prepare/tests/test_align.py b/adapt/prepare/tests/test_align.py index c4a22d6..37a57db 100644 --- a/adapt/prepare/tests/test_align.py +++ b/adapt/prepare/tests/test_align.py @@ -5,6 +5,17 @@ import tempfile from os import unlink import unittest +try: + import boto3 + import botocore + S3 = boto3.client() + resource = S3.list_buckets() +except: + cloud = False +else: + cloud = True + import random + import warnings from adapt.prepare import align @@ -86,103 +97,120 @@ class TestMemoization(unittest.TestCase): """Tests memoizers. """ + def setUp(self): + self.tempdir = tempfile.TemporaryDirectory() + if cloud: + cloudtempfold = random.getrandbits(32) + s3 = boto3.resource("s3") + self.bucket = s3.Bucket("%08x" % cloudtempfold) + self.bucket.create() + self.cloudtempdir = "s3://%08x" % cloudtempfold + def test_alignment_memoizer(self): - # Make a directory in which to save alignments - tempdir = tempfile.TemporaryDirectory() - - # Create some fake alignments - seqs1 = OrderedDict([('AB123.1', 'ATCG--A'), - ('AB456.1', 'TTTTCCA')]) - seqs2 = OrderedDict([('KY123.1', 'AAAACCA'), - ('AB456.1', 'TTTTCCA')]) - seqs3 = OrderedDict([('AB123.1', 'ATCG--A'), - ('KY123.1', 'AAAACCA'), - ('AB456.1', 'TTTTCCA')]) - seqs4 = OrderedDict([('AB123.1', 'ATCG--A'), - ('YZ123.1', 'GGGGCCA'), - ('DE456.1', 'CCCCCCA')]) - am = align.AlignmentMemoizer(tempdir.name) - - # Progressively save these and check the results - am.save(seqs1) - am.save(seqs2) - self.assertEqual(am.get(seqs1.keys()), seqs1) - self.assertEqual(am.get(seqs2.keys()), seqs2) - self.assertIsNone(am.get(seqs3.keys())) - self.assertIsNone(am.get(seqs4.keys())) - - am.save(seqs3) - self.assertEqual(am.get(seqs1.keys()), seqs1) - self.assertEqual(am.get(seqs2.keys()), seqs2) - self.assertEqual(am.get(seqs3.keys()), seqs3) - self.assertIsNone(am.get(seqs4.keys())) - - am.save(seqs4) - self.assertEqual(am.get(seqs1.keys()), seqs1) - self.assertEqual(am.get(seqs2.keys()), seqs2) - self.assertEqual(am.get(seqs3.keys()), seqs3) - self.assertEqual(am.get(seqs4.keys()), seqs4) - - seqs3 = OrderedDict([('AB123.1', 'ATCG--A'), - ('KY123.1', 'GGGGGGG'), - ('AB456.1', 'TTTTCCA')]) - am.save(seqs3) - self.assertEqual(am.get(seqs1.keys()), seqs1) - self.assertEqual(am.get(seqs2.keys()), seqs2) - self.assertEqual(am.get(seqs3.keys()), seqs3) - self.assertEqual(am.get(seqs4.keys()), seqs4) - del am - - # Try reading from a new AlignmentMemoizer at the same path - am_new = align.AlignmentMemoizer(tempdir.name) - self.assertEqual(am_new.get(seqs1.keys()), seqs1) - self.assertEqual(am_new.get(seqs2.keys()), seqs2) - self.assertEqual(am_new.get(seqs3.keys()), seqs3) - self.assertEqual(am_new.get(seqs4.keys()), seqs4) - - # Reorder keys and check that output is the same - seqs4_keys = list(seqs4.keys()) - self.assertEqual(am_new.get(seqs4_keys), seqs4) - self.assertEqual(am_new.get(seqs4_keys[::-1]), seqs4) - - # Cleanup - tempdir.cleanup() + def test_on_dir(directory): + # Create some fake alignments + seqs1 = OrderedDict([('AB123.1', 'ATCG--A'), + ('AB456.1', 'TTTTCCA')]) + seqs2 = OrderedDict([('KY123.1', 'AAAACCA'), + ('AB456.1', 'TTTTCCA')]) + seqs3 = OrderedDict([('AB123.1', 'ATCG--A'), + ('KY123.1', 'AAAACCA'), + ('AB456.1', 'TTTTCCA')]) + seqs4 = OrderedDict([('AB123.1', 'ATCG--A'), + ('YZ123.1', 'GGGGCCA'), + ('DE456.1', 'CCCCCCA')]) + am = align.AlignmentMemoizer(directory) + + # Progressively save these and check the results + am.save(seqs1) + am.save(seqs2) + self.assertEqual(am.get(seqs1.keys()), seqs1) + self.assertEqual(am.get(seqs2.keys()), seqs2) + self.assertIsNone(am.get(seqs3.keys())) + self.assertIsNone(am.get(seqs4.keys())) + + am.save(seqs3) + self.assertEqual(am.get(seqs1.keys()), seqs1) + self.assertEqual(am.get(seqs2.keys()), seqs2) + self.assertEqual(am.get(seqs3.keys()), seqs3) + self.assertIsNone(am.get(seqs4.keys())) + + am.save(seqs4) + self.assertEqual(am.get(seqs1.keys()), seqs1) + self.assertEqual(am.get(seqs2.keys()), seqs2) + self.assertEqual(am.get(seqs3.keys()), seqs3) + self.assertEqual(am.get(seqs4.keys()), seqs4) + + seqs3 = OrderedDict([('AB123.1', 'ATCG--A'), + ('KY123.1', 'GGGGGGG'), + ('AB456.1', 'TTTTCCA')]) + am.save(seqs3) + self.assertEqual(am.get(seqs1.keys()), seqs1) + self.assertEqual(am.get(seqs2.keys()), seqs2) + self.assertEqual(am.get(seqs3.keys()), seqs3) + self.assertEqual(am.get(seqs4.keys()), seqs4) + del am + + # Try reading from a new AlignmentMemoizer at the same path + am_new = align.AlignmentMemoizer(directory) + self.assertEqual(am_new.get(seqs1.keys()), seqs1) + self.assertEqual(am_new.get(seqs2.keys()), seqs2) + self.assertEqual(am_new.get(seqs3.keys()), seqs3) + self.assertEqual(am_new.get(seqs4.keys()), seqs4) + + # Reorder keys and check that output is the same + seqs4_keys = list(seqs4.keys()) + self.assertEqual(am_new.get(seqs4_keys), seqs4) + self.assertEqual(am_new.get(seqs4_keys[::-1]), seqs4) + + test_on_dir(self.tempdir.name) + + # If Boto3 is installed, test AWS cloud services + if cloud: + test_on_dir(self.cloudtempdir) def test_alignment_stat_memoizer(self): - # Create a new directory in which to store memoizations - tempdir = tempfile.TemporaryDirectory() - - # Create some fake stats - asm = align.AlignmentStatMemoizer(tempdir.name) - asm.save(['AB123.1', 'KY456.2'], (0.8, 0.9)) - asm.save(['AB123.1', 'KY789.2'], (0.85, 0.95)) - self.assertEqual(asm.get(['AB123.1', 'KY456.2']), (0.8, 0.9)) - self.assertEqual(asm.get(['AB123.1', 'KY789.2']), (0.85, 0.95)) - self.assertEqual(asm.get(['KY789.2', 'AB123.1']), (0.85, 0.95)) - self.assertIsNone(asm.get(['KY456.2', 'KY789.2'])) - del asm - - # Check that a new AlignmentStatMemoizer can read these - asm_new = align.AlignmentStatMemoizer(tempdir.name) - self.assertEqual(asm_new.get(['AB123.1', 'KY456.2']), (0.8, 0.9)) - self.assertEqual(asm_new.get(['AB123.1', 'KY789.2']), (0.85, 0.95)) - self.assertEqual(asm_new.get(['KY789.2', 'AB123.1']), (0.85, 0.95)) - self.assertIsNone(asm_new.get(['KY456.2', 'KY789.2'])) - - # Add a new stat to the memoizer - asm_new.save(['AB123.1', 'KY000.1'], (0.5, 0.6)) - del asm_new - - # Check that we can read all the stats - asm_new2 = align.AlignmentStatMemoizer(tempdir.name) - self.assertEqual(asm_new2.get(['AB123.1', 'KY456.2']), (0.8, 0.9)) - self.assertEqual(asm_new2.get(['AB123.1', 'KY789.2']), (0.85, 0.95)) - self.assertEqual(asm_new2.get(['KY789.2', 'AB123.1']), (0.85, 0.95)) - self.assertIsNone(asm_new2.get(['KY456.2', 'KY789.2'])) - self.assertEqual(asm_new2.get(['AB123.1', 'KY000.1']), (0.5, 0.6)) - - # Cleanup - tempdir.cleanup() + def test_on_dir(directory): + # Create some fake stats + asm = align.AlignmentStatMemoizer(directory) + asm.save(['AB123.1', 'KY456.2'], (0.8, 0.9)) + asm.save(['AB123.1', 'KY789.2'], (0.85, 0.95)) + self.assertEqual(asm.get(['AB123.1', 'KY456.2']), (0.8, 0.9)) + self.assertEqual(asm.get(['AB123.1', 'KY789.2']), (0.85, 0.95)) + self.assertEqual(asm.get(['KY789.2', 'AB123.1']), (0.85, 0.95)) + self.assertIsNone(asm.get(['KY456.2', 'KY789.2'])) + del asm + + # Check that a new AlignmentStatMemoizer can read these + asm_new = align.AlignmentStatMemoizer(directory) + self.assertEqual(asm_new.get(['AB123.1', 'KY456.2']), (0.8, 0.9)) + self.assertEqual(asm_new.get(['AB123.1', 'KY789.2']), (0.85, 0.95)) + self.assertEqual(asm_new.get(['KY789.2', 'AB123.1']), (0.85, 0.95)) + self.assertIsNone(asm_new.get(['KY456.2', 'KY789.2'])) + + # Add a new stat to the memoizer + asm_new.save(['AB123.1', 'KY000.1'], (0.5, 0.6)) + del asm_new + + # Check that we can read all the stats + asm_new2 = align.AlignmentStatMemoizer(directory) + self.assertEqual(asm_new2.get(['AB123.1', 'KY456.2']), (0.8, 0.9)) + self.assertEqual(asm_new2.get(['AB123.1', 'KY789.2']), (0.85, 0.95)) + self.assertEqual(asm_new2.get(['KY789.2', 'AB123.1']), (0.85, 0.95)) + self.assertIsNone(asm_new2.get(['KY456.2', 'KY789.2'])) + self.assertEqual(asm_new2.get(['AB123.1', 'KY000.1']), (0.5, 0.6)) + + test_on_dir(self.tempdir.name) + + # If Boto3 is installed, test AWS cloud services + if cloud: + test_on_dir(self.cloudtempdir) + + def tearDown(self): + self.tempdir.cleanup() + if cloud: + self.bucket.objects.delete() + self.bucket.delete() class TestCurateAgainstRef(unittest.TestCase): diff --git a/adapt/utils/seq_io.py b/adapt/utils/seq_io.py index 78189fe..601c563 100644 --- a/adapt/utils/seq_io.py +++ b/adapt/utils/seq_io.py @@ -13,6 +13,37 @@ logger = logging.getLogger(__name__) +def process_fasta(f, replace_degenerate=False, + skip_gaps=False, make_uppercase=True): + + degenerate_pattern = re.compile('[YRWSMKBDHV]') + m = OrderedDict() + curr_seq_name = "" + for line in f: + line = line.rstrip() + if len(line) == 0: + # Reset the sequence being read on an empty line + curr_seq_name = "" + continue + if curr_seq_name == "": + # Must encounter a new sequence + assert line.startswith('>') + if line.startswith('>'): + curr_seq_name = line[1:] + m[curr_seq_name] = '' + else: + # Append the sequence + if make_uppercase: + line = line.upper() + if replace_degenerate: + line = degenerate_pattern.sub('N', line) + if skip_gaps: + line = line.replace('-', '') + m[curr_seq_name] += line + + return m + + def read_fasta(fn, replace_degenerate=False, skip_gaps=False, make_uppercase=True): """Read a FASTA file. @@ -36,40 +67,14 @@ def read_fasta(fn, replace_degenerate=False, """ logger.debug("Reading fasta file %s", fn) - degenerate_pattern = re.compile('[YRWSMKBDHV]') - - def process(f): - m = OrderedDict() - curr_seq_name = "" - for line in f: - line = line.rstrip() - if len(line) == 0: - # Reset the sequence being read on an empty line - curr_seq_name = "" - continue - if curr_seq_name == "": - # Must encounter a new sequence - assert line.startswith('>') - if line.startswith('>'): - curr_seq_name = line[1:] - m[curr_seq_name] = '' - else: - # Append the sequence - if make_uppercase: - line = line.upper() - if replace_degenerate: - line = degenerate_pattern.sub('N', line) - if skip_gaps: - line = line.replace('-', '') - m[curr_seq_name] += line - return m - if fn.endswith('.gz'): with gzip.open(fn, 'rt') as f: - m = process(f) + m = process_fasta(f, replace_degenerate, + skip_gaps, make_uppercase) else: with open(fn, 'r') as f: - m = process(f) + m = process_fasta(f, replace_degenerate, + skip_gaps, make_uppercase) return m diff --git a/bin/design.py b/bin/design.py index 688296d..bcb13ec 100644 --- a/bin/design.py +++ b/bin/design.py @@ -8,6 +8,8 @@ import re import shutil import tempfile +import random +import numpy as np from adapt import alignment from adapt import guide_search @@ -23,6 +25,14 @@ from adapt.utils import seq_io from adapt.utils import year_cover +try: + import boto3 + from botocore.exceptions import ClientError +except: + cloud = False +else: + cloud = True + __author__ = 'Hayden Metsky ' logger = logging.getLogger(__name__) @@ -214,18 +224,47 @@ def prepare_alignments(args): # Setup alignment and alignment stat memoizers if args.prep_memoize_dir: - if not os.path.isdir(args.prep_memoize_dir): - raise Exception(("Path '%s' does not exist") % - args.prep_memoize_dir) - align_memoize_dir = os.path.join(args.prep_memoize_dir, 'aln') - if not os.path.exists(align_memoize_dir): - os.makedirs(align_memoize_dir) - align_stat_memoize_dir = os.path.join(args.prep_memoize_dir, 'stats') - if not os.path.exists(align_stat_memoize_dir): - os.makedirs(align_stat_memoize_dir) - - am = align.AlignmentMemoizer(align_memoize_dir) - asm = align.AlignmentStatMemoizer(align_stat_memoize_dir) + if args.prep_memoize_dir[:5] == "s3://": + bucket = args.prep_memoize_dir.split("/")[2] + try: + if args.aws_access_key_id is not None and args.aws_secret_access_key is not None: + S3 = boto3.client("s3", aws_access_key_id = args.aws_access_key_id, + aws_secret_access_key = args.aws_secret_access_key) + else: + S3 = boto3.client("s3") + S3.head_bucket(Bucket=bucket) + except ClientError as e: + if e.response['Error']['Code'] == "404": + S3.create_bucket(Bucket=bucket) + elif e.response['Error']['Code'] == "403": + raise Exception(("Incorrect AWS Access Key ID or Secret Access Key")) from e + else: + raise + except ConnectionError as e: + raise Exception(("Cannot connect to Amazon S3")) from e + if args.prep_memoize_dir[-1] == "/": + align_memoize_dir = '%saln' %args.prep_memoize_dir + align_stat_memoize_dir = '%sstats' %args.prep_memoize_dir + else: + align_memoize_dir = '%s/aln' %args.prep_memoize_dir + align_stat_memoize_dir = '%s/stats' %args.prep_memoize_dir + else: + if not os.path.isdir(args.prep_memoize_dir): + raise Exception(("Path '%s' does not exist") % + args.prep_memoize_dir) + align_memoize_dir = os.path.join(args.prep_memoize_dir, 'aln') + if not os.path.exists(align_memoize_dir): + os.makedirs(align_memoize_dir) + align_stat_memoize_dir = os.path.join(args.prep_memoize_dir, 'stats') + if not os.path.exists(align_stat_memoize_dir): + os.makedirs(align_stat_memoize_dir) + + am = align.AlignmentMemoizer(align_memoize_dir, + aws_access_key_id = args.aws_access_key_id, + aws_secret_access_key = args.aws_secret_access_key) + asm = align.AlignmentStatMemoizer(align_stat_memoize_dir, + aws_access_key_id = args.aws_access_key_id, + aws_secret_access_key = args.aws_secret_access_key) else: am = None asm = None @@ -301,7 +340,8 @@ def prepare_alignments(args): nc = prepare_alignment.prepare_for( tax_id, segment, ref_accs, aln_file_dir.name, aln_memoizer=am, aln_stat_memoizer=asm, - sample_seqs=args.sample_seqs, prep_influenza=args.prep_influenza, + sample_seqs=args.sample_seqs, + prep_influenza=args.prep_influenza, years_tsv=years_tsv_tmp_name, cluster_threshold=args.cluster_threshold, accessions_to_use=accessions_to_use_for_tax, @@ -639,6 +679,10 @@ def guide_is_suitable(guide): def main(args): logger = logging.getLogger(__name__) + # Set random seed for entire program + random.seed(args.seed) + np.random.seed(args.seed) + check_obj_args(args) logger.info("Running design.py with arguments: %s", args) @@ -904,6 +948,10 @@ def __call__(self, parser, namespace, values, option_string=None): "equivalently, avoids guides flanked by 'G'). Note that " "this is the 3' end in the target sequence (not the spacer " "sequence).")) + base_subparser.add_argument('--seed', type=int, + help=("SEED will set the random seed, guaranteeing the same output " + "given the same inputs. If SEED is not set to the same value, " + "output may vary across different runs.")) # Use a model to predict activity base_subparser.add_argument('--predict-activity-model-path', @@ -1070,8 +1118,11 @@ def __call__(self, parser, namespace, values, option_string=None): help=("Path to mafft executable, used for generating alignments")) input_auto_common_subparser.add_argument('--prep-memoize-dir', help=("Path to directory in which to memoize alignments and " - "statistics on them; if not set, this does not memoize " - "this information")) + "statistics on them. If set to \"s3://BUCKET/PATH\", it " + "will save to the S3 bucket if boto3 and botocore are " + "installed and access key information exists via " + "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY or via AWS CLI. " + "If not set, this does not memoize this information.")) input_auto_common_subparser.add_argument('--sample-seqs', type=int, help=("After fetching accessions, randomly select SAMPLE_SEQS of them " "with replacement from each taxonomy any move forward " @@ -1129,6 +1180,14 @@ def __call__(self, parser, namespace, values, option_string=None): help=("API key to use for NCBI e-utils. Using this increases the " "limit on requests/second and may prevent an IP address " "from being block due to too many requests")) + input_auto_common_subparser.add_argument('--aws-access-key-id', + help=("User Account Access Key for AWS. This is only necessary " + "if using S3 for memoization via PREP_MEMOIZE_DIR and AWS CLI " + "is not installed and configured.")) + input_auto_common_subparser.add_argument('--aws-secret-access-key', + help=("User Account Secret Access Key for AWS. This is only " + "necessary if using S3 for memoization via PREP_MEMOIZE_DIR " + "and AWS CLI is not installed and configured.")) # Auto prepare from file input_autofile_subparser = argparse.ArgumentParser(add_help=False) diff --git a/requirements-with-aws.txt b/requirements-with-aws.txt new file mode 100644 index 0000000..0e45a73 --- /dev/null +++ b/requirements-with-aws.txt @@ -0,0 +1,5 @@ +numpy==1.18.2 +scipy==1.4.1 +tensorflow==2.3.0 +boto3==1.14.54 +botocore==1.17.54 diff --git a/setup.py b/setup.py index 2456d72..511beda 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,9 @@ author_email='hayden@mit.edu', packages=find_packages(), install_requires=['numpy>=1.16.0,<1.19.0', 'scipy==1.4.1', 'tensorflow>=2.3.0'], + extras_require={ + 'AWS': ['boto3>=1.14.54', 'botocore>=1.17.54'] + }, scripts=[ 'bin/design.py', 'bin/design_naively.py',