geohot · KacieAhmed · Mar 18, 2021 · Mar 18, 2021 · Apr 16, 2021
diff --git a/corona b/corona
diff --git a/corona.py b/corona.py
@@ -1,4 +1,4 @@
-from lib import cc, translate
+from lib import genome, translate
 # entire diff: https://www.ncbi.nlm.nih.gov/projects/msaviewer/?rid=7FYNU14F01R&coloring=
 # protein alignments: http://virological.org/t/alignment-of-58-sarbecovirus-genomes-for-conservation-analysis-of-sars-cov-2/430
 
@@ -44,15 +44,17 @@
 #        https://en.wikipedia.org/wiki/MRNA_(nucleoside-2%27-O-)-methyltransferase
 
 # in front "the untranslated leader sequence that ends with the Transcription Regulation Sequence"
-corona['untranslated_region'] = cc[0:265]
+corona['untranslated_region'] = genome.get_nucleotides()[0:265]
 
-corona['orf1a'] = translate(cc[266-1:13483], True)
+corona['orf1a'] = translate(genome.get_nucleotides()[266-1:13483], True)
 
 # cc[266-1+4398*3:13468] = 'TTT_TTA_AAC' aka 'X_XXY_YYZ'
 # https://en.wikipedia.org/wiki/Ribosomal_frameshift
 # Programmed −1 Ribosomal Frameshifting
 # TODO: add this to the translate function with automatic detection
-corona['orf1b'] = translate(cc[13468-1:21555], False).strip("*")  # chop off the stop, note this doesn't have a start
+# chop off the stop, note this doesn't have a start
+corona['orf1b'] = translate(genome.get_nucleotides()[
+                            13468-1:21555], False).strip("*")
 
 # exploit vector, this attaches to ACE2. also called "surface glycoprotein"
 # https://www.ncbi.nlm.nih.gov/Structure/pdb/6VYB -- open state
@@ -62,28 +64,32 @@
 #   S2  = 686-1273
 #   S2' = 816-1273
 # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2750777/
-corona['spike_glycoprotein'] = translate(cc[21563-1:25384], True)
+corona['spike_glycoprotein'] = translate(
+    genome.get_nucleotides()[21563-1:25384], True)
 
 # Forms homotetrameric potassium sensitive ion channels (viroporin) and may modulate virus release.
-corona['orf3a'] = translate(cc[25393-1:26220], True)
+corona['orf3a'] = translate(genome.get_nucleotides()[25393-1:26220], True)
 
 # these two things stick out, used in assembly aka they package the virus
-corona['envelope_protein'] = translate(cc[26245-1:26472], True)  # also known as small membrane
-corona['membrane_glycoprotein'] = translate(cc[26523-1:27191], True)
+corona['envelope_protein'] = translate(
+    genome.get_nucleotides()[26245-1:26472], True)  # also known as small membrane
+corona['membrane_glycoprotein'] = translate(
+    genome.get_nucleotides()[26523-1:27191], True)
 
-corona['orf6'] = translate(cc[27202-1:27387], True)
+corona['orf6'] = translate(genome.get_nucleotides()[27202-1:27387], True)
 
-corona['orf7a'] = translate(cc[27394-1:27759], True)
-corona['orf7b'] = translate(cc[27756-1:27887], True)  # is this one real?
+corona['orf7a'] = translate(genome.get_nucleotides()[27394-1:27759], True)
+corona['orf7b'] = translate(genome.get_nucleotides()[
+                            27756-1:27887], True)  # is this one real?
 
-corona['orf8'] = translate(cc[27894-1:28259], True)
+corona['orf8'] = translate(genome.get_nucleotides()[27894-1:28259], True)
 
 # https://en.wikipedia.org/wiki/Capsid
 # Packages the positive strand viral genome RNA into a helical ribonucleocapsid
 # Includes the "internal" protein (from Coronavirus Pathogenesis)
 # https://www.sciencedirect.com/topics/veterinary-science-and-veterinary-medicine/human-coronavirus-oc43
-corona['nucleocapsid_phosphoprotein'] = translate(cc[28274-1:29533], True)
+corona['nucleocapsid_phosphoprotein'] = translate(
+    genome.get_nucleotides()[28274-1:29533], True)
 
 # might be called the internal protein (Coronavirus Pathogenesis)
-corona['orf10'] = translate(cc[29558-1:29674], True)
-
+corona['orf10'] = translate(genome.get_nucleotides()[29558-1:29674], True)
diff --git a/fold.py b/fold.py
@@ -7,35 +7,19 @@
 import sys
 import argparse
 
-parser = argparse.ArgumentParser(description='Fold some proteins.')
-parser.add_argument('--scratch', action='store_true')
-parser.add_argument('--temp', type=int, default=300)
-parser.add_argument('--steps', type=int, default=100000, help="2500000000 should fold the protein")
-parser.add_argument('--writes', type=int, default=1000, help="default is 1000")
-parser.add_argument('--out', type=str, default="/tmp/output.pdb")
-parser.add_argument('--pdb', type=str, default="proteins/villin/1vii.pdb")
-parser.add_argument('--fasta', type=str, default=None)
-args = parser.parse_args(sys.argv[1:])
+from parser import Parser
+from unfold import Unfold
+from simulate import Simulate
+
+
+args = Parser(argparse).parse()
 
 try:
-  platform = Platform.getPlatformByName("CUDA")
+    platform = Platform.getPlatformByName("CUDA")
 except Exception:
-  platform = Platform.getPlatformByName("OpenCL")
-
-if args.scratch:
-  # unfolded protein
-  if args.fasta is not None:
-    fasta = args.fasta
-  else:
-    protein_fasta = "proteins/villin/1vii.fasta"
-    fasta = open(protein_fasta).read().split("\n")[1]
-  print("folding %s" % fasta)
-  from lib import write_unfolded
-  write_unfolded(fasta, "/tmp/unfolded.pdb")
-  pdb = PDBFile("/tmp/unfolded.pdb")
-else:
-  # already folded protein
-  pdb = PDBFile(args.pdb)
+    platform = Platform.getPlatformByName("OpenCL")
+
+pdb = Unfold(args).unfold
 
 #forcefield = ForceField('amber99sb.xml', 'tip3p.xml')
 forcefield = ForceField('amber03.xml', 'amber03_obc.xml')
@@ -44,19 +28,4 @@
 modeller.addHydrogens(forcefield)
 print(modeller.topology)
 
-system = forcefield.createSystem(modeller.topology,
-  implicitSolvent=OBC2,   # matches https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2980750/#bib39
-  nonbondedMethod=NoCutoff, nonbondedCutoff=1*nanometer,
-  constraints=HBonds)
-integrator = LangevinIntegrator(args.temp*kelvin, 1/picosecond, 2*femtoseconds)
-simulation = Simulation(modeller.topology, system, integrator, platform)
-simulation.context.setPositions(modeller.positions)
-simulation.minimizeEnergy()
-
-steps = args.steps
-steps_write = max(1, steps//args.writes)
-print("writing every %d steps" % steps_write)
-simulation.reporters.append(PDBReporter(args.out, steps_write))
-simulation.reporters.append(StateDataReporter(stdout, steps_write, step=True, potentialEnergy=True, temperature=True))
-simulation.step(steps)
-
+Simulate(forcefield, modeller, args).simulate()
diff --git a/genome.py b/genome.py
@@ -0,0 +1,23 @@
+# A genome sequence is the complete list of the nucleotides
+# (A, C, G, and T for DNA genomes) that make up all the
+# chromosomes of an individual or a species
+
+# TODO: make nucleotides have the apropriate property...
+# sepcify the nucleotides is a string containng only A,C,G,T
+
+# nucleotides -- are all sequence of ACGT valid genome ?
+
+class Genome:
+    def __init__(self, nucleotides):
+        self.nucleotides = nucleotides
+
+    def get_nucleotides(self):
+        return self.nucleotides
+
+
+class GenomeBuilder():
+    def __init__(self, nucleotides):
+        self.genome = Genome(nucleotides)
+
+    def build(self):
+        return self.genome
diff --git a/lib.py b/lib.py
@@ -1,5 +1,9 @@
+import pathlib
+import json
+import os
 import random
-
+from genome import Genome
+from genome import GenomeBuilder
 # Asn or Asp / B  AAU, AAC; GAU, GAC
 # Gln or Glu / Z  CAA, CAG; GAA, GAG
 # START AUG
@@ -27,87 +31,95 @@
 """.strip()
 dec = {}
 for t in tt.split("\n"):
-  k = t[:len("Val / V")].strip()
-  v = t[len("Val / V "):]
-  if '/' in k:
-    k = k.split("/")[-1].strip()
-  k = k.replace("STOP", "*")
-  v = v.replace(",", "").replace(";", "").lower().replace("u", "t").split(" ")
-  for vv in v:
-    if vv in dec:
-      print("dup", vv)
-    dec[vv.strip()] = k
+    k = t[:len("Val / V")].strip()
+    v = t[len("Val / V "):]
+    if '/' in k:
+        k = k.split("/")[-1].strip()
+    k = k.replace("STOP", "*")
+    v = v.replace(",", "").replace(
+        ";", "").lower().replace("u", "t").split(" ")
+    for vv in v:
+        if vv in dec:
+            print("dup", vv)
+        dec[vv.strip()] = k
+
 
 def translate(x, protein=False):
-  x = x.lower()
-  aa = []
-  for i in range(0, len(x)-2, 3):
-    aa.append(dec[x[i:i+3]])
-  aa = ''.join(aa)
-  if protein:
-    if aa[0] != "M" or aa[-1] != "*":
-      print("BAD PROTEIN")
-      print(aa)
-      return None
-    aa = aa[:-1]
-  return aa
+    x = x.lower()
+    aa = []
+    for i in range(0, len(x)-2, 3):
+        aa.append(dec[x[i:i+3]])
+    aa = ''.join(aa)
+    if protein:
+        if aa[0] != "M" or aa[-1] != "*":
+            print("BAD PROTEIN")
+            print(aa)
+            return None
+        aa = aa[:-1]
+    return aa
+
 
 ltl = 'Asp D Glu E Arg R Lys K His H Asn N Gln Q Ser S Thr T Tyr Y Ala A Gly G Val V Leu L Ile I Pro P Phe F Met M Trp W Cys C'
 ltl = ltl.split(" ")
 ltl = dict(zip(ltl[1::2], ltl[0::2]))
 
+
 def get_atoms():
-  from data import get_amber99sb
-  amber99sb = get_amber99sb()
-  residues = amber99sb.getElementsByTagName("Residue")
-  atoms = {}
-  for r in residues:
-    name = r.attributes['name'].value
-    atoms[name] = [x.attributes['name'].value for x in r.getElementsByTagName("Atom")]
-  return atoms
+    from data import get_amber99sb
+    amber99sb = get_amber99sb()
+    residues = amber99sb.getElementsByTagName("Residue")
+    atoms = {}
+    for r in residues:
+        name = r.attributes['name'].value
+        atoms[name] = [
+            x.attributes['name'].value for x in r.getElementsByTagName("Atom")]
+    return atoms
+
 
 def write_unfolded(fasta, fn):
-  atoms = get_atoms()
-  atom_num = 1
-  res_num = 1
-  ss = []
-  random.seed(1337)
-  for i, aa in enumerate(fasta):
-    tl = ltl[aa].upper()
-    for a in atoms[tl] + ([] if i != len(fasta)-1 else ["OXT"]):
-      if len(a) < 4:
-        pa = " " + a
-      else:
-        pa = a
-      gr = lambda: 1.0*(random.random()-0.5)
-      x,y,z = gr(), gr(), gr()
-      x += res_num*5
-      s = "ATOM %6d %-4s %3s A %3d    %8.3f%8.3f%8.3f  1.00  1.00           %s" % \
-        (atom_num, pa, tl, res_num, x, y, z, a[0:1])
-      ss.append(s)
-      atom_num += 1
-    res_num += 1
+    atoms = get_atoms()
+    atom_num = 1
+    res_num = 1
+    ss = []
+    random.seed(1337)
+    for i, aa in enumerate(fasta):
+        tl = ltl[aa].upper()
+        for a in atoms[tl] + ([] if i != len(fasta)-1 else ["OXT"]):
+            if len(a) < 4:
+                pa = " " + a
+            else:
+                pa = a
+
+            def gr(): return 1.0*(random.random()-0.5)
+            x, y, z = gr(), gr(), gr()
+            x += res_num*5
+            s = "ATOM %6d %-4s %3s A %3d    %8.3f%8.3f%8.3f  1.00  1.00           %s" % \
+                (atom_num, pa, tl, res_num, x, y, z, a[0:1])
+            ss.append(s)
+            atom_num += 1
+        res_num += 1
+
+    with open(fn, "w") as f:
+        f.write('\n'.join(ss))
+
 
-  with open(fn, "w") as f:
-    f.write('\n'.join(ss))
-
 def invert(dd):
-  dd = dd.upper()
-  def _invert(x):
-    if x == 'A':
-      return 'T'
-    elif x == 'T':
-      return 'A'
-    elif x == 'C':
-      return 'G'
-    elif x == 'G':
-      return 'C'
-  return (''.join([_invert(x) for x in dd]))[::-1]
+    dd = dd.upper()
 
-import pathlib
-import os
-import json
-with open(os.path.join(pathlib.Path(__file__).parent.absolute(), "data", "allseq.json")) as f:
-  allseq = json.load(f)
-cc = allseq['MN908947']
+    def _invert(x):
+        if x == 'A':
+            return 'T'
+        elif x == 'T':
+            return 'A'
+        elif x == 'C':
+            return 'G'
+        elif x == 'G':
+            return 'C'
+    return (''.join([_invert(x) for x in dd]))[::-1]
 
+
+with open(os.path.join(pathlib.Path(__file__).parent.absolute(), "data", "allseq.json")) as f:
+    allseq = json.load(f)
+nucleotides_list = allseq['MN908947']
+builder = GenomeBuilder(nucleotides_list)
+genome = builder.build()
diff --git a/opt.py b/opt.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
-from lib import cc as virus
+from lib import genome as virus
 from vaccine.load import dat as vaccine
 from corona import corona
 
-virus = virus.replace("T", "U")
+virus = virus.get_nucleotides().replace("T", "U")
 vaccine = vaccine.replace("Ψ", "U")
 
 """
@@ -21,4 +21,3 @@
 
 print(vvirus)
 print(vaccine)
-
diff --git a/parser.py b/parser.py
@@ -0,0 +1,22 @@
+import sys
+
+
+class Parser:
+    def __init__(self, argparse):
+        self.argparse = argparse
+
+    def parse(self):
+        parser = self.argparse.ArgumentParser(
+            description='Fold some proteins.')
+        parser.add_argument('--scratch', action='store_true')
+        parser.add_argument('--temp', type=int, default=300)
+        parser.add_argument('--steps', type=int, default=100000,
+                            help="2500000000 should fold the protein")
+        parser.add_argument('--writes', type=int,
+                            default=1000, help="default is 1000")
+        parser.add_argument('--out', type=str, default="/tmp/output.pdb")
+        parser.add_argument('--pdb', type=str,
+                            default="proteins/villin/1vii.pdb")
+        parser.add_argument('--fasta', type=str, default=None)
+        args = parser.parse_args(sys.argv[1:])
+        return args