enhancements

msmbuilder · Jan 24, 2015 · 98af8cf · 98af8cf
1 parent 411166b
commit 98af8cf
Show file tree

Hide file tree

Showing 2 changed files with 131 additions and 64 deletions.
diff --git a/dihedral_mutinf.py b/dihedral_mutinf.py
@@ -3,8 +3,10 @@
 import argparse
 import cPickle
 import time
+import pandas as pd
 from multiprocessing import Pool
 from itertools import combinations_with_replacement as combinations
+from scipy import stats
 from sklearn.metrics import mutual_info_score
 from contextlib import closing
 
@@ -24,44 +26,84 @@ def __exit__(self, ty, val, tb):
         return False
 
 
-def rbins(n=30):
-    return np.linspace(-np.pi, np.pi, n+3)[1:-1]
+def shuffle(df, n=1):
+    ind = df.index
+    sampler = np.random.permutation
+    for i in range(n):
+        new_vals = df.take(sampler(df.shape[0])).values
+        df = pd.DataFrame(new_vals, index=ind)
+    return df
 
 
-def mi(X, Y, r=rbins()):
-    H = np.histogram2d(X, Y, [r, r])[0]
-    return mutual_info_score(None, None, contingency=H)
+def ent(nbins, *args):
+    W = np.vstack((args)).T
+    n = len(args)
+    H = np.histogramdd(W, bins=nbins, range=n*[[-180, 180]])
+    dx = H[1][0][1] - H[1][0][0]
+    return stats.entropy(H[0].flatten()) + n*np.log(dx)
+
+
+def mi(nbins, X, Y):
+    H = np.histogram2d(X, Y, bins=nbins, range=2*[[-180, 180]])
+    return mutual_info_score(None, None, contingency=H[0])
+
+
+class getDihedrals(object):
+    def __call__(self, traj):
+        atoms, angles = self.method(traj)
+        idx = [traj.topology.atom(i).residue.index
+               for i in atoms[:, self.type]]
+        return pd.DataFrame(180*angles/np.pi, columns=idx)
+
+    def __init__(self, method, type):
+        assert type < 3 or type > -1
+        self.type = type
+        self.method = method
 
 
 def dihedrals(traj):
-    kinds = [md.compute_phi,
-             md.compute_psi]
-    return [kind(traj)[1].T for kind in kinds]
+    kinds = [
+        getDihedrals(md.compute_phi, 2),
+        getDihedrals(md.compute_psi, 1),
+        getDihedrals(md.compute_chi1, 1),
+        getDihedrals(md.compute_chi2, 0)
+        ]
+    return [kind(traj) for kind in kinds]
 
 
 class f(object):
     def __call__(self, i):
-        return sum([mi(d[0][i[0]], d[1][i[1]])
-                    for d in combinations(self.D, 2)])
-
-    def __init__(self, D):
+        return np.divide(sum([mi(self.n, d[0][i[0]], d[1][i[1]])
+                              if i[0] in d[0].columns
+                              and i[1] in d[1].columns
+                              else 0.0
+                              for d in combinations(self.D, 2)]),
+                         sum([np.sqrt(
+                              ent(self.n, d[0][i[0]])*ent(self.n, d[1][i[1]]))
+                              if i[0] in d[0].columns
+                              and i[1] in d[1].columns
+                              else 0.0
+                              for d in combinations(self.D, 2)]))
+
+    def __init__(self, nbins, D):
         self.D = D
+        self.n = nbins
 
 
-def run(traj, iter, N):
+def run(traj, nbins, iter, N):
     D = dihedrals(traj)
-    n = D[0].shape[0]
+    n = np.unique(np.hstack(tuple(map(np.array, [df.columns for df in D]))))
     R = []
     for i in range(iter+1):
-        r = np.zeros((n, n))
-        g = f(D)
+        r = np.zeros((n.size, n.size))
+        g = f(nbins, D)
         with timing(i):
             with closing(Pool(processes=N)) as pool:
-                r[np.triu_indices(n)] = pool.map(g, combinations(range(n), 2))
+                r[np.triu_indices(n.size)] = pool.map(g, combinations(n, 2))
                 pool.terminate()
-            r[np.triu_indices(n)[::-1]] = r[np.triu_indices(n)]
+            r[np.triu_indices(n.size)[::-1]] = r[np.triu_indices(n.size)]
             R.append(r)
-            [np.random.shuffle(d) for d in D]
+            [shuffle(df) for df in D]
     return R[0] - np.mean(R[1:], axis=0)
 
 
@@ -76,6 +118,8 @@ def parse_cmdln():
                         default=100, type=int)
     parser.add_argument('-t', '--topology', dest='top',
                         help='File containing topology.', default=None)
+    parser.add_argument('-b', '--n-bins', dest='nbins',
+                        help='Number of bins', default=30, type=int)
     parser.add_argument('-n', '--n-proc', dest='N',
                         help='Number of processors to be used.',
                         default=4, type=int)
@@ -88,5 +132,5 @@ def parse_cmdln():
 if __name__ == "__main__":
     options = parse_cmdln()
     traj = md.load(options.traj, top=options.top)
-    M = run(traj, options.iter, options.N)
+    M = run(traj, options.nbins, options.iter, options.N)
     cPickle.dump(M, open(options.out, 'wb'))
diff --git a/dihedral_tent.py b/dihedral_tent.py
@@ -1,9 +1,11 @@
 import numpy as np
 import mdtraj as md
+import pandas as pd
 import time
 import argparse
 import cPickle
 from multiprocessing import Pool
+from scipy import stats
 from itertools import product
 from itertools import combinations_with_replacement as combinations
 from contextlib import closing
@@ -24,88 +26,107 @@ def __exit__(self, ty, val, tb):
         return False
 
 
-def rbins(n=30):
-    return np.linspace(-np.pi, np.pi, n+3)[1:-1]
+def shuffle(df, n=1):
+    ind = df.index
+    sampler = np.random.permutation
+    for i in range(n):
+        new_vals = df.take(sampler(df.shape[0])).values
+        df = pd.DataFrame(new_vals, index=ind)
+    return df
 
 
-def ent(H):
-    H /= H.sum()
-    return -np.sum(H*np.nan_to_num(np.log2(H)))
+def ent(nbins, *args):
+    W = np.vstack((args)).T
+    n = len(args)
+    H = np.histogramdd(W, bins=nbins, range=n*[[-180, 180]])
+    dx = H[1][0][1] - H[1][0][0]
+    return stats.entropy(H[0].flatten()) + n*np.log(dx)
 
 
-def ent1D(X, r=rbins()):
-    H = np.histogram(X, r)[0].astype(float)
-    return ent(H)
+def ce(nbins, X, Y):
+    return ent(nbins, X, Y) - ent(nbins, Y)
 
 
-def ent2D(X, Y, r=rbins()):
-    H = np.histogram2d(X, Y, 2*[r])[0].astype(float)
-    return ent(H)
+def cmi(nbins, X, Y, Z):
+    return sum([ent(nbins, X, Z),
+                ent(nbins, Y, Z),
+                -ent(nbins, X, Y, Z),
+                - ent(nbins, Z)])
 
 
-def ent3D(X, Y, Z, r=rbins()):
-    W = np.vstack((X, Y, Z)).T
-    H = np.histogramdd(W, 3*[r])[0].astype(float)
-    return ent(H)
+class getDihedrals(object):
+    def __call__(self, traj):
+        atoms, angles = self.method(traj)
+        idx = [traj.topology.atom(i).residue.index
+               for i in atoms[:, self.type]]
+        return pd.DataFrame(180*angles/np.pi, columns=idx)
 
-
-def ce(X, Y):
-    return ent2D(X, Y) - ent1D(Y)
-
-
-def cmi(X, Y, Z):
-    return ent2D(X, Z) + ent2D(Y, Z) - ent3D(X, Y, Z) - ent1D(Z)
+    def __init__(self, method, type):
+        assert type < 3 or type > -1
+        self.type = type
+        self.method = method
 
 
 def dihedrals(traj):
-    kinds = [md.compute_phi,
-             md.compute_psi]
-    return [kind(traj)[1].T for kind in kinds]
+    kinds = [
+        getDihedrals(md.compute_phi, 2),
+        getDihedrals(md.compute_psi, 1),
+        getDihedrals(md.compute_chi1, 1),
+        getDihedrals(md.compute_chi2, 0)
+        ]
+    return [kind(traj) for kind in kinds]
 
 
-class f(object):
+class F(object):
     def __call__(self, i):
-        return sum([cmi(self.cD[d[0]][i[0]], self.pD[d[1]][i[1]],
-                    self.pD[d[0]][i[0]])
+        return sum([cmi(self.n, self.cD[d[0]][i[0]], self.pD[d[1]][i[1]],
+                        self.pD[d[0]][i[0]])
+                    if i[0] in self.cD[d[0]].columns
+                    and i[1] in self.cD[d[1]].columns
+                    else 0.0
                     for d in combinations(range(len(self.cD)), 2)])
 
-    def __init__(self, cD, pD):
+    def __init__(self, nbins, cD, pD):
         self.cD = cD
         self.pD = pD
+        self.n = nbins
 
 
-class h(object):
+class H(object):
     def __call__(self, i):
-        return sum([ce(self.cD[d[0]][i[0]], self.pD[d[0]][i[0]])
+        return sum([ce(self.n, self.cD[d[0]][i], self.pD[d[1]][i])
+                    if i in self.cD[d[0]].columns
+                    and i in self.pD[d[1]].columns
+                    else 0.0
                     for d in combinations(range(len(self.cD)), 2)])
 
-    def __init__(self, cD, pD):
+    def __init__(self, nbins, cD, pD):
         self.cD = cD
         self.pD = pD
+        self.n = nbins
 
 
-def run(current, past, iter, N):
+def run(current, past, nbins, iter, N):
     cD = dihedrals(current)
     pD = dihedrals(past)
-    n = cD[0].shape[0]
+    n = np.unique(np.hstack(tuple(map(np.array, [df.columns for df in cD]))))
     R = []
-    q = h(cD, pD)
+    q = H(nbins, cD, pD)
     for i in range(iter+1):
-        g = f(cD, pD)
+        g = F(nbins, cD, pD)
         with timing(i):
             with closing(Pool(processes=N)) as pool:
-                R.append(np.reshape(pool.map(g, product(range(n),
-                                             range(n))),
-                                            (n, n)))
+                R.append(np.reshape(pool.map(g, product(n, n)),
+                                            (n.size, n.size)))
                 pool.terminate()
-            [np.random.shuffle(d) for d in cD]
-            [np.random.shuffle(d) for d in pD]
+            cD = [shuffle(df) for df in cD]
+            pD = [shuffle(df) for df in pD]
     CMI = R[0] - np.mean(R[1:], axis=0)
     with closing(Pool(processes=N)) as pool:
-        CH = (pool.map(q, zip(*(2*[range(n)])))*np.ones((n, n))).T
+        CH = (pool.map(q, n)*np.ones((n.size, n.size))).T
         pool.terminate()
     T = CMI/CH
-    return T - T.T
+    return pd.DataFrame(T - T.T, columns=n)
 
 
 def parse_cmdln():
@@ -122,6 +143,8 @@ def parse_cmdln():
     parser.add_argument('-s', '--shuffle-iter', dest='iter',
                         help='Number of shuffle iterations.',
                         default=100, type=int)
+    parser.add_argument('-b', '--n-bins', dest='nbins',
+                        help='Number of bins', default=30, type=int)
     parser.add_argument('-n', '--n-proc', dest='N',
                         help='Number of processors', default=4, type=int)
     parser.add_argument('-o', '--output', dest='out',
@@ -134,5 +157,5 @@ def parse_cmdln():
     options = parse_cmdln()
     current = md.load(options.current, top=options.top)
     past = md.load(options.past, top=options.top)
-    D = run(current, past, options.iter, options.N)
+    D = run(current, past, options.nbins, options.iter, options.N)
     cPickle.dump(D, open(options.out, 'wb'))