forked from jmp1985/metrix-database
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xia2_parser.py
380 lines (308 loc) · 11.1 KB
/
xia2_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# -*- coding: utf-8 -*-
from __future__ import division
from metrix_db.initialiser import processing_statistic_name_mapping
# Name of columns for statistics
stat_name_list = ['Overall_Stats', 'High_Res_Stats', 'Low_Res_Stats']
class XIA2Parser(object):
'''
A class to represent the xia2 parser
'''
def __init__(self, handle):
'''
Init the class with the handle
'''
import sqlite3
self.handle = handle
self.cur = self.handle.cursor()
def _select_id_from_pdb_id(self, pdb_id):
'''
Find individual id for a PDB code in table PDB_id
'''
self.cur.execute('''
SELECT id FROM PDB_id WHERE PDB_id.pdb_id="%s"
''' % (pdb_id))
return self.cur.fetchone()[0]
def _insert_or_ignore_into_sweeps(self, pdb_id):
'''
Find the SWEEPS and their IDs that belong to a particular
PDB_id
'''
self.cur.execute('''
INSERT OR IGNORE INTO SWEEPS
(pdb_id_id) SELECT id FROM PDB_id
WHERE PDB_id.pdb_id="%s"
''' % (pdb_id))
def _select_id_from_sweeps(self, pdb_pk):
'''
Finding the SWEEP_ids belonging to a particular PDB_id
and return them
'''
self.cur.execute('''
SELECT id FROM SWEEPS WHERE SWEEPS.pdb_id_id="%s"
''' % (pdb_pk))
sweep_pk = self.cur.fetchall()[-1][0]
return sweep_pk
def _insert_into_sweep_id(self, name, sweep_pk):
'''
Find the column/stat name to be entered for each selected sweep_id
'''
self.cur.execute('''
INSERT INTO %s (sweep_id) VALUES (%s)
''' % (name, sweep_pk))
def _update_high_res_stats(self, name, value, sweep_pk):
'''
Update high resolution stats for a selected sweep_id
'''
self.cur.execute('''
UPDATE High_Res_Stats SET %s = %s
WHERE sweep_id = %s
''' % (name, value, sweep_pk))
def _update_low_res_stats(self, name, value, sweep_pk):
'''
Update low resolution stats for a seleted sweep_id
'''
self.cur.execute('''
UPDATE Low_Res_Stats SET %s = %s
WHERE sweep_id = %s
''' % (name, value, sweep_pk))
def _update_overall_stats(self, name, value, sweep_pk):
'''
Update overall stats for a selected sweep_id
'''
self.cur.execute('''
UPDATE Overall_Stats SET %s = %s
WHERE sweep_id = %s
''' % (name, value, sweep_pk))
def _update_stats(self, name, overall, low, high, sweep_pk):
'''
Update all stats for overall, low and high resolution at once
'''
if overall is not None:
self._update_overall_stats(name, overall, sweep_pk)
if low is not None:
self._update_low_res_stats(name, low, sweep_pk)
if high is not None:
self._update_high_res_stats(name, high, sweep_pk)
def _update_wavelength(self, sweep_pk, wavelength):
'''
Update the wavelength for a given sweep_id
'''
self.cur.execute('''
UPDATE SWEEPS SET wavelength = %s WHERE id = "%s"
''' % (wavelength, sweep_pk))
def _insert_into_dev_stats(self, sweep_pk):
'''
Enter sweep_id into Dev_stats table
'''
self.cur.execute('''
INSERT INTO Dev_Stats_json (sweep_id) VALUES (%s)
''' % (sweep_pk))
def _update_dev_stats_date_time(self, sweep_pk):
'''
Update the timestamp for each sweep_id in
Dev_stat Table
'''
import datetime
self.cur.execute('''
UPDATE Dev_Stats_json SET date_time = "%s"
WHERE Dev_Stats_json.sweep_id= "%s"
''' % (str(datetime.datetime.today()), sweep_pk))
def _get_number_of_executions(self, pdb_pk):
'''
Get the current execution number for each sweep_id from Dev_stats
'''
self.cur.execute('''
SELECT pdb_id_id FROM SWEEPS WHERE SWEEPS.pdb_id_id=%s
''' % (pdb_pk))
number_of_executions = len(self.cur.fetchall())
return number_of_executions
def _update_dev_stats_execution_number(self, sweep_pk, number_of_executions):
'''
Update the execution number for each sweep_id and enter in Dev_stats
table
'''
self.cur.execute('''
UPDATE Dev_Stats_json SET execution_number = "%s"
WHERE Dev_Stats_json.sweep_id="%s"
''' % (number_of_executions, sweep_pk))
def _update_dev_stats_dials_version(self, sweep_pk, dials_version):
'''
Update the dials version for each sweep_id
'''
self.cur.execute('''
UPDATE Dev_Stats_json SET dials_version ="%s"
WHERE Dev_Stats_json.sweep_id="%s"
''' % (dials_version, sweep_pk))
def _update_dev_stats(self, pdb_pk, sweep_pk, dials_version):
'''
Update some statistic metadata for each sweep_id
'''
self._insert_into_dev_stats(sweep_pk)
self._update_dev_stats_date_time(sweep_pk)
number_of_executions = self._get_number_of_executions(pdb_pk)
self._update_dev_stats_execution_number(sweep_pk, number_of_executions)
self._update_dev_stats_dials_version(sweep_pk, dials_version)
def _update_sweep_and_dev_stats(self, pdb_id, pdb_pk, wavelength, statistics, dials_version):
'''
Update all the information for a sweep with wavelength and statistics
'''
# Create a new sweep ID and get the sweep database id
self._insert_or_ignore_into_sweeps(pdb_id)
sweep_pk = self._select_id_from_sweeps(pdb_pk)
# Add the statistics as columns
for name in stat_name_list:
self._insert_into_sweep_id(name, sweep_pk)
# Update the wavelength of the sweep
self._update_wavelength(sweep_pk, wavelength)
# For each statistic, enter into the database
for stat, name in processing_statistic_name_mapping.iteritems():
if stat in statistics:
assert len(statistics[stat]) in [1, 3]
if len(statistics[stat]) == 3:
overall, low, high = statistics[stat]
else:
overall, low, high = statistics[stat][0], None, None
self._update_stats(processing_statistic_name_mapping[stat], overall, low, high, sweep_pk)
# Update the dev stats stuff
self._update_dev_stats(pdb_pk, sweep_pk, dials_version)
def _update_data_type(self, data_type, pdb_pk):
'''
Update the data type for the PDB ID (e.g. SAD, MAD, MR)
'''
self.cur.execute('''
UPDATE PDB_id SET
data_type = ? WHERE id = ?
''', (data_type, pdb_pk))
def _commit(self):
'''
Commit changes back to the database
'''
self.handle.commit()
def _is_sad_mad_or_mr(self, data):
'''
Decide if data is SAD, MAD or MR
'''
crystals = data['_crystals']
data_type = None
for name in crystals.iterkeys():
wavelengths = crystals[name]['_wavelengths'].keys()
if 'NATIVE' in wavelengths:
assert data_type is None or data_type == 'MR'
data_type = 'MR'
elif 'SAD' in wavelengths:
assert data_type is None or data_type == 'SAD'
data_type = 'SAD'
elif all(w.startswith('WAVE') for w in wavelengths):
assert data_type is None or data_type == 'MAD'
data_type = 'MAD'
else:
return None
return data_type
def _parse_xia2_sad(self, pdb_id, pdb_pk, data, dials_version):
'''
Parse XIA2 SAD Data
'''
# Loop through all the crystals
crystals = data['_crystals']
for crystal_name in crystals.iterkeys():
# Get statistics and wavelengths
crystal = crystals[crystal_name]
if not '_scaler' in crystal or crystal['_scaler'] is None:
continue
scaler = crystal['_scaler']
scalr_statistics = scaler['_scalr_statistics']
wavelengths = crystal['_wavelengths']
# Get the statistics and wavelength for the sweep
result = scalr_statistics['["AUTOMATIC", "%s", "SAD"]' % crystal_name]
wavelength = wavelengths['SAD']['_wavelength']
# Update the statistics
self._update_sweep_and_dev_stats(pdb_id, pdb_pk, wavelength, result, dials_version)
# Update the data type
self._update_data_type("SAD", pdb_pk)
print 'SAD data input for %s completed.' % (pdb_id)
def _parse_xia2_mad(self, pdb_id, pdb_pk, data, dials_version):
'''
Parse XIA2 MAD Data
'''
# Loop through all the crystals
crystals = data['_crystals']
for crystal_name in crystals.iterkeys():
# Get statistics and wavelengths
crystal = crystals[crystal_name]
if not '_scaler' in crystal or crystal['_scaler'] is None:
continue
scaler = crystal['_scaler']
scalr_statistics = scaler['_scalr_statistics']
wavelengths = crystal['_wavelengths']
# Loop through the wavelengths
for wave in range(1, len(scalr_statistics.keys())+1):
# Get the statistics and wavelength for the sweep
result = scalr_statistics['["AUTOMATIC", "%s", "WAVE%d"]' % (crystal_name, wave)]
wavelength = wavelengths['WAVE%d' % wave]['_wavelength']
# Update the statistics
self._update_sweep_and_dev_stats(pdb_id, pdb_pk, wavelength, result, dials_version)
# Update the data type
self._update_data_type("MAD", pdb_pk)
print 'MAD data input for %s completed.' % (pdb_id)
def _parse_xia2_mr(self, pdb_id, pdb_pk, data, dials_version):
'''
Parse XIA2 MR data
'''
# Loop through all the crystals
crystals = data['_crystals']
for crystal_name in crystals.iterkeys():
# Get statistics and wavelengths
crystal = crystals[crystal_name]
if not '_scaler' in crystal or crystal['_scaler'] is None:
continue
scaler = crystal['_scaler']
scalr_statistics = scaler['_scalr_statistics']
wavelengths = crystal['_wavelengths']
# Get the statistics and wavelength for the sweep
result = scalr_statistics['["AUTOMATIC", "%s", "NATIVE"]' % crystal_name]
wavelength = wavelengths['NATIVE']['_wavelength']
# Update the statistics
self._update_sweep_and_dev_stats(pdb_id, pdb_pk, wavelength, result, dials_version)
# Update the data type
self._update_data_type("MR", pdb_pk)
print 'MR data input for %s completed. ' % (pdb_id)
def _parse_xia2_json(self, pdb_id, filename, dials_version):
'''
Parse a xia2.json file
'''
import json
# Load the XIA2 Json file
data = json.load(open(filename))
# Perform check for SAD, MAD or MR
check = self._is_sad_mad_or_mr(data)
# Select entry for pdb_id
pdb_pk = self._select_id_from_pdb_id(pdb_id)
# Execute function based on data type
if check == 'SAD':
self._parse_xia2_sad(pdb_id, pdb_pk, data, dials_version)
elif check == 'MAD':
self._parse_xia2_mad(pdb_id, pdb_pk, data, dials_version)
elif check == 'MR':
self._parse_xia2_mr(pdb_id, pdb_pk, data, dials_version)
else:
raise RuntimeError('Data needs to be SAD, MAD or MR: found %s' % check)
# Commit changes back to the database
self._commit()
def _parse_xia2_txt(self, filename):
'''
Get the DIALS version
'''
with open(filename) as infile:
for line in infile.readlines():
if line.startswith('DIALS'):
dials_version = line[6:]
return dials_version
raise RuntimeError("Couldn't read DIALS version from %s" % filename)
def add_entry(self, pdb_id, xia2_txt_filename, xia2_json_filename):
'''
Add the xia2 entry
'''
# Parse the xia2.txt
dials_version = self._parse_xia2_txt(xia2_txt_filename)
# Parse the xia2 json file
self._parse_xia2_json(pdb_id, xia2_json_filename, dials_version)