-
Notifications
You must be signed in to change notification settings - Fork 2
/
predictor_learning_curve.py
398 lines (355 loc) · 17.9 KB
/
predictor_learning_curve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""Compute and plot learning curves of the predictor.
"""
import argparse
import gzip
import math
import os
import pickle
import random
import parse_data
import predictor
import predictor_hyperparam_search
import numpy as np
import tensorflow as tf
__author__ = 'Hayden Metsky <hayden@mit.edu>'
def compute_learning_curve(x, y, regression, context_nt, data_parser,
num_splits=5, num_inner_splits=5, num_sizes=10,
outer_splits_to_run=None):
"""Compute a learning curve.
This splits the data num_splits ways. In each split S, there is a training
set S_T and a validation (test) set S_V. S_T gets further split so that
a fraction of it can be used for early stopping during training. Let
S_T' be what remains of it. Then, this trains the model using
(n / num_sizes)*|S_T'| training data points for n in [1,...,num_sizes].
For each number of training data points, it computes a training loss
and validation loss.
Note that at the largest sampling, not all training data points will
be used: some will be reserved for validation/testing, and others
will be reserved for early stopping during training.
Args:
x: input data
y: labels/output
regression: if True, perform regression; if False, classification
context_nt: number of nt on each side of guide to use for context
data_parser: data parser object; needed for sampling
num_splits: number of splits of the data to make (i.e., k in k-fold);
at each size, this yields num_splits different train/validation
metrics based on the different splits of the data
num_inner_splits: number of splits to perform for hyperparameter
search on each outer fold and sampling size
num_sizes: number of different sizes of the training data to compute
a train/validation metrics for
outer_splits_to_run: if set, a list of outer splits to run (0-based);
if not set, run all
Returns:
dict
{fold index:
{sampling approach ('sample_all' indicating
sampling across all data points, or 'sample_crrnas' indicating
sampling from the crRNAs):
{sample size:
{'val': validation metrics}
}
}
}
"""
def sample_over_all_data(num, xx, yy):
# Produce a subsample, of size num, across all unique data points,
# in xx and yy; sample to include all replicates for each unique
# data point included
assert len(xx) == len(yy)
assert num <= data_parser.num_unique_points(xx)
idx = data_parser.unique_sampling_idx(num, xx)
return xx[idx], yy[idx]
def sample_over_crrnas(num, xx, yy):
# Produce a subsample, of num crRNAs, across the different
# crRNAs in xx and yy; for each crRNA, include all data points
# This should produce the same number of data points as
# subsample_over_all_data(num*(number of targets per crRNA), xx, yy)
# but with fewer crRNAs; the output of subsample_over_all_data() may
# encompass all crRNAs even with a small fraction of all data points
# being sampled
assert len(xx) == len(yy)
# Determine the crRNA positions of each data point in xx; this
# effectively serves as an identifier for the crRNA
xx_with_pos = [(xi, data_parser.pos_for_input(xi))
for xi in xx]
all_pos = set([pos for xi, pos in xx_with_pos])
assert num <= len(all_pos)
crrna_pos_sampled = set(random.sample(list(all_pos), num))
idx = [i for i, (xi, xi_pos) in enumerate(xx_with_pos)
if xi_pos in crrna_pos_sampled]
return xx[idx], yy[idx]
# Determine the sizes to sample for each sampling approach
# This should be the minimum number, for each sampling fraction, taken
# across the folds (the numbers to sample should be very similar
# across the folds, but might be slightly different; this helps ensure
# that we sample the same set of sizes for each fold)
sample_sizes_all = [None]*num_sizes
sample_sizes_crrnas = [None]*num_sizes
split_iter = data_parser.split(x, y, num_splits=num_splits,
stratify_by_pos=True)
for x_train, y_train, _, _ in split_iter:
# Reserve some of x_train for early stopping
train_split_iter = data_parser.split(x_train, y_train, num_splits=4,
stratify_by_pos=True)
x_train_for_train, y_train_for_train, _, _ = next(train_split_iter)
num_unique_points = data_parser.num_unique_points(x_train_for_train)
# Use num_sizes different subsamplings of x_train_for_train, in
# increasing size
for i in range(num_sizes):
n = i + 1
frac_to_sample = (float(n) / num_sizes)
# Number to sample, sampling from unique data points
num_points_to_sample = round(frac_to_sample * num_unique_points)
if sample_sizes_all[i] is None or num_points_to_sample < sample_sizes_all[i]:
sample_sizes_all[i] = num_points_to_sample
# Number to sample, sampling from crRNAs
crrna_pos = set([data_parser.pos_for_input(xi) for xi in
x_train_for_train])
num_crrnas_to_sample = round(frac_to_sample * len(crrna_pos))
if sample_sizes_crrnas[i] is None or num_crrnas_to_sample < sample_sizes_crrnas[i]:
sample_sizes_crrnas[i] = num_crrnas_to_sample
# Construct the learning curve
learning_curve = {}
fold = 0
split_iter = data_parser.split(x, y, num_splits=num_splits,
stratify_by_pos=True)
for x_train, y_train, x_validate, y_validate in split_iter:
print('STARTING SPLIT {} of {}'.format(fold+1, num_splits))
print(' Training up to n={}, validating on n={}'.format(len(x_train),
len(x_validate)))
if outer_splits_to_run is not None:
if fold not in outer_splits_to_run:
print(' Skipping this outer split')
fold += 1
# Advance random number generator
random.random()
np.random.random()
tf.random.uniform([1])
continue
learning_curve[fold] = {'sample_all': {}, 'sample_crrnas': {}}
# Use x_validate, y_validate for validation of the model on
# this fold
# Use num_sizes different subsamplings of x_train_for_train, in
# increasing size; each sampling has some number of *unique* data
# points (and all replicates for each of those)
for size_i in range(num_sizes):
print('Training on sampling {} of {} of the training set'.format(
size_i+1, num_sizes))
if regression:
# Print what the MSE would be if only predicting the mean of
# the training data
print(' MSE on train data if predicting mean of train data:',
np.mean(np.square(np.mean(y_train) - y_train.ravel())))
print(' MSE on validation data if predicting mean of train data:',
np.mean(np.square(np.mean(y_train) - y_validate.ravel())))
# Sample from all data points, train, and then run
# predictor.test() on the validation data for this fold to
# get the validation results
size = sample_sizes_all[size_i]
print('Sampling {} unique points from all data points'.format(size))
x_train_sample, y_train_sample = sample_over_all_data(
size, x_train_for_train, y_train_for_train)
# Perform hyperparameter search; do this because hyperparameters
# may vary based on sampling size
hyperparams, _, _ = predictor_hyperparam_search.search_for_hyperparams(
x_train_sample, y_train_sample, 'random', regression,
context_nt, num_splits=num_inner_splits,
num_random_samples=50, dp=data_parser)
# Split the training data ({x,y}_train_sample) *again* to get a
# separate validation set ({x,y}_train_sample_for_es)) to
# pass to predictor.train_and_validate(), which it will use for early
# stopping
# This way, we do not use the same validation set both for early
# stopping and for measuring the performance of the model (otherwise
# our 'training' of the model, which includes deciding when to
# stop, would have seen and factored in the data used for evaluating
# it on the fold)
# Only use one of the splits, with 3/4 being used to train the model
# and 1/4 for early stopping
train_split_iter = data_parser.split(x_train_sample, y_train_sample,
num_splits=4, stratify_by_pos=True)
x_train_sample_for_train, y_train_sample_for_train, x_train_sample_for_es, y_train_sample_for_es = next(train_split_iter)
# Construct model
model = predictor.construct_model(hyperparams,
x_train_sample.shape, regression,
compile_for_keras=True, y_train=y_train_sample)
predictor.train_with_keras(
model, x_train_sample_for_train, y_train_sample_for_train,
x_train_sample_for_es, y_train_sample_for_es)
val_results = predictor.test_with_keras(model, x_validate, y_validate,
data_parser)
learning_curve[fold]['sample_all'][size] = {'val': val_results}
# Sample from the crRNAs, train, and then run
# predictor.test() on the validation data for this fold to
# get the validation results
# Above, size is the total number of data points; for this, it is
# the number of crRNAs
size = sample_sizes_crrnas[size_i]
if size < 35:
# There are too few crRNAs sampled at this size
#
# This can cause errors downstream -- for example, if the
# number of inner splits during the hyperparameter search
# cross-validation is greater than the number of samples
#
# It can also cause an error in which there is no valdidation
# data during a cross-validation of the hyperparameter search;
# this happens because the inner split avoids overlap between
# the validation data and training data (removing points from
# the validation data to achieve it). When there is no
# validation data, Keras presents the error 'ValueError: Empty
# training data.'
#
# So skip this sampling
print(('Skipping crRNA sampling because there are too '
'few sampled crRNAs ({})').format(size))
continue
print('Sampling {} crRNAs'.format(size))
x_train_sample, y_train_sample = sample_over_crrnas(
size, x_train_for_train, y_train_for_train)
# Perform hyperparameter search; do this because hyperparameters
# may vary based on sampling size
hyperparams, _, _ = predictor_hyperparam_search.search_for_hyperparams(
x_train_sample, y_train_sample, 'random', regression,
context_nt, num_splits=num_inner_splits,
num_random_samples=50, dp=data_parser)
# Split the training data ({x,y}_train_sample), as explained above
train_split_iter = data_parser.split(x_train_sample, y_train_sample,
num_splits=4, stratify_by_pos=True)
x_train_sample_for_train, y_train_sample_for_train, x_train_sample_for_es, y_train_sample_for_es = next(train_split_iter)
# Construct model
model = predictor.construct_model(hyperparams,
x_train_sample.shape, regression,
compile_for_keras=True, y_train=y_train_sample)
predictor.train_with_keras(
model, x_train_sample_for_train, y_train_sample_for_train,
x_train_sample_for_es, y_train_sample_for_es)
val_results = predictor.test_with_keras(model, x_validate, y_validate,
data_parser)
learning_curve[fold]['sample_crrnas'][size] = {'val': val_results}
print(('FINISHED SPLIT {} of {}').format(fold+1, num_splits))
fold += 1
return learning_curve
def main(args):
# Seed the predictor
predictor.set_seed(args.seed)
# Make the data_parser be module wide
global data_parser
# Read data
# Do not have a validation set; read the test set if desired, but
# don't use it for anything
train_split_frac = 1.0 - args.test_split_frac
if args.dataset == 'cas13':
parser_class = parse_data.Cas13ActivityParser
subset = args.cas13_subset
if args.cas13_classify:
regression = False
else:
regression = True
data_parser = parser_class(
subset=subset,
context_nt=args.context_nt,
split=(train_split_frac, 0, args.test_split_frac),
shuffle_seed=args.seed,
stratify_by_pos=True)
if args.dataset == 'cas13':
classify_activity = args.cas13_classify
regress_on_all = args.cas13_regress_on_all
regress_only_on_active = args.cas13_regress_only_on_active
data_parser.set_activity_mode(
classify_activity, regress_on_all, regress_only_on_active)
if args.cas13_normalize_crrna_activity:
data_parser.set_normalize_crrna_activity()
data_parser.read()
parse_data._split_parser = data_parser
x, y = data_parser.train_set()
x_test, y_test = data_parser.test_set()
# Compute the learning curve
learning_curve = compute_learning_curve(x, y,
regression, args.context_nt, data_parser,
num_splits=args.num_splits, num_sizes=args.num_sizes,
outer_splits_to_run=args.outer_splits_to_run)
# Write the learning curve to a TSV file
with gzip.open(args.write_tsv, 'wt') as fw:
def write_row(row):
fw.write('\t'.join(str(c) for c in row) + '\n')
# Write header
header = ['fold', 'sampling_approach', 'size', 'dataset', 'metric', 'value']
write_row(header)
for fold in learning_curve.keys():
for sampling_approach in learning_curve[fold].keys():
for size in learning_curve[fold][sampling_approach].keys():
for dataset in learning_curve[fold][sampling_approach][size].keys():
for metric in learning_curve[fold][sampling_approach][size][dataset].keys():
value = learning_curve[fold][sampling_approach][size][dataset][metric]
write_row([fold, sampling_approach, size,
dataset, metric, value])
if __name__ == "__main__":
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('--write-tsv',
required=True,
help=("Path to .tsv.gz file to which to write learning curve "
"results"))
parser.add_argument('--dataset',
choices=['cas13'],
default='cas13',
help=("Dataset to use."))
parser.add_argument('--cas13-subset',
choices=['exp', 'pos', 'neg', 'exp-and-pos'],
help=("Use a subset of the Cas13 data. See parse_data module "
"for descriptions of the subsets. To use all data, do not "
"set."))
parser.add_argument('--cas13-classify',
action='store_true',
help=("If set, only classify Cas13 activity into inactive/active"))
parser.add_argument('--cas13-regress-on-all',
action='store_true',
help=("If set, perform regression for Cas13 data on all data "
"(this can be reduced using --cas13-subset)"))
parser.add_argument('--cas13-regress-only-on-active',
action='store_true',
help=("If set, perform regression for Cas13 data only on the "
"active class"))
parser.add_argument('--cas13-normalize-crrna-activity',
action='store_true',
help=("If set, normalize the activity of each crRNA (guide) "
"across its targets to have mean 0 and stdev 1; this means "
"prediction is performed based on target differences (e.g., "
"mismatches) rather than inherent sequence of the crRNA"))
parser.add_argument('--context-nt',
type=int,
default=10,
help=("nt of target sequence context to include alongside each "
"guide"))
parser.add_argument('--num-splits',
type=int,
default=5,
help=("Number of splits of the data to make (i.e., k in k-fold); "
"at each size, this yields num_splits different train/"
"validation metrics based on different splits of the data"))
parser.add_argument('--num-sizes',
type=int,
default=10,
help=("Number of different sizes of the training data to compute "
"train/validation metrics for"))
parser.add_argument('--outer-splits-to-run',
nargs='+',
type=int,
help=("If set, only run the given outer splits (0-based). If "
"not set, run for all."))
parser.add_argument('--test-split-frac',
type=float,
default=0.3,
help=("Fraction of the dataset to leave out (completely ignore) "
"from this analysis"))
parser.add_argument('--seed',
type=int,
default=1,
help=("Random seed"))
args = parser.parse_args()
# Print the arguments provided
print(args)
main(args)