-
Notifications
You must be signed in to change notification settings - Fork 11
/
31-plotting_sample_size_biological_context_coverage.Rmd
461 lines (379 loc) · 16.5 KB
/
31-plotting_sample_size_biological_context_coverage.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
---
title: "Plotting: sample size and biological context pathway coverage, number
of latent variables"
output:
html_notebook:
toc: true
toc_float: true
---
**J. Taroni 2018**
In this notebook, we will be plotting:
* The number of latent variables for models trained on 1) different biological
contexts 2) random subsampling (various sample sizes) and the 3) full recount2
model (MultiPLIER)
* The pathway coverage and proportion of the latent variables associated with
pathways from the same three conditions
## Set up
```{r}
# magrittr pipe
`%>%` <- dplyr::`%>%`
```
We'll use `ggplot2` for plotting.
```{r}
library(ggplot2)
```
#### Custom functions
```{r}
# custom ggplot2 theme
custom_theme <- function() {
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45,
hjust = 1),
legend.position = "none",
plot.title = ggplot2::element_text(hjust = 0.5),
axis.title.x = ggplot2::element_blank(),
text = ggplot2::element_text(size = 15))
}
```
```{r}
# this expects that data.frames will already be cleaned and re-ordered!
# all data.frames must have the same column for use as the y.variable
# (e.g., matching y.variable.name)
ThreePlotsWrapper <- function(size.df, context.df, recount2.df,
y.variable.name, y.label, y.limits = "c(0, 1)") {
# Given results for sample size, biological context, and MultiPLIER make
# plots for the three conditions (using custom_theme from above)
#
# Args:
# size.df: data.frame with the sample size results, requires a column named
# "sample_size"
# context.df: data.frame with the biological context results, requires a
# column named "biological_context"
# recount2.df: data.frame with multiplier results, requires a column named
# "condition"
# y.variable.name: the column name, shared by all three data.frames, that
# will be used as the y variable in the plots
# y.label: what should the y-axis label be?
# y.limits: passed to ggplot2::ylim, should follow the format
# "c(<LOWER LIMIT>, <UPPER LIMIT>)"
#
# Returns:
# a list of the follow plots: size, context, recount2
# standardized geom_boxplot + geom_jitter plot for sample size and biological
# context plots
BoxplotJitter <- function(df, x.var, y.var, y.lim){
ggplot(data = df, aes_string(x = x.var,
y = y.var,
group = x.var)) +
# will be using jitter so including an outlier shape in the boxplot could
# confuse things a bit
geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = 0.5, width = 0.3) +
custom_theme() +
ylim(eval(parse(text = y.lim)))
}
size.plot <- size.df %>%
BoxplotJitter(x.var = "sample_size",
y.var = y.variable.name,
y.lim = y.limits) +
# only the sample size plot which will be on the left in multipanel figures
# will have a y-axis label
labs(y = y.label, title = "Sample Size")
context.plot <- context.df %>%
BoxplotJitter(x.var = "biological_context",
y.var = y.variable.name,
y.lim = y.limits) +
labs(y = NULL, title = "Biological Context")
# the multiplier plot is a single point. the custom theme is shared.
recount2.plot <- recount2.df %>%
ggplot(aes_string(x = "condition", y = y.variable.name)) +
geom_point(fill = "#0000FF", shape = 23, size = 4) +
custom_theme() +
ylim(eval(parse(text = y.limits))) +
labs(y = NULL, title = "MultiPLIER")
# return a list of the three plots
return(list("size" = size.plot,
"context" = context.plot,
"recount2" = recount2.plot))
}
```
```{r}
MultiPanelWrapper <- function(size.plot, context.plot, recount2.plot,
plot.title, output.file) {
# given the plots for the sample size experiment, the biological context
# experiment and the full MultiPLIER model, respectively, make a multipanel
# plot titled with the plot.title argument and saved as an 11"W x 5"H plot
# in the location specified by output.file
# for the three conditions, make a 3 column multipanel plot
panels <- cowplot::plot_grid(size.plot, context.plot, recount2.plot,
align = "h",
# the size plot will have the y-axis label
# and the MultiPLIER panel should be quite s
# small
rel_widths = c(1.125, 1, 0.375),
ncol = 3)
# the title "panel"
p.title <- cowplot::ggdraw() +
cowplot::draw_label(plot.title, fontface = "bold",
size = 20)
# combine to get the final plot
final.plot <- cowplot::plot_grid(p.title, panels, ncol = 1,
rel_heights = c(0.1, 1))
# save 11"W x 5"H file (the type of file is determined by the file extension
# provided in output.file)
ggsave(output.file, plot = final.plot, width = 11, height = 5)
}
```
#### Directory setup
```{r}
# plot and result directory setup for this notebook
plot.dir <- file.path("plots", "31")
dir.create(plot.dir, recursive = TRUE, showWarnings = FALSE)
results.dir <- file.path("results", "31")
dir.create(results.dir, recursive = TRUE, showWarnings = FALSE)
```
## Read in data
For both the biological context and sample size evaluations, the results we
will be plotting are from `30-evaluate_sample_size_and_biological_context`.
### Biological context
```{r}
# pathway coverage and proportion of LVs associated with pathways
context.coverage.file <- file.path("results", "30",
"biological_context_pathway_coverage.tsv")
# we'll want to reorder the biological contexts by sample size
context.levels <- c("blood", "cancer", "tissue", "cell line", "other tissues")
context.coverage.df <- readr::read_tsv(context.coverage.file) %>%
# clean up the context information for plotting
dplyr::mutate(biological_context = gsub("_", " ", biological_context)) %>%
# reorder by the sample size (low -> high)
dplyr::mutate(biological_context = factor(biological_context,
levels = context.levels))
# number of latent variables
context.num.file <- file.path("results", "30",
"biological_context_number_of_lvs.tsv")
context.num.df <- readr::read_tsv(context.num.file) %>%
# clean up the context information for plotting
dplyr::mutate(biological_context = gsub("_", " ", biological_context)) %>%
# reorder by the sample size (low -> high)
dplyr::mutate(biological_context = factor(biological_context,
levels = context.levels))
```
### Sample size
```{r}
# pathway coverage and proportion of LVs associated with pathways
size.coverage.file <- file.path("results", "30",
"subsampled_pathway_coverage.tsv")
# order by sample size
size.levels <- c("500", "1000", "2000", "4000", "8000", "16000", "32000")
# for plotting, we want the sample size as a factor (otherwise it's hard to
# see what is happening on the lower end of things) but to order by the integer
# value
size.coverage.df <- readr::read_tsv(size.coverage.file) %>%
dplyr::mutate(sample_size = factor(sample_size, levels = size.levels))
# Number of latent variables
size.num.file <- file.path("results", "30",
"subsampled_number_of_lvs.tsv")
size.num.df <- readr::read_tsv(size.num.file) %>%
dplyr::mutate(sample_size = factor(sample_size, levels = size.levels))
```
### MultiPLIER
For the full recount2 bits, we'll use the same values that are in
`figure_notebooks/subsampling_figures`.
(The full model was trained on ~37K samples, because we subset recount2 to
include only those samples with metadata.)
```{r}
# number of latent variables in the MultiPLIER model
recount2.num.lvs <- 987
# pathway coverage
recount2.coverage <- 0.4187898
# proportion of the latent variables associated with a pathway
recount2.lv.prop <- 0.2016211
```
Make into `data.frames` for plotting purposes.
We'll always use "condition" as one of the column names (see
`ThreePlotsWrapper` above).
```{r}
# proportion of LVs associated with pathways
recount2.prop.df <- data.frame(value = recount2.lv.prop,
condition = "full recount2")
# pathway coverage
recount2.coverage.df <- data.frame(value = recount2.coverage,
condition = "full recount2")
# number of latent variables
recount2.num.df <-
data.frame(number_of_latent_variables = as.integer(recount2.num.lvs),
condition = "full recount2")
```
## Plotting
### Number of latent variables
```{r}
num.plots <- ThreePlotsWrapper(size.df = size.num.df,
context.df = context.num.df,
recount2.df = recount2.num.df,
y.variable.name = "number_of_latent_variables",
y.label = "number of latent variables",
y.limits = "c(0, 1000)")
```
Save to file
```{r}
MultiPanelWrapper(size.plot = num.plots$size,
context.plot = num.plots$context,
recount2.plot = num.plots$recount2,
plot.title = "Number of Latent Variables",
output.file = file.path(plot.dir,
"number_of_latent_variables.pdf"))
```
### Pathway coverage
The sample size and biological context `data.frames` contain the information
about pathway coverage _and_ the proportion of LVs associated with a pathway,
so we'll need to filter them to the correct metric.
```{r}
coverage.plots <-
ThreePlotsWrapper(size.df = dplyr::filter(size.coverage.df,
metric == "pathway coverage"),
context.df = dplyr::filter(context.coverage.df,
metric == "pathway coverage"),
recount2.df = recount2.coverage.df,
y.variable.name = "value",
y.label = "proportion of input pathways")
```
```{r}
MultiPanelWrapper(size.plot = coverage.plots$size,
context.plot = coverage.plots$context,
recount2.plot = coverage.plots$recount2,
plot.title = "Pathway Coverage",
output.file = file.path(plot.dir, "pathway_coverage.pdf"))
```
### Latent variables significantly associated with pathways
```{r}
# for line length/style purposes
metric.to.use <- "LV associated with pathways"
prop.plots <-
ThreePlotsWrapper(size.df = dplyr::filter(size.coverage.df,
metric == metric.to.use),
context.df = dplyr::filter(context.coverage.df,
metric == metric.to.use),
recount2.df = recount2.coverage.df,
y.variable.name = "value",
y.label = "proportion of latent variables")
```
```{r}
MultiPanelWrapper(size.plot = prop.plots$size,
context.plot = prop.plots$context,
recount2.plot = prop.plots$recount2,
plot.title = "LVs significantly associated with pathways",
output.file = file.path(plot.dir, "lv_proportion.pdf"))
```
### Following up on the proportion of LVs associated with pathways results
If we look at the `prop.plots`
```{r}
prop.plots$size
prop.plots$context
prop.plots$recount2
```
We can see in the sample size plot that, generally speaking, the larger the
sample size, the lower the proportion of latent variables that are associated
with at least one pathway.
`PLIER::PLIER` has a parameter, `frac`, that determines the fraction of latent
variables that meets that criterion.
(`frac = 0.7` by default in the version of PLIER we have on this Docker
container; we did not specify `frac`.)
This parameter is used to set `L3`, which is the penalty on `U` that controls
sparsity ([Mao, et al. _bioRxiv_. 2017.](http://dx.doi.org/10.1101/116061)).
Based on my understanding, this controls the proportion of positive entries in
`U` (i.e., the number of gene sets that an LV has some association with), but
it _does not_ guarantee that all of these associations will be significant
([Mao, et al. _bioRxiv_. 2017.](http://dx.doi.org/10.1101/116061)) and
the cutoff we happen to be using (`FDR < 0.05`) is not taken into account.
For reference, this is the sample size information for these different training
sets:
| Biological condition | Sample size |
|:-----------------------|:----------------|
| blood | 3862 |
| cancer | 8807 |
| tissue | 12396 |
| cell line | 14532 |
| other tissues (not blood) | 32984 |
Once we take the biological context experiments into account, it gets a bit
more interesting:
* The `blood` training set has ~4000 samples, but models trained on blood have a
much higher proportion of latent variables significantly associated with
pathways than the models trained on 4000 randomly selected samples.
We'd expect that there'd be less "technical noise" and more "biological signal"
in the `blood` training set.
It's also likely to be a function of the pathways under consideration.
Specifically, we supply immune cell-related gene sets to the models, and
we expect `blood` samples to be _particularly_ relevant for those gene sets.
* It looks like the _more latent variables_, the _lower proportion of LVs
significantly associated with a pathway_ based on the sample size results alone.
And it's certainly the case that the more LVs -> the more pathways that are
captured by the model (higher pathway coverage).
However, `tissue` models have higher number of LVs and more pathway coverage
than `cell line` models but they have approximately the same proportion of LVs
significantly associated with a pathway.
These values are also perhaps a bit lower than we would expect based on sample
size alone (looking at `16000` from sample size plot).
## Check `U` sparsity
To follow up on the observations above, let's check that, _in practice_, the
fraction of the LVs that have at least one association with a pathway should be
**~0.7**.
First, removing everything from the workspace.
```{r}
rm(list = ls())
```
#### Custom functions just for this
```{r}
# this is intended to be used with lists of models output from
# scripts/subsampling_PLIER.R
GetSparsityList <- function(list.of.models) {
# we assume that the list of models has names
# these sparsity calculations are different from what we've done before
CheckUSparsity <- function(plier.model) {
u.matrix <- plier.model$U
# what proportion of values are positive?
total.positive <- sum(u.matrix > 0) / (dim(u.matrix)[1] * dim(u.matrix)[2])
# what proportion of columns have at least 1 positive value (association)?
col.wise <- sum(apply(u.matrix, 2, function(x) any(x > 0))) / ncol(u.matrix)
return(list("overall" = total.positive,
"column_wise" = col.wise))
}
sparsity.list <- lapply(list.of.models,
function(x) CheckUSparsity(x$PLIER))
}
# we'll use this twice -- once for biological context and once for sample size
# we assume that the files.vector is named
GetResults <- function(files.vector) {
lapply(files.vector,
function(x) {
# for each file, read in the list of models
model.list <- readRDS(x)
# get the sparsity list for the current list of models
GetSparsityList(model.list)
})
}
```
#### Files
```{r}
models.dir <- "models"
# sample size
size.model.files <- list.files(models.dir, pattern = "subsampled",
full.names = TRUE)
names(size.model.files) <- sub(".RDS", "", sub(".*[_]", "", size.model.files))
# biological context
context.model.files <- list.files(models.dir, pattern = "accessions",
full.names = TRUE)
names(context.model.files) <-
stringr::str_match(context.model.files, "recount2_(.*?)_accessions")[, 2]
```
#### Check sparsity
```{r}
size.results <- GetResults(size.model.files)
size.df <- reshape2::melt(size.results)
summary(dplyr::filter(size.df, L3 == "column_wise")$value)
```
```{r}
context.results <- GetResults(context.model.files)
context.df <- reshape2::melt(context.results)
summary(dplyr::filter(context.df, L3 == "column_wise")$value)
```
These results are consistent with our expectations.