J. Taroni 2018
In this notebook, we will be plotting:
The number of latent variables for models trained on 1) different biological contexts 2) random subsampling (various sample sizes) and the 3) full recount2 model (MultiPLIER)
The pathway coverage and proportion of the latent variables associated with pathways from the same three conditions
# magrittr pipe
`%>%` <- dplyr::`%>%`
We’ll use ggplot2
for plotting.
library(ggplot2)
# custom ggplot2 theme
custom_theme <- function() {
ggplot2::theme_bw() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45,
hjust = 1),
legend.position = "none",
plot.title = ggplot2::element_text(hjust = 0.5),
axis.title.x = ggplot2::element_blank(),
text = ggplot2::element_text(size = 15))
}
# this expects that data.frames will already be cleaned and re-ordered!
# all data.frames must have the same column for use as the y.variable
# (e.g., matching y.variable.name)
ThreePlotsWrapper <- function(size.df, context.df, recount2.df,
y.variable.name, y.label, y.limits = "c(0, 1)") {
# Given results for sample size, biological context, and MultiPLIER make
# plots for the three conditions (using custom_theme from above)
#
# Args:
# size.df: data.frame with the sample size results, requires a column named
# "sample_size"
# context.df: data.frame with the biological context results, requires a
# column named "biological_context"
# recount2.df: data.frame with multiplier results, requires a column named
# "condition"
# y.variable.name: the column name, shared by all three data.frames, that
# will be used as the y variable in the plots
# y.label: what should the y-axis label be?
# y.limits: passed to ggplot2::ylim, should follow the format
# "c(<LOWER LIMIT>, <UPPER LIMIT>)"
#
# Returns:
# a list of the follow plots: size, context, recount2
# standardized geom_boxplot + geom_jitter plot for sample size and biological
# context plots
BoxplotJitter <- function(df, x.var, y.var, y.lim){
ggplot(data = df, aes_string(x = x.var,
y = y.var,
group = x.var)) +
# will be using jitter so including an outlier shape in the boxplot could
# confuse things a bit
geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = 0.5, width = 0.3) +
custom_theme() +
ylim(eval(parse(text = y.lim)))
}
size.plot <- size.df %>%
BoxplotJitter(x.var = "sample_size",
y.var = y.variable.name,
y.lim = y.limits) +
# only the sample size plot which will be on the left in multipanel figures
# will have a y-axis label
labs(y = y.label, title = "Sample Size")
context.plot <- context.df %>%
BoxplotJitter(x.var = "biological_context",
y.var = y.variable.name,
y.lim = y.limits) +
labs(y = NULL, title = "Biological Context")
# the multiplier plot is a single point. the custom theme is shared.
recount2.plot <- recount2.df %>%
ggplot(aes_string(x = "condition", y = y.variable.name)) +
geom_point(fill = "#0000FF", shape = 23, size = 4) +
custom_theme() +
ylim(eval(parse(text = y.limits))) +
labs(y = NULL, title = "MultiPLIER")
# return a list of the three plots
return(list("size" = size.plot,
"context" = context.plot,
"recount2" = recount2.plot))
}
MultiPanelWrapper <- function(size.plot, context.plot, recount2.plot,
plot.title, output.file) {
# given the plots for the sample size experiment, the biological context
# experiment and the full MultiPLIER model, respectively, make a multipanel
# plot titled with the plot.title argument and saved as an 11"W x 5"H plot
# in the location specified by output.file
# for the three conditions, make a 3 column multipanel plot
panels <- cowplot::plot_grid(size.plot, context.plot, recount2.plot,
align = "h",
# the size plot will have the y-axis label
# and the MultiPLIER panel should be quite s
# small
rel_widths = c(1.125, 1, 0.375),
ncol = 3)
# the title "panel"
p.title <- cowplot::ggdraw() +
cowplot::draw_label(plot.title, fontface = "bold",
size = 20)
# combine to get the final plot
final.plot <- cowplot::plot_grid(p.title, panels, ncol = 1,
rel_heights = c(0.1, 1))
# save 11"W x 5"H file (the type of file is determined by the file extension
# provided in output.file)
ggsave(output.file, plot = final.plot, width = 11, height = 5)
}
# plot and result directory setup for this notebook
plot.dir <- file.path("plots", "31")
dir.create(plot.dir, recursive = TRUE, showWarnings = FALSE)
results.dir <- file.path("results", "31")
dir.create(results.dir, recursive = TRUE, showWarnings = FALSE)
For both the biological context and sample size evaluations, the results we will be plotting are from 30-evaluate_sample_size_and_biological_context
.
# pathway coverage and proportion of LVs associated with pathways
context.coverage.file <- file.path("results", "30",
"biological_context_pathway_coverage.tsv")
# we'll want to reorder the biological contexts by sample size
context.levels <- c("blood", "cancer", "tissue", "cell line", "other tissues")
context.coverage.df <- readr::read_tsv(context.coverage.file) %>%
# clean up the context information for plotting
dplyr::mutate(biological_context = gsub("_", " ", biological_context)) %>%
# reorder by the sample size (low -> high)
dplyr::mutate(biological_context = factor(biological_context,
levels = context.levels))
Parsed with column specification:
cols(
value = col_double(),
metric = col_character(),
seed = col_integer(),
biological_context = col_character()
)
# number of latent variables
context.num.file <- file.path("results", "30",
"biological_context_number_of_lvs.tsv")
context.num.df <- readr::read_tsv(context.num.file) %>%
# clean up the context information for plotting
dplyr::mutate(biological_context = gsub("_", " ", biological_context)) %>%
# reorder by the sample size (low -> high)
dplyr::mutate(biological_context = factor(biological_context,
levels = context.levels))
Parsed with column specification:
cols(
number_of_latent_variables = col_integer(),
seed = col_integer(),
biological_context = col_character()
)
# pathway coverage and proportion of LVs associated with pathways
size.coverage.file <- file.path("results", "30",
"subsampled_pathway_coverage.tsv")
# order by sample size
size.levels <- c("500", "1000", "2000", "4000", "8000", "16000", "32000")
# for plotting, we want the sample size as a factor (otherwise it's hard to
# see what is happening on the lower end of things) but to order by the integer
# value
size.coverage.df <- readr::read_tsv(size.coverage.file) %>%
dplyr::mutate(sample_size = factor(sample_size, levels = size.levels))
Parsed with column specification:
cols(
value = col_double(),
metric = col_character(),
seed = col_integer(),
sample_size = col_integer()
)
# Number of latent variables
size.num.file <- file.path("results", "30",
"subsampled_number_of_lvs.tsv")
size.num.df <- readr::read_tsv(size.num.file) %>%
dplyr::mutate(sample_size = factor(sample_size, levels = size.levels))
Parsed with column specification:
cols(
number_of_latent_variables = col_integer(),
seed = col_integer(),
sample_size = col_integer()
)
For the full recount2 bits, we’ll use the same values that are in figure_notebooks/subsampling_figures
. (The full model was trained on ~37K samples, because we subset recount2 to include only those samples with metadata.)
# number of latent variables in the MultiPLIER model
recount2.num.lvs <- 987
# pathway coverage
recount2.coverage <- 0.4187898
# proportion of the latent variables associated with a pathway
recount2.lv.prop <- 0.2016211
Make into data.frames
for plotting purposes. We’ll always use “condition” as one of the column names (see ThreePlotsWrapper
above).
# proportion of LVs associated with pathways
recount2.prop.df <- data.frame(value = recount2.lv.prop,
condition = "full recount2")
# pathway coverage
recount2.coverage.df <- data.frame(value = recount2.coverage,
condition = "full recount2")
# number of latent variables
recount2.num.df <-
data.frame(number_of_latent_variables = as.integer(recount2.num.lvs),
condition = "full recount2")
num.plots <- ThreePlotsWrapper(size.df = size.num.df,
context.df = context.num.df,
recount2.df = recount2.num.df,
y.variable.name = "number_of_latent_variables",
y.label = "number of latent variables",
y.limits = "c(0, 1000)")
Save to file
MultiPanelWrapper(size.plot = num.plots$size,
context.plot = num.plots$context,
recount2.plot = num.plots$recount2,
plot.title = "Number of Latent Variables",
output.file = file.path(plot.dir,
"number_of_latent_variables.pdf"))
The sample size and biological context data.frames
contain the information about pathway coverage and the proportion of LVs associated with a pathway, so we’ll need to filter them to the correct metric.
coverage.plots <-
ThreePlotsWrapper(size.df = dplyr::filter(size.coverage.df,
metric == "pathway coverage"),
context.df = dplyr::filter(context.coverage.df,
metric == "pathway coverage"),
recount2.df = recount2.coverage.df,
y.variable.name = "value",
y.label = "proportion of input pathways")
MultiPanelWrapper(size.plot = coverage.plots$size,
context.plot = coverage.plots$context,
recount2.plot = coverage.plots$recount2,
plot.title = "Pathway Coverage",
output.file = file.path(plot.dir, "pathway_coverage.pdf"))
# for line length/style purposes
metric.to.use <- "LV associated with pathways"
prop.plots <-
ThreePlotsWrapper(size.df = dplyr::filter(size.coverage.df,
metric == metric.to.use),
context.df = dplyr::filter(context.coverage.df,
metric == metric.to.use),
recount2.df = recount2.coverage.df,
y.variable.name = "value",
y.label = "proportion of latent variables")
MultiPanelWrapper(size.plot = prop.plots$size,
context.plot = prop.plots$context,
recount2.plot = prop.plots$recount2,
plot.title = "LVs significantly associated with pathways",
output.file = file.path(plot.dir, "lv_proportion.pdf"))
If we look at the prop.plots
prop.plots$size
prop.plots$context
prop.plots$recount2
We can see in the sample size plot that, generally speaking, the larger the sample size, the lower the proportion of latent variables that are associated with at least one pathway.
PLIER::PLIER
has a parameter, frac
, that determines the fraction of latent variables that meets that criterion. (frac = 0.7
by default in the version of PLIER we have on this Docker container; we did not specify frac
.)
This parameter is used to set L3
, which is the penalty on U
that controls sparsity (Mao, et al. bioRxiv. 2017.). Based on my understanding, this controls the proportion of positive entries in U
(i.e., the number of gene sets that an LV has some association with), but it does not guarantee that all of these associations will be significant (Mao, et al. bioRxiv. 2017.) and the cutoff we happen to be using (FDR < 0.05
) is not taken into account.
For reference, this is the sample size information for these different training sets:
Biological condition | Sample size |
---|---|
blood | 3862 |
cancer | 8807 |
tissue | 12396 |
cell line | 14532 |
other tissues (not blood) | 32984 |
Once we take the biological context experiments into account, it gets a bit more interesting:
blood
training set has ~4000 samples, but models trained on blood have a much higher proportion of latent variables significantly associated with pathways than the models trained on 4000 randomly selected samples. We’d expect that there’d be less “technical noise” and more “biological signal” in the blood
training set. It’s also likely to be a function of the pathways under consideration. Specifically, we supply immune cell-related gene sets to the models, and we expect blood
samples to be particularly relevant for those gene sets.tissue
models have higher number of LVs and more pathway coverage than cell line
models but they have approximately the same proportion of LVs significantly associated with a pathway. These values are also perhaps a bit lower than we would expect based on sample size alone (looking at 16000
from sample size plot).U
sparsityTo follow up on the observations above, let’s check that, in practice, the fraction of the LVs that have at least one association with a pathway should be ~0.7. First, removing everything from the workspace.
rm(list = ls())
# this is intended to be used with lists of models output from
# scripts/subsampling_PLIER.R
GetSparsityList <- function(list.of.models) {
# we assume that the list of models has names
# these sparsity calculations are different from what we've done before
CheckUSparsity <- function(plier.model) {
u.matrix <- plier.model$U
# what proportion of values are positive?
total.positive <- sum(u.matrix > 0) / (dim(u.matrix)[1] * dim(u.matrix)[2])
# what proportion of columns have at least 1 positive value (association)?
col.wise <- sum(apply(u.matrix, 2, function(x) any(x > 0))) / ncol(u.matrix)
return(list("overall" = total.positive,
"column_wise" = col.wise))
}
sparsity.list <- lapply(list.of.models,
function(x) CheckUSparsity(x$PLIER))
}
# we'll use this twice -- once for biological context and once for sample size
# we assume that the files.vector is named
GetResults <- function(files.vector) {
lapply(files.vector,
function(x) {
# for each file, read in the list of models
model.list <- readRDS(x)
# get the sparsity list for the current list of models
GetSparsityList(model.list)
})
}
models.dir <- "models"
# sample size
size.model.files <- list.files(models.dir, pattern = "subsampled",
full.names = TRUE)
names(size.model.files) <- sub(".RDS", "", sub(".*[_]", "", size.model.files))
# biological context
context.model.files <- list.files(models.dir, pattern = "accessions",
full.names = TRUE)
names(context.model.files) <-
stringr::str_match(context.model.files, "recount2_(.*?)_accessions")[, 2]
size.results <- GetResults(size.model.files)
size.df <- reshape2::melt(size.results)
summary(dplyr::filter(size.df, L3 == "column_wise")$value)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.6667 0.6983 0.7010 0.7019 0.7093 0.7273
context.results <- GetResults(context.model.files)
context.df <- reshape2::melt(context.results)
summary(dplyr::filter(context.df, L3 == "column_wise")$value)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.6824 0.6953 0.6989 0.6978 0.7016 0.7072
These results are consistent with our expectations.