J. Taroni 2018
Part of the motivation behind using PLIER (particularly in a systemic lupus erythematosus whole blood setting) is that we can automatically extract information about cell type proportion variation. One of the datasets included in the SLE WB compendium comes from Banchereau, et al.. The original publication contained some interesting findings about neutrophils and plasmablasts. In this notebook, we explore whether latent variables from the PLIER model trained on the SLE WB compendium associated with these cell type (gene sets) show similar trends to what was found by Banchereau, et al. By extension, this also tells us whether the cell type-specific patterns are retained following all the processing we did in greenelab/rheum-plier-data/sle-wb
.
`%>%` <- dplyr::`%>%`
library(PLIER)
# custom functions
source(file.path("util", "plier_util.R"))
# plot and result directory setup for this notebook
plot.dir <- file.path("plots", "06")
dir.create(plot.dir, recursive = TRUE, showWarnings = FALSE)
results.dir <- file.path("results", "06")
dir.create(results.dir, recursive = TRUE, showWarnings = FALSE)
# SLE WB model trained in notebook 05
model.file <- file.path("results", "05", "SLE-WB_PLIER_model.RDS")
plier.results <- readRDS(model.file)
# summary data.frame (pathway to LV info like AUC, FDR)
sle.summary <- plier.results$summary
# LVs
b.matrix <- plier.results$B
# phenotype information
# read in sample data relationship file
file.65391 <- file.path("data", "sample_info", "E-GEOD-65391.sdrf.txt")
e.65391.sdrf <- readr::read_tsv(file.65391)
Duplicated column names deduplicated: 'Term Source REF' => 'Term Source REF_1' [31], 'Term Accession Number' => 'Term Accession Number_1' [32], 'Characteristics [nephritis_class]' => 'Characteristics [nephritis_class]_1' [62], 'Term Source REF' => 'Term Source REF_2' [70], 'Term Accession Number' => 'Term Accession Number_2' [71], 'Term Source REF' => 'Term Source REF_3' [73], 'Term Accession Number' => 'Term Accession Number_3' [74], 'Term Source REF' => 'Term Source REF_4' [82], 'Term Accession Number' => 'Term Accession Number_4' [83], 'Term Source REF' => 'Term Source REF_5' [89], 'Term Accession Number' => 'Term Accession Number_5' [90], 'Term Source REF' => 'Term Source REF_6' [92], 'Term Accession Number' => 'Term Accession Number_6' [93], 'Term Source REF' => 'Term Source REF_7' [108], 'Protocol REF' => 'Protocol REF_1' [109], 'Term Source REF' => 'Term Source REF_8' [110], 'Protocol REF' => 'Protocol REF_2' [111], 'Term Source REF' => 'Term Source REF_9' [112], 'Protocol REF' => 'Protocol REF_3' [115], 'Term Source REF' => 'Term Source REF_10' [116], 'Protocol REF' => 'Protocol REF_4' [119], 'Term Source REF' => 'Term Source REF_11' [120], 'Term Source REF' => 'Term Source REF_12' [123], 'Protocol REF' => 'Protocol REF_5' [125], 'Term Source REF' => 'Term Source REF_13' [126], 'Protocol REF' => 'Protocol REF_6' [127], 'Term Source REF' => 'Term Source REF_14' [128], 'Term Source REF' => 'Term Source REF_15' [155], 'Term Accession Number' => 'Term Accession Number_7' [156], 'Term Source REF' => 'Term Source REF_16' [199], 'Term Accession Number' => 'Term Accession Number_8' [200], 'Term Source REF' => 'Term Source REF_17' [206], 'Term Accession Number' => 'Term Accession Number_9' [207], 'Term Source REF' => 'Term Source REF_18' [209], 'Term Accession Number' => 'Term Accession Number_10' [210]Parsed with column specification:
cols(
.default = col_character(),
`Characteristics [age]` = col_double(),
`Characteristics [batch]` = col_integer(),
`Characteristics [batch_replicate]` = col_logical(),
`Characteristics [visit_count]` = col_integer(),
`FactorValue [age]` = col_double(),
`FactorValue [batch]` = col_integer(),
`FactorValue [batch_replicate]` = col_logical(),
`FactorValue [visit_count]` = col_integer()
)
See spec(...) for full column specifications.
# use Sample instead of Source Name
colnames(e.65391.sdrf)[1] <- c("Sample")
# get rid of trailing " 1" so they match the sample names from B matrix
e.65391.sdrf$Sample <- gsub(" 1", "", e.65391.sdrf$Sample)
sle.summary %>%
dplyr::filter(grepl("Neutrophil", pathway),
FDR < 0.05)
Based on these results, we select LV2
,LV27
, LV34
, and LV87
for further exploration. If these latent variables are also strongly associated with other pathways or cell types, that is not desirable. That could make intepretation more difficult.
sle.summary %>%
dplyr::filter(`LV index` %in% c(2, 27, 34, 87)) %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::arrange(`LV index`, desc(AUC))
Some notes on these results:
LV2
looks like it captures the myeloid lineage in a broad sense, as a monocyte gene set, SVM Monocytes
, and a neutrophil progenitor gene set, DMAP_GRAN2
(reported to be a neutrophilic metamyelocyte signature in the DMAP paper).
LV27
and LV34
also look like they may capture some information about monocytes because of their association with the DMAP_MONO2
gene set.
LV87
looks the most “neutrophil-specific” and therefore, the most desirable for this kind of analysis.
PLIER::plotU(plierRes = plier.results,
pval.cutoff = 1e-06,
indexCol = c(2, 27, 34, 87),
top = 10)
png(file.path(plot.dir, "SLE_WB_neutrophil_Uplot.png"),
res = 300, width = 7, height = 7, units = "in")
PLIER::plotU(plierRes = plier.results,
pval.cutoff = 1e-06,
indexCol = c(2, 27, 34, 87),
top = 10)
dev.off()
null device
1
neutrophil.count.df <- e.65391.sdrf[, c("Sample",
"Characteristics [neutrophil_count]")]
colnames(neutrophil.count.df)[2] <- c("Neutrophil.Count")
neutrophil.count.df <-
neutrophil.count.df %>%
dplyr::filter(Neutrophil.Count != "Data Not Available") %>%
dplyr::filter(Neutrophil.Count != "Not Applicable")
# combine with neutrophil latent variables
neutro.lv.df <- as.data.frame(cbind(colnames(b.matrix),
t(b.matrix[c(2, 27, 34, 87), ])))
colnames(neutro.lv.df) <- c("Sample", "LV2", "LV27", "LV34", "LV87")
neutro.df <- dplyr::inner_join(neutro.lv.df, neutrophil.count.df,
by = "Sample") %>%
dplyr::mutate(LV2 = as.numeric(as.character(LV2)),
LV27 = as.numeric(as.character(LV27)),
LV34 = as.numeric(as.character(LV34)),
LV87 = as.numeric(as.character(LV87)),
Neutrophil.Count = as.numeric(as.character(Neutrophil.Count)))
Column `Sample` joining factor and character vector, coercing into character vector
neutro.df
# write to file, will use to compare to results with recount2
count.file <- file.path(results.dir,
"Banchereau_count_SLE_model_neutrophil_LV.tsv")
readr::write_tsv(neutro.df, path = count.file)
# function for scatter plots, given lv (a string indicating the LV of interest)
# make a scatter plot where the LV is the x variable, neutrophil count is the
# y variable & fit a line with geom_smooth(method = "lm")
# also will annotate the plot with the supplied r-squared value (rsq arg)
# where the text is placed is automatically chosen from the x and y values
# needs to be used in this global environment
LVScatter <- function(lv, rsq) {
y.var <- "Neutrophil.Count"
# calculate where to put the r-squared value
x.range <- max(neutro.df[, lv]) - min(neutro.df[, lv])
x.coord <- min(neutro.df[, lv]) + (x.range * 0.15)
y.range <- max(neutro.df[, y.var]) - min(neutro.df[, y.var])
y.coord <- max(neutro.df[, y.var]) - (y.range * 0.15)
ggplot2::ggplot(neutro.df, ggplot2::aes_string(x = lv, y = y.var)) +
ggplot2::geom_point(alpha = 0.7) +
ggplot2::geom_smooth(method = "lm") +
ggplot2::theme_bw() +
ggplot2::labs(y = "Neutrophil Count") +
ggplot2::theme(legend.position = "none",
text = ggplot2::element_text(size = 15)) +
ggplot2::annotate("text", x = x.coord, y = y.coord,
label = paste("r-squared =", rsq))
}
summary(lm(neutro.df$Neutrophil.Count ~ neutro.df$LV2))
Call:
lm(formula = neutro.df$Neutrophil.Count ~ neutro.df$LV2)
Residuals:
Min 1Q Median 3Q Max
-4.3999 -1.4245 -0.4961 0.9463 13.9831
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.81000 0.08434 45.18 <2e-16 ***
neutro.df$LV2 8.43183 0.77393 10.89 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.246 on 851 degrees of freedom
Multiple R-squared: 0.1224, Adjusted R-squared: 0.1214
F-statistic: 118.7 on 1 and 851 DF, p-value: < 2.2e-16
LVScatter(lv = "LV2", rsq = 0.12)
summary(lm(neutro.df$Neutrophil.Count ~ neutro.df$LV27))
Call:
lm(formula = neutro.df$Neutrophil.Count ~ neutro.df$LV27)
Residuals:
Min 1Q Median 3Q Max
-5.0670 -1.0934 -0.1809 0.7934 12.2135
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.99216 0.06501 61.41 <2e-16 ***
neutro.df$LV27 4.66659 0.20285 23.00 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.882 on 851 degrees of freedom
Multiple R-squared: 0.3834, Adjusted R-squared: 0.3827
F-statistic: 529.2 on 1 and 851 DF, p-value: < 2.2e-16
LVScatter(lv = "LV27", rsq = 0.38)
summary(lm(neutro.df$Neutrophil.Count ~ neutro.df$LV34))
Call:
lm(formula = neutro.df$Neutrophil.Count ~ neutro.df$LV34)
Residuals:
Min 1Q Median 3Q Max
-4.9315 -1.2393 -0.2902 0.8375 11.4418
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.03189 0.06972 57.83 <2e-16 ***
neutro.df$LV34 4.11942 0.22162 18.59 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.022 on 851 degrees of freedom
Multiple R-squared: 0.2888, Adjusted R-squared: 0.2879
F-statistic: 345.5 on 1 and 851 DF, p-value: < 2.2e-16
LVScatter(lv = "LV34", rsq = 0.29)
summary(lm(neutro.df$Neutrophil.Count ~ neutro.df$LV87))
Call:
lm(formula = neutro.df$Neutrophil.Count ~ neutro.df$LV87)
Residuals:
Min 1Q Median 3Q Max
-3.9316 -1.2844 -0.3642 0.7306 12.5832
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.09067 0.06936 58.98 <2e-16 ***
neutro.df$LV87 3.37195 0.18090 18.64 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.02 on 851 degrees of freedom
Multiple R-squared: 0.2899, Adjusted R-squared: 0.2891
F-statistic: 347.5 on 1 and 851 DF, p-value: < 2.2e-16
LVScatter(lv = "LV87", rsq = 0.29)
When comparing the results to the recount2 model, we’ll pursue LV27
because it’s the best performing and LV87
because of its lack of overlap with monocyte signatures. So, we’ll save the plots for these two LVs.
plot.file <- file.path(plot.dir, "Banchereau_neutrophil_count_LV27_scatter.png")
ggplot2::ggsave(plot.file, plot = LVScatter("LV27", rsq = 0.38))
Saving 7 x 7 in image
plot.file <- file.path(plot.dir, "Banchereau_neutrophil_count_LV87_scatter.png")
ggplot2::ggsave(plot.file, plot = LVScatter("LV87", rsq = 0.29))
In Banchereau, et al., the authors demonstrated that plasmablast counts were different between patients stratified by 3 disease activity (DA) groups.
While there were no plasmablast gene sets given to PLIER during training, we did use plasma cell gene sets (isolated from PBMCs) (A plasmablast is a plasma cell precursor that is typically found in a germinal center.)
What LVs, if any, are significantly associated with the plasma cell gene sets?
sle.summary %>%
dplyr::filter(grepl("Plasma", pathway),
FDR < 0.05)
sle.summary %>%
dplyr::filter(`LV index` %in% c(52, 136),
FDR < 0.05) %>%
dplyr::arrange(desc(AUC))
PLIER::plotU(plierRes = plier.results,
pval.cutoff = 1e-06,
indexCol = c(52, 136),
top = 10)
png(file.path(plot.dir, "SLE_WB_plasma_cell_Uplot.png"),
res = 300, width = 7, height = 7, units = "in")
PLIER::plotU(plierRes = plier.results,
pval.cutoff = 1e-06,
indexCol = c(52, 136),
top = 10)
dev.off()
null device
1
Disease activity was defined based on SLEDAI (SLE Disease Activity Index, Bombardier, et al. 1992.). Banchereau, et al. defined the disease activity groups as follows:
Samples were categorized as DA1 (SLEDAI: 0–2), DA2 (SLEDAI: 3–7), or DA3 (SLEDAI > 7), based on SLEDAI distribution across the cohort.
# get DA information from the sample-data relationship file
da.group.df <- e.65391.sdrf[, c("Sample", "Characteristics [disease_activity]")]
colnames(da.group.df)[2] <- "Disease.Activity"
da.group.df <- da.group.df %>%
dplyr::filter(Disease.Activity != "Not Applicable")
# plasma cell LV
plasma.lv.df <- as.data.frame(cbind(colnames(b.matrix),
t(b.matrix[c(52, 136), ])))
colnames(plasma.lv.df) <- c("Sample", "LV52", "LV136")
# join
plasma.df <- dplyr::inner_join(plasma.lv.df, da.group.df) %>%
dplyr::mutate(Disease.Activity = dplyr::recode(Disease.Activity,
`1` = "DA1",
`2` = "DA2",
`3` = "DA3"),
Disease.Activity = factor(Disease.Activity),
LV52 = as.numeric(as.character(LV52)),
LV136 = as.numeric(as.character(LV136)))
Joining, by = "Sample"
Column `Sample` joining factor and character vector, coercing into character vector
head(plasma.df)
# write to file, will be used to compare to recount2 model
plasma.file <- file.path(results.dir,
"Banchereau_DA_group_SLE_model_plasma_cell_LVs.tsv")
readr::write_tsv(plasma.df, path = plasma.file)
plasma.df %>%
ggplot2::ggplot(ggplot2::aes(x = Disease.Activity,
y = LV52)) +
ggplot2::geom_boxplot(notch = TRUE) +
ggplot2::theme_bw() +
ggplot2::labs(x = "Disease Activity") +
ggplot2::theme(text = ggplot2::element_text(size = 15))
plot.file <- file.path(plot.dir, "Banchereau_DA_group_LV52_boxplot.png")
ggplot2::ggsave(filename = plot.file, plot = ggplot2::last_plot())
Saving 7 x 7 in image
# check for statistical significance
# (pairwise t-test is consistent with original publication as far as I can tell)
pairwise.t.test(x = plasma.df$LV52,
g = plasma.df$Disease.Activity,
p.adjust.method = "bonferroni")
Pairwise comparisons using t tests with pooled SD
data: plasma.df$LV52 and plasma.df$Disease.Activity
DA1 DA2
DA2 0.012 -
DA3 < 2e-16 6.5e-08
P value adjustment method: bonferroni
plasma.df %>%
ggplot2::ggplot(ggplot2::aes(x = Disease.Activity,
y = LV136)) +
ggplot2::geom_boxplot(notch = TRUE) +
ggplot2::theme_bw() +
ggplot2::labs(x = "Disease Activity") +
ggplot2::theme(text = ggplot2::element_text(size = 15))
plot.file <- file.path(plot.dir, "Banchereau_DA_group_LV136_boxplot.png")
ggplot2::ggsave(filename = plot.file, plot = ggplot2::last_plot())
Saving 7 x 7 in image
# check for statistical significance
# (pairwise t-test is consistent with original publication as far as I can tell)
pairwise.t.test(x = plasma.df$LV136,
g = plasma.df$Disease.Activity,
p.adjust.method = "bonferroni")
Pairwise comparisons using t tests with pooled SD
data: plasma.df$LV136 and plasma.df$Disease.Activity
DA1 DA2
DA2 0.236 -
DA3 0.018 1.000
P value adjustment method: bonferroni