Observed vs. expected
load('Rdata/raws.Rdata')
iscb_aff_country <-
keynotes %>%
separate_rows(afflcountries, sep = '\\|') %>%
filter(!is.na(afflcountries)) %>%
add_count(year, conference, full_name, name = 'num_affls') %>%
mutate(probabilities = 1 / num_affls,
publication_date = ymd(year, truncated = 2),
year = ymd(year, truncated = 2)) %>%
left_join(nat_to_reg, by = c('afflcountries' = 'country_name'))
pubmed_aff_country <- corr_authors %>%
filter(!is.na(countries)) %>%
add_count(pmid, name = 'num_corr_authors') %>% # number of corresponding authors per pmid
select(pmid, journal, publication_date, year, countries, fore_name_simple, last_name_simple, num_corr_authors) %>%
separate_rows(countries) %>%
add_count(pmid, fore_name_simple, last_name_simple, name = 'num_affls') %>%
mutate(probabilities = 1 / num_corr_authors / num_affls)
country_rep <- iscb_aff_country %>%
group_by(countries) %>%
summarise(Observed = sum(probabilities)) %>%
arrange(desc(Observed))
## `summarise()` ungrouping output (override with `.groups` argument)
num_papers <- corr_authors %>%
filter(!is.na(countries)) %>%
pull(pmid) %>%
unique() %>%
length()
num_papers
## [1] 84137
num_honorees <- sum(iscb_aff_country$probabilities)
obs_vs_exp_all <- pubmed_aff_country %>%
group_by(countries) %>%
summarise(num_authors = sum(probabilities)) %>%
ungroup() %>%
mutate(freq_affi = num_authors / sum(num_authors)) %>%
arrange(desc(num_authors)) %>%
mutate(Expected = freq_affi * num_honorees) %>%
left_join(country_rep, by = 'countries') %>%
left_join(nat_to_reg, by = 'countries') %>%
select(country_name, everything()) %>%
select(-c(region, countries)) %>%
mutate(
Observed = replace_na(Observed, 0),
over_rep = Observed - Expected,
other_honorees = num_honorees - Observed,
other_authors = num_papers - num_authors
)
## `summarise()` ungrouping output (override with `.groups` argument)
Null hypothesis: For each country, the proportion of honorees affiliated with an institution/company from that country is similar to the proportion of authors affiliated with an institution/company from that country.
my_fish <- function(df) {
res <- df %>%
unlist() %>%
matrix(ncol = 2, byrow = TRUE) %>%
my_riskratio(correction = TRUE)
res_fish <- df %>%
unlist() %>%
matrix(ncol = 2) %>%
fisher.test()
or <- res_fish$estimate
l_or <- res_fish$conf.int[1]
u_or <- res_fish$conf.int[2]
p0 <- df[2]/(df[2] + df[1])
fish_rr <- or/(1 - p0 + p0*or)
fish_rr_lower <- l_or/(1 - p0 + p0*l_or)
fish_rr_upper <- u_or/(1 - p0 + p0*u_or)
res$measure[2, 1:3] %>%
c(p_value = res$p.value[2,2]) %>%
as.matrix() %>% t() %>% data.frame()
}
nested_obs_exp <- obs_vs_exp_all %>%
filter(!is.na(country_name)) %>%
select(country_name, other_authors, num_authors, other_honorees, Observed) %>%
group_by(country_name) %>%
nest()
fish_obs_exp <- nested_obs_exp %>%
mutate(fish = map(data, my_fish)) %>%
dplyr::select(-data) %>%
unnest()
fish_obs_exp %>%
mutate(ci = paste0('(', round(log2(lower), 1), ', ',
round(log2(upper), 1), ')'),
lestimate = log2(estimate)) %>%
left_join(obs_vs_exp_all, by = 'country_name') %>%
select(country_name, freq_affi, Observed, Expected, over_rep,
# log2fc,
estimate, lestimate, ci) %>%
rename('Country' = 'country_name',
'Author proportion' = 'freq_affi',
'Observed - Expected' = 'over_rep',
'Enrichment' = 'estimate',
'Log2(enrichment)' = 'lestimate',
'95% Confidence interval' = 'ci') %>%
datatable(rownames = FALSE) %>%
formatPercentage('Author proportion', 2) %>%
formatRound(c('Observed', 'Expected', 'Observed - Expected',
'Enrichment', 'Log2(enrichment)'), 1)
Adapted from epitools::riskratio()
.
Presentation enrichment/depletion of 20 countries that have the most publications.
filtered_obs_exp <- obs_vs_exp_all %>%
left_join(fish_obs_exp, by = 'country_name') %>%
top_n(25, num_authors) %>%
mutate(
distance_to_null = case_when(
lower > 1 ~ lower - 1,
TRUE ~ upper - 2
),
presentation = case_when(
lower > 1 ~ '#377EB8',
upper < 1 ~ '#E41A1C',
TRUE ~ 'grey20'
),
country_name = as.factor(country_name) %>% fct_reorder(num_authors)) %>%
arrange(desc(num_authors))
plot_obs_exp <- filtered_obs_exp %>%
mutate(lestimate = log2(estimate),
llower = log2(lower),
lupper = log2(upper)) %>%
select(country_name, Expected, Observed, lestimate, llower, lupper, presentation, over_rep) %>%
pivot_longer(- c(country_name, presentation, over_rep), names_to = 'type') %>%
mutate(subtype = ifelse(type == 'Expected' | type == 'Observed', 'Sqrt(number of honorees)', 'Log2 enrichment, 95% CI'))
save(plot_obs_exp, file = 'Rdata/affiliations.Rdata')
plot_obs_exp_right <- plot_obs_exp %>% filter(subtype == 'Sqrt(number of honorees)')
plot_obs_exp_left <- plot_obs_exp %>% filter(subtype != 'Sqrt(number of honorees)')
enrichment_plot_left <- plot_obs_exp_left %>%
ggplot(aes(x = country_name)) +
coord_flip() +
labs(x = NULL, y = bquote(Log[2] ~ 'enrichment, 95% CI')) +
theme(
legend.position = c(0.9, 0.3),
axis.title = element_text(size = 9),
plot.margin = margin(5.5, 2, 5.5, 5.5, unit = 'pt')
) +
scale_color_brewer(type = 'qual', palette = 'Set1') +
geom_pointrange(
data = plot_obs_exp_left %>%
pivot_wider(names_from = type),
aes(y = lestimate,
ymin = llower,
ymax = lupper,
group = country_name),
color = filtered_obs_exp$presentation,
stroke = 0.3, fatten = 2
) +
scale_x_discrete(position = 'top', labels = NULL) +
scale_y_reverse() +
geom_hline(data = plot_obs_exp_left, aes(yintercept = 0), linetype = 2)
overrep_countries <- plot_obs_exp_right %>%
filter(over_rep > 0) %>%
pull(country_name)
enrichment_plot_right <- plot_obs_exp_right %>%
ggplot(aes(x = country_name, y = value)) +
geom_line(aes(group = country_name),
color = rev(plot_obs_exp_right$presentation)) +
geom_point(aes(shape = type), color = 'grey20') +
labs(x = NULL, y = 'Number of honorees') +
theme(
axis.title = element_text(size = 9),
legend.position = c(0.75 , 0.3),
axis.text.y = element_text(
color = rev(filtered_obs_exp$presentation),
hjust = 0.5),
plot.margin = margin(5.5, 5.5, 8.5, 2, unit = 'pt')
) +
scale_y_sqrt(breaks = c(0, 1, 4, 16, 50, 120, 225)) +
# scale_color_brewer(type = 'qual', palette = 'Set1') +
coord_flip() +
geom_text(
data = . %>%
filter(type == 'Expected', !(country_name %in% overrep_countries)),
nudge_y = 1.2, aes(label = round(over_rep, 1)), size = 2.5) +
geom_text(
data = . %>%
filter(type == 'Expected', country_name %in% overrep_countries),
nudge_y = -1.2, aes(label = round(over_rep, 1)), size = 2.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
enrichment_plot <- cowplot::plot_grid(enrichment_plot_left, enrichment_plot_right,
rel_widths = c(1, 1.3))
enrichment_plot
ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5, dpi = 600)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] DT_0.16 epitools_0.5-10.1 gdtools_0.2.2
## [4] wru_0.1-10 rnaturalearth_0.1.0 lubridate_1.7.9.2
## [7] caret_6.0-86 lattice_0.20-41 forcats_0.5.0
## [10] stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [13] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4
## [16] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] class_7.3-17 rprojroot_1.3-2
## [5] fs_1.5.0 rstudioapi_0.12
## [7] farver_2.0.3 remotes_2.2.0
## [9] prodlim_2019.11.13 fansi_0.4.1
## [11] xml2_1.3.2 codetools_0.2-16
## [13] splines_4.0.3 knitr_1.30
## [15] pkgload_1.1.0 jsonlite_1.7.1
## [17] pROC_1.16.2 broom_0.7.2
## [19] dbplyr_2.0.0 rgeos_0.5-5
## [21] compiler_4.0.3 httr_1.4.2
## [23] backports_1.2.0 assertthat_0.2.1
## [25] Matrix_1.2-18 cli_2.1.0
## [27] htmltools_0.5.0 prettyunits_1.1.1
## [29] tools_4.0.3 gtable_0.3.0
## [31] glue_1.4.2 rnaturalearthdata_0.1.0
## [33] reshape2_1.4.4 Rcpp_1.0.5
## [35] cellranger_1.1.0 vctrs_0.3.4
## [37] svglite_1.2.3.2 nlme_3.1-149
## [39] iterators_1.0.13 crosstalk_1.1.0.1
## [41] timeDate_3043.102 gower_0.2.2
## [43] xfun_0.19 ps_1.4.0
## [45] testthat_3.0.0 rvest_0.3.6
## [47] lifecycle_0.2.0 devtools_2.3.2
## [49] MASS_7.3-53 scales_1.1.1
## [51] ipred_0.9-9 hms_0.5.3
## [53] RColorBrewer_1.1-2 yaml_2.2.1
## [55] curl_4.3 memoise_1.1.0
## [57] rpart_4.1-15 stringi_1.5.3
## [59] desc_1.2.0 foreach_1.5.1
## [61] e1071_1.7-4 pkgbuild_1.1.0
## [63] lava_1.6.8.1 systemfonts_0.3.2
## [65] rlang_0.4.8 pkgconfig_2.0.3
## [67] evaluate_0.14 sf_0.9-6
## [69] recipes_0.1.15 htmlwidgets_1.5.2
## [71] labeling_0.4.2 cowplot_1.1.0
## [73] tidyselect_1.1.0 processx_3.4.4
## [75] plyr_1.8.6 magrittr_1.5
## [77] R6_2.5.0 generics_0.1.0
## [79] DBI_1.1.0 mgcv_1.8-33
## [81] pillar_1.4.6 haven_2.3.1
## [83] withr_2.3.0 units_0.6-7
## [85] survival_3.2-7 sp_1.4-4
## [87] nnet_7.3-14 modelr_0.1.8
## [89] crayon_1.3.4 KernSmooth_2.23-17
## [91] utf8_1.1.4 rmarkdown_2.5
## [93] usethis_1.6.3 grid_4.0.3
## [95] readxl_1.3.1 data.table_1.13.2
## [97] callr_3.5.1 ModelMetrics_1.2.2.2
## [99] reprex_0.3.0 digest_0.6.27
## [101] classInt_0.4-3 stats4_4.0.3
## [103] munsell_0.5.0 viridisLite_0.3.0
## [105] sessioninfo_1.1.1