Only keep articles from 2002 because few authors had nationality predictions before 2002 (mostly due to missing metadata). See 093.summary-stats for more details.
load("Rdata/raws.Rdata")
alpha_threshold <- qnorm(0.975)
pubmed_nat_df <- corr_authors %>%
filter(year(year) >= 2002) %>%
separate_rows(countries, sep = ",") %>%
filter(countries == "US") %>%
left_join(nationalize_df, by = c("fore_name", "last_name")) %>%
group_by(pmid, journal, publication_date, year, adjusted_citations) %>%
summarise_at(vars(African:SouthAsian), mean, na.rm = T) %>%
ungroup()
iscb_nat_df <- keynotes %>%
separate_rows(afflcountries, sep = "\\|") %>%
filter(afflcountries == "United States") %>%
left_join(nationalize_df, by = c("fore_name", "last_name"))
start_year <- 1992
end_year <- 2019
n_years <- end_year - start_year
my_jours <- unique(pubmed_nat_df$journal)
my_confs <- unique(iscb_nat_df$conference)
n_jours <- length(my_jours)
n_confs <- length(my_confs)
region_levels <- paste(c("Celtic/English", "European", "East Asian", "Hispanic", "South Asian", "Arabic", "Hebrew", "African", "Nordic", "Greek"), "names")
region_cols <- c("#ffffb3", "#fccde5", "#b3de69", "#fdb462", "#80b1d3", "#8dd3c7", "#bebada", "#fb8072", "#bc80bd", "#ccebc5")
Prepare data frames for later analyses:
iscb_pubmed_oth <- iscb_nat_df %>%
rename("journal" = conference) %>%
select(year, journal, African:SouthAsian, publication_date) %>%
mutate(
type = "Keynote speakers/Fellows",
adjusted_citations = 1
) %>%
bind_rows(
pubmed_nat_df %>%
select(year, journal, African:SouthAsian, publication_date, adjusted_citations) %>%
mutate(type = "Pubmed authors")
) %>%
mutate(OtherCategories = SouthAsian + Hispanic + Jewish + Muslim + Nordic + Greek + African) %>%
pivot_longer(c(African:SouthAsian, OtherCategories),
names_to = "region",
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
group_by(type, year, region)
iscb_pubmed_sum_oth <- iscb_pubmed_oth %>%
summarise(
mean_prob = mean(probabilities),
se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
)
iscb_pubmed_sum <- iscb_pubmed_sum_oth %>%
filter(region != "OtherCategories")
fig_us_name_origina <- iscb_pubmed_sum %>%
filter(year < "2020-01-01") %>%
region_breakdown("main", region_levels, fct_rev(type)) +
guides(fill = guide_legend(nrow = 2))
large_regions <- c("CelticEnglish", "EastAsian", "European", "OtherCategories")
## Mean and standard deviation of predicted probabilities:
fig_us_name_originb <- iscb_pubmed_sum_oth %>%
filter(region %in% large_regions) %>%
recode_region() %>%
gam_and_ci(
df2 = iscb_pubmed_oth %>%
filter(region %in% large_regions) %>%
recode_region(),
start_y = start_year, end_y = end_year
) +
theme(
legend.position = c(0.88, 0.83),
panel.grid.minor = element_blank(),
legend.margin = margin(-0.5, 0, 0, 0, unit = "cm"),
legend.text = element_text(size = 6)
) +
facet_wrap(vars(fct_relevel(region, large_regions)), nrow = 1)
fig_us_name_origin <- cowplot::plot_grid(fig_us_name_origina, fig_us_name_originb, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1))
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
fig_us_name_origin
ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5, dpi = 600)
ggsave("figs/us_name_origin.svg", fig_us_name_origin, width = 6.5, height = 5.5)
iscb_lm <- iscb_pubmed_oth %>%
ungroup() %>%
mutate(
# year = c(scale(year)),
# year = as.factor(year),
type = relevel(as.factor(type), ref = "Pubmed authors")
)
main_lm <- function(regioni) {
glm(type ~ year + probabilities,
data = iscb_lm %>%
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
inte_lm <- function(regioni) {
glm(type ~ probabilities * year,
data = iscb_lm %>%
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
main_list <- lapply(large_regions, main_lm)
names(main_list) <- large_regions
lapply(main_list, broom::tidy)
## $CelticEnglish
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.76 0.535 8.89 5.86e-19
## 2 year -0.000611 0.0000348 -17.6 3.87e-69
## 3 probabilities 0.269 0.185 1.46 1.46e- 1
##
## $EastAsian
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.73 0.519 9.12 7.71e-20
## 2 year -0.000592 0.0000346 -17.1 9.97e-66
## 3 probabilities -1.89 0.455 -4.14 3.43e- 5
##
## $European
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.78 0.524 9.12 7.75e-20
## 2 year -0.000614 0.0000345 -17.8 6.78e-71
## 3 probabilities 0.446 0.194 2.30 2.16e- 2
##
## $OtherCategories
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.93 0.520 9.48 2.58e-21
## 2 year -0.000618 0.0000345 -17.9 1.18e-71
## 3 probabilities 0.0957 0.212 0.451 6.52e- 1
inte_list <- lapply(large_regions, inte_lm)
lapply(inte_list, broom::tidy)
## [[1]]
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.81 0.730 6.60 4.16e-11
## 2 probabilities 0.128 1.32 0.0969 9.23e- 1
## 3 year -0.000615 0.0000479 -12.8 9.81e-38
## 4 probabilities:year 0.00000953 0.0000883 0.108 9.14e- 1
##
## [[2]]
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4.71 0.536 8.78 1.57e-18
## 2 probabilities -1.29 3.81 -0.339 7.34e- 1
## 3 year -0.000591 0.0000357 -16.5 1.68e-61
## 4 probabilities:year -0.0000380 0.000243 -0.157 8.76e- 1
##
## [[3]]
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.17 0.674 7.67 1.77e-14
## 2 probabilities -0.809 1.40 -0.577 5.64e- 1
## 3 year -0.000640 0.0000449 -14.3 3.42e-46
## 4 probabilities:year 0.0000840 0.0000926 0.906 3.65e- 1
##
## [[4]]
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.22 0.670 7.79 6.76e-15
## 2 probabilities -0.964 1.59 -0.605 5.45e- 1
## 3 year -0.000637 0.0000447 -14.2 4.57e-46
## 4 probabilities:year 0.0000701 0.000104 0.673 5.01e- 1
for (i in 1:4) {
print(anova(main_list[[i]], inte_list[[i]], test = "Chisq"))
}
## Analysis of Deviance Table
##
## Model 1: type ~ year + probabilities
## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 26398 2064.9
## 2 26397 2064.9 1 0.011635 0.9141
## Analysis of Deviance Table
##
## Model 1: type ~ year + probabilities
## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 26398 2036.7
## 2 26397 2036.7 1 0.024177 0.8764
## Analysis of Deviance Table
##
## Model 1: type ~ year + probabilities
## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 26398 2061.9
## 2 26397 2061.1 1 0.82361 0.3641
## Analysis of Deviance Table
##
## Model 1: type ~ year + probabilities
## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 26398 2066.8
## 2 26397 2066.3 1 0.45537 0.4998
Interaction terms do not predict type
over and above the main effect of name origin probability and year (p > 0.01).
An East Asian name has 0.1516018 the odds of being selected as an honoree, significantly lower compared to other names (\(\beta_\textrm{East Asian} =\) -1.8865, P = 3.4282e-05). The two groups of scientists did not have a significant association with names predicted to be Celtic/English (P = 0.14566), European (P = 0.021596), or in Other categories (P = 0.65199).
It’s difficult to come to a conclusion for other regions with so few data points and the imperfect accuracy of our prediction. There seems to be little difference between the proportion of keynote speakers of African, Arabic, South Asian and Hispanic origin than those in the field. However, just because a nationality isn’t underrepresented against the field doesn’t mean scientists from that nationality are appropriately represented.
df2 <- iscb_pubmed_oth %>%
filter(region != "OtherCategories") %>%
recode_region()
fig_s7 <- iscb_pubmed_sum %>%
recode_region() %>%
gam_and_ci(
df2 = df2,
start_y = start_year, end_y = end_year
) +
theme(legend.position = c(0.8, 0.1)) +
facet_wrap(vars(fct_relevel(region, region_levels)), ncol = 3)
fig_s7
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
ggsave("figs/fig_s7.png", fig_s7, width = 6, height = 6)
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
ggsave("figs/fig_s7.svg", fig_s7, width = 6, height = 6)
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] broom_0.7.2 DT_0.16 epitools_0.5-10.1
## [4] gdtools_0.2.2 wru_0.1-10 rnaturalearth_0.1.0
## [7] lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
## [10] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
## [13] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [16] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] class_7.3-17 rprojroot_1.3-2
## [5] fs_1.5.0 rstudioapi_0.12
## [7] farver_2.0.3 remotes_2.2.0
## [9] prodlim_2019.11.13 fansi_0.4.1
## [11] xml2_1.3.2 codetools_0.2-16
## [13] splines_4.0.3 knitr_1.30
## [15] pkgload_1.1.0 jsonlite_1.7.1
## [17] pROC_1.16.2 dbplyr_2.0.0
## [19] rgeos_0.5-5 compiler_4.0.3
## [21] httr_1.4.2 backports_1.2.0
## [23] assertthat_0.2.1 Matrix_1.2-18
## [25] cli_2.1.0 htmltools_0.5.0
## [27] prettyunits_1.1.1 tools_4.0.3
## [29] gtable_0.3.0 glue_1.4.2
## [31] rnaturalearthdata_0.1.0 reshape2_1.4.4
## [33] Rcpp_1.0.5 cellranger_1.1.0
## [35] vctrs_0.3.4 svglite_1.2.3.2
## [37] nlme_3.1-149 iterators_1.0.13
## [39] crosstalk_1.1.0.1 timeDate_3043.102
## [41] gower_0.2.2 xfun_0.19
## [43] ps_1.4.0 testthat_3.0.0
## [45] rvest_0.3.6 lifecycle_0.2.0
## [47] devtools_2.3.2 MASS_7.3-53
## [49] scales_1.1.1 ipred_0.9-9
## [51] hms_0.5.3 RColorBrewer_1.1-2
## [53] yaml_2.2.1 curl_4.3
## [55] memoise_1.1.0 rpart_4.1-15
## [57] stringi_1.5.3 desc_1.2.0
## [59] foreach_1.5.1 e1071_1.7-4
## [61] pkgbuild_1.1.0 lava_1.6.8.1
## [63] systemfonts_0.3.2 rlang_0.4.8
## [65] pkgconfig_2.0.3 evaluate_0.14
## [67] sf_0.9-6 recipes_0.1.15
## [69] htmlwidgets_1.5.2 labeling_0.4.2
## [71] cowplot_1.1.0 tidyselect_1.1.0
## [73] processx_3.4.4 plyr_1.8.6
## [75] magrittr_1.5 R6_2.5.0
## [77] generics_0.1.0 DBI_1.1.0
## [79] mgcv_1.8-33 pillar_1.4.6
## [81] haven_2.3.1 withr_2.3.0
## [83] units_0.6-7 survival_3.2-7
## [85] sp_1.4-4 nnet_7.3-14
## [87] modelr_0.1.8 crayon_1.3.4
## [89] KernSmooth_2.23-17 utf8_1.1.4
## [91] rmarkdown_2.5 usethis_1.6.3
## [93] grid_4.0.3 readxl_1.3.1
## [95] data.table_1.13.2 callr_3.5.1
## [97] ModelMetrics_1.2.2.2 reprex_0.3.0
## [99] digest_0.6.27 classInt_0.4-3
## [101] stats4_4.0.3 munsell_0.5.0
## [103] viridisLite_0.3.0 sessioninfo_1.1.1