library(tidyverse)
library(lubridate)
library(rnaturalearth)
library(wru)
source("utils/r-utils.R")
theme_set(theme_bw() + theme(legend.title = element_blank()))
Only keep articles from 2002 because few authors had gender predictions before 2002. See 093.summary-stats for more details.
load("Rdata/raws.Rdata")
alpha_threshold <- qnorm(0.975)
gender_df <- read_tsv("data/gender/genderize.tsv")
##
## ── Column specification ──────────────────────────────────────────
## cols(
## fore_name_simple = col_character(),
## n_authors = col_double(),
## genderize_sample_size = col_double(),
## query_date = col_date(format = ""),
## probability_male = col_double()
## )
pubmed_gender_df <- corr_authors %>%
filter(year(year) >= 2002) %>%
left_join(gender_df, by = "fore_name_simple")
iscb_gender_df <- keynotes %>%
left_join(gender_df, by = "fore_name_simple")
start_year <- 1993
end_year <- 2019
n_years <- end_year - start_year
my_jours <- unique(pubmed_gender_df$journal)
my_confs <- unique(iscb_gender_df$conference)
n_jours <- length(my_jours)
n_confs <- length(my_confs)
iscb_pubmed <- iscb_gender_df %>%
rename("journal" = conference) %>%
select(year, journal, probability_male, publication_date) %>%
mutate(
type = "Keynote speakers/Fellows",
adjusted_citations = 1
) %>%
bind_rows(
pubmed_gender_df %>%
select(year, journal, probability_male, publication_date, adjusted_citations) %>%
mutate(type = "Pubmed authors")
) %>%
mutate(probability_female = 1 - probability_male) %>%
pivot_longer(contains("probability"),
names_to = "gender",
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
group_by(type, year, gender)
iscb_pubmed_sum <- iscb_pubmed %>%
summarise(
# n = n(),
mean_prob = mean(probabilities, na.rm = T),
se_prob = sd(probabilities, na.rm = T),
# n = mean(n),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
)
# https://stats.stackexchange.com/questions/25895/computing-standard-error-in-weighted-mean-estimation
# save(iscb_pubmed, file = 'Rdata/iscb-pubmed_gender.Rdata')
Additional fig. 1 with separated keynote speakers and fellows
fig_1d <- iscb_pubmed %>%
ungroup() %>%
mutate(
type2 = case_when(
type == "Pubmed authors" ~ "Pubmed authors",
journal == "ISCB Fellow" ~ "ISCB Fellows",
type == "Keynote speakers/Fellows" ~ "Keynote speakers"
)
) %>%
group_by(type2, year, gender) %>%
summarise(
mean_prob = mean(probabilities),
se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
) %>%
gender_breakdown("main", fct_rev(type2)) +
scale_x_date(
labels = scales::date_format("'%y"),
expand = c(0, 0)
)
## Scale for 'x' is already present. Adding another scale for 'x',
## which will replace the existing scale.
iscb_pubmed_sum %>%
filter(gender == "probability_male") %>%
gam_and_ci(
df2 = iscb_pubmed %>% filter(gender == "probability_male"),
start_y = start_year, end_y = end_year
) +
theme(legend.position = c(0.88, 0.2))
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 13171 rows containing non-finite values
## (stat_smooth).
iscb_lm <- iscb_pubmed %>%
filter(gender == "probability_female", !is.na(probabilities)) %>%
mutate(type = as.factor(type)) %>%
mutate(type = type %>% relevel(ref = "Pubmed authors"))
scaled_iscb <- iscb_lm %>%
filter(year(year) >= 2002)
# scaled_iscb$s_prob <- scale(scaled_iscb$probabilities, scale = F)
# scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F)
main_lm <- glm(type ~ year + probabilities,
data = scaled_iscb, # %>% mutate(year = as.factor(year))
family = "binomial"
)
broom::tidy(main_lm)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.06 0.478 -4.30 1.67e- 5
## 2 year -0.000271 0.0000320 -8.47 2.46e-17
## 3 probabilities 0.193 0.146 1.33 1.85e- 1
inte_lm <- glm(
# type ~ scale(year, scale = F) * scale(probabilities, scale = F),
# type ~ s_year * s_prob,
type ~ year * probabilities,
data = scaled_iscb, # %>% mutate(year = as.factor(year))
family = "binomial"
)
broom::tidy(inte_lm)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.77 0.568 -3.11 1.85e- 3
## 2 year -0.000291 0.0000383 -7.59 3.22e-14
## 3 probabilities -0.992 1.29 -0.771 4.41e- 1
## 4 year:probabilities 0.0000787 0.0000846 0.930 3.52e- 1
anova(main_lm, inte_lm, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: type ~ year + probabilities
## Model 2: type ~ year * probabilities
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 153942 4582.8
## 2 153941 4582.0 1 0.86975 0.351
# mean(scaled_iscb$year)
# mean(scaled_iscb$probabilities)
The two groups of scientists did not have a significant association with the gender predicted from fore names (P = 0.18469). Interaction terms do not predict type
over and above the main effect of gender probability and year.
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] gdtools_0.2.2 wru_0.1-10 rnaturalearth_0.1.0
## [4] lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
## [7] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
## [10] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [13] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] class_7.3-17 rprojroot_1.3-2
## [5] fs_1.5.0 rstudioapi_0.12
## [7] farver_2.0.3 remotes_2.2.0
## [9] DT_0.16 prodlim_2019.11.13
## [11] fansi_0.4.1 xml2_1.3.2
## [13] codetools_0.2-16 splines_4.0.3
## [15] knitr_1.30 pkgload_1.1.0
## [17] jsonlite_1.7.1 pROC_1.16.2
## [19] broom_0.7.2 dbplyr_2.0.0
## [21] rgeos_0.5-5 compiler_4.0.3
## [23] httr_1.4.2 backports_1.2.0
## [25] assertthat_0.2.1 Matrix_1.2-18
## [27] cli_2.1.0 htmltools_0.5.0
## [29] prettyunits_1.1.1 tools_4.0.3
## [31] gtable_0.3.0 glue_1.4.2
## [33] rnaturalearthdata_0.1.0 reshape2_1.4.4
## [35] Rcpp_1.0.5 cellranger_1.1.0
## [37] vctrs_0.3.4 svglite_1.2.3.2
## [39] nlme_3.1-149 iterators_1.0.13
## [41] crosstalk_1.1.0.1 timeDate_3043.102
## [43] gower_0.2.2 xfun_0.19
## [45] ps_1.4.0 testthat_3.0.0
## [47] rvest_0.3.6 lifecycle_0.2.0
## [49] devtools_2.3.2 MASS_7.3-53
## [51] scales_1.1.1 ipred_0.9-9
## [53] hms_0.5.3 RColorBrewer_1.1-2
## [55] yaml_2.2.1 curl_4.3
## [57] memoise_1.1.0 rpart_4.1-15
## [59] stringi_1.5.3 desc_1.2.0
## [61] foreach_1.5.1 e1071_1.7-4
## [63] pkgbuild_1.1.0 lava_1.6.8.1
## [65] systemfonts_0.3.2 rlang_0.4.8
## [67] pkgconfig_2.0.3 evaluate_0.14
## [69] sf_0.9-6 recipes_0.1.15
## [71] htmlwidgets_1.5.2 labeling_0.4.2
## [73] cowplot_1.1.0 tidyselect_1.1.0
## [75] processx_3.4.4 plyr_1.8.6
## [77] magrittr_1.5 R6_2.5.0
## [79] generics_0.1.0 DBI_1.1.0
## [81] mgcv_1.8-33 pillar_1.4.6
## [83] haven_2.3.1 withr_2.3.0
## [85] units_0.6-7 survival_3.2-7
## [87] sp_1.4-4 nnet_7.3-14
## [89] modelr_0.1.8 crayon_1.3.4
## [91] KernSmooth_2.23-17 utf8_1.1.4
## [93] rmarkdown_2.5 usethis_1.6.3
## [95] grid_4.0.3 readxl_1.3.1
## [97] data.table_1.13.2 callr_3.5.1
## [99] ModelMetrics_1.2.2.2 reprex_0.3.0
## [101] digest_0.6.27 classInt_0.4-3
## [103] stats4_4.0.3 munsell_0.5.0
## [105] viridisLite_0.3.0 sessioninfo_1.1.1