alpha_threshold <- qnorm(0.975)
all_full_names <- read_tsv('data/names/full-names.tsv.xz') %>% distinct()
gender_df <- read_tsv('data/gender/genderize.tsv')
# world <- ne_countries(scale='medium',returnclass = 'sf')
nat_to_reg <- world %>%
select(iso_a2, name, region_wb) %>%
rename('countries' = iso_a2,
'country_name' = name,
'region' = region_wb)
iscb_gender_df <- read_tsv('data/iscb/keynotes.tsv') %>%
mutate(publication_date = ymd(year, truncated = 2),
year = ymd(year, truncated = 2)) %>%
left_join(all_full_names, by = c('fore_name', 'last_name')) %>%
left_join(gender_df, by = 'fore_name_simple') %>%
filter(conference != 'PSB', year == '2020-01-01')
start_year <- 1993
end_year <- 2019
n_years <- end_year - start_year
my_confs <- unique(iscb_gender_df$conference)
n_confs <- length(my_confs)
table(iscb_gender_df$afflcountries)
##
## China Italy Japan United Kingdom
## 1 1 1 1
## United States
## 13
mean(iscb_gender_df$probability_male, na.rm = T)
## [1] 0.584375
Proportion of US affiliation: 76.47%. Mean probability of being male: 58.44%.
nationalize_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/data/NamePrism_results_authors.tsv') %>%
rename('full_name' = X1) %>%
distinct(full_name, .keep_all = T) %>%
left_join(all_full_names, by = 'full_name')
iscb_nat_df <- read_tsv('data/iscb/keynotes.tsv') %>%
mutate(publication_date = ymd(year, truncated = 2),
year = ymd(year, truncated = 2)) %>%
left_join(all_full_names, by = c('fore_name', 'last_name')) %>%
left_join(nationalize_df, by = c('fore_name', 'last_name_simple')) %>%
filter(conference != 'PSB', year == '2020-01-01')
# remove PSB, exclude ISCB Fellows and ISMB speakers in 2020 for now
my_confs <- unique(iscb_nat_df$conference)
n_confs <- length(my_confs)
region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
iscb_nat_df %>%
select(African:SouthAsian, publication_date) %>%
pivot_longer(African:SouthAsian,
names_to = 'region',
values_to = 'probabilities') %>%
filter(!is.na(probabilities)) %>%
group_by(region) %>%
add_count() %>%
summarise(
mean_prob = mean(probabilities, na.rm = T),
sd_prob = sd(probabilities, na.rm = T),
n = mean(n),
me_prob = alpha_threshold * sd_prob / sqrt(n)
) %>%
ungroup() %>%
recode_region() %>%
arrange(desc(mean_prob))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Problem with `mutate()` input `region`.
## ℹ Unknown levels in `f`: OtherCategories
## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
## # A tibble: 10 x 5
## region mean_prob sd_prob n me_prob
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 East Asian names 0.330 0.473 15 0.239
## 2 Celtic/English names 0.147 0.261 15 0.132
## 3 European names 0.146 0.225 15 0.114
## 4 South Asian names 0.136 0.348 15 0.176
## 5 Hebrew names 0.0736 0.171 15 0.0867
## 6 Hispanic names 0.0626 0.179 15 0.0904
## 7 Greek names 0.0596 0.218 15 0.110
## 8 African names 0.0214 0.0424 15 0.0215
## 9 Nordic names 0.0190 0.0460 15 0.0233
## 10 Arab/Turk/Pers names 0.00555 0.0117 15 0.00594
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] broom_0.7.2 DT_0.16 epitools_0.5-10.1
## [4] gdtools_0.2.2 wru_0.1-10 rnaturalearth_0.1.0
## [7] lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
## [10] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
## [13] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
## [16] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] class_7.3-17 rprojroot_1.3-2
## [5] fs_1.5.0 rstudioapi_0.12
## [7] farver_2.0.3 remotes_2.2.0
## [9] prodlim_2019.11.13 fansi_0.4.1
## [11] xml2_1.3.2 codetools_0.2-16
## [13] splines_4.0.3 knitr_1.30
## [15] pkgload_1.1.0 jsonlite_1.7.1
## [17] pROC_1.16.2 dbplyr_2.0.0
## [19] rgeos_0.5-5 compiler_4.0.3
## [21] httr_1.4.2 backports_1.2.0
## [23] assertthat_0.2.1 Matrix_1.2-18
## [25] cli_2.1.0 htmltools_0.5.0
## [27] prettyunits_1.1.1 tools_4.0.3
## [29] gtable_0.3.0 glue_1.4.2
## [31] rnaturalearthdata_0.1.0 reshape2_1.4.4
## [33] Rcpp_1.0.5 cellranger_1.1.0
## [35] vctrs_0.3.4 svglite_1.2.3.2
## [37] nlme_3.1-149 iterators_1.0.13
## [39] crosstalk_1.1.0.1 timeDate_3043.102
## [41] gower_0.2.2 xfun_0.19
## [43] ps_1.4.0 testthat_3.0.0
## [45] rvest_0.3.6 lifecycle_0.2.0
## [47] devtools_2.3.2 MASS_7.3-53
## [49] scales_1.1.1 ipred_0.9-9
## [51] hms_0.5.3 RColorBrewer_1.1-2
## [53] yaml_2.2.1 curl_4.3
## [55] memoise_1.1.0 rpart_4.1-15
## [57] stringi_1.5.3 desc_1.2.0
## [59] foreach_1.5.1 e1071_1.7-4
## [61] pkgbuild_1.1.0 lava_1.6.8.1
## [63] systemfonts_0.3.2 rlang_0.4.8
## [65] pkgconfig_2.0.3 evaluate_0.14
## [67] sf_0.9-6 recipes_0.1.15
## [69] htmlwidgets_1.5.2 labeling_0.4.2
## [71] cowplot_1.1.0 tidyselect_1.1.0
## [73] processx_3.4.4 plyr_1.8.6
## [75] magrittr_1.5 R6_2.5.0
## [77] generics_0.1.0 DBI_1.1.0
## [79] mgcv_1.8-33 pillar_1.4.6
## [81] haven_2.3.1 withr_2.3.0
## [83] units_0.6-7 survival_3.2-7
## [85] sp_1.4-4 nnet_7.3-14
## [87] modelr_0.1.8 crayon_1.3.4
## [89] KernSmooth_2.23-17 utf8_1.1.4
## [91] rmarkdown_2.5 usethis_1.6.3
## [93] grid_4.0.3 readxl_1.3.1
## [95] data.table_1.13.2 callr_3.5.1
## [97] ModelMetrics_1.2.2.2 reprex_0.3.0
## [99] digest_0.6.27 classInt_0.4-3
## [101] stats4_4.0.3 munsell_0.5.0
## [103] viridisLite_0.3.0 sessioninfo_1.1.1