# looking at only US affiliation
pubmed_race_pmids <- corr_authors %>%
tidyr::separate_rows(countries, sep = ',') %>%
filter(countries == 'US') %>%
rename('surname' = last_name_simple) %>%
predict_race(surname.only = T, impute.missing = F)
## [1] "Proceeding with surname-only predictions..."
## Warning in merge_surnames(voter.file, impute.missing =
## impute.missing): 5166 surnames were not matched.
pubmed_us_race <- pubmed_race_pmids %>%
group_by(pmid, journal, publication_date, year, adjusted_citations) %>%
summarise_at(vars(contains('pred.')), mean, na.rm = T, .groups = 'drop') %>%
{.}
iscb_us_race <- keynotes %>%
rename('surname' = last_name_simple) %>%
predict_race(surname.only = T, impute.missing = F)
## [1] "Proceeding with surname-only predictions..."
## Warning in merge_surnames(voter.file, impute.missing =
## impute.missing): 100 surnames were not matched.
my_jours <- unique(pubmed_us_race$journal)
my_confs <- unique(iscb_us_race$conference)
n_jours <- length(my_jours)
n_confs <- length(my_confs)
iscb_pubmed <- iscb_us_race %>%
rename('journal' = conference) %>%
select(year, journal, contains('pred')) %>%
mutate(publication_date = year,
type = 'Keynote speakers/Fellows',
adjusted_citations = 1) %>%
bind_rows(
pubmed_us_race %>%
select(year, journal, contains('pred'), publication_date, adjusted_citations) %>%
mutate(type = 'Pubmed authors')
) %>%
mutate(pred_sum_others = pred.his + pred.oth + pred.bla) %>%
tidyr::pivot_longer(contains('pred'),
names_to = 'Race',
values_to = 'probabilities') %>%
filter(!is.na(probabilities)) %>%
recode_race() %>%
group_by(type, year, Race) %>%
mutate(
pmc_citations_year = mean(adjusted_citations),
weight = adjusted_citations/pmc_citations_year,
weighted_probs = probabilities*weight
)
## Adding missing grouping variables: `pmid`
iscb_pubmed_sum <- iscb_pubmed %>%
summarise(
mean_prob = mean(weighted_probs),
mean_raw = mean(probabilities),
se_prob = sqrt(var(probabilities) * sum(weight^2)/(sum(weight)^2)),
me_prob = alpha_threshold * se_prob,
.groups = 'drop'
)
iscb_race <- vector('list', length = n_confs)
i <- 0
for (conf in my_confs){
i <- i + 1
iscb_race[[i]] <- iscb_pubmed %>%
filter(type != 'Pubmed authors' & journal == conf & (Race != 'Other categories')) %>%
group_by(year, Race, journal) %>%
summarise(mean_prob = mean(probabilities, na.rm = T), .groups = 'drop') %>%
# ungroup() %>%
{.}
}
save(iscb_pubmed, iscb_race, file = 'Rdata/iscb-us-race.Rdata')
fig_stats <- iscb_pubmed_sum %>%
filter(Race %in% c('White', 'Asian', 'Other categories')) %>%
gam_and_ci(
df2 = iscb_pubmed %>%
filter(Race %in% c('White', 'Asian', 'Other categories')),
start_y = start_year, end_y = end_year) +
theme(legend.position = c(0.84, 0.78),
panel.grid.minor = element_blank(),
legend.margin = margin(-0.5, 0, 0, 0, unit='cm')) +
facet_wrap(vars(forcats::fct_rev(Race)))
fig_2a <- iscb_pubmed_sum %>%
mutate(type = fct_rev(as.factor(type))) %>%
filter(Race != 'Other categories') %>%
# group_by(year, type, Race) %>%
# summarise(mean_prob = mean(probabilities, na.rm = T), .groups = 'drop') %>%
race_breakdown(category = 'main', race_levels, type)
fig_2 <- cowplot::plot_grid(
fig_2a, fig_stats, labels = 'AUTO', ncol = 1, rel_heights = c(1.5,1))
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
fig_2
ggsave('figs/us_racial_makeup.png', fig_2, width = 6.5, height = 5.5)
ggsave('figs/us_racial_makeup.svg', fig_2, width = 6.5, height = 5.5)
Regression of the probability of a name of a certain race on the type of scientists (authors vs. speakers) and year (interaction term included):
main_lm <- function(racei){
lm(weighted_probs ~ year + type,
data = iscb_pubmed %>%
ungroup() %>%
filter(Race == racei, !is.na(weighted_probs)) %>%
mutate(year = c(scale(year)),
type = relevel(as.factor(type), ref = 'Pubmed authors')))
}
inte_lm <- function(racei){
lm(weighted_probs ~ year * type,
data = iscb_pubmed %>%
ungroup() %>%
filter(Race == racei, !is.na(weighted_probs)) %>%
mutate(year = c(scale(year)),
type = relevel(as.factor(type), ref = 'Pubmed authors')))
}
inte_list <- lapply(c('White', 'Asian', 'Other categories'), main_lm)
lapply(inte_list, summary)
## [[1]]
##
## Call:
## lm(formula = weighted_probs ~ year + type, data = iscb_pubmed %>%
## ungroup() %>% filter(Race == racei, !is.na(weighted_probs)) %>%
## mutate(year = c(scale(year)), type = relevel(as.factor(type),
## ref = "Pubmed authors")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8113 -0.5414 -0.0942 0.2651 16.6148
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.612841 0.004475 136.937
## year -0.041522 0.004554 -9.118
## typeKeynote speakers/Fellows 0.083082 0.038991 2.131
## Pr(>|t|)
## (Intercept) <2e-16 ***
## year <2e-16 ***
## typeKeynote speakers/Fellows 0.0331 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6672 on 22553 degrees of freedom
## Multiple R-squared: 0.004462, Adjusted R-squared: 0.004374
## F-statistic: 50.54 on 2 and 22553 DF, p-value: < 2.2e-16
##
##
## [[2]]
##
## Call:
## lm(formula = weighted_probs ~ year + type, data = iscb_pubmed %>%
## ungroup() %>% filter(Race == racei, !is.na(weighted_probs)) %>%
## mutate(year = c(scale(year)), type = relevel(as.factor(type),
## ref = "Pubmed authors")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2870 -0.2596 -0.2303 0.1112 6.2454
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.247463 0.003178 77.861
## year 0.043328 0.003234 13.397
## typeKeynote speakers/Fellows -0.099391 0.027691 -3.589
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## year < 2e-16 ***
## typeKeynote speakers/Fellows 0.000332 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4739 on 22553 degrees of freedom
## Multiple R-squared: 0.009848, Adjusted R-squared: 0.00976
## F-statistic: 112.2 on 2 and 22553 DF, p-value: < 2.2e-16
##
##
## [[3]]
##
## Call:
## lm(formula = weighted_probs ~ year + type, data = iscb_pubmed %>%
## ungroup() %>% filter(Race == racei, !is.na(weighted_probs)) %>%
## mutate(year = c(scale(year)), type = relevel(as.factor(type),
## ref = "Pubmed authors")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1573 -0.1143 -0.0837 0.0159 9.6671
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 0.139696 0.001617 86.395
## year -0.001807 0.001645 -1.098
## typeKeynote speakers/Fellows 0.016309 0.014088 1.158
## Pr(>|t|)
## (Intercept) <2e-16 ***
## year 0.272
## typeKeynote speakers/Fellows 0.247
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2411 on 22553 degrees of freedom
## Multiple R-squared: 0.0001446, Adjusted R-squared: 5.593e-05
## F-statistic: 1.631 on 2 and 22553 DF, p-value: 0.1958
Interaction terms do not predict probabilities
over and above the main effect of group of scientists and year.
compare_lm <- function(racei) anova(main_lm(racei), inte_lm(racei))
lapply(c('White', 'Asian', 'Other categories'), compare_lm)
## [[1]]
## Analysis of Variance Table
##
## Model 1: weighted_probs ~ year + type
## Model 2: weighted_probs ~ year * type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 22553 10041
## 2 22552 10040 1 0.39995 0.8983 0.3432
##
## [[2]]
## Analysis of Variance Table
##
## Model 1: weighted_probs ~ year + type
## Model 2: weighted_probs ~ year * type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 22553 5064.0
## 2 22552 5063.6 1 0.43339 1.9302 0.1647
##
## [[3]]
## Analysis of Variance Table
##
## Model 1: weighted_probs ~ year + type
## Model 2: weighted_probs ~ year * type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 22553 1310.7
## 2 22552 1310.7 1 0.00067119 0.0115 0.9144
Specifically, a name coming from the group of honorees has significantly higher probability of being white (\(\beta_\textrm{white} =\) 0.083082, p = 0.033118) and lower probability of being Asian, (\(\beta_\textrm{Asian} =\) -0.099391, p = 0.00033221). The two groups of scientists did not have a significant association with names predicted to be in Other categories (p = 0.24699).