##
## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## fore_name = col_character(),
## last_name = col_character(),
## fore_name_simple = col_character(),
## last_name_simple = col_character(),
## full_name = col_character()
## )
nat_to_reg <- world %>%
select(- geometry) %>%
as_tibble() %>%
select(iso_a2, name, region_wb) %>%
rename('countries' = iso_a2,
'country_name' = name,
'region' = region_wb) %>%
mutate(country_name = country_name %>%
gsub('United States of America', 'United States', .))
nat_to_reg[nat_to_reg$country_name == 'Norway', 'countries'] <- 'NO'
nat_to_reg[nat_to_reg$country_name == 'Somaliland', 'countries'] <- 'SO'
nat_to_reg[nat_to_reg$country_name == 'France', 'countries'] <- 'FR'
nat_to_reg %>% filter(is.na(countries))
## # A tibble: 5 x 3
## countries country_name region
## <chr> <chr> <chr>
## 1 <NA> Ashmore and Cartier Is. East Asia & Pacific
## 2 <NA> N. Cyprus Europe & Central Asia
## 3 <NA> Indian Ocean Ter. East Asia & Pacific
## 4 <NA> Siachen Glacier South Asia
## 5 <NA> Kosovo Europe & Central Asia
articles <- readr::read_tsv('data/pubmed/articles.tsv.xz') %>%
mutate(year = substr(publication_date, 1, 4) %>% ymd(truncated = 2),
publication_date = ymd(publication_date, truncated = 2)) %>%
filter(year(publication_date) < 2020)
##
## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## pmid = col_double(),
## pmcid = col_character(),
## doi = col_character(),
## journal = col_character(),
## publication_date = col_character(),
## pmc_cited_by_count = col_double(),
## title = col_character()
## )
# citations <- xml2::read_xml('data/pubmed/esummary/compbio-english.xml.xz')
corr_authors <- readr::read_tsv(
'data/names/corresponding-authors.tsv.xz',
col_types = readr::cols(fore_name_simple = readr::col_character())) %>%
inner_join(articles, by = 'pmid') %>%
mutate(adjusted_citations = sqrt(pmc_cited_by_count + 1))
keynotes <- readr::read_tsv('data/iscb/keynotes.tsv') %>%
mutate(publication_date = ymd(year, truncated = 2),
year = ymd(year, truncated = 2)) %>%
left_join(select(all_full_names, - full_name), by = c('fore_name', 'last_name')) %>%
filter(year(year) < 2020, conference != 'PSB') # remove PSB, exclude ISCB Fellows and ISMB speakers in 2020 for now
##
## ── Column specification ───────────────────────────────────────────────────────────────────────────────
## cols(
## year = col_double(),
## full_name = col_character(),
## fore_name = col_character(),
## last_name = col_character(),
## conference = col_character(),
## source = col_character(),
## affiliations = col_character(),
## afflcountries = col_character()
## )
## # A tibble: 0 x 11
## # … with 11 variables: year <date>, full_name <chr>, fore_name <chr>, last_name <chr>,
## # conference <chr>, source <chr>, affiliations <chr>, afflcountries <chr>, publication_date <date>,
## # fore_name_simple <chr>, last_name_simple <chr>
large_jours <- articles %>%
count(journal, sort = T) %>%
head(10)
nationalize_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/6ab0feeca430ae9997dbaf8f81707359be50a17d/data/NamePrism_results_authors.tsv') %>%
rename('full_name' = X1) %>%
distinct(full_name, .keep_all = T) %>%
left_join(all_full_names, by = 'full_name')
## Warning: Missing column names filled in: 'X1' [1]
##
## ── Column specification ──────────────────────────────────────────
## cols(
## X1 = col_character(),
## African = col_double(),
## CelticEnglish = col_double(),
## EastAsian = col_double(),
## European = col_double(),
## Greek = col_double(),
## Hispanic = col_double(),
## Jewish = col_double(),
## Muslim = col_double(),
## Nordic = col_double(),
## SouthAsian = col_double()
## )
Number of articles from 1993-2019: 176773 (~ 100 articles with no authors).
Number of last authors: 176609.
corr_authors %>%
count(year, name = 'Number of articles with last authors') %>%
DT::datatable(rownames = F)
If we set a threshold at least 200 articles a year, we should only consider articles from 1998 on.
corr_authors <- corr_authors %>%
add_count(year, name = 'n_aut_yr') %>%
filter(n_aut_yr > 200) %>%
select(- n_aut_yr)
nrow(corr_authors)
## [1] 176110