Show code
library(tidyverse)
## get my transliteration table (I tried to guess the PsycInfo ASCII name from the PsycTESTS name)
translit <- readRDS("../sober_rubric/raw_data/psycinfo_psyctests_names.rds")
## get our first scrape (by journal, checking counts for each year in each journal for top tests)
psycinfo_scrape_by_journal <- read_tsv('../sober_rubric/raw_data/merged_table_all.tsv') %>%
drop_na(Name) %>%
# this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
# mutate(Name = toTitleCase(Name)) %>%
rename(usage_count = "Hit Count") %>%
group_by(Name, Year) %>%
summarise(usage_count = sum(usage_count))
## get our second scrape (by test DOI and year)
overview <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
byyear <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_table_years_1.tsv")
# byyear %>% group_by(DOI) %>% summarise(Hits = sum(Hits, na.rm=T)) %>% pull(Hits) %>% table()
one_hit_wonders <- overview %>% filter(Hits == 1) %>%
mutate(Year = first_pub_year) %>%
mutate(Hits = coalesce(Hits, 1))
# for some few, the call was repeated by year for some reason
one_hit_wonders %>% select(DOI, first_pub_year) %>% inner_join(byyear, by = "DOI") %>% arrange(DOI)
# A tibble: 13,280 × 4
DOI first_pub_year Year Hits
<chr> <dbl> <dbl> <dbl>
1 10.1037/t00002-000 2014 2014 1
2 10.1037/t00046-000 2009 2009 1
3 10.1037/t00053-000 2013 2013 1
4 10.1037/t00077-000 2015 2015 1
5 10.1037/t00113-000 2012 2012 1
6 10.1037/t00119-000 2015 2015 1
7 10.1037/t00127-000 1949 1949 1
8 10.1037/t00128-000 2014 2014 1
9 10.1037/t00131-000 1961 1961 1
10 10.1037/t00193-000 2016 2016 1
# ℹ 13,270 more rows
Show code
byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")
psycinfo_by_doi <- one_hit_wonders %>%
select(DOI, Year, Hits) %>%
bind_rows(byyear) %>%
left_join(overview %>% rename(total_hits = Hits), by = "DOI")
## don't use tests with names that occur many times
dupe_names <- translit %>% group_by(name_psycinfo) %>% filter(n() > 1) %>% ungroup()
translit <- translit %>% group_by(name_psycinfo) %>%
mutate(non_unique_name = n() > 1) %>%
filter(row_number() == 1) %>% ungroup()
# merge it all
psycinfo <- psycinfo_by_doi %>%
full_join(translit %>% select(DOI, name_psycinfo, NameOC), by = "DOI") %>%
full_join(psycinfo_scrape_by_journal, by = c("name_psycinfo" = "Name", "Year")) %>%
rename(hits_scrape_1 = usage_count,
hits_scrape_2 = Hits,
total_hits_scrape_2 = total_hits) %>%
group_by(name_psycinfo) %>%
mutate(total_hits_scrape_1 = sum(hits_scrape_1))
psycinfo %>% is.na() %>% colSums()
DOI Year hits_scrape_2
96747 39022 135768
first_pub_year last_pub_year total_hits_scrape_2
135768 135768 135768
name_psycinfo NameOC hits_scrape_1
3079 99825 218121
total_hits_scrape_1
265989
Show code
## aggregate it all
psycinfo_overall <- psycinfo %>%
group_by(name_psycinfo) %>%
summarise(total_hits_scrape_1 = sum(hits_scrape_1, na.rm = T),
total_hits_scrape_2 = sum(hits_scrape_2, na.rm = T)) %>%
left_join(translit %>% select(DOI, name_psycinfo))
## correlate totals
cor.test(psycinfo_overall$total_hits_scrape_1, psycinfo_overall$total_hits_scrape_2)
Pearson's product-moment correlation
data: psycinfo_overall$total_hits_scrape_1 and psycinfo_overall$total_hits_scrape_2
t = 249.62, df = 104320, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6076982 0.6152964
sample estimates:
cor
0.6115114
Show code
psycinfo_overall %>%
filter(total_hits_scrape_1 > 0, total_hits_scrape_2 > 0) %>%
summarise(cor(total_hits_scrape_1, total_hits_scrape_2))
# A tibble: 1 × 1
`cor(total_hits_scrape_1, total_hits_scrape_2)`
<dbl>
1 0.904
Show code
## correlate by year, diffs, proportions
cor.test(psycinfo$hits_scrape_1, psycinfo$hits_scrape_2)
Pearson's product-moment correlation
data: psycinfo$hits_scrape_1 and psycinfo$hits_scrape_2
t = 467.52, df = 39014, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9196473 0.9226533
sample estimates:
cor
0.921164
Show code
[1] 12.3914
Show code
Show code
[1] 11.99798
Show code
# psycinfo %>% filter(hits_scrape_1 > hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()
psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% nrow()
[1] 27545
Show code
# psycinfo %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% table() %>% sort()
# psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()
Top Tests in each
Only in PsycInfo Scrape 1
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
<int> <dbl> <dbl>
1 35864 254316 7.09
Show code
options(cols.min.print = 2, cols.print = 2)
Only in PsycTests Scrape 2
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_2)` `sum(total_hits_scrape_2)/n()`
<int> <dbl> <dbl>
1 22052 57411 2.60
Show code
psycinfo_overall %>%
ungroup() %>%
filter(total_hits_scrape_1 == 0, total_hits_scrape_2 >= 1) %>%
# filter(!is.na(DOI), is.na(total_hits_scrape_1) | total_hits_scrape_1 == 0) %>%
drop_na(name_psycinfo, total_hits_scrape_2) %>%
arrange(desc(total_hits_scrape_2)) %>%
select( name_psycinfo, total_hits_scrape_2) %>%
DT::datatable()
Hits only in scrape 1, even though we have a match for the name
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
<int> <dbl> <dbl>
1 1453 22239 15.3
Hits only in scrape 1 without a clear match for the name
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
<int> <dbl> <dbl>
1 34411 232077 6.74
Merge Scrape 1 and 2
Show code
psycinfo_scrape_1_without_hits_in_2 <- psycinfo_overall %>%
ungroup() %>%
filter(total_hits_scrape_1 > 0, is.na(total_hits_scrape_2) | total_hits_scrape_2 == 0) %>%
select(DOI, name_psycinfo) %>%
distinct(name_psycinfo, .keep_all = TRUE) %>%
left_join(psycinfo_scrape_by_journal %>%
rename(name_psycinfo = Name, Hits = usage_count), by = "name_psycinfo", multiple = "all") %>%
mutate(DOI = coalesce(DOI, name_psycinfo)) %>%
group_by(DOI) %>%
mutate(first_pub_year = min(Year, na.rm = T),
last_pub_year = max(Year, na.rm = T),
total_hits = sum(Hits, na.rm = T)) %>%
ungroup()
psycinfo_scrape_1_without_hits_in_2 %>%
summarise(n_distinct(DOI), sum(Hits), sum(Hits)/n_distinct(DOI))
# A tibble: 1 × 3
`n_distinct(DOI)` `sum(Hits)` `sum(Hits)/n_distinct(DOI)`
<int> <dbl> <dbl>
1 35864 254316 7.09
Show code
[1] 3078
Show code
sum(!is.na(psycinfo_by_doi_with_hits$name_psycinfo))
[1] 215037
Show code
# A tibble: 1 × 3
`n_distinct(DOI)` `sum(Hits, na.rm = T)` sum(Hits, na.rm = T)/n_di…¹
<int> <dbl> <dbl>
1 31118 876009 28.2
# ℹ abbreviated name: ¹`sum(Hits, na.rm = T)/n_distinct(DOI)`
Show code
psycinfo_merged <- bind_rows(
scrape_2 = psycinfo_by_doi_with_hits,
scrape_1 = psycinfo_scrape_1_without_hits_in_2, .id = "source")
psycinfo_merged %>%
summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
# A tibble: 1 × 3
`n_distinct(DOI)` `sum(Hits, na.rm = T)` sum(Hits, na.rm = T)/n_di…¹
<int> <dbl> <dbl>
1 66982 1130325 16.9
# ℹ abbreviated name: ¹`sum(Hits, na.rm = T)/n_distinct(DOI)`
Show code
saveRDS(psycinfo_merged, "../sober_rubric/raw_data/psycinfo_merged_scrape_1_and_2.rds")
Joint top list