Preprocessing APA PsycInfo

Show code
library(tidyverse)
## get my transliteration table (I tried to guess the PsycInfo ASCII name from the PsycTESTS name)
translit <- readRDS("../sober_rubric/raw_data/psycinfo_psyctests_names.rds")

## get our first scrape (by journal, checking counts for each year in each journal for top tests)
psycinfo_scrape_by_journal <- read_tsv('../sober_rubric/raw_data/merged_table_all.tsv') %>% 
  drop_na(Name) %>% 
  # this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
#  mutate(Name = toTitleCase(Name)) %>% 
  rename(usage_count = "Hit Count") %>% 
  group_by(Name, Year) %>% 
  summarise(usage_count = sum(usage_count))

## get our second scrape (by test DOI and year)
overview <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
byyear <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_table_years_1.tsv")
# byyear %>% group_by(DOI) %>% summarise(Hits = sum(Hits, na.rm=T)) %>% pull(Hits) %>% table()

one_hit_wonders <- overview %>% filter(Hits == 1) %>% 
  mutate(Year = first_pub_year) %>% 
  mutate(Hits = coalesce(Hits, 1))
# for some few, the call was repeated by year for some reason
one_hit_wonders %>% select(DOI, first_pub_year) %>% inner_join(byyear, by = "DOI") %>% arrange(DOI)
# A tibble: 13,280 × 4
   DOI                first_pub_year  Year  Hits
   <chr>                       <dbl> <dbl> <dbl>
 1 10.1037/t00002-000           2014  2014     1
 2 10.1037/t00046-000           2009  2009     1
 3 10.1037/t00053-000           2013  2013     1
 4 10.1037/t00077-000           2015  2015     1
 5 10.1037/t00113-000           2012  2012     1
 6 10.1037/t00119-000           2015  2015     1
 7 10.1037/t00127-000           1949  1949     1
 8 10.1037/t00128-000           2014  2014     1
 9 10.1037/t00131-000           1961  1961     1
10 10.1037/t00193-000           2016  2016     1
# ℹ 13,270 more rows
Show code
byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")

psycinfo_by_doi <- one_hit_wonders %>% 
  select(DOI, Year, Hits) %>% 
  bind_rows(byyear) %>% 
  left_join(overview %>% rename(total_hits = Hits), by = "DOI")


## don't use tests with names that occur many times
dupe_names <- translit %>% group_by(name_psycinfo) %>% filter(n() > 1) %>% ungroup()
translit <- translit %>% group_by(name_psycinfo) %>% 
  mutate(non_unique_name = n() > 1) %>% 
  filter(row_number() == 1) %>% ungroup()

# merge it all
psycinfo <- psycinfo_by_doi %>% 
  full_join(translit %>% select(DOI, name_psycinfo, NameOC), by = "DOI") %>% 
  full_join(psycinfo_scrape_by_journal, by = c("name_psycinfo" = "Name", "Year")) %>% 
  rename(hits_scrape_1 = usage_count,
         hits_scrape_2 = Hits,
         total_hits_scrape_2 = total_hits) %>% 
  group_by(name_psycinfo) %>% 
  mutate(total_hits_scrape_1 = sum(hits_scrape_1))
psycinfo %>% is.na() %>% colSums()
                DOI                Year       hits_scrape_2 
              96747               39022              135768 
     first_pub_year       last_pub_year total_hits_scrape_2 
             135768              135768              135768 
      name_psycinfo              NameOC       hits_scrape_1 
               3079               99825              218121 
total_hits_scrape_1 
             265989 
Show code
## aggregate it all
psycinfo_overall <- psycinfo %>% 
  group_by(name_psycinfo) %>% 
  summarise(total_hits_scrape_1 = sum(hits_scrape_1, na.rm = T),
         total_hits_scrape_2 = sum(hits_scrape_2, na.rm = T)) %>% 
  left_join(translit %>% select(DOI, name_psycinfo))

## correlate totals
cor.test(psycinfo_overall$total_hits_scrape_1, psycinfo_overall$total_hits_scrape_2)

    Pearson's product-moment correlation

data:  psycinfo_overall$total_hits_scrape_1 and psycinfo_overall$total_hits_scrape_2
t = 249.62, df = 104320, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6076982 0.6152964
sample estimates:
      cor 
0.6115114 
Show code
psycinfo_overall %>% 
  filter(total_hits_scrape_1 > 0, total_hits_scrape_2 > 0) %>% 
  summarise(cor(total_hits_scrape_1, total_hits_scrape_2))
# A tibble: 1 × 1
  `cor(total_hits_scrape_1, total_hits_scrape_2)`
                                            <dbl>
1                                           0.904
Show code
## correlate by year, diffs, proportions
cor.test(psycinfo$hits_scrape_1, psycinfo$hits_scrape_2)

    Pearson's product-moment correlation

data:  psycinfo$hits_scrape_1 and psycinfo$hits_scrape_2
t = 467.52, df = 39014, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9196473 0.9226533
sample estimates:
     cor 
0.921164 
Show code
psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% abs() %>% mean(na.rm=T)
[1] 12.3914
Show code
psycinfo %>%  mutate(prop = hits_scrape_2/ hits_scrape_1) %>% pull(prop) %>%  qplot() + scale_x_log10()
Show code
psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>%  mean(na.rm=T)
[1] 11.99798
Show code
# psycinfo %>% filter(hits_scrape_1 > hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()

psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% nrow()
[1] 27545
Show code
# psycinfo %>%  mutate(diff = hits_scrape_2 - hits_scrape_1) %>% pull(diff) %>% table() %>% sort()

# psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()

Top Tests in each

Only in PsycInfo Scrape 1

Show code
psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())
# A tibble: 1 × 3
  `n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
  <int>                      <dbl>                          <dbl>
1 35864                     254316                           7.09
Show code
options(cols.min.print = 2, cols.print = 2)
Show code
psycinfo_overall %>% 
  ungroup() %>% 
  # filter(is.na(DOI)) %>%
  filter(total_hits_scrape_2 == 0, total_hits_scrape_1 >= 1) %>% 
  arrange(desc(total_hits_scrape_1)) %>% 
  select(name_psycinfo, total_hits_scrape_1) %>% 
  arrange(desc(total_hits_scrape_1)) %>% 
  DT::datatable()

Only in PsycTests Scrape 2

Show code
psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 == 0,
         total_hits_scrape_2 > 0) %>% 
  summarise(n(), sum(total_hits_scrape_2), sum(total_hits_scrape_2)/n())
# A tibble: 1 × 3
  `n()` `sum(total_hits_scrape_2)` `sum(total_hits_scrape_2)/n()`
  <int>                      <dbl>                          <dbl>
1 22052                      57411                           2.60
Show code
psycinfo_overall %>% 
  ungroup() %>% 
  filter(total_hits_scrape_1 == 0, total_hits_scrape_2 >= 1) %>% 
  # filter(!is.na(DOI), is.na(total_hits_scrape_1) | total_hits_scrape_1 == 0) %>% 
  drop_na(name_psycinfo, total_hits_scrape_2) %>% 
  arrange(desc(total_hits_scrape_2)) %>% 
  select( name_psycinfo, total_hits_scrape_2) %>% 
  DT::datatable()

Hits only in scrape 1, even though we have a match for the name

Show code
psycinfo_overall %>% 
  ungroup() %>% 
  filter(!is.na(DOI),
         total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())
# A tibble: 1 × 3
  `n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
  <int>                      <dbl>                          <dbl>
1  1453                      22239                           15.3

Hits only in scrape 1 without a clear match for the name

Show code
psycinfo_overall %>% 
  ungroup() %>% 
  filter(is.na(DOI),
         total_hits_scrape_1 > 0,
         total_hits_scrape_2 == 0) %>% 
  summarise(n(), sum(total_hits_scrape_1), sum(total_hits_scrape_1)/n())
# A tibble: 1 × 3
  `n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
  <int>                      <dbl>                          <dbl>
1 34411                     232077                           6.74

Merge Scrape 1 and 2

Show code
psycinfo_scrape_1_without_hits_in_2 <- psycinfo_overall %>% 
    ungroup() %>% 
    filter(total_hits_scrape_1 > 0, is.na(total_hits_scrape_2) | total_hits_scrape_2 == 0) %>% 
    select(DOI, name_psycinfo) %>% 
    distinct(name_psycinfo, .keep_all = TRUE) %>% 
    left_join(psycinfo_scrape_by_journal %>% 
     rename(name_psycinfo = Name, Hits = usage_count), by = "name_psycinfo", multiple = "all") %>% 
    mutate(DOI = coalesce(DOI, name_psycinfo)) %>% 
    group_by(DOI) %>% 
    mutate(first_pub_year = min(Year, na.rm = T),
           last_pub_year = max(Year, na.rm = T),
           total_hits = sum(Hits, na.rm = T)) %>% 
  ungroup()

psycinfo_scrape_1_without_hits_in_2 %>% 
  summarise(n_distinct(DOI), sum(Hits), sum(Hits)/n_distinct(DOI))
# A tibble: 1 × 3
  `n_distinct(DOI)` `sum(Hits)` `sum(Hits)/n_distinct(DOI)`
              <int>       <dbl>                       <dbl>
1             35864      254316                        7.09
Show code
psycinfo_by_doi_with_hits <- psycinfo_by_doi %>%
  drop_na(Hits, Year) %>% 
  anti_join(psycinfo_overall %>% filter(total_hits_scrape_2 == 0) %>% select(DOI), by = "DOI") %>% 
  left_join(translit %>% select(DOI, name_psycinfo), by = "DOI")
sum(is.na(psycinfo_by_doi_with_hits$name_psycinfo))
[1] 3078
Show code
sum(!is.na(psycinfo_by_doi_with_hits$name_psycinfo))
[1] 215037
Show code
psycinfo_by_doi_with_hits %>% 
  summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
# A tibble: 1 × 3
  `n_distinct(DOI)` `sum(Hits, na.rm = T)` sum(Hits, na.rm = T)/n_di…¹
              <int>                  <dbl>                       <dbl>
1             31118                 876009                        28.2
# ℹ abbreviated name: ¹​`sum(Hits, na.rm = T)/n_distinct(DOI)`
Show code
psycinfo_merged <- bind_rows(
  scrape_2 = psycinfo_by_doi_with_hits, 
  scrape_1 = psycinfo_scrape_1_without_hits_in_2, .id = "source")

psycinfo_merged %>% 
  summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
# A tibble: 1 × 3
  `n_distinct(DOI)` `sum(Hits, na.rm = T)` sum(Hits, na.rm = T)/n_di…¹
              <int>                  <dbl>                       <dbl>
1             66982                1130325                        16.9
# ℹ abbreviated name: ¹​`sum(Hits, na.rm = T)/n_distinct(DOI)`
Show code
saveRDS(psycinfo_merged, "../sober_rubric/raw_data/psycinfo_merged_scrape_1_and_2.rds")

Joint top list

Show code
psycinfo_merged %>% 
  group_by(DOI, name_psycinfo, source) %>%
  summarise(total_hits = sum(Hits, na.rm  = T)) %>% 
  arrange(desc(total_hits)) %>% 
  ungroup() %>% 
  select( source, name_psycinfo, total_hits) %>% 
  DT::datatable()