Import

APA PsycTests

Show code

records_wide <- readRDS("../sober_rubric/raw_data/preprocessed_records.rds")

records_wide %>% group_by(Name)  %>% filter(n()>1) %>% ungroup() %>% summarise(n_distinct(Name), n())

# A tibble: 1 × 2
  `n_distinct(Name)` `n()`
               <int> <int>
1               1237  3043

Show code

records_wide$first_construct <- str_trim(str_replace_all(str_to_lower(records_wide$first_construct), "[:space:]+", " "))

First EBSCO scrape of APA PsycInfo

Show code

psycinfo <- read_tsv('../sober_rubric/raw_data/merged_table_all.tsv') %>% 
  # this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
#  mutate(Name = toTitleCase(Name)) %>% 
  rename(usage_count = "Hit Count") %>% 
  group_by(Name, Year) %>% 
  summarise(usage_count = sum(usage_count))

Second EBSCO scrape of APA PsycInfo

Show code

overview <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
byyear <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_table_years_1.tsv")
n_distinct(byyear$DOI)

[1] 31145

Show code

overview %>% filter(is.na(Hits)) %>% nrow()

[1] 40574

Show code

nrow(overview)

[1] 71692

Show code

overview %>% filter(Hits >= 1) %>% nrow()

[1] 31118

Show code

one_hit_wonders <- overview %>% filter(Hits == 1) %>% 
  mutate(Year = first_pub_year)

nrow(one_hit_wonders)

[1] 13280

Show code

# for some few, the call was repeated by year for some reason
# one_hit_wonders %>% select(DOI, first_pub_year) %>% inner_join(byyear, by = "DOI") %>% arrange(DOI)

byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")

all <- one_hit_wonders %>% 
  select(DOI, Year, Hits) %>% 
  bind_rows(byyear) %>% 
  left_join(overview %>% rename(total_hits = Hits), by = "DOI")

n_distinct(all$DOI)

[1] 31145

Show code

all %>% filter(Hits > 0) %>% filter(Year < first_pub_year | Year > last_pub_year) %>% nrow()

[1] 0

Show code

all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(hits_by_year > total_hits) %>% ungroup() %>% select(DOI, everything()) %>% mutate(diff = hits_by_year - total_hits) %>% nrow()

[1] 0

Show code

# all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(usage_count, na.rm = T)) %>% filter(hits_by_year < total_hits) %>% select(DOI, everything()) %>% mutate(diff = hits_by_year - total_hits) %>%  View()
all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(hits_by_year == total_hits) %>% nrow()

[1] 31118

Show code

all %>% group_by(DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(hits_by_year == 0)

# A tibble: 27 × 2
   DOI                hits_by_year
   <chr>                     <dbl>
 1 10.1037/t00747-000            0
 2 10.1037/t00875-000            0
 3 10.1037/t00878-000            0
 4 10.1037/t00879-000            0
 5 10.1037/t02477-000            0
 6 10.1037/t02488-000            0
 7 10.1037/t04670-000            0
 8 10.1037/t04771-000            0
 9 10.1037/t04776-000            0
10 10.1037/t04779-000            0
# ℹ 17 more rows

Show code

all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(is.na(total_hits)) %>% pull(hits_by_year) %>% table()

.
 0 
27

Show code

psyctests_info <- records_wide %>% 
  select(DOI, TestYear, Name, first_construct, 
         original_test_DOI, original_DOI_combined, 
         test_type, ConstructList, subdiscipline_1, subdiscipline_2, 
         classification_1, classification_2, instrument_type_broad, 
         InstrumentType,
         number_of_factors_subscales, Name_base) %>%
  distinct() %>% 
  inner_join(all, by = c("DOI" = "DOI"), multiple = "all") %>% 
  rename(usage_count = "Hits")


write_rds(psyctests_info, "../sober_rubric/raw_data/psyctests_info.rds")

2016 changes in standards

Show code

test_data <- records_wide %>%
  filter(TestYear <= 2022) %>%
    rowwise() %>%
    mutate(Methodology = length(MethodologyList) >0) %>%
    mutate(AdministrationMethod = length(AdministrationMethodList) >0) %>%
    mutate(PopulationGroup = length(PopulationGroupList) >0) %>%
    mutate(AgeGroup = length(AgeGroupList) >0) %>%
    group_by(TestYear) %>%
    summarise(Reliability = mean(Reliability!="No reliability indicated."),
              FactorAnalysis = mean(FactorAnalysis!="No factor analysis indicated."),
              # Unidimensional = mean(FactorAnalysis=="This is a unidimensional measure."),
              FactorsAndSubscales = mean(!is.na(FactorsAndSubscales)),
              Validity = mean(Validity!="No validity indicated."),
              Format = mean(!is.na(Format)),
              # Fee = mean(Fee == "Yes"),
              Methodology = mean(Methodology),
              AdministrationMethod = mean(AdministrationMethod),
              # AgeGroup = mean(AgeGroup),
              # PopulationGroup = mean(PopulationGroup),
              TestItems = mean(TestItemsAvailable == "Yes")) %>%
    pivot_longer(-TestYear)
test_data %>%
  ggplot(aes(TestYear, value, color = name)) +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030),
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1.2))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  geom_text_repel(
    data = test_data %>% drop_na() %>% group_by(name) %>% filter(TestYear == max(TestYear, na.rm = T)),
    aes(label = name),
    segment.color = 'grey',
    xlim = c(2022, 2033),
    box.padding = 0.1,
    # point.padding = 0.6,
    nudge_x = 1.2,
    # nudge_y = 0,
    force = 0.5,
    hjust = 0,
    direction="y",
    na.rm = TRUE
  ) +
  ylab("PsycTests contains information about...") +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/changed_standards_2016.pdf", width = 8, height = 4)
ggsave("figures/changed_standards_2016.png", width = 8, height = 4)

Show code

records_wide %>% 
  group_by(TestYear) %>% 
  summarise(number_of_test_items = mean(number_of_test_items, na.rm = T)) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
ggplot(aes(TestYear, number_of_test_items)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, NA), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
  xlab("Publication year in APA PsycTests") +
  ylab("Number of items in measure")

Show code

records_wide %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
ggplot(aes(TestYear, number_of_factors_subscales)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_pointrange(stat = 'summary', fun.data = 'mean_se') +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Number of factors/subscales")

Show code

records_wide %>% 
  group_by(InstrumentType, TestYear) %>% 
  summarise(tests = n()) %>% 
  group_by(TestYear) %>% 
  mutate(tests = tests/sum(tests, na.rm = T)) %>% 
  arrange(TestYear) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  # mutate(tests = cumsum(tests)) %>% 
  ggplot(aes(TestYear, tests, color = InstrumentType)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_color_viridis_d() +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Proportion of instrument type")

Show code

records_wide %>% 
  # filter(instrument_type_broad !=)
  group_by(instrument_type_broad, TestYear) %>% 
  summarise(tests = n()) %>% 
  group_by(TestYear) %>% 
  mutate(tests = tests/sum(tests, na.rm = T)) %>% 
  arrange(TestYear) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  # mutate(tests = cumsum(tests)) %>% 
  ggplot(aes(TestYear, tests, color = instrument_type_broad)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Proportion of instrument type")

Show code

records_wide %>% 
  filter(instrument_type_broad != "questionnaire") %>% 
  group_by(InstrumentType, TestYear) %>% 
  summarise(tests = n()) %>% 
  group_by(TestYear) %>% 
  mutate(tests = tests/sum(tests, na.rm = T)) %>% 
  arrange(TestYear) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  # mutate(tests = cumsum(tests)) %>% 
  ggplot(aes(TestYear, tests, color = InstrumentType)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Proportion of instrument type") +
  ggtitle("Without questionnaires")

Show code

psyctests_info %>% group_by(DOI, TestYear) %>% 
  summarise(used = sum(usage_count, na.rm = T)) %>% 
  full_join(records_wide %>% select(DOI, TestYear)) %>% 
  mutate(used = coalesce(used, 0)) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  group_by(TestYear) %>% 
  summarise(never_reused = mean(used == 0),
            used_once = mean(used == 1),
            used_twice = mean(used == 2),
            used_thrice = mean(used == 3),
            used_more_10 = mean(used > 9)) %>% 
  pivot_longer(-TestYear) %>% 
  ggplot(aes(TestYear, value, color = name)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Frequency")

New measures by publication year

Show code

count_all <- records_wide %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear)

count_orig <- records_wide %>% 
  filter(test_type == "Original") %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear)

count_base <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI))

count_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(first_construct)) %>% 
  arrange(TestYear)

counts <- bind_rows(
  "novel constructs" = count_construct,
  "novel measures" = count_orig,
  "with translations\n and revisions" = count_all,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(counts, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Number of measures/constructs published") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) +
  geom_text_repel(data = counts %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  lineheight = .9,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1.5,
                  nudge_y = -15,
                  force = 1,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/counts.pdf", width = 8, height = 4)
ggsave("figures/counts.png", width = 8, height = 4)

Cumulative number of measures and constructs

Show code

cumsum_all <- records_wide %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_orig <- records_wide %>% 
  filter(test_type == "Original") %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_base <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsums <- bind_rows(
  "novel constructs" = cumsum_construct,
  "novel measures" = cumsum_orig,
  "with translations\n and revisions" = cumsum_all,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of measures") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", tests, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE) +
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )

Show code

ggsave("figures/cumsums.pdf", width = 8, height = 4)
ggsave("figures/cumsums.png", width = 8, height = 4)

By subdiscipline

Show code

cumsum_all <- records_wide %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 


cumsum_orig <- records_wide %>% 
  filter(test_type == "Original") %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_base <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsums <- bind_rows(
  "constructs" = cumsum_construct,
  "measures" = cumsum_orig,
  "with translations & revisions" = cumsum_all,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)

cumsums$origin <- factor(cumsums$origin, levels = c("with translations & revisions", "constructs", "measures"))
my_colors <- c("with translations & revisions" = "#7570B3",
               "constructs" = "#1B9E77", 
               "measures" = "#D95F02") 

ggplot(cumsums, aes(Year, tests, color = origin)) + 
  geom_line() +
  facet_wrap(~ subdiscipline_1, scales = "free_y", ncol = 2) + 
  scale_y_continuous("Cumulative number of measures") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2022), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_manual(values = my_colors, guide = guide_legend(title = NULL)) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = c(0.58, 0.08),
        legend.justification = c(0, 0),
        legend.box.just = "right",
        legend.text = element_text(size = 11))

Show code

ggsave("figures/cumsums_subdiscipline.pdf", width = 8, height = 7)
ggsave("figures/cumsums_subdiscipline.png", width = 8, height = 7)

Tests by usage frequency

Show code

test_frequency <- psyctests_info %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  # filter(TestYear >= 1990) %>%
  filter(between(Year, 1993, 2022)) %>%
  group_by(Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  arrange(n)

test_frequency <- records_wide %>% 
  filter(test_type == "Original", TestYear <= 2022) %>% 
  select(Test = DOI) %>% 
  full_join(test_frequency) %>% 
  mutate(n = coalesce(n, 0.5))

test_frequency <- test_frequency %>% 
  group_by(n) %>% 
  summarise(count = n()) %>% 
  ungroup() %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  # facet_wrap(~ subdiscipline_1, scales = "free_y") + 
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
  scale_y_continuous("Number of measures") +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(0.5, 1, 2, 5, 10, 100, 1000, 25000),
               labels = c(0, 1, 2, 5, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count + 700)) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
freq_plot

Show code

ggsave("figures/frequency_across.pdf", width = 8, height = 4)
ggsave("figures/frequency_across.png", width = 8, height = 4)

test_frequency <- test_frequency %>% 
  filter(n >= 1) %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
  scale_y_continuous("Number of measures") +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(1, 2, 5, 10, 100, 1000, 25000),
               labels = c(1, 2, 5, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count + 700)) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
freq_plot

Show code

ggsave("figures/frequency_across_no0.pdf", width = 8, height = 4)
ggsave("figures/frequency_across_no0.png", width = 8, height = 4)

Show code

test_frequency <- psyctests_info %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  # filter(TestYear >= 1990) %>%
  filter(between(Year, 1993, 2022)) %>%
  group_by(subdiscipline_1, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  arrange(n)

test_frequency <- records_wide %>% 
  filter(test_type == "Original", TestYear <= 2022) %>% 
  select(subdiscipline_1, Test = DOI) %>% 
  full_join(test_frequency) %>% 
  mutate(n = coalesce(n, 0.5))

test_frequency <- test_frequency %>% 
  # mutate(n = if_else(n >= 1000, 1000, n)) %>% 
  mutate(subdiscipline_1 = str_replace(subdiscipline_1, " Psychology", "")) %>% 
  group_by(subdiscipline_1, n) %>% 
  summarise(count = n()) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  facet_wrap(~ subdiscipline_1, scales = "free_y") + 
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
  scale_y_continuous("Number of measures", expand = expansion(c(0, 0.1))) +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(0.5, 1, 2, 10, 100, 1000, 25000),
               labels = c(0, 1, 2, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count ), size = 3, vjust = -0.4, hjust = 0.4) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

freq_plot

Show code

ggsave("figures/frequency.pdf", width = 8, height = 4)
ggsave("figures/frequency.png", width = 8, height = 4)

test_frequency <- test_frequency %>% 
  filter(n >= 1) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  facet_wrap(~ subdiscipline_1, scales = "free_y") + 
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
    scale_y_continuous("Number of measures", expand = expansion(c(0, 0.1))) +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(1, 2, 5, 10, 100, 1000, 25000),
               labels = c(1, 2, 5, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count ), size = 3, vjust = -0.11, hjust = 0.4) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
freq_plot

Show code

ggsave("figures/frequency_no0.pdf", width = 8, height = 4)

By instrument type

Show code

usage_by_year_instrument_type <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(instrument_type_broad, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(instrument_type_broad) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(instrument_type_broad, n_tests, Year) %>% 
  summarise(n = sum(n),
            diff_tests = n()) %>% 
  ungroup()



usage_by_year_instrument_type %>% 
  ggplot(., aes(Year, n, color = instrument_type_broad)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Times tests were coded in PsycInfo") +
  scale_x_continuous(limits = c(1993, 2030), breaks = c(1993, 1998, 2003, 2008, 2013, 2018, 2022)) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = gsub("^.*$", " ", instrument_type_broad)), # This will force the correct position of the link's right end.
                  data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE
  ) +
  geom_text_repel(data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(instrument_type_broad, " Psychology", ""), " (n=", n_tests, ")")),
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE)+
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/usage_by_year_instrument_type.pdf", width = 10, height = 4)
ggsave("figures/usage_by_year_instrument_type.png", width = 10, height = 4)

Show code

usage_by_year_instrument_type <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(InstrumentType, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(InstrumentType) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(InstrumentType, n_tests, Year) %>% 
  summarise(n = sum(n),
            diff_tests = n()) %>% 
  ungroup()



usage_by_year_instrument_type %>% 
  ggplot(., aes(Year, n, color = InstrumentType)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Times tests were coded in PsycInfo") +
  scale_x_continuous(limits = c(1993, 2035), breaks = c(1993, 1998, 2003, 2008, 2013, 2018, 2022)) +
  scale_color_discrete() +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = gsub("^.*$", " ", InstrumentType)), # This will force the correct position of the link's right end.
                  data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE
  ) +
  geom_text_repel(data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(InstrumentType, " Psychology", ""), " (n=", n_tests, ")")),
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE)+
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/usage_by_year_instrument_type_narrow.pdf", width = 14, height = 10)
ggsave("figures/usage_by_year_instrument_type_narrow.png", width = 14, height = 10)

Entropy/Fragmentation

Show code

# searchstring:(TM *a* OR TM *e* OR TM *o* OR TM *i* OR TM *u*) AND PY "year (i.e. 1993)"
denominator <- rio::import("TnM_with_vowels_per_year_psycinfo.xlsx") %>% 
  rename(Year = year, number_of_publications_with_tests_and_measures = results)
psyctests_info <- psyctests_info %>% left_join(denominator) %>% 
    filter(between(Year, 1993, 2022))

var_label(psyctests_info$DOI) <- "Test DOI"
var_label(psyctests_info$TestYear) <- "When test was published according to PsycTests"
var_label(psyctests_info$first_pub_year) <- "First recorded year in which test was coded in PsycInfo"
var_label(psyctests_info$last_pub_year) <- "Last recorded year in which test was coded in PsycInfo"
var_label(psyctests_info$subdiscipline_1) <- "Subdiscipline of test"
var_label(psyctests_info$usage_count) <- "Frequency observed in PsycInfo"
write_rds(psyctests_info, "psycinfo_over_time.rds")

Revised approach using Hill diversity and coverage

Total number of tests coded in our scrape vs. total number of pubs with tests and measures are highly correlated. The former probably better represents sampling effort, because lower effort could also result in fewer tests being coded for a single paper. So, using the number of tests coded as our indicator of effort is proper.

Show code

pubs_vs_tests <- psyctests_info %>% group_by(Year) %>% summarise(n = sum(usage_count, na.rm = T), pubs_examined = mean(number_of_publications_with_tests_and_measures))
cor.test(pubs_vs_tests$n, pubs_vs_tests$pubs_examined) %>% broom::tidy() %>% knitr::kable(caption = "Correlation between publications with tests coded vs. number of tests per year")

Table 1: Correlation between publications with tests coded vs. number of tests per year
estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.9791029	25.47591	0	28	0.9560829	0.9901174	Pearson’s product-moment correlation	two.sided

To compare across years, we use either rarefaction or coverage standardization to make every year directly comparable.

Show code

# Set a seed for reproducibility
set.seed(05102019)

### 1. Prepare Counts Data (example: All tests)
# Assume psyctests_info is already loaded and structured as in your original code.

counts_all <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(DOI) %>% 
  filter(usage_count > 0) %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop")

# 2.2. Revisions lumped
counts_lumped <- psyctests_info %>% 
  filter(between(Year, 1993, 2022), usage_count > 0) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>%
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop")

# 2.3. By construct
counts_construct <- psyctests_info %>% 
  filter(between(Year, 1993, 2022), usage_count > 0) %>% 
  drop_na(first_construct) %>% 
  mutate(Test = first_construct) %>%
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop")

### 2. Reshape Data: Make Tests the Rows and Years the Columns
# Pivot to wide format so that each row is a Test and each column (except Test) is a Year.
counts_all_wide <- counts_all %>% 
  pivot_wider(id_cols = Test, names_from = Year, values_from = n, values_fill = list(n = 0))

counts_lumped_wide <- counts_lumped %>% 
  pivot_wider(id_cols = Test, names_from = Year, values_from = n, values_fill = list(n = 0))

counts_construct_wide <- counts_construct %>% 
  pivot_wider(id_cols = Test, names_from = Year, values_from = n, values_fill = list(n = 0))

### 3. Run iNEXT for Coverage-Based Comparison
# Here, we set q = 0 (species richness). You can set q = 1 for Hill–Shannon or q = 2 for Hill–Simpson.
inext_all <- iNEXT(counts_all_wide %>% tibble::column_to_rownames("Test"), q = 1, datatype = "abundance")
inext_lumped <- iNEXT(counts_lumped_wide %>% tibble::column_to_rownames("Test"), q = 1, datatype = "abundance")
inext_constructs <- iNEXT(counts_construct_wide %>% tibble::column_to_rownames("Test"), q = 1, datatype = "abundance")

Show code

inext_combined = bind_rows("all"  = inext_all$AsyEst, "lumped" = inext_lumped$AsyEst, "constructs" = inext_constructs$AsyEst, .id = "type") %>% 
  mutate(Year = as.numeric(Assemblage)) %>% 
  mutate(type = case_match(type, 
                           "all" ~ "measures",
                           "constructs" ~ "constructs",
                            "lumped" ~ "with translations\n and revisions"))

ggplot(inext_combined, aes(x = Year, y = Estimator, ymin = LCL, ymax = UCL, group = type, color = type)) +
  geom_smooth(stat = "identity", size = 0.9) +
  facet_grid(Diversity ~ ., scales = "free_y") +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  theme_minimal(base_size = 14) +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2022), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  labs(y = "Fragmentation index")

Show code

ggsave("figures/diversity_indices.pdf", width = 8, height = 6)
ggsave("figures/diversity_indices.png", width = 8, height = 6)

Show code

asymptotic_hill_shannon <- inext_combined %>% filter(Diversity %in% c( "Shannon diversity"))

asymptotic_hill_shannon %>% 
  ggplot(., aes(Year, y = Estimator, min=LCL, ymax = UCL, color = type, fill = type)) +
  geom_smooth(size = 0.7, stat = "identity") +
  scale_y_continuous("Fragmentation index") +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  scale_fill_brewer(type = "qual", guide = "none", palette = 2) +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_text_repel(data = asymptotic_hill_shannon %>% drop_na() %>% group_by(type) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",type)), # "\n (n = ", n_tests, ")"
                  segment.curvature = -0.5,
                  segment.square = TRUE,
                  segment.color = 'grey', 
                  xlim = c(2023, 2030),
                  nudge_x = 1.14,
                  lineheight = .9,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
   theme(
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/entropy.pdf", width = 8, height = 4)
ggsave("figures/entropy.png", width = 8, height = 4)

By subdiscipline

Show code

# For each subdiscipline, pivot the counts (tests = DOI) across years, run iNEXT (q = 1),
# and extract the asymptotic Shannon diversity estimates.
asy_est_all_sub <- psyctests_info %>% 
  filter(between(Year, 1993, 2022), usage_count > 0) %>% 
  group_by(subdiscipline_1) %>% 
  group_modify(~ {
    # Create a wide matrix: rows = tests (DOI), columns = Years
    counts <- .x %>% 
      group_by(Year, Test = DOI) %>% 
      summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop") %>% 
      pivot_wider(id_cols = Test, names_from = Year, values_from = n, values_fill = list(n = 0))
    
    if(nrow(counts) > 0 && ncol(counts) > 1) {
      inext_obj <- iNEXT(counts %>% tibble::column_to_rownames("Test"), q = 1, datatype = "abundance")
      asy <- inext_obj$AsyEst %>% 
        filter(Diversity == "Shannon diversity") %>% 
        mutate(subdiscipline = .y$subdiscipline_1)
      asy
    } else {
      tibble()
    }
  }) %>% 
  ungroup()

asy_est_all_sub <- asy_est_all_sub %>% rename(Year = Assemblage)
asy_est_all_sub %>% 
  ggplot(., aes(x = as.numeric(Year), y = Estimator, color = subdiscipline)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Fragmentation index") +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2033), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 7))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = asy_est_all_sub %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""))),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
  theme_minimal(base_size = 13) +
  ggtitle("All measures, including revisions and translations") +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")

Show code

ggsave("figures/entropy_subdiscipline_all.pdf", width = 8, height = 4)
ggsave("figures/entropy_subdiscipline_all.png", width = 8, height = 4)

Show code

asy_est_orig_sub <- psyctests_info %>% 
  filter(between(Year, 1993, 2022), usage_count > 0) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  group_by(subdiscipline_1) %>% 
  group_modify(~ {
    counts <- .x %>% 
      group_by(Year, Test) %>% 
      summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop") %>% 
      pivot_wider(id_cols = Test, names_from = Year, values_from = n, values_fill = list(n = 0))
    
    if(nrow(counts) > 0 && ncol(counts) > 1) {
      inext_obj <- iNEXT(counts %>% tibble::column_to_rownames("Test"), q = 1, datatype = "abundance")
      asy <- inext_obj$AsyEst %>% 
        filter(Diversity == "Shannon diversity") %>% 
        mutate(subdiscipline = .y$subdiscipline_1)
      asy
    } else {
      tibble()
    }
  }) %>% 
  ungroup()


asy_est_orig_sub <- asy_est_orig_sub %>% rename(Year = Assemblage)
asy_est_orig_sub %>% 
  ggplot(., aes(x = as.numeric(Year), y = Estimator, color = subdiscipline)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Fragmentation index") +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2033), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 7))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = asy_est_orig_sub %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""))),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
  theme_minimal(base_size = 13) +
  ggtitle("Lumping revisions and translations with their originals") +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")

Show code

ggsave("figures/entropy_subdiscipline_orig.pdf", width = 8, height = 4)
ggsave("figures/entropy_subdiscipline_orig.png", width = 8, height = 4)

Show code

asy_est_constructs_sub <- psyctests_info %>% 
  filter(between(Year, 1993, 2022), usage_count > 0) %>% 
  group_by(subdiscipline_1) %>% 
  group_modify(~ {
    counts <- .x %>% 
      group_by(Year, Test = first_construct) %>% 
      summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop") %>% 
      pivot_wider(id_cols = Test, names_from = Year, values_from = n, values_fill = list(n = 0))
    
    if(nrow(counts) > 0 && ncol(counts) > 1) {
      inext_obj <- iNEXT(counts %>% tibble::column_to_rownames("Test"), q = 1, datatype = "abundance")
      asy <- inext_obj$AsyEst %>% 
        filter(Diversity == "Shannon diversity") %>% 
        mutate(subdiscipline = .y$subdiscipline_1)
      asy
    } else {
      tibble()
    }
  }) %>% 
  ungroup()

asy_est_constructs_sub <- asy_est_constructs_sub %>% rename(Year = Assemblage)
asy_est_constructs_sub %>% 
  ggplot(., aes(x = as.numeric(Year), y = Estimator, color = subdiscipline)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Fragmentation index") +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2033), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 7))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = asy_est_constructs_sub %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""))),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
  theme_minimal(base_size = 13) +
  ggtitle("All measures, including revisions and translations") +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")

Show code

ggsave("figures/entropy_subdiscipline_constructs.pdf", width = 8, height = 4)
ggsave("figures/entropy_subdiscipline_constructs.png", width = 8, height = 4)

Rarefaction

Rarefaction involves resampling the tests in each year a number of times based off of the smallest sample-size (year with the smallest number of measure observations, 1993) to get a distribution of comparisons. This also allows us to get rid of the relativizing from zero to one, no matter what diversity measure we use.

Show code

# Set a seed for reproducibility
set.seed(05102019)

### 1. Define a simulation function for rarefaction

simulate_rarefaction <- function(freq, n_min, k = 50) {
  # freq: named numeric vector of counts (names are test IDs)
  # n_min: rarefaction depth (number of observations to sample)
  # k: number of resampling iterations
  res <- replicate(k, {
    # Expand the frequency vector into a vector of individual observations
    pop <- rep(names(freq), times = freq)
    # Sample without replacement the rarefaction depth number of individuals
    sample_ids <- sample(pop, size = n_min, replace = FALSE)
    tbl <- table(sample_ids)
    
    # Compute diversity metrics:
    richness     <- length(tbl)
    shannon      <- entropy(as.numeric(tbl))
    gini_simpson <- 1 - sum((as.numeric(tbl) / n_min)^2)
    hill_simpson <- hillR::hill_taxa(as.numeric(tbl), q = 2)
    hill_shannon <- hillR::hill_taxa(as.numeric(tbl), q = 1) 
    # effective number of species (Hill number q=1)
    
    
    c(richness = richness,
      shannon = shannon,
      gini_simpson = gini_simpson,
      hill_simpson = hill_simpson,
      hill_shannon = hill_shannon)
  })
  
  # Convert the simulation output to a data frame:
  res_df <- as.data.frame(t(res))
  res_df$replicate <- 1:k
  res_df
}

### 2. Prepare counts for each grouping

# 2.1. All tests (using DOI as the test identifier)
counts_all <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(DOI) %>% 
  filter(usage_count > 0) %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop")

# 2.2. Revisions lumped (using if_else to recode tests)
counts_lumped <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  filter(usage_count > 0) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop")


# 2.3. By construct (using first_construct as the test identifier)
counts_construct <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  filter(usage_count > 0) %>% 
  drop_na(first_construct) %>% 
  mutate(Test = first_construct) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE), .groups = "drop")

### 3. A helper function to run rarefaction per count type

run_rarefaction <- function(counts_df, type_label, k = 500) {
  # Compute total counts per year for this type:
  yearly_totals <- counts_df %>% 
    group_by(Year) %>% 
    summarise(total = sum(n), .groups = "drop")
  
  # Set the rarefaction depth to the smallest total count across years:
  rarefaction_depth <- min(yearly_totals$total)
  
  # For each year, create the frequency vector and run the simulation:
  sim_data <- counts_df %>% 
    group_by(Year) %>% 
    summarise(freq = list(setNames(n, Test)), .groups = "drop") %>% 
    left_join(yearly_totals, by = "Year") %>% 
    mutate(sim = map(freq, ~ simulate_rarefaction(.x, n_min = rarefaction_depth, k = k))) %>% 
    select(Year, sim) %>% 
    unnest(sim) %>% 
    mutate(type = type_label)
  
  sim_data
}

### 4. Run rarefaction for all three types

rare_all       <- run_rarefaction(counts_all,       "All tests")
rare_lumped    <- run_rarefaction(counts_lumped,    "Revisions lumped")
rare_construct <- run_rarefaction(counts_construct, "By construct")

# Combine the rarefaction results from all types:
rare_combined <- bind_rows(rare_all, rare_lumped, rare_construct)

# Pivot the data to long format for the metrics
rare_long <- rare_combined %>% 
  pivot_longer(cols = c(richness, shannon, gini_simpson, hill_simpson, hill_shannon),
               names_to = "metric",
               values_to = "value")

# Summarize rarefaction results
rare_long <- rare_long %>%
  group_by(Year, type, metric) %>%
  summarise(
    mean = mean(value),
    lower = quantile(value, 0.025),
    upper = quantile(value, 0.975)
  )

rare_long <- rare_long %>% 
  mutate(type = case_match(type, 
                           "All tests" ~ "measures",
                           "By construct" ~ "constructs",
                            "Revisions lumped" ~ "with translations\n and revisions")) %>% 
  mutate(metric = case_match(metric,
                             'hill_simpson' ~ "Hill-Simpson",
                             'hill_shannon' ~ "Hill-Shannon",
                             'richness' ~ "Species richness"))

### 5. Create a facet plot
# The facet grid is arranged with metrics in rows and count types in columns.
ggplot(rare_long %>% filter(metric %in% c("Hill-Simpson", "Hill-Shannon", "Species richness")), aes(x = Year, y = mean, min=lower, ymax = upper, color = type, fill = type)) +
  geom_smooth(stat = "identity") +
  # geom_line(stat = 'summary', fun = mean, color = "blue", size = 0.7, group = 1) +
  facet_grid(metric ~ ., scales = "free_y") +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  scale_fill_brewer(type = "qual", guide = "none", palette = 2) +
  theme_minimal(base_size = 14) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2022), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  labs(y = "Fragmentation index",
        title = "Standardized by Rarefaction")

Show code

ggsave("figures/diversity_indices_rarefaction.pdf", width = 8, height = 6)
ggsave("figures/diversity_indices_rarefaction.png", width = 8, height = 6)

Show code

rarefaction_hill_shannon <- rare_long %>% filter(metric %in% c( "Hill-Shannon"))

rarefaction_hill_shannon %>% 
  ggplot(., aes(Year, y = mean, min=lower, ymax = upper, color = type, fill = type)) +
  geom_smooth(size = 0.7, stat = "identity") +
  scale_y_continuous("Fragmentation index") +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  scale_fill_brewer(type = "qual", guide = "none", palette = 2) +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_text_repel(data = rarefaction_hill_shannon %>% drop_na() %>% group_by(type) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",type)), # "\n (n = ", n_tests, ")"
                  segment.curvature = -0.5,
                  segment.square = TRUE,
                  segment.color = 'grey', 
                  xlim = c(2023, 2030),
                  nudge_x = 1.14,
                  lineheight = .9,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
   theme(
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/entropy_rarefaction.pdf", width = 8, height = 4)
ggsave("figures/entropy_rarefaction.png", width = 8, height = 4)

Rarefaction plus 25% chance of off by one

Primitive approach to check robustness of conclusions to off-by-one errors.

Show code

# Set a seed for reproducibility
set.seed(05102019)

### 1. Define a simulation function for rarefaction with per-sample error
simulate_rarefaction <- function(freq, pub_year, current_year, n_min, k = 50) {
  # freq: named numeric vector of counts (names are test IDs)
  # pub_year: named numeric vector of TestYear for each test (same names as freq)
  # current_year: the scalar year for this group
  # n_min: rarefaction depth (number of observations to sample)
  # k: number of resampling iterations
  res <- replicate(k, {
    # For each test, adjust the count with measurement error anew:
    freq_adj <- sapply(seq_along(freq), function(i) {
      if (is.na(pub_year[i]) || current_year < pub_year[i]) {
        # Before publication or missing publication info, force count to 0
        0
      } else {
        # With 25% chance, adjust the count by ±1; otherwise leave unchanged.
        adjustment <- if (runif(1) < 0.25) sample(c(-1, 1), size = 1) else 0
        new_val <- freq[i] + adjustment
        if (new_val < 0) 0 else new_val
      }
    })
    
    # Expand the adjusted frequency vector into individual observations
    pop <- rep(names(freq_adj), times = freq_adj)
    
    # If there are too few observations after error adjustment, return NA metrics.
    if (length(pop) < n_min) {
      return(c(richness = NA, shannon = NA, gini_simpson = NA, true_diversity = NA))
    }
    
    # Draw a rarefied sample without replacement.
    sample_ids <- sample(pop, size = n_min, replace = FALSE)
    tbl <- table(sample_ids)
    
    # Compute diversity metrics:
    richness     <- length(tbl)
    shannon      <- entropy(as.numeric(tbl))
    gini_simpson <- 1 - sum((as.numeric(tbl) / n_min)^2)
    true_diversity <- exp(shannon)  # effective number of species (Hill number q=1)
    
    c(richness = richness,
      shannon = shannon,
      gini_simpson = gini_simpson,
      true_diversity = true_diversity)
  })
  
  # Convert the simulation output to a data frame.
  res_df <- as.data.frame(t(res))
  res_df$replicate <- 1:k
  res_df
}

### 2. Prepare counts for each grouping (bringing along TestYear)

# 2.1. All tests (using DOI as the test identifier)
counts_all <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(DOI) %>% 
  filter(usage_count > 0) %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE),
            TestYear = min(TestYear),
            .groups = "drop")

# 2.2. Revisions lumped (using if_else to recode tests)
counts_lumped <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  filter(usage_count > 0) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE),
            TestYear = min(TestYear),
            .groups = "drop")

# 2.3. By construct (using first_construct as the test identifier)
counts_construct <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  filter(usage_count > 0) %>% 
  drop_na(first_construct) %>% 
  mutate(Test = first_construct) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = TRUE),
            TestYear = min(TestYear),
            .groups = "drop")

### 3. A helper function to run rarefaction per count type with per-sample error
run_rarefaction <- function(counts_df, type_label, k = 500) {
  # Compute total counts per year for this type:
  yearly_totals <- counts_df %>% 
    group_by(Year) %>% 
    summarise(total = sum(n), .groups = "drop")
  
  # Set the rarefaction depth to the smallest total count across years:
  rarefaction_depth <- min(yearly_totals$total)
  
  # For each year, create the frequency vector and the publication year vector,
  # then run the simulation using pmap so that the scalar Year is passed along.
  sim_data <- counts_df %>% 
    group_by(Year) %>% 
    summarise(
      freq = list(setNames(n, Test)),
      pub  = list(setNames(TestYear, Test)),
      .groups = "drop"
    ) %>% 
    left_join(yearly_totals, by = "Year") %>% 
    mutate(sim = pmap(
      list(freq, pub, Year),
      function(freq, pub, Year) {
        simulate_rarefaction(
          freq = freq,
          pub_year = pub,
          current_year = Year,
          n_min = rarefaction_depth,
          k = k
        )
      }
    )) %>% 
    select(Year, sim) %>% 
    unnest(sim) %>% 
    mutate(type = type_label)
  
  sim_data
}

### 4. Run rarefaction for all three types
rare_all       <- run_rarefaction(counts_all,       "All tests")
rare_lumped    <- run_rarefaction(counts_lumped,    "Revisions lumped")
rare_construct <- run_rarefaction(counts_construct, "By construct")

# Combine the rarefaction results from all types:
rare_combined <- bind_rows(rare_all, rare_lumped, rare_construct)


# Pivot the data to long format for the metrics
rare_long <- rare_combined %>% 
  pivot_longer(cols = c(richness, shannon, gini_simpson, true_diversity),
               names_to = "metric",
               values_to = "value")

# Summarize rarefaction results (mean and 95% intervals)
rare_long <- rare_long %>%
  group_by(Year, type, metric) %>%
  summarise(
    mean = mean(value, na.rm = TRUE),
    lower = quantile(value, 0.025, na.rm = TRUE),
    upper = quantile(value, 0.975, na.rm = TRUE)
  )


rare_long <- rare_long %>% 
  mutate(type = case_match(type, 
                           "All tests" ~ "measures",
                           "By construct" ~ "constructs",
                            "Revisions lumped" ~ "with translations\n and revisions")) %>% 
  mutate(metric = case_match(metric,
                             'hill_simpson' ~ "Hill-Simpson",
                             'hill_shannon' ~ "Hill-Shannon",
                             'richness' ~ "Species richness"))

### 5. Create a facet plot
# The facet grid is arranged with metrics in rows and count types in columns.
ggplot(rare_long %>% filter(metric %in% c("Hill-Simpson", "Hill-Shannon", "Species richness")), aes(x = Year, y = mean, min=lower, ymax = upper, color = type, fill = type)) +
  geom_smooth(stat = "identity") +
  # geom_line(stat = 'summary', fun = mean, color = "blue", size = 0.7, group = 1) +
  facet_grid(metric ~ ., scales = "free_y") +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  scale_fill_brewer(type = "qual", guide = "none", palette = 2) +
  theme_minimal(base_size = 14) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2022), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  labs(y = "Fragmentation index",
        title = "Standardized by Rarefaction + 25% off-by-one errors",
        subtitle = "For published tests, counts are adjusted by ±1 with 25% probability anew per simulation")

Show code

ggsave("figures/diversity_indices_rarefaction_ob1.pdf", width = 8, height = 6)
ggsave("figures/diversity_indices_rarefaction_ob1.png", width = 8, height = 6)

Old approach: Normalised Shannon Entropy

Show code

byorig_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
# what's the point of mutate(n_tests = n_distinct(Test))? just to check?


byconstruct_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(first_construct) %>% 
  group_by(Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()

all_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(DOI) %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()

original_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  filter(test_type == "Original") %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()

entropy_by_year <- bind_rows(# "all tests" = all_entropy_by_year,
                             "measures" = original_entropy_by_year,
                             "with translations\n and revisions" = byorig_entropy_by_year,
                             # "by name base" = bybase_entropy_by_year,
                             "constructs" = byconstruct_entropy_by_year,
                             .id = "version")


entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = version)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(version) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",version)), # "\n (n = ", n_tests, ")"
                  segment.curvature = -0.5,
                  segment.square = TRUE,
                  segment.color = 'grey', 
                  xlim = c(2023, 2030),
                  nudge_x = 1.14,
                  lineheight = .9,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
   theme(
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

by subdiscipline

all measures

Show code

entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(subdiscipline_1, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(subdiscipline_1, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()


entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = subdiscipline_1)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy\n(with revisions and translations)", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2033), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 14))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""), " (n=", n_tests, ")")),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")

original measures

Show code

entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  group_by(subdiscipline_1, Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(subdiscipline_1, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()


entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = subdiscipline_1)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy\n(novel measures)", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 17))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""), " (n=", n_tests, ")")),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F)  +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")

constructs

Show code

entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(subdiscipline_1, Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(subdiscipline_1, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()


entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = subdiscipline_1)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy (constructs)", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2038), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 10))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""), " (n=", n_tests, ")")),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

By instrument type

Show code

entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(instrument_type_broad, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(instrument_type_broad) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(instrument_type_broad, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()



entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = instrument_type_broad)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous(limits = c(1993, 2027), breaks = c(1993, 1998, 2003, 2008, 2013, 2018, 2022)) +
  scale_color_brewer(type = "qual", guide = "none", palette = 3) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = gsub("^.*$", " ", instrument_type_broad)), # This will force the correct position of the link's right end.
                  data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE
  ) +
  geom_text_repel(data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(instrument_type_broad, " Psychology", ""), " (n=", n_tests, ")")),
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE)+
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Lorenz curves

Show code

test_frequency <- psyctests_info %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  # filter(TestYear >= 1990) %>%
  filter(between(Year, 1993, 2022)) %>%
  group_by(subdiscipline_1, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  arrange(n) %>% 
  mutate(decile = Hmisc::cut2(n, g = 10)) %>% 
  mutate(cumsum = cumsum(n),
         sum = sum(n))
# 
# test_frequency %>% 
#     group_by(subdiscipline_1, decile) %>% 
#     summarise(share = sum(n)/first(sum),
#               median_n = median(n),
#               n_measures = n()) %>% 
#   View()


ggplot(test_frequency, aes(n)) +
  stat_lorenz(desc = F) +
  coord_fixed() +
  geom_abline(linetype = "dashed") +
      theme_minimal() +
    hrbrthemes::scale_x_percent("Cumulative percentage of measures") +
    hrbrthemes::scale_y_percent("Cumulative percentage of measure market share") #+

Show code

#    hrbrthemes::theme_ipsum_rc()

ggplot(test_frequency, aes(n, color = subdiscipline_1)) +
  stat_lorenz(desc = F) +
  coord_fixed() +
  geom_abline(linetype = "dashed") +
      theme_minimal() +
    hrbrthemes::scale_x_percent("Cumulative percentage of measures") +
    hrbrthemes::scale_y_percent("Cumulative percentage of measure market share")

Survival

aggregate stats

Show code

constructs <- psyctests_info %>% 
     # filter(between(first_pub_year, 1950, 2015)) %>%
     unnest(ConstructList) %>% 
     rowwise() %>% 
     mutate(construct = unlist(ConstructList)) %>% 
     select(-ConstructList) %>% 
     filter(between(Year, 1993, 2022)) %>% 
     drop_na(construct) %>% 
     mutate(survival = last_pub_year - first_pub_year,
            survived_five = if_else(survival >= 5, T, F),
            survived_ten = if_else(survival >= 10, T, F)) %>%
     distinct(construct, .keep_all = TRUE)

mean(constructs$survival, na.rm = T)

[1] 6.120457

Show code

sd(constructs$survival, na.rm = T)

[1] 8.788988

Show code

median(constructs$survival, na.rm = T)

[1] 2

Show code

max(constructs$survival, na.rm = T)

[1] 122

Show code

min(constructs$survival, na.rm = T)

[1] 0

Show code

measures <- psyctests_info %>% 
     filter(between(Year, 1993, 2022))  %>% 
     mutate(survival = last_pub_year - first_pub_year,
            survived_five = if_else(survival >= 5, T, F),
            survived_ten = if_else(survival >= 10, T, F)) %>%
     distinct(DOI, .keep_all = TRUE)

mean(measures$survival, na.rm = T)

[1] 6.046785

Show code

sd(measures$survival, na.rm = T)

[1] 8.743927

Show code

median(measures$survival, na.rm = T)

[1] 2

Show code

max(measures$survival, na.rm = T)

[1] 122

Show code

min(measures$survival, na.rm = T)

[1] 0

cumulative sum

all constructs

Show code

cumsum_construct <- constructs %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )

Show code

ggsave("figures/cumsums_survival_all.pdf", width = 8, height = 4)
ggsave("figures/cumsums_survival_all.png", width = 8, height = 4)

first constructs

Show code

first_constructs <- constructs %>%
  distinct(first_construct, .keep_all = TRUE)


cumsum_construct <- first_constructs %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- first_constructs %>% 
  filter(survived_five == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- first_constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )

Show code

ggsave("figures/cumsums_survival_first.pdf", width = 8, height = 4)
ggsave("figures/cumsums_survival_first.png", width = 8, height = 4)

measures

Show code

cumsum_all <- measures %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_5 <- measures %>% 
  filter(survived_five == T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_10 <- measures %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  mutate(tests = cumsum(tests)) 

cumsums <- bind_rows(
  "all measures" = cumsum_all,
  "measures in use\n for => 5 years" = cumsum_5,
  "measures in use\n for => 10 years" = cumsum_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of measures") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", tests, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE) +
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )

Show code

ggsave("figures/cumsums_survival_measures.pdf", width = 8, height = 4)
ggsave("figures/cumsums_survival_measures.png", width = 8, height = 4)

by first use in PsycInfo instead of publication year in PsycTests

Show code

cumsum_construct <- constructs %>% 
  arrange(first_pub_year) %>% 
  group_by(first_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(first_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(first_pub_year) %>% 
  group_by(first_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(first_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(first_pub_year) %>% 
  group_by(first_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(first_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = first_pub_year) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("First usage logged in PsycInfo",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )

by last use in PsycInfo instead of publication year in PsycTests

Show code

cumsum_construct <- constructs %>% 
  arrange(last_pub_year) %>% 
  group_by(last_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(last_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(last_pub_year) %>% 
  group_by(last_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(last_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(last_pub_year) %>% 
  group_by(last_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(last_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = last_pub_year) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Last usage logged in PsycInfo",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )

counts

Show code

count_construct <- constructs %>% 
  arrange(TestYear) %>%
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(construct)) %>% 
  arrange(TestYear)

count_construct_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(TestYear) %>%
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(construct)) %>% 
  arrange(TestYear)

count_construct_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>%
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(construct)) %>% 
  arrange(TestYear)

counts <- bind_rows(
  "all constructs" = count_construct,
  "constructs in use\n for => 5 years" = count_construct_5,
  "constructs in use\n for => 10 years" = count_construct_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)



ggplot(counts, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) +
  geom_text_repel(data = counts %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  lineheight = .9,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1.5,
                  nudge_y = -15,
                  force = 1,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/counts_survival.pdf", width = 8, height = 4)
ggsave("figures/counts_survival.png", width = 8, height = 4)

Robustness checks

Show code

all_constructs_over_time <- records_wide %>% select(subdiscipline_1, DOI, TestYear, ConstructList) %>% 
    unnest(ConstructList) %>% 
    rowwise() %>% 
    mutate(construct = unlist(ConstructList)) %>% 
  select(-ConstructList)

cumsum_all_constructs <- all_constructs_over_time %>% 
  arrange(TestYear) %>% 
  distinct(construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(construct)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 


cumsum_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(first_construct)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs))

cumsums <- bind_rows(
  "first constructs" = cumsum_construct,
  "all constructs" = cumsum_all_constructs,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2027), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)

Show code

ggsave("figures/cumsum_all_vs_first.pdf", width = 8, height = 4)
ggsave("figures/cumsum_all_vs_first.png", width = 8, height = 4)

All or first constructs

For some 20% of tests, two or more constructs were coded. In most plots, we simply use the first construct for each test.

Show code

records_wide <- records_wide %>% 
  rowwise() %>% 
  mutate(constructs_n = length(ConstructList)) %>% 
  ungroup()

table(records_wide$constructs_n)


    1     2     3     4     5     6     7     8     9    10    12 
56645 12421  2134   326    85    37    13    14     3     3     3 
   15    16    17    21    22    23    25 
    1     1     1     1     1     1     2

Show code

round(prop.table(table(records_wide$constructs_n)),2)


   1    2    3    4    5    6    7    8    9   10   12   15   16   17 
0.79 0.17 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
  21   22   23   25 
0.00 0.00 0.00 0.00

Show code

ggplot(records_wide, aes(constructs_n)) + 
  geom_bar()

Expanding the entropy calculation to all coded constructs makes little difference.

Show code

all_constructs_over_time <- psyctests_info %>% select(subdiscipline_1, DOI, TestYear, Year, ConstructList, usage_count) %>% 
    unnest(ConstructList) %>% 
    rowwise() %>% 
    mutate(construct = unlist(ConstructList)) %>% 
  select(-ConstructList)

entropy_all_constructs <- all_constructs_over_time %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(construct) %>% 
  group_by(Year, construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(construct)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(
    tests = list(as.numeric(n)),
    entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
        hill_shannon = hillR::hill_taxa(n, q = 1),

            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()

byconstruct_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(first_construct) %>% 
  group_by(Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(
    tests = list(as.numeric(n)),
    total_n = sum(n),
    entropy = entropy(n),,
    norm_entropy = calc_norm_entropy(n),
    hill_shannon = hillR::hill_taxa(n, q = 1),
    n = sum(n),
    diff_tests = n()) %>% 
  ungroup()

entropy_by_year <- bind_rows(# "all tests" = all_entropy_by_year,
                             "all constructs" = entropy_all_constructs,
                             # "by name base" = bybase_entropy_by_year,
                             "first constructs" = byconstruct_entropy_by_year,
                             .id = "version")

entropy_by_year %>% 
  ggplot(., aes(Year, hill_shannon, color = version)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Hill-Shannon Diversity") +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2027), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +

  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = entropy_by_year %>% group_by(version) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",version)), # "\n (n = ", n_tests, ")"
                  segment.curvature = -0.5,
                  segment.square = TRUE,
                  segment.color = 'grey', 
                  xlim = c(2023, 2030),
                  nudge_x = 1.14,
                  lineheight = .9,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

Show code

ggsave("figures/entropy_all_vs_first.pdf", width = 8, height = 4)
ggsave("figures/entropy_all_vs_first.png", width = 8, height = 4)

Entropy by classification

Show code

entropy_by_class <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  group_by(classification_1, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(classification_1) %>% 
  # filter(n > 0) %>% 
  summarise(
    tbl = list(as.numeric(n)),
    total_n = sum(n),
    entropy = entropy(n),
    norm_entropy = calc_norm_entropy(n),
    hill_shannon = hillR::hill_taxa(n, q = 1)) %>% 
  arrange(norm_entropy)

entropy_by_class_list <- entropy_by_class %>% select(classification_1, tbl) %>% 
  deframe()

Show code

entropy_by_class_inext <- iNEXT(entropy_by_class_list) # , endpoint = 1000, knots = 10
ggiNEXT(entropy_by_class_inext, color.var = 'None')

Error in `palette()`:
! Insufficient values in manual scale. 31 needed but only 8
  provided.

Show code

entropy_by_class <- entropy_by_class %>% select(-tbl) %>% full_join(
  entropy_by_class_inext$AsyEst %>% rename(classification_1 = Assemblage) %>% 
    mutate(diversity_estimate = sprintf("%.0f [%.0f;%.0f]", Estimator, LCL, UCL)) %>%
    # mutate(diversity_estimate = Estimator) %>% 
    select(classification_1, diversity_estimate, Diversity) %>% 
    pivot_wider(values_from = diversity_estimate, names_from = Diversity)
)

kable(entropy_by_class %>% arrange(`Shannon diversity` %>% str_extract("([0-9]+) ") %>% as.numeric(), hill_shannon))

classification_1	total_n	entropy	norm_entropy	hill_shannon	Species richness	Shannon diversity	Simpson diversity
Intelligence	7284	1.936077	0.5490299	6.931508	44 [34;80]	7 [7;7]	5 [5;5]
Military Personnel, Adjustment, and Training	4453	1.969452	0.4206315	7.166747	169 [129;210]	7 [7;8]	4 [4;4]
Aptitude and Achievement	4326	1.996436	0.4782585	7.362771	80 [65;104]	7 [7;8]	5 [4;5]
General Assessment Tools	1403	2.209249	0.4999613	9.108874	121 [88;154]	10 [9;11]	4 [3;4]
Anxiety and Depression	46429	2.992137	0.4813136	19.928224	701 [638;764]	20 [20;20]	8 [8;8]
Neuropsychological Assessment	40220	3.084879	0.5202518	21.864832	541 [471;611]	22 [22;22]	10 [10;11]
Functional Status and Adaptive Behavior	13507	3.284067	0.5645529	26.684077	655 [541;768]	27 [26;28]	12 [11;12]
Legal and Forensic Evaluation	4529	3.449960	0.6070047	31.499119	480 [405;555]	33 [31;35]	9 [8;10]
Perceptual, Motor, and Sensory Processing	5738	3.520288	0.6515822	33.794166	410 [327;494]	35 [34;36]	14 [13;15]
Emotional States, Emotional Responses, and Motivation	38778	3.544672	0.5337411	34.628321	1167 [1066;1268]	35 [34;36]	10 [10;10]
Addiction, Gambling, and Substance Abuse/Use	36817	3.573186	0.5433288	35.629915	1132 [1040;1224]	36 [35;37]	10 [10;10]
Sports, Recreation, and Leisure	4825	3.611987	0.6296420	37.039577	518 [448;589]	39 [37;42]	9 [9;10]
Religious and Political Beliefs	4132	3.652164	0.6311091	38.558002	514 [447;581]	41 [39;44]	10 [10;11]
Culture, Racial, and Ethnic Identity	6153	3.726453	0.6439467	41.531547	436 [385;487]	43 [41;45]	13 [12;14]
Communication, Language, and Verbal Processing	2790	3.694919	0.6789113	40.242318	465 [377;553]	44 [40;47]	13 [12;14]
Trauma, Stress, and Coping	38159	3.848689	0.5720890	46.931513	1394 [1258;1530]	48 [47;49]	17 [16;17]
Cognitive Processes, Memory, and Decision Making	49219	3.858750	0.5818379	47.406069	1112 [1031;1192]	48 [47;49]	16 [16;16]
Development and Aging	36420	4.178621	0.6023399	65.275788	1862 [1704;2021]	67 [65;68]	14 [13;14]
Sex, Gender Roles, and Sexual Behavior	10381	4.241755	0.6697565	69.529799	807 [729;885]	73 [70;76]	20 [19;20]
Mental Health/Illness Related Assessment	255535	4.352307	0.5817821	77.657418	2700 [2549;2850]	78 [77;79]	28 [28;28]
Human Factors and Environmental Engineering	1044	4.145539	0.7699030	63.151640	638 [506;770]	83 [73;93]	26 [22;30]
Consumer Behavior, Marketing, and Advertising	2415	4.350168	0.7401094	77.491464	1044 [833;1256]	93 [87;100]	28 [25;30]
Attitudes, Interests, Values, and Expectancies	13711	4.475760	0.6580759	87.861379	1484 [1358;1611]	93 [90;96]	24 [23;25]
Human-Computer Interaction	2081	4.432384	0.7509163	84.131733	656 [560;751]	100 [92;107]	21 [19;23]
Personality	71623	4.599470	0.6361150	99.431584	1977 [1866;2088]	101 [99;103]	23 [23;24]
Family Relationships and Parenting	20446	4.626174	0.6696103	102.122556	1813 [1649;1978]	107 [104;110]	27 [26;28]
Treatment, Rehabilitation, and Therapeutic Processes	30240	4.657907	0.6011396	105.415164	4826 [4546;5106]	114 [110;118]	12 [12;13]
Social, Group, and Interpersonal Relationships	27392	4.852731	0.6805980	128.089695	2099 [1949;2249]	133 [130;137]	33 [32;34]
Organizational, Occupational, and Career Development	21713	4.942498	0.6661604	140.119799	3098 [2848;3347]	151 [146;156]	19 [18;20]
Physical Health/Illness Related Assessment	45177	5.011744	0.6626486	150.166406	3299 [3120;3478]	156 [153;159]	32 [31;33]
Education, Teaching, and Student Characteristics	17691	4.951154	0.6480872	141.338031	4537 [4241;4834]	161 [154;168]	12 [11;12]

Entropy by instrument type

Show code

entropy_by_instrument <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  group_by(instrument_type_broad, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(instrument_type_broad) %>% 
  filter(n > 0) %>% 
  summarise(
    tbl = list(as.numeric(n)),
    total_n = sum(n),
    entropy = entropy(n),
    norm_entropy = calc_norm_entropy(n),
    hill_shannon = hillR::hill_taxa(n, q = 1)) %>% 
  arrange(hill_shannon)


entropy_by_instrument_list <- entropy_by_instrument %>% select(instrument_type_broad, tbl) %>% 
  deframe()
entropy_by_instrument_inext <- iNEXT(entropy_by_instrument_list, endpoint = 1000, knots = 10)
ggiNEXT(entropy_by_instrument_inext)

Show code

entropy_by_instrument <- entropy_by_instrument %>% select(-tbl) %>% full_join(
  entropy_by_instrument_inext$AsyEst %>% rename(instrument_type_broad = Assemblage) %>% 
    mutate(diversity_estimate = sprintf("%.0f [%.0f;%.0f]", Estimator, LCL, UCL)) %>%
    # mutate(diversity_estimate = Estimator) %>% 
    select(instrument_type_broad, diversity_estimate, Diversity) %>% 
    pivot_wider(values_from = diversity_estimate, names_from = Diversity)
)
entropy_by_instrument %>% 
  kable()

instrument_type_broad	total_n	entropy	norm_entropy	hill_shannon	Species richness	Shannon diversity	Simpson diversity
NA	12993	2.549405	0.4904120	12.79949	285 [227;343]	13 [13;13]	6 [6;6]
test	80256	3.576240	0.5167475	35.73890	1669 [1536;1802]	36 [36;37]	10 [10;10]
task	23705	3.763432	0.5971220	43.09610	852 [759;946]	44 [43;45]	16 [16;16]
other-rating	61962	3.908466	0.5656449	49.82249	1702 [1555;1849]	51 [50;51]	19 [18;19]
questionnaire	685715	6.380357	0.6462648	590.13864	33802 [33153;34450]	606 [602;610]	63 [62;63]

Tests by exact same name

Show code

same_name_tests <- records_wide %>% group_by(name_psycinfo) %>% filter(n_distinct(DOI) > 1) %>% summarise(n = n())
nrow(same_name_tests)

[1] 1230

Show code

mean(same_name_tests$n)

[1] 2.44878

Show code

same_name_tests %>% arrange(desc(n)) %>% head(20)

# A tibble: 20 × 2
   name_psycinfo                                n
   <chr>                                    <int>
 1 theory of planned behavior questionnaire    18
 2 job satisfaction scale                      15
 3 self-efficacy scale                         11
 4 self-control scale                          10
 5 behavioral intentions measure                9
 6 parental involvement scale                   9
 7 procedural justice scale                     9
 8 social distance scale                        9
 9 marital satisfaction measure                 8
10 perceived behavioral control measure         8
11 religiosity measure                          8
12 religiosity scale                            8
13 social capital measure                       8
14 social support scale                         8
15 victimization measure                        8
16 attribution questionnaire                    7
17 delinquency measure                          7
18 fear of crime scale                          7
19 food frequency questionnaire                 7
20 life satisfaction scale                      7

Proliferation and fragmentation over time

2016 changes in standards

New measures by publication year

Cumulative number of measures and constructs

By subdiscipline

Tests by usage frequency

By instrument type

Entropy/Fragmentation

Revised approach using Hill diversity and coverage

By subdiscipline

Rarefaction

Rarefaction plus 25% chance of off by one

Old approach: Normalised Shannon Entropy

by subdiscipline

all measures

original measures

constructs

By instrument type

Lorenz curves

Survival

aggregate stats

cumulative sum

all constructs

first constructs

measures

by first use in PsycInfo instead of publication year in PsycTests

by last use in PsycInfo instead of publication year in PsycTests

counts

Robustness checks

All or first constructs

Entropy by classification

Entropy by instrument type

Tests by exact same name