Construct proliferation: Changes over time

Import

APA PsycTests

Show code
records_wide <- readRDS("../sober_rubric/raw_data/preprocessed_records.rds")

records_wide %>% group_by(Name)  %>% filter(n()>1) %>% ungroup() %>% summarise(n_distinct(Name), n())
# A tibble: 1 × 2
  `n_distinct(Name)` `n()`
               <int> <int>
1               1237  3043
Show code
records_wide$first_construct <- str_trim(str_replace_all(str_to_lower(records_wide$first_construct), "[:space:]+", " "))
First EBSCO scrape of APA PsycInfo
Show code
psycinfo <- read_tsv('../sober_rubric/raw_data/merged_table_all.tsv') %>% 
  # this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
#  mutate(Name = toTitleCase(Name)) %>% 
  rename(usage_count = "Hit Count") %>% 
  group_by(Name, Year) %>% 
  summarise(usage_count = sum(usage_count))
Rows: 309223 Columns: 5
── Column specification ──────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): Name, Journal
dbl (3): Hit Count, Year, number of search results

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
`summarise()` has grouped output by 'Name'. You can override using the `.groups` argument.
Second EBSCO scrape of APA PsycInfo
Show code
overview <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
Rows: 71692 Columns: 4
── Column specification ──────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): DOI
dbl (3): first_pub_year, last_pub_year, Hits

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Show code
byyear <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_table_years_1.tsv")
Rows: 218142 Columns: 3
── Column specification ──────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): DOI
dbl (2): Year, Hits

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Show code
n_distinct(byyear$DOI)
[1] 31145
Show code
overview %>% filter(is.na(Hits)) %>% nrow()
[1] 40574
Show code
nrow(overview)
[1] 71692
Show code
overview %>% filter(Hits >= 1) %>% nrow()
[1] 31118
Show code
one_hit_wonders <- overview %>% filter(Hits == 1) %>% 
  mutate(Year = first_pub_year)

nrow(one_hit_wonders)
[1] 13280
Show code
# for some few, the call was repeated by year for some reason
# one_hit_wonders %>% select(DOI, first_pub_year) %>% inner_join(byyear, by = "DOI") %>% arrange(DOI)

byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")
anti_join: added no columns
           > rows only in x   204,862
           > rows only in y  (      0)
           > matched rows    ( 13,280)
           >                 =========
           > rows total       204,862
Show code
all <- one_hit_wonders %>% 
  select(DOI, Year, Hits) %>% 
  bind_rows(byyear) %>% 
  left_join(overview %>% rename(total_hits = Hits), by = "DOI")
left_join: added 3 columns (first_pub_year, last_pub_year, total_hits)
           > rows only in x         1
           > rows only in y  ( 40,548)
           > matched rows     218,141
           >                 =========
           > rows total       218,142
Show code
n_distinct(all$DOI)
[1] 31145
Show code
all %>% filter(Hits > 0) %>% filter(Year < first_pub_year | Year > last_pub_year) %>% nrow()
[1] 0
Show code
all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(hits_by_year > total_hits) %>% ungroup() %>% select(DOI, everything()) %>% mutate(diff = hits_by_year - total_hits) %>% nrow()
`summarise()` has grouped output by 'total_hits'. You can override using the
`.groups` argument.
[1] 0
Show code
# all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(usage_count, na.rm = T)) %>% filter(hits_by_year < total_hits) %>% select(DOI, everything()) %>% mutate(diff = hits_by_year - total_hits) %>%  View()
all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(hits_by_year == total_hits) %>% nrow()
`summarise()` has grouped output by 'total_hits'. You can override using the
`.groups` argument.
[1] 31118
Show code
all %>% group_by(DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(hits_by_year == 0)
# A tibble: 27 × 2
   DOI                hits_by_year
   <chr>                     <dbl>
 1 10.1037/t00747-000            0
 2 10.1037/t00875-000            0
 3 10.1037/t00878-000            0
 4 10.1037/t00879-000            0
 5 10.1037/t02477-000            0
 6 10.1037/t02488-000            0
 7 10.1037/t04670-000            0
 8 10.1037/t04771-000            0
 9 10.1037/t04776-000            0
10 10.1037/t04779-000            0
# ℹ 17 more rows
Show code
all %>% group_by(total_hits, DOI) %>% summarise(hits_by_year = sum(Hits, na.rm = T)) %>% filter(is.na(total_hits)) %>% pull(hits_by_year) %>% table()
`summarise()` has grouped output by 'total_hits'. You can override using the
`.groups` argument.
.
 0 
27 
Show code
psyctests_info <- records_wide %>% 
  select(DOI, TestYear, Name, first_construct, 
         original_test_DOI, original_DOI_combined, 
         test_type, ConstructList, subdiscipline_1, subdiscipline_2, 
         classification_1, classification_2, instrument_type_broad, 
         InstrumentType,
         number_of_factors_subscales, Name_base) %>%
  distinct() %>% 
  inner_join(all, by = c("DOI" = "DOI"), multiple = "all") %>% 
  rename(usage_count = "Hits")
inner_join: added 5 columns (Year, Hits, first_pub_year, last_pub_year, total_hits)
            > rows only in x  ( 40,548)
            > rows only in y  (      1)
            > matched rows     218,141    (includes duplicates)
            >                 =========
            > rows total       218,141
Show code
write_rds(psyctests_info, "../sober_rubric/raw_data/psyctests_info.rds")

2016 changes in standards

Show code
test_data <- records_wide %>%
  filter(TestYear <= 2022) %>%
    rowwise() %>%
    mutate(Methodology = length(MethodologyList) >0) %>%
    mutate(AdministrationMethod = length(AdministrationMethodList) >0) %>%
    mutate(PopulationGroup = length(PopulationGroupList) >0) %>%
    mutate(AgeGroup = length(AgeGroupList) >0) %>%
    group_by(TestYear) %>%
    summarise(Reliability = mean(Reliability!="No reliability indicated."),
              FactorAnalysis = mean(FactorAnalysis!="No factor analysis indicated."),
              # Unidimensional = mean(FactorAnalysis=="This is a unidimensional measure."),
              FactorsAndSubscales = mean(!is.na(FactorsAndSubscales)),
              Validity = mean(Validity!="No validity indicated."),
              Format = mean(!is.na(Format)),
              # Fee = mean(Fee == "Yes"),
              Methodology = mean(Methodology),
              AdministrationMethod = mean(AdministrationMethod),
              # AgeGroup = mean(AgeGroup),
              # PopulationGroup = mean(PopulationGroup),
              TestItems = mean(TestItemsAvailable == "Yes")) %>%
    pivot_longer(-TestYear)
pivot_longer: reorganized (Reliability, FactorAnalysis, FactorsAndSubscales, Validity, Format, …) into (name, value) [was 117x9, now 936x3]
Show code
test_data %>%
  ggplot(aes(TestYear, value, color = name)) +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030),
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1.2))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  geom_text_repel(
    data = test_data %>% drop_na() %>% group_by(name) %>% filter(TestYear == max(TestYear, na.rm = T)),
    aes(label = name),
    segment.color = 'grey',
    xlim = c(2022, 2033),
    box.padding = 0.1,
    # point.padding = 0.6,
    nudge_x = 1.2,
    # nudge_y = 0,
    force = 0.5,
    hjust = 0,
    direction="y",
    na.rm = TRUE
  ) +
  ylab("PsycTests contains information about...") +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: no rows removed
Warning: Removed 696 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/changed_standards_2016.pdf", width = 8, height = 4)
Warning: Removed 696 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/changed_standards_2016.png", width = 8, height = 4)
Warning: Removed 696 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
records_wide %>% 
  group_by(TestYear) %>% 
  summarise(number_of_test_items = mean(number_of_test_items, na.rm = T)) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
ggplot(aes(TestYear, number_of_test_items)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, NA), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
  xlab("Publication year in APA PsycTests") +
  ylab("Number of items in measure")

Show code
records_wide %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
ggplot(aes(TestYear, number_of_factors_subscales)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_pointrange(stat = 'summary', fun.data = 'mean_se') +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Number of factors/subscales")
Warning: Removed 54780 rows containing non-finite outside the scale range
(`stat_summary()`).

Show code
records_wide %>% 
  group_by(InstrumentType, TestYear) %>% 
  summarise(tests = n()) %>% 
  group_by(TestYear) %>% 
  mutate(tests = tests/sum(tests, na.rm = T)) %>% 
  arrange(TestYear) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  # mutate(tests = cumsum(tests)) %>% 
  ggplot(aes(TestYear, tests, color = InstrumentType)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_color_viridis_d() +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Proportion of instrument type")
`summarise()` has grouped output by 'InstrumentType'. You can override using the
`.groups` argument.
Warning: Removed 29 rows containing missing values or values outside the scale range
(`geom_line()`).

Show code
records_wide %>% 
  # filter(instrument_type_broad !=)
  group_by(instrument_type_broad, TestYear) %>% 
  summarise(tests = n()) %>% 
  group_by(TestYear) %>% 
  mutate(tests = tests/sum(tests, na.rm = T)) %>% 
  arrange(TestYear) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  # mutate(tests = cumsum(tests)) %>% 
  ggplot(aes(TestYear, tests, color = instrument_type_broad)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Proportion of instrument type")
`summarise()` has grouped output by 'instrument_type_broad'. You can override
using the `.groups` argument.

Show code
records_wide %>% 
  filter(instrument_type_broad != "questionnaire") %>% 
  group_by(InstrumentType, TestYear) %>% 
  summarise(tests = n()) %>% 
  group_by(TestYear) %>% 
  mutate(tests = tests/sum(tests, na.rm = T)) %>% 
  arrange(TestYear) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  # mutate(tests = cumsum(tests)) %>% 
  ggplot(aes(TestYear, tests, color = InstrumentType)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Proportion of instrument type") +
  ggtitle("Without questionnaires")
`summarise()` has grouped output by 'InstrumentType'. You can override using the
`.groups` argument.

Show code
psyctests_info %>% group_by(DOI, TestYear) %>% 
  summarise(used = sum(usage_count, na.rm = T)) %>% 
  full_join(records_wide %>% select(DOI, TestYear)) %>% 
  mutate(used = coalesce(used, 0)) %>% 
  filter(TestYear >= 1993, TestYear <= 2022) %>% 
  group_by(TestYear) %>% 
  summarise(never_reused = mean(used == 0),
            used_once = mean(used == 1),
            used_twice = mean(used == 2),
            used_thrice = mean(used == 3),
            used_more_10 = mean(used > 9)) %>% 
  pivot_longer(-TestYear) %>% 
  ggplot(aes(TestYear, value, color = name)) + 
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  geom_line() +
  xlim(1993, 2022) +
  xlab("Publication year in APA PsycTests") +
  ylab("Frequency")
`summarise()` has grouped output by 'DOI'. You can override using the `.groups`
argument.
Joining with `by = join_by(DOI, TestYear)`
full_join: added no columns
> rows only in x 0
> rows only in y 40,548
> matched rows 31,144
> ========
> rows total 71,692
pivot_longer: reorganized (never_reused, used_once, used_twice, used_thrice,
used_more_10) into (name, value) [was 30x6, now 150x3]

New measures by publication year

Show code
count_all <- records_wide %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear)

count_orig <- records_wide %>% 
  filter(test_type == "Original") %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear)

count_base <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI))

count_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(first_construct)) %>% 
  arrange(TestYear)

counts <- bind_rows(
  "novel constructs" = count_construct,
  "novel measures" = count_orig,
  "with translations\n and revisions" = count_all,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(counts, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Number of measures/constructs published") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) +
  geom_text_repel(data = counts %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  lineheight = .9,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1.5,
                  nudge_y = -15,
                  force = 1,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: no rows removed
Warning: Removed 250 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/counts.pdf", width = 8, height = 4)
Warning: Removed 250 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/counts.png", width = 8, height = 4)
Warning: Removed 250 rows containing missing values or values outside the scale range
(`geom_line()`).

Cumulative number of measures and constructs

Show code
cumsum_all <- records_wide %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_orig <- records_wide %>% 
  filter(test_type == "Original") %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_base <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsums <- bind_rows(
  "novel constructs" = cumsum_construct,
  "novel measures" = cumsum_orig,
  "with translations\n and revisions" = cumsum_all,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of measures") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", tests, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE) +
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )
drop_na: no rows removed
Warning: Removed 250 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums.pdf", width = 8, height = 4)
Warning: Removed 250 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums.png", width = 8, height = 4)
Warning: Removed 250 rows containing missing values or values outside the scale range
(`geom_line()`).

By subdiscipline

Show code
cumsum_all <- records_wide %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
cumsum_orig <- records_wide %>% 
  filter(test_type == "Original") %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
cumsum_base <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  mutate(tests = cumsum(tests)) 
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
cumsum_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(subdiscipline_1, TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
cumsums <- bind_rows(
  "constructs" = cumsum_construct,
  "measures" = cumsum_orig,
  "with translations & revisions" = cumsum_all,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)

cumsums$origin <- factor(cumsums$origin, levels = c("with translations & revisions", "constructs", "measures"))
my_colors <- c("with translations & revisions" = "#7570B3",
               "constructs" = "#1B9E77", 
               "measures" = "#D95F02") 

ggplot(cumsums, aes(Year, tests, color = origin)) + 
  geom_line() +
  facet_wrap(~ subdiscipline_1, scales = "free_y", ncol = 2) + 
  scale_y_continuous("Cumulative number of measures") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2022), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_manual(values = my_colors, guide = guide_legend(title = NULL)) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = c(0.58, 0.08),
        legend.justification = c(0, 0),
        legend.box.just = "right",
        legend.text = element_text(size = 11))
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
Warning: Removed 189 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_subdiscipline.pdf", width = 8, height = 7)
Warning: Removed 189 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_subdiscipline.png", width = 8, height = 7)
Warning: Removed 189 rows containing missing values or values outside the scale range
(`geom_line()`).

Tests by usage frequency

Show code
test_frequency <- psyctests_info %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  # filter(TestYear >= 1990) %>%
  filter(between(Year, 1993, 2022)) %>%
  group_by(Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  arrange(n)
drop_na: removed 51,440 rows (24%), 166,701 rows remaining
Show code
test_frequency <- records_wide %>% 
  filter(test_type == "Original", TestYear <= 2022) %>% 
  select(Test = DOI) %>% 
  full_join(test_frequency) %>% 
  mutate(n = coalesce(n, 0.5))
Joining with `by = join_by(Test)`
full_join: added one column (n)
           > rows only in x   25,130
           > rows only in y    3,687
           > matched rows     18,179
           >                 ========
           > rows total       46,996
Show code
test_frequency <- test_frequency %>% 
  group_by(n) %>% 
  summarise(count = n()) %>% 
  ungroup() %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  # facet_wrap(~ subdiscipline_1, scales = "free_y") + 
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
  scale_y_continuous("Number of measures") +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(0.5, 1, 2, 5, 10, 100, 1000, 25000),
               labels = c(0, 1, 2, 5, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count + 700)) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
freq_plot
Warning: `position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency_across.pdf", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency_across.png", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
Show code
test_frequency <- test_frequency %>% 
  filter(n >= 1) %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
  scale_y_continuous("Number of measures") +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(1, 2, 5, 10, 100, 1000, 25000),
               labels = c(1, 2, 5, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count + 700)) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
freq_plot
Warning: `position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency_across_no0.pdf", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency_across_no0.png", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
Show code
test_frequency <- psyctests_info %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  # filter(TestYear >= 1990) %>%
  filter(between(Year, 1993, 2022)) %>%
  group_by(subdiscipline_1, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  arrange(n)
drop_na: removed 51,440 rows (24%), 166,701 rows remaining
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
test_frequency <- records_wide %>% 
  filter(test_type == "Original", TestYear <= 2022) %>% 
  select(subdiscipline_1, Test = DOI) %>% 
  full_join(test_frequency) %>% 
  mutate(n = coalesce(n, 0.5))
Joining with `by = join_by(subdiscipline_1, Test)`
full_join: added one column (n)
> rows only in x 25,199
> rows only in y 4,456
> matched rows 18,110
> ========
> rows total 47,765
Show code
test_frequency <- test_frequency %>% 
  # mutate(n = if_else(n >= 1000, 1000, n)) %>% 
  mutate(subdiscipline_1 = str_replace(subdiscipline_1, " Psychology", "")) %>% 
  group_by(subdiscipline_1, n) %>% 
  summarise(count = n()) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(percent = count/sum(count))
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  facet_wrap(~ subdiscipline_1, scales = "free_y") + 
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
  scale_y_continuous("Number of measures", expand = expansion(c(0, 0.1))) +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(0.5, 1, 2, 10, 100, 1000, 25000),
               labels = c(0, 1, 2, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count ), size = 3, vjust = -0.4, hjust = 0.4) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")

freq_plot
Warning: `position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency.pdf", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency.png", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
Show code
test_frequency <- test_frequency %>% 
  filter(n >= 1) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(percent = count/sum(count))

freq_plot <- ggplot(test_frequency, aes(n, count)) + 
  geom_bar(width = 0.1, fill = colors["novel"], stat = "identity") +
  facet_wrap(~ subdiscipline_1, scales = "free_y") + 
  # scale_y_sqrt("Number of measures", breaks = c(0, 100, 400, 1000, 2000, 4000, 6000, 10000), limits = c(0, 11500)) +
    scale_y_continuous("Number of measures", expand = expansion(c(0, 0.1))) +
  scale_x_log10("Usages recorded in APA PsycInfo 1993-2022",
                breaks = c(1, 2, 5, 10, 100, 1000, 25000),
               labels = c(1, 2, 5, 10, 100, 1000, 25000)) +
  
  geom_text(aes(label = if_else(n <= 2, sprintf("%.0f%%", percent*100), ""),
                x = n, y = count ), size = 3, vjust = -0.11, hjust = 0.4) +

  # scale_x_sqrt(breaks = c(0, 1, 2, 3, 4, 5, 10, 20, 40, 50), labels = c(0, 1, 2, 3, 4, 5, 10, 20, 40, "50+")) +
  # geom_text_repel(aes(x = n, label = first_acronym, y = y), 
  #                 data =
  #                   test_frequency %>% group_by(subdiscipline_1) %>% filter(row_number() > (n() - 10) ) %>% left_join(records_wide %>% select(Test = DOI, first_acronym)) %>% 
  #                   mutate(first_acronym = if_else(first_acronym == "HRSD", "HAM-D",
  #                                                  first_acronym)) %>% 
    # mutate(y = 20 + 50*(1+n()-row_number())),
    #               size = 3.3, force = 5, force_pull  = 0, max.time = 1, 
    #               max.overlaps = Inf,
    #               segment.color = "lightgray",
    #               segment.curvature = 1,
    #               hjust = 1,
    #               nudge_y = 10,
    #               direction = "y"
    # ) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
freq_plot
Warning: `position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
Show code
ggsave("figures/frequency_no0.pdf", width = 8, height = 4)
Warning: `position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.
`position_stack()` requires non-overlapping x intervals.

By instrument type

Show code
usage_by_year_instrument_type <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(instrument_type_broad, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(instrument_type_broad) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(instrument_type_broad, n_tests, Year) %>% 
  summarise(n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'instrument_type_broad', 'Year'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'instrument_type_broad', 'n_tests'. You can
override using the `.groups` argument.
Show code
usage_by_year_instrument_type %>% 
  ggplot(., aes(Year, n, color = instrument_type_broad)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Times tests were coded in PsycInfo") +
  scale_x_continuous(limits = c(1993, 2030), breaks = c(1993, 1998, 2003, 2008, 2013, 2018, 2022)) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = gsub("^.*$", " ", instrument_type_broad)), # This will force the correct position of the link's right end.
                  data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE
  ) +
  geom_text_repel(data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(instrument_type_broad, " Psychology", ""), " (n=", n_tests, ")")),
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE)+
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
Warning: Removed 30 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/usage_by_year_instrument_type.pdf", width = 10, height = 4)
Warning: Removed 30 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/usage_by_year_instrument_type.png", width = 10, height = 4)
Warning: Removed 30 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
usage_by_year_instrument_type <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(InstrumentType, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(InstrumentType) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(InstrumentType, n_tests, Year) %>% 
  summarise(n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'InstrumentType', 'Year'. You can override
using the `.groups` argument.
`summarise()` has grouped output by 'InstrumentType', 'n_tests'. You can override
using the `.groups` argument.
Show code
usage_by_year_instrument_type %>% 
  ggplot(., aes(Year, n, color = InstrumentType)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Times tests were coded in PsycInfo") +
  scale_x_continuous(limits = c(1993, 2035), breaks = c(1993, 1998, 2003, 2008, 2013, 2018, 2022)) +
  scale_color_discrete() +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = gsub("^.*$", " ", InstrumentType)), # This will force the correct position of the link's right end.
                  data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE
  ) +
  geom_text_repel(data = usage_by_year_instrument_type %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(InstrumentType, " Psychology", ""), " (n=", n_tests, ")")),
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE)+
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
Warning: ggrepel: 11 unlabeled data points (too many overlaps).
Consider increasing max.overlaps
Warning: ggrepel: 12 unlabeled data points (too many overlaps).
Consider increasing max.overlaps
Show code
ggsave("figures/usage_by_year_instrument_type_narrow.pdf", width = 14, height = 10)
Warning: ggrepel: 7 unlabeled data points (too many overlaps).
Consider increasing max.overlaps
Warning: ggrepel: 9 unlabeled data points (too many overlaps).
Consider increasing max.overlaps
Show code
ggsave("figures/usage_by_year_instrument_type_narrow.png", width = 14, height = 10)
Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 9 unlabeled data points (too many overlaps). Consider increasing max.overlaps

Entropy

Entropy by year

Show code
byorig_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: removed 45,635 rows (22%), 158,814 rows remaining
`summarise()` has grouped output by 'Year'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'n_tests'. You can override using the
`.groups` argument.
Show code
# what's the point of mutate(n_tests = n_distinct(Test))? just to check?


byconstruct_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(first_construct) %>% 
  group_by(Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: no rows removed
`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.`summarise()` has grouped output by 'n_tests'. You can override using the `.groups` argument.
Show code
all_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(DOI) %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: no rows removed
`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.`summarise()` has grouped output by 'n_tests'. You can override using the `.groups` argument.
Show code
original_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  filter(test_type == "Original") %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'Year'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'n_tests'. You can override using the
`.groups` argument.
Show code
entropy_by_year <- bind_rows(# "all tests" = all_entropy_by_year,
                             "measures" = original_entropy_by_year,
                             "with translations\n and revisions" = byorig_entropy_by_year,
                             # "by name base" = bybase_entropy_by_year,
                             "constructs" = byconstruct_entropy_by_year,
                             .id = "version")


entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = version)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(version) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",version)), # "\n (n = ", n_tests, ")"
                  segment.curvature = -0.5,
                  segment.square = TRUE,
                  segment.color = 'grey', 
                  xlim = c(2023, 2030),
                  nudge_x = 1.14,
                  lineheight = .9,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
   theme(
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: no rows removed
Show code
ggsave("figures/entropy.pdf", width = 8, height = 4)
ggsave("figures/entropy.png", width = 8, height = 4)

by subdiscipline

all measures

Show code
entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(subdiscipline_1, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(subdiscipline_1, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'subdiscipline_1', 'Year'. You can override
using the `.groups` argument.
`summarise()` has grouped output by 'subdiscipline_1', 'n_tests'. You can
override using the `.groups` argument.
Show code
entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = subdiscipline_1)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy\n(with revisions and translations)", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2033), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 14))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""), " (n=", n_tests, ")")),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")
drop_na: removed one row (1%), 149 rows remaining
Show code
ggsave("figures/entropy_subdiscipline_all.pdf", width = 8, height = 4)
ggsave("figures/entropy_subdiscipline_all.png", width = 8, height = 4)

original measures

Show code
entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  group_by(subdiscipline_1, Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(subdiscipline_1, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: removed 45,635 rows (22%), 158,814 rows remaining
`summarise()` has grouped output by 'subdiscipline_1', 'Year'. You can override
using the `.groups` argument.
`summarise()` has grouped output by 'subdiscipline_1', 'n_tests'. You can
override using the `.groups` argument.
Show code
entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = subdiscipline_1)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy\n(novel measures)", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 17))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
 # geom_text_repel(aes(label = gsub("^.*$", " ", subdiscipline_1)), # This will force the correct position of the link's right end.
 #                  data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
 #                  segment.curvature = -0.1,
 #                  segment.square = TRUE,
 #                  segment.color = 'grey',
 #                  box.padding = 0.1,
 #                  point.padding = 0.6,
 #                  max.overlaps = Inf,
 #                  nudge_x = 1.3,
 #                  # nudge_y = 0,
 #                  force = 20,
 #                  hjust = 0,
 #                  direction="y",
 #                  na.rm = TRUE
 #  ) +  
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""), " (n=", n_tests, ")")),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F)  +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none") +
    coord_cartesian(clip = "off")
drop_na: removed one row (1%), 148 rows remaining
Show code
ggsave("figures/entropy_subdiscipline_orig.pdf", width = 8, height = 4)
ggsave("figures/entropy_subdiscipline_orig.png", width = 8, height = 4)

constructs

Show code
entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(subdiscipline_1, Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(subdiscipline_1) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(subdiscipline_1, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'subdiscipline_1', 'Year'. You can override
using the `.groups` argument.
`summarise()` has grouped output by 'subdiscipline_1', 'n_tests'. You can
override using the `.groups` argument.
Show code
entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = subdiscipline_1)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy (constructs)", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2038), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 10))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(subdiscipline_1) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(subdiscipline_1, " Psychology", ""), " (n=", n_tests, ")")),
                  # segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  max.overlaps = Inf,
                  point.padding = 0.6,
                  xlim = c(2022, NA),
                  nudge_x = 2,
                  # nudge_y = 0.0,
                  force = 5,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: removed one row (1%), 149 rows remaining
Show code
ggsave("figures/entropy_subdiscipline_constructs.pdf", width = 8, height = 4)
ggsave("figures/entropy_subdiscipline_constructs.png", width = 8, height = 4)

By instrument type

Show code
entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  group_by(instrument_type_broad, Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  group_by(instrument_type_broad) %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(instrument_type_broad, n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'instrument_type_broad', 'Year'. You can
override using the `.groups` argument.
`summarise()` has grouped output by 'instrument_type_broad', 'n_tests'. You can
override using the `.groups` argument.
Show code
entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = instrument_type_broad)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous(limits = c(1993, 2027), breaks = c(1993, 1998, 2003, 2008, 2013, 2018, 2022)) +
  scale_color_brewer(type = "qual", guide = "none", palette = 3) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = gsub("^.*$", " ", instrument_type_broad)), # This will force the correct position of the link's right end.
                  data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE
  ) +
  geom_text_repel(data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0("  ",str_replace(instrument_type_broad, " Psychology", ""), " (n=", n_tests, ")")),
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 0.15,
                  nudge_y = 0.05,
                  force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE)+
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
Warning: Removed 31 rows containing missing values or values outside the scale range
(`geom_line()`).

Lorenz curves

Show code
test_frequency <- psyctests_info %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  # filter(TestYear >= 1990) %>%
  filter(between(Year, 1993, 2022)) %>%
  group_by(subdiscipline_1, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  arrange(n) %>% 
  mutate(decile = Hmisc::cut2(n, g = 10)) %>% 
  mutate(cumsum = cumsum(n),
         sum = sum(n))
drop_na: removed 51,440 rows (24%), 166,701 rows remaining
`summarise()` has grouped output by 'subdiscipline_1'. You can override using the
`.groups` argument.
Show code
# 
# test_frequency %>% 
#     group_by(subdiscipline_1, decile) %>% 
#     summarise(share = sum(n)/first(sum),
#               median_n = median(n),
#               n_measures = n()) %>% 
#   View()


ggplot(test_frequency, aes(n)) +
  stat_lorenz(desc = F) +
  coord_fixed() +
  geom_abline(linetype = "dashed") +
      theme_minimal() +
    hrbrthemes::scale_x_percent("Cumulative percentage of measures") +
    hrbrthemes::scale_y_percent("Cumulative percentage of measure market share") #+
Show code
#    hrbrthemes::theme_ipsum_rc()

ggplot(test_frequency, aes(n, color = subdiscipline_1)) +
  stat_lorenz(desc = F) +
  coord_fixed() +
  geom_abline(linetype = "dashed") +
      theme_minimal() +
    hrbrthemes::scale_x_percent("Cumulative percentage of measures") +
    hrbrthemes::scale_y_percent("Cumulative percentage of measure market share")

Survival

aggregate stats

Show code
constructs <- psyctests_info %>% 
     # filter(between(first_pub_year, 1950, 2015)) %>%
     unnest(ConstructList) %>% 
     rowwise() %>% 
     mutate(construct = unlist(ConstructList)) %>% 
     select(-ConstructList) %>% 
     filter(between(Year, 1993, 2022)) %>% 
     drop_na(construct) %>% 
     mutate(survival = last_pub_year - first_pub_year,
            survived_five = if_else(survival >= 5, T, F),
            survived_ten = if_else(survival >= 10, T, F)) %>%
     distinct(construct, .keep_all = TRUE)
drop_na: no rows removed
Show code
mean(constructs$survival, na.rm = T)
[1] 6.120457
Show code
sd(constructs$survival, na.rm = T)
[1] 8.788988
Show code
median(constructs$survival, na.rm = T)
[1] 2
Show code
max(constructs$survival, na.rm = T)
[1] 122
Show code
min(constructs$survival, na.rm = T)
[1] 0
Show code
measures <- psyctests_info %>% 
     filter(between(Year, 1993, 2022))  %>% 
     mutate(survival = last_pub_year - first_pub_year,
            survived_five = if_else(survival >= 5, T, F),
            survived_ten = if_else(survival >= 10, T, F)) %>%
     distinct(DOI, .keep_all = TRUE)

mean(measures$survival, na.rm = T)
[1] 6.046785
Show code
sd(measures$survival, na.rm = T)
[1] 8.743927
Show code
median(measures$survival, na.rm = T)
[1] 2
Show code
max(measures$survival, na.rm = T)
[1] 122
Show code
min(measures$survival, na.rm = T)
[1] 0

cumulative sum

all constructs

Show code
cumsum_construct <- constructs %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )
drop_na: no rows removed
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_survival_all.pdf", width = 8, height = 4)
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_survival_all.png", width = 8, height = 4)
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).

first constructs

Show code
first_constructs <- constructs %>%
  distinct(first_construct, .keep_all = TRUE)


cumsum_construct <- first_constructs %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- first_constructs %>% 
  filter(survived_five == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- first_constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )
drop_na: no rows removed
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_survival_first.pdf", width = 8, height = 4)
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_survival_first.png", width = 8, height = 4)
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).

measures

Show code
cumsum_all <- measures %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_5 <- measures %>% 
  filter(survived_five == T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n()) %>% 
  arrange(TestYear) %>% 
  mutate(tests = cumsum(tests)) 

cumsum_10 <- measures %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>% 
  distinct(Name_base, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(DOI)) %>% 
  mutate(tests = cumsum(tests)) 

cumsums <- bind_rows(
  "all measures" = cumsum_all,
  "measures in use\n for => 5 years" = cumsum_5,
  "measures in use\n for => 10 years" = cumsum_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of measures") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", tests, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE) +
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )
drop_na: no rows removed
Warning: Removed 220 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_survival_measures.pdf", width = 8, height = 4)
Warning: Removed 220 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsums_survival_measures.png", width = 8, height = 4)
Warning: Removed 220 rows containing missing values or values outside the scale range
(`geom_line()`).

by first use in PsycInfo instead of publication year in PsycTests

Show code
cumsum_construct <- constructs %>% 
  arrange(first_pub_year) %>% 
  group_by(first_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(first_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(first_pub_year) %>% 
  group_by(first_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(first_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(first_pub_year) %>% 
  group_by(first_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(first_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = first_pub_year) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("First usage logged in PsycInfo",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )
drop_na: no rows removed
Warning: Removed 186 rows containing missing values or values outside the scale range
(`geom_line()`).

by last use in PsycInfo instead of publication year in PsycTests

Show code
cumsum_construct <- constructs %>% 
  arrange(last_pub_year) %>% 
  group_by(last_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(last_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 

cumsum_construct_survived_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(last_pub_year) %>% 
  group_by(last_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(last_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 
  
cumsum_construct_survived_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(last_pub_year) %>% 
  group_by(last_pub_year) %>% 
  summarise(constructs = n_distinct(DOI)) %>% 
  arrange(last_pub_year) %>% 
  mutate(constructs = cumsum(constructs)) 



cumsums <- bind_rows(
  "all constructs" = cumsum_construct,
  "constructs in use\n for => 5 years" = cumsum_construct_survived_5,
  "constructs in use\n for => 10 years" = cumsum_construct_survived_10,
  .id = "origin"
  ) %>% 
  rename(Year = last_pub_year) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Last usage logged in PsycInfo",
                     limits = c(1993, 2030), 
                     breaks = seq(1993,2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022"),
                     expand = expansion(add = c(0, 1))) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   )  +
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)+
   # theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")  +
    guides(
    x = guide_axis(cap = "both"), # Cap both ends
  )
drop_na: no rows removed

counts

Show code
count_construct <- constructs %>% 
  arrange(TestYear) %>%
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(construct)) %>% 
  arrange(TestYear)

count_construct_5 <- constructs %>% 
  filter(survived_five == T) %>% 
  arrange(TestYear) %>%
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(construct)) %>% 
  arrange(TestYear)

count_construct_10 <- constructs %>% 
  filter(survived_ten == T) %>% 
  arrange(TestYear) %>%
  group_by(TestYear) %>% 
  summarise(tests = n_distinct(construct)) %>% 
  arrange(TestYear)

counts <- bind_rows(
  "all constructs" = count_construct,
  "constructs in use\n for => 5 years" = count_construct_5,
  "constructs in use\n for => 10 years" = count_construct_10,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)



ggplot(counts, aes(Year, tests, color = origin)) + 
  geom_line() +
  geom_vline(xintercept = 2016, linetype = 'dashed') +
  scale_y_continuous("Number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2030), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) +
  geom_text_repel(data = counts %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  lineheight = .9,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1.5,
                  nudge_y = -15,
                  force = 1,
                  hjust = 0,
                  direction="y",
                  na.rm = F) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: no rows removed
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/counts_survival.pdf", width = 8, height = 4)
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/counts_survival.png", width = 8, height = 4)
Warning: Removed 187 rows containing missing values or values outside the scale range
(`geom_line()`).

Robustness checks

Show code
all_constructs_over_time <- records_wide %>% select(subdiscipline_1, DOI, TestYear, ConstructList) %>% 
    unnest(ConstructList) %>% 
    rowwise() %>% 
    mutate(construct = unlist(ConstructList)) %>% 
  select(-ConstructList)

cumsum_all_constructs <- all_constructs_over_time %>% 
  arrange(TestYear) %>% 
  distinct(construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(construct)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs)) 


cumsum_construct <- records_wide %>% 
  arrange(TestYear) %>% 
  distinct(first_construct, .keep_all = T) %>% 
  group_by(TestYear) %>% 
  summarise(constructs = n_distinct(first_construct)) %>% 
  arrange(TestYear) %>% 
  mutate(constructs = cumsum(constructs))

cumsums <- bind_rows(
  "first constructs" = cumsum_construct,
  "all constructs" = cumsum_all_constructs,
  .id = "origin"
  ) %>% 
  rename(Year = TestYear) %>% 
  filter(Year <= 2022)


ggplot(cumsums, aes(Year, constructs, color = origin)) + 
  geom_line() +
  scale_y_continuous("Cumulative number of constructs") +
  scale_x_continuous("Publication year in APA PsycTests",
                     limits = c(1993, 2027), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = cumsums %>% drop_na() %>% group_by(origin) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ", origin, "\n (n = ", constructs, ")")),
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  nudge_x = 1.2,
                  hjust = 0,
                  na.rm = TRUE)
drop_na: no rows removed
Warning: Removed 170 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsum_all_vs_first.pdf", width = 8, height = 4)
Warning: Removed 170 rows containing missing values or values outside the scale range
(`geom_line()`).
Show code
ggsave("figures/cumsum_all_vs_first.png", width = 8, height = 4)
Warning: Removed 170 rows containing missing values or values outside the scale range
(`geom_line()`).

All or first constructs

For some 20% of tests, two or more constructs were coded. In most plots, we simply use the first construct for each test.
Show code
records_wide <- records_wide %>% 
  rowwise() %>% 
  mutate(constructs_n = length(ConstructList)) %>% 
  ungroup()

table(records_wide$constructs_n)

    1     2     3     4     5     6     7     8     9    10    12 
56645 12421  2134   326    85    37    13    14     3     3     3 
   15    16    17    21    22    23    25 
    1     1     1     1     1     1     2 
Show code
round(prop.table(table(records_wide$constructs_n)),2)

   1    2    3    4    5    6    7    8    9   10   12   15   16   17 
0.79 0.17 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
  21   22   23   25 
0.00 0.00 0.00 0.00 
Show code
ggplot(records_wide, aes(constructs_n)) + 
  geom_bar()

Expanding the entropy calculation to all coded constructs makes little difference.

Show code
all_constructs_over_time <- psyctests_info %>% select(subdiscipline_1, DOI, TestYear, Year, ConstructList, usage_count) %>% 
    unnest(ConstructList) %>% 
    rowwise() %>% 
    mutate(construct = unlist(ConstructList)) %>% 
  select(-ConstructList)

entropy_all_constructs <- all_constructs_over_time %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(construct) %>% 
  group_by(Year, construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(construct)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: no rows removed
`summarise()` has grouped output by 'Year'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'n_tests'. You can override using the
`.groups` argument.
Show code
byconstruct_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(first_construct) %>% 
  group_by(Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(entropy = entropy(n),,
            norm_entropy = calc_norm_entropy(n),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: no rows removed
`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.`summarise()` has grouped output by 'n_tests'. You can override using the `.groups` argument.
Show code
entropy_by_year <- bind_rows(# "all tests" = all_entropy_by_year,
                             "all constructs" = entropy_all_constructs,
                             # "by name base" = bybase_entropy_by_year,
                             "first constructs" = byconstruct_entropy_by_year,
                             .id = "version")

entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = version)) +
  geom_line(size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2027), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +

  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(version) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",version)), # "\n (n = ", n_tests, ")"
                  segment.curvature = -0.5,
                  segment.square = TRUE,
                  segment.color = 'grey', 
                  xlim = c(2023, 2030),
                  nudge_x = 1.14,
                  lineheight = .9,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
   theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: no rows removed
Show code
ggsave("figures/entropy_all_vs_first.pdf", width = 8, height = 4)
ggsave("figures/entropy_all_vs_first.png", width = 8, height = 4)

Unbiased estimators of Shannon entropy

Does not make much of a difference.
Show code
byorig_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
  drop_na(Test) %>% 
  group_by(Year, Test) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(norm_entropy = calc_norm_entropy(n),
            norm_entropy_MM = entropy(n, method = "MM") / log(n()),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: removed 45,635 rows (22%), 158,814 rows remaining
`summarise()` has grouped output by 'Year'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'n_tests'. You can override using the
`.groups` argument.
Show code
1 - (byorig_entropy_by_year$n_tests[1]/ records_wide %>% filter(between(TestYear, 1993, 2022)) %>% 
       mutate(Test = if_else(test_type == "Original", DOI, original_test_DOI)) %>% 
       summarise(n_distinct(Test)))
  n_distinct(Test)
1        0.5517793
Show code
byconstruct_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(first_construct) %>% 
  group_by(Year, Test = first_construct) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(norm_entropy = calc_norm_entropy(n),
            norm_entropy_MM = entropy(n, method = "MM") / log(n()),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: no rows removed
`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.`summarise()` has grouped output by 'n_tests'. You can override using the `.groups` argument.
Show code
all_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  drop_na(DOI) %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(norm_entropy = calc_norm_entropy(n),
            norm_entropy_MM = entropy(n, method = "MM") / log(n()),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
drop_na: no rows removed
`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.`summarise()` has grouped output by 'n_tests'. You can override using the `.groups` argument.
Show code
original_entropy_by_year <- psyctests_info %>% 
  filter(between(Year, 1993, 2022)) %>% 
  filter(test_type == "Original") %>% 
  group_by(Year, Test = DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(n_tests = n_distinct(Test)) %>% 
  group_by(n_tests, Year) %>% 
  filter(n > 0) %>% 
  summarise(norm_entropy = calc_norm_entropy(n),
            norm_entropy_MM = entropy(n, method = "MM") / log(n()),
            n = sum(n),
            diff_tests = n()) %>% 
  ungroup()
`summarise()` has grouped output by 'Year'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'n_tests'. You can override using the
`.groups` argument.
Show code
entropy_by_year <- bind_rows(# "all tests" = all_entropy_by_year,
  "measures" = original_entropy_by_year,
  "with translations\n and revisions" = byorig_entropy_by_year,
  # "by name base" = bybase_entropy_by_year,
  "constructs" = byconstruct_entropy_by_year,
  .id = "version")


plot_entropy <- entropy_by_year %>% 
  ggplot(., aes(Year, norm_entropy, color = version)) +
  geom_line(size = 0.7, linetype = "dashed") +
  geom_line(aes(y = norm_entropy_MM), size = 0.7) +
  scale_y_continuous("Normalized Shannon Entropy", limits = c(0, 1), labels = scales::percent) +
  # geom_line(aes(y = log(n)), color = 'red') +
  scale_x_continuous("Usage year as coded in APA PsycInfo", limits = c(1993, 2027), 
                     breaks = seq(1993, 2022, by = 1),
                     labels = c(1993, "", "", "", "", 1998, "", "", "", "", 2003, "", "", "", "", 2008, "", "", "", "", 2013, "", "", "", "", 2018, "", "", "", "2022")) +
  # ggtitle(str_c(n_distinct(tests_by_year$Test), " measures tracked in PsycInfo")) +
  # annotate("text", x = 1993, y = 1, label = "- each used once", 
  #       size = 3.3, vjust = 0.3, hjust = 0.05) +
  # annotate("text", x = 1993, y = 0, label = "- all used one", 
  #       size = 3.3,  vjust = 0.3, hjust = 0.05) +
  
  scale_color_brewer(type = "qual", guide = "none", palette = 2) +
  # ggrepel::geom_text_repel(
  #   aes(label = str_replace(subdiscipline, " Psychology", "")), data = entropy_by_year %>% filter(Year == max(Year, na.rm = T)),
  #   size = 4, hjust = 1,
  #   ) + 
  geom_text_repel(aes(label = str_replace_all(version, "[a-z=0-9/() ]+", " ")), # This will force the correct position of the link's right end.
                  data = entropy_by_year %>% drop_na() %>% group_by(version) %>% filter(Year == max(Year, na.rm = T)),
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  lineheight = .9,
                  segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1.15,
                  nudge_y = 0.03,
                  force = 0.9,
                  hjust = 0,
                  direction="y",
                  size = 3.3,
                  na.rm = TRUE) +
  geom_text_repel(data = entropy_by_year %>% drop_na() %>% group_by(version) %>% filter(Year == max(Year, na.rm = T)),
                  aes(label = paste0(" ",version)), # "\n (n = ", n_tests, ")"
                  segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1.15,
                  nudge_y = 0.0,
                  lineheight = .9,
                  force = 0.9,
                  size = 3.3,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE) +
  theme_minimal(base_size = 13) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title.position = "plot",
        plot.title = element_text(face="bold"),
        legend.position = "none")
drop_na: no rows removed
drop_na: no rows removed
Show code
plot_entropy

Entropy by classificaiton

Show code
entropy_by_class <- psyctests_info %>% 
  group_by(classification_1, DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T),
            parent = case_when(
                    n > 50 ~ "",
                    n > 20 ~ "used 21-50 times",
                    n > 5 ~ "used 6-20 times",
                    TRUE ~ "used 1-5 times")) %>% 
  group_by(classification_1) %>% 
  # filter(n > 0) %>% 
  summarise(
    entropy = entropy(n),
    norm_entropy = calc_norm_entropy(n)) %>% 
  arrange(norm_entropy)
`summarise()` has grouped output by 'classification_1'. You can override using
the `.groups` argument.
Show code
kable(entropy_by_class)
classification_1 entropy norm_entropy
Military Personnel, Adjustment, and Training 2.383022 0.4815393
Anxiety and Depression 3.193979 0.4847495
Aptitude and Achievement 2.316471 0.5135321
Neuropsychological Assessment 3.279933 0.5288051
Intelligence 2.257675 0.5350566
General Assessment Tools 2.473627 0.5371412
Functional Status and Adaptive Behavior 3.530552 0.5868450
Trauma, Stress, and Coping 4.183437 0.5982308
Emotional States, Emotional Responses, and Motivation 4.248094 0.6077965
Mental Health/Illness Related Assessment 4.840355 0.6092446
Addiction, Gambling, and Substance Abuse/Use 4.368824 0.6220855
Cognitive Processes, Memory, and Decision Making 4.430633 0.6357767
Sports, Recreation, and Leisure 3.819379 0.6452859
Treatment, Rehabilitation, and Therapeutic Processes 5.157328 0.6514703
Legal and Forensic Evaluation 3.839622 0.6557771
Personality 5.270944 0.6880603
Development and Aging 5.170306 0.7010938
Physical Health/Illness Related Assessment 5.519613 0.7074794
Perceptual, Motor, and Sensory Processing 4.027230 0.7094247
Attitudes, Interests, Values, and Expectancies 4.964110 0.7109935
Communication, Language, and Verbal Processing 4.017755 0.7210416
Family Relationships and Parenting 5.284498 0.7324078
Sex, Gender Roles, and Sexual Behavior 4.922535 0.7417984
Social, Group, and Interpersonal Relationships 5.558756 0.7492815
Religious and Political Beliefs 4.667158 0.7656319
Consumer Behavior, Marketing, and Advertising 4.587740 0.7679755
Culture, Racial, and Ethnic Identity 4.787330 0.7693485
Human Factors and Environmental Engineering 4.212094 0.7776987
Human-Computer Interaction 4.726558 0.7843782
Organizational, Occupational, and Career Development 6.103051 0.7895549
Education, Teaching, and Student Characteristics 6.281884 0.7986947

Entropy by instrument type

Show code
psyctests_info %>% 
  group_by(instrument_type_broad, DOI) %>% 
  summarise(n = sum(usage_count, na.rm = T),
            parent = case_when(
                    n > 50 ~ "",
                    n > 20 ~ "used 21-50 times",
                    n > 5 ~ "used 6-20 times",
                    TRUE ~ "used 1-5 times")) %>% 
  group_by(instrument_type_broad) %>% 
  filter(n > 0) %>% 
  summarise(
    entropy = entropy(n),
    norm_entropy = calc_norm_entropy(n)) %>% 
  arrange(norm_entropy) %>% 
  kable()
`summarise()` has grouped output by 'instrument_type_broad'. You can override
using the `.groups` argument.
instrument_type_broad entropy norm_entropy
other-rating 4.454314 0.6201748
task 4.041814 0.6228520
NA 3.717207 0.6283096
test 4.704202 0.6359652
questionnaire 7.207948 0.7060498

Tests by exact same name

Show code
same_name_tests <- records_wide %>% group_by(name_psycinfo) %>% filter(n_distinct(DOI) > 1) %>% summarise(n = n())
nrow(same_name_tests)
[1] 1230
Show code
mean(same_name_tests$n)
[1] 2.44878
Show code
same_name_tests %>% arrange(desc(n)) %>% head(20)
# A tibble: 20 × 2
   name_psycinfo                                n
   <chr>                                    <int>
 1 theory of planned behavior questionnaire    18
 2 job satisfaction scale                      15
 3 self-efficacy scale                         11
 4 self-control scale                          10
 5 behavioral intentions measure                9
 6 parental involvement scale                   9
 7 procedural justice scale                     9
 8 social distance scale                        9
 9 marital satisfaction measure                 8
10 perceived behavioral control measure         8
11 religiosity measure                          8
12 religiosity scale                            8
13 social capital measure                       8
14 social support scale                         8
15 victimization measure                        8
16 attribution questionnaire                    7
17 delinquency measure                          7
18 fear of crime scale                          7
19 food frequency questionnaire                 7
20 life satisfaction scale                      7