Salivary and serum immunoassay comparison: Multivariate Imputation of Hormones

Here, I rerun the models documented in Arslan et al. (2023). Not within spitting distance: Salivary immunoassays of estradiol have subpar validity for predicting cycle phase. The changes are: - switched models to be multivariate with residual correlations between outcomes. - dropped one outcome (fertile_lh) which was a side interest and led to many rows being dropped because of missingness - added an outcome (log LH, luteinising hormone). It wasn’t part of the original set and since I needed it for a different project, I added it here.

The imputed values changed only very slightly. I’ve documented the mistake of not using mutually adjusted multivariate predictors here.

library(tidyverse)

Warning: package 'dplyr' was built under R version 4.5.2

── Attaching core tidyverse packages ────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(codebook)
library(brms)

Loading required package: Rcpp
Loading 'brms' package (version 2.23.0). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

Attaching package: 'brms'

The following object is masked from 'package:stats':

    ar

library(rio)

plot_curve <- function(model, method = "fitted", resp = NULL) {
  cond <- tibble(.rows = 1)
  cond[[names(model$data)[2]]] <- "none"
  if (!is.null(resp)) {
    plot(conditional_effects(model, method = method,
                  conditions = cond, 
                  spaghetti = TRUE, nsamples = 100, resp = resp), 
                points = TRUE, 
                point_args = list(alpha = 0.05),
                spaghetti_args = list(alpha = 0.01, colour = '#FF000007'), 
                line_args = list(size = 0), 
                plot = FALSE)[[1]]
  } else {
    plot(conditional_effects(model, method = method,
                  conditions = cond, 
                  spaghetti = TRUE, nsamples = 100), 
                points = TRUE, 
                point_args = list(alpha = 0.05),
                spaghetti_args = list(alpha = 0.01, colour = '#FF000007'), 
                line_args = list(size = 0), 
                plot = FALSE)[[1]]
  }
}

rmse <- function(error) { sqrt(mean(error^2, na.rm = TRUE)) }
rmse_brms <- function(model) {
  resid <- residuals(model, summary = F)
  rmse_samples <- apply(resid, 1, rmse)
  rmse_ci <- rstantools::posterior_interval(as.matrix(rmse_samples), prob = 0.95)
  sprintf("%.2f [%.2f;%.2f]", mean(rmse_samples), rmse_ci[1], rmse_ci[2]) 
}
mae <- function(error) { mean(abs(error), na.rm = TRUE) }
mae_brms <- function(model) {
  resid <- residuals(model, summary = F)
  mae_samples <- apply(resid, 1, mae)
  mae_ci <- rstantools::posterior_interval(as.matrix(mae_samples), prob = 0.95)
  sprintf("%.2f [%.2f;%.2f]", mean(mae_samples), mae_ci[1], mae_ci[2]) 
}

loo_ci <- function(model) {
  loo_R <- loo_R2(model, re_formula = NA)
  loo_R <- sqrt(loo_R)
  sprintf("%.2f [%.2f;%.2f]", loo_R[,"Estimate"], loo_R[,"Q2.5"], loo_R[,"Q97.5"]) 
}

biocycle <- readRDS("biocycle.rds")
options(mc.cores = parallel::detectCores(), brms.backend = "cmdstanr", brms.file_refit = "on_change")
rstan::rstan_options(auto_write = TRUE)

biocycle <- biocycle %>% filter(between(cycle_length, 20, 35))
biocycle$logE_P <- log(biocycle$E_P)
biocycle$logE <- log(biocycle$total_estradiol)
biocycle$logP <- log(biocycle$progesterone)
biocycle$logT <- log(biocycle$Testo)
biocycle$logFE <- log(biocycle$estradiol)
biocycle$logFT <- log(biocycle$`Free T (ng/dL)`)
biocycle$logLH <- log(biocycle$LH)
biocycle$fertile_lh <- NULL

Days

bc_days <- data.frame(
  bc_day = c(-28:-1, -29:-40),
  prc_stirn_bc = c(.01, .01, .02, .03, .05, .09, .16, .27, .38, .48, .56, .58, .55, .48, .38, .28, .20, .14, .10, .07, .06, .04, .03, .02, .01, .01, .01, .01, rep(.01, times = 12)),
#                   rep(.01, times = 70)), # gangestad uses .01 here, but I think such cases are better thrown than kept, since we might simply have missed a mens
  prc_wcx_bc = c(.000, .000, .001, .002, .004, .009, .018, .032, .050, .069, .085, .094, .093, .085, .073, .059, .047, .036, .028, .021, .016, .013, .010, .008, .007, .006, .005, .005, rep(.005, times = 12))
) %>% arrange(bc_day)

fc_days <- data.frame(
  fc_day = c(0:39),
  prc_stirn_fc = c(.01, .01, .02, .03, .05, .09, .16, .27, .38, .48, .56, .58, .55, .48, .38, .28, .20, .14, .10, .07, .06, .04, .03, .02, .01, .01, .01, .01, rep(.01, times = 12)),
#                   rep(.01, times = 70)), # gangestad uses .01 here, but I think such cases are better thrown than kept, since we might simply have missed a mens
  prc_wcx_fc = c(.000, .000, .001, .002, .004, .009, .018, .032, .050, .069, .085, .094, .093, .085, .073, .059, .047, .036, .028, .021, .016, .013, .010, .008, .007, .006, .005, .005, rep(.005, times = 12))
)

lh_days <- tibble(
    conception_risk_lh = c(rep(0,8), 0.00, 0.01, 0.02, 0.06, 0.16, 0.20, 0.25, 0.24, 0.10, 0.02, 0.02, rep(0,12)),
    lh_day = -15:15
  ) %>% 
    mutate(fertile_lh = conception_risk_lh/max(conception_risk_lh)*0.25/0.31)
biocycle <- biocycle %>% left_join(lh_days %>% select(lh_day, fertile_lh), by = "lh_day")

Multivariate Imputation

Backward counting

bf_e_bc <- bf(logE ~ s(bc_day) + (1 | id) + (1|id:cycle))
bf_fe_bc <- bf(logFE ~ s(bc_day) + (1 | id) + (1|id:cycle))
bf_p_bc <- bf(logP ~ s(bc_day) + (1 | id) + (1|id:cycle))
bf_t_bc <- bf(logT ~ s(bc_day) + (1 | id) + (1|id:cycle))
bf_ft_bc <- bf(logFT ~ s(bc_day) + (1 | id) + (1|id:cycle))
bf_lh_bc <- bf(logLH ~ s(bc_day) + (1 | id) + (1|id:cycle))

mod_mv_bc <- brm(
  bf_e_bc + bf_fe_bc + bf_p_bc + bf_t_bc + bf_ft_bc + bf_lh_bc + set_rescor(TRUE),
  data = biocycle,
  control = list(adapt_delta = 0.95, max_treedepth = 20),
  file = 'models/mod_mv_bc'
)

Warning: Rows containing NAs were excluded from the model.

mod_mv_bc

 Family: MV(gaussian, gaussian, gaussian, gaussian, gaussian, gaussian) 
  Links: mu = identity
         mu = identity
         mu = identity
         mu = identity
         mu = identity
         mu = identity 
Formula: logE ~ s(bc_day) + (1 | id) + (1 | id:cycle) 
         logFE ~ s(bc_day) + (1 | id) + (1 | id:cycle) 
         logP ~ s(bc_day) + (1 | id) + (1 | id:cycle) 
         logT ~ s(bc_day) + (1 | id) + (1 | id:cycle) 
         logFT ~ s(bc_day) + (1 | id) + (1 | id:cycle) 
         logLH ~ s(bc_day) + (1 | id) + (1 | id:cycle) 
   Data: biocycle (Number of observations: 3394) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Smoothing Spline Hyperparameters:
                     Estimate Est.Error l-95% CI u-95% CI Rhat
sds(logE_sbc_day_1)      5.05      1.28     3.20     8.11 1.00
sds(logFE_sbc_day_1)     5.44      1.33     3.51     8.55 1.01
sds(logP_sbc_day_1)      5.80      1.47     3.78     9.53 1.00
sds(logT_sbc_day_1)      1.03      0.34     0.59     1.89 1.00
sds(logFT_sbc_day_1)     1.10      0.36     0.60     1.97 1.00
sds(logLH_sbc_day_1)     5.90      1.48     3.74     9.47 1.00
                     Bulk_ESS Tail_ESS
sds(logE_sbc_day_1)      1013     1307
sds(logFE_sbc_day_1)      798     1374
sds(logP_sbc_day_1)       901     1286
sds(logT_sbc_day_1)       754     1424
sds(logFT_sbc_day_1)     1044     1693
sds(logLH_sbc_day_1)      930     1561

Multilevel Hyperparameters:
~id (Number of levels: 246) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
sd(logE_Intercept)      0.21      0.01     0.18     0.23 1.00
sd(logFE_Intercept)     0.20      0.02     0.17     0.23 1.00
sd(logP_Intercept)      0.42      0.03     0.37     0.47 1.00
sd(logT_Intercept)      0.34      0.02     0.31     0.38 1.01
sd(logFT_Intercept)     0.39      0.02     0.36     0.43 1.03
sd(logLH_Intercept)     0.29      0.02     0.25     0.33 1.01
                    Bulk_ESS Tail_ESS
sd(logE_Intercept)       904     1168
sd(logFE_Intercept)      825     1023
sd(logP_Intercept)      1169     1888
sd(logT_Intercept)       292      649
sd(logFT_Intercept)      165      270
sd(logLH_Intercept)      937     2557

~id:cycle (Number of levels: 448) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
sd(logE_Intercept)      0.01      0.00     0.00     0.02 1.01
sd(logFE_Intercept)     0.01      0.01     0.00     0.02 1.03
sd(logP_Intercept)      0.16      0.03     0.11     0.21 1.01
sd(logT_Intercept)      0.01      0.01     0.00     0.02 1.01
sd(logFT_Intercept)     0.03      0.00     0.02     0.04 1.02
sd(logLH_Intercept)     0.12      0.03     0.05     0.17 1.02
                    Bulk_ESS Tail_ESS
sd(logE_Intercept)       185      429
sd(logFE_Intercept)      136      433
sd(logP_Intercept)       688     1041
sd(logT_Intercept)       221      698
sd(logFT_Intercept)      303      636
sd(logLH_Intercept)      468      573

Regression Coefficients:
                Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
logE_Intercept      4.46      0.02     4.43     4.49 1.01     1126
logFE_Intercept     0.43      0.02     0.40     0.47 1.00     1603
logP_Intercept      7.30      0.03     7.24     7.36 1.01      977
logT_Intercept      3.31      0.02     3.26     3.35 1.02      145
logFT_Intercept    -0.88      0.03    -0.93    -0.83 1.02      166
logLH_Intercept     1.83      0.02     1.79     1.88 1.00     1509
logE_sbc_day_1    -11.70      1.56   -14.82    -8.63 1.00      908
logFE_sbc_day_1   -12.73      1.76   -16.24    -9.33 1.00      920
logP_sbc_day_1    -19.59      2.25   -24.08   -15.34 1.00     2294
logT_sbc_day_1     -0.95      0.51    -1.96     0.03 1.00     1117
logFT_sbc_day_1    -0.84      0.56    -1.92     0.21 1.00     1131
logLH_sbc_day_1     3.74      2.15    -0.34     7.95 1.00     1565
                Tail_ESS
logE_Intercept      1898
logFE_Intercept     2357
logP_Intercept      1699
logT_Intercept       309
logFT_Intercept      293
logLH_Intercept     2376
logE_sbc_day_1      1671
logFE_sbc_day_1     1430
logP_sbc_day_1      2651
logT_sbc_day_1      2162
logFT_sbc_day_1     2079
logLH_sbc_day_1     2497

Further Distributional Parameters:
            Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sigma_logE      0.50      0.01     0.49     0.51 1.00     1691
sigma_logFE     0.56      0.01     0.55     0.58 1.00     1716
sigma_logP      0.64      0.01     0.62     0.65 1.00     4315
sigma_logT      0.18      0.00     0.17     0.18 1.00     2678
sigma_logFT     0.20      0.00     0.19     0.20 1.00     1734
sigma_logLH     0.65      0.01     0.63     0.66 1.00     4606
            Tail_ESS
sigma_logE      2336
sigma_logFE     2409
sigma_logP      2912
sigma_logT      2504
sigma_logFT     2784
sigma_logLH     2765

Residual Correlations: 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
rescor(logE,logFE)      0.99      0.00     0.99     0.99 1.01
rescor(logE,logP)      -0.13      0.02    -0.16    -0.09 1.00
rescor(logFE,logP)     -0.14      0.02    -0.18    -0.11 1.00
rescor(logE,logT)       0.24      0.02     0.21     0.28 1.00
rescor(logFE,logT)      0.23      0.02     0.19     0.26 1.00
rescor(logP,logT)      -0.03      0.02    -0.06     0.01 1.00
rescor(logE,logFT)      0.25      0.02     0.22     0.29 1.00
rescor(logFE,logFT)     0.30      0.02     0.26     0.33 1.00
rescor(logP,logFT)     -0.09      0.02    -0.13    -0.06 1.00
rescor(logT,logFT)      0.85      0.01     0.84     0.86 1.00
rescor(logE,logLH)      0.35      0.02     0.32     0.38 1.00
rescor(logFE,logLH)     0.35      0.02     0.32     0.39 1.00
rescor(logP,logLH)     -0.15      0.02    -0.18    -0.12 1.00
rescor(logT,logLH)      0.33      0.02     0.29     0.36 1.00
rescor(logFT,logLH)     0.33      0.02     0.30     0.36 1.00
                    Bulk_ESS Tail_ESS
rescor(logE,logFE)       699     1602
rescor(logE,logP)       7153     3481
rescor(logFE,logP)      7223     3393
rescor(logE,logT)       2872     2924
rescor(logFE,logT)      2784     3042
rescor(logP,logT)       3699     2826
rescor(logE,logFT)      2730     2989
rescor(logFE,logFT)     2831     3015
rescor(logP,logFT)      3737     3035
rescor(logT,logFT)      1431     2459
rescor(logE,logLH)      3565     3019
rescor(logFE,logLH)     3495     2921
rescor(logP,logLH)      5476     3126
rescor(logT,logLH)      4368     3190
rescor(logFT,logLH)     4660     3217

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

# Predict
preds_bc <- predict(mod_mv_bc, newdata = bc_days, re_formula = NA)

# Assign to bc_days using named list elements for safety
bc_days$mv_est_estradiol_bc <- round(preds_bc[, "Estimate", "logE"], 2)
bc_days$mv_est_free_estradiol_bc <- round(preds_bc[, "Estimate", "logFE"], 2)
bc_days$mv_est_progesterone_bc <- round(preds_bc[, "Estimate", "logP"], 2)
bc_days$mv_est_testosterone_bc <- round(preds_bc[, "Estimate", "logT"], 2)
bc_days$mv_est_free_testosterone_bc <- round(preds_bc[, "Estimate", "logFT"], 2)
bc_days$mv_est_luteinising_bc <- round(preds_bc[, "Estimate", "logLH"], 2)
rio::export(bc_days ,"merge_files/bc_days_mv.rds")
rio::export(bc_days ,"merge_files/bc_days_mv.tsv")
rio::export(bc_days ,"merge_files/bc_days_mv.sav")

Forward counting

bf_e_fc <- bf(logE ~ s(fc_day) + (1 | id) + (1|id:cycle))
bf_fe_fc <- bf(logFE ~ s(fc_day) + (1 | id) + (1|id:cycle))
bf_p_fc <- bf(logP ~ s(fc_day) + (1 | id) + (1|id:cycle))
bf_t_fc <- bf(logT ~ s(fc_day) + (1 | id) + (1|id:cycle))
bf_ft_fc <- bf(logFT ~ s(fc_day) + (1 | id) + (1|id:cycle))
bf_lh_fc <- bf(logLH ~ s(fc_day) + (1 | id) + (1|id:cycle))

mod_mv_fc <- brm(
  bf_e_fc + bf_fe_fc + bf_p_fc + bf_t_fc + bf_ft_fc + bf_lh_fc + set_rescor(TRUE),
  data = biocycle,
  control = list(adapt_delta = 0.95, max_treedepth = 20),
  file = 'models/mod_mv_fc'
)

Warning: Rows containing NAs were excluded from the model.

mod_mv_fc

 Family: MV(gaussian, gaussian, gaussian, gaussian, gaussian, gaussian) 
  Links: mu = identity
         mu = identity
         mu = identity
         mu = identity
         mu = identity
         mu = identity 
Formula: logE ~ s(fc_day) + (1 | id) + (1 | id:cycle) 
         logFE ~ s(fc_day) + (1 | id) + (1 | id:cycle) 
         logP ~ s(fc_day) + (1 | id) + (1 | id:cycle) 
         logT ~ s(fc_day) + (1 | id) + (1 | id:cycle) 
         logFT ~ s(fc_day) + (1 | id) + (1 | id:cycle) 
         logLH ~ s(fc_day) + (1 | id) + (1 | id:cycle) 
   Data: biocycle (Number of observations: 3394) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Smoothing Spline Hyperparameters:
                     Estimate Est.Error l-95% CI u-95% CI Rhat
sds(logE_sfc_day_1)      1.82      0.57     1.03     3.21 1.00
sds(logFE_sfc_day_1)     2.03      0.62     1.14     3.51 1.00
sds(logP_sfc_day_1)      3.84      1.10     2.35     6.47 1.00
sds(logT_sfc_day_1)      0.60      0.20     0.33     1.09 1.00
sds(logFT_sfc_day_1)     0.71      0.24     0.39     1.28 1.00
sds(logLH_sfc_day_1)     3.22      0.91     1.94     5.42 1.00
                     Bulk_ESS Tail_ESS
sds(logE_sfc_day_1)      1648     2089
sds(logFE_sfc_day_1)     1817     2336
sds(logP_sfc_day_1)      1418     2690
sds(logT_sfc_day_1)      1730     2478
sds(logFT_sfc_day_1)     1741     2648
sds(logLH_sfc_day_1)     1636     2430

Multilevel Hyperparameters:
~id (Number of levels: 246) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
sd(logE_Intercept)      0.16      0.01     0.14     0.19 1.01
sd(logFE_Intercept)     0.15      0.01     0.12     0.18 1.01
sd(logP_Intercept)      0.38      0.03     0.32     0.44 1.00
sd(logT_Intercept)      0.34      0.02     0.31     0.38 1.01
sd(logFT_Intercept)     0.39      0.02     0.36     0.43 1.01
sd(logLH_Intercept)     0.30      0.02     0.26     0.35 1.00
                    Bulk_ESS Tail_ESS
sd(logE_Intercept)       641     1521
sd(logFE_Intercept)      486     1572
sd(logP_Intercept)      1074     2253
sd(logT_Intercept)       327      680
sd(logFT_Intercept)      354      558
sd(logLH_Intercept)     1440     2111

~id:cycle (Number of levels: 448) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
sd(logE_Intercept)      0.01      0.00     0.00     0.02 1.01
sd(logFE_Intercept)     0.01      0.01     0.00     0.02 1.01
sd(logP_Intercept)      0.20      0.03     0.13     0.26 1.01
sd(logT_Intercept)      0.01      0.00     0.00     0.02 1.00
sd(logFT_Intercept)     0.03      0.00     0.02     0.04 1.01
sd(logLH_Intercept)     0.11      0.03     0.03     0.16 1.01
                    Bulk_ESS Tail_ESS
sd(logE_Intercept)       274      537
sd(logFE_Intercept)      217      616
sd(logP_Intercept)       619      994
sd(logT_Intercept)       500      980
sd(logFT_Intercept)      397      923
sd(logLH_Intercept)      407      424

Regression Coefficients:
                Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
logE_Intercept      4.46      0.01     4.43     4.49 1.00     2211
logFE_Intercept     0.44      0.01     0.41     0.46 1.00     3284
logP_Intercept      7.30      0.03     7.24     7.36 1.00     1734
logT_Intercept      3.31      0.02     3.27     3.36 1.01      167
logFT_Intercept    -0.88      0.02    -0.93    -0.83 1.01      187
logLH_Intercept     1.83      0.02     1.79     1.88 1.00     2073
logE_sfc_day_1     -0.72      1.44    -3.45     2.20 1.00     1078
logFE_sfc_day_1    -0.81      1.61    -3.91     2.43 1.00     1078
logP_sfc_day_1    -10.49      2.78   -15.86    -5.23 1.00     2593
logT_sfc_day_1      0.35      0.50    -0.59     1.33 1.00     1786
logFT_sfc_day_1     0.50      0.55    -0.60     1.59 1.00     1806
logLH_sfc_day_1     1.59      2.24    -2.76     6.09 1.00     2341
                Tail_ESS
logE_Intercept      2543
logFE_Intercept     3057
logP_Intercept      1851
logT_Intercept       397
logFT_Intercept      388
logLH_Intercept     2578
logE_sfc_day_1      1587
logFE_sfc_day_1     1738
logP_sfc_day_1      3263
logT_sfc_day_1      2476
logFT_sfc_day_1     2779
logLH_sfc_day_1     2446

Further Distributional Parameters:
            Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sigma_logE      0.58      0.01     0.57     0.60 1.00     1481
sigma_logFE     0.66      0.01     0.64     0.67 1.00     1471
sigma_logP      0.81      0.01     0.79     0.83 1.00     5363
sigma_logT      0.18      0.00     0.18     0.19 1.00     3504
sigma_logFT     0.20      0.00     0.20     0.21 1.00     2900
sigma_logLH     0.70      0.01     0.68     0.71 1.00     4020
            Tail_ESS
sigma_logE      2196
sigma_logFE     2637
sigma_logP      3016
sigma_logT      2737
sigma_logFT     2922
sigma_logLH     2934

Residual Correlations: 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
rescor(logE,logFE)      0.99      0.00     0.99     0.99 1.00
rescor(logE,logP)      -0.02      0.02    -0.05     0.02 1.00
rescor(logFE,logP)     -0.04      0.02    -0.07    -0.00 1.00
rescor(logE,logT)       0.26      0.02     0.23     0.30 1.00
rescor(logFE,logT)      0.25      0.02     0.21     0.28 1.00
rescor(logP,logT)      -0.03      0.02    -0.07     0.00 1.00
rescor(logE,logFT)      0.27      0.02     0.24     0.31 1.00
rescor(logFE,logFT)     0.31      0.02     0.28     0.34 1.00
rescor(logP,logFT)     -0.12      0.02    -0.16    -0.09 1.00
rescor(logT,logFT)      0.86      0.01     0.85     0.87 1.00
rescor(logE,logLH)      0.36      0.02     0.33     0.39 1.00
rescor(logFE,logLH)     0.37      0.02     0.34     0.40 1.00
rescor(logP,logLH)     -0.17      0.02    -0.21    -0.14 1.00
rescor(logT,logLH)      0.39      0.02     0.36     0.42 1.00
rescor(logFT,logLH)     0.39      0.02     0.36     0.42 1.00
                    Bulk_ESS Tail_ESS
rescor(logE,logFE)      1025     1817
rescor(logE,logP)       7752     3024
rescor(logFE,logP)      7813     3029
rescor(logE,logT)       3810     2963
rescor(logFE,logT)      3771     3114
rescor(logP,logT)       3677     3120
rescor(logE,logFT)      3562     3055
rescor(logFE,logFT)     3789     3265
rescor(logP,logFT)      4093     2973
rescor(logT,logFT)      1439     2328
rescor(logE,logLH)      3658     3174
rescor(logFE,logLH)     3718     3154
rescor(logP,logLH)      6344     2936
rescor(logT,logLH)      4363     2922
rescor(logFT,logLH)     4040     2807

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

preds_fc <- predict(mod_mv_fc, newdata = fc_days, re_formula = NA)

fc_days$mv_est_estradiol_fc <- round(preds_fc[, "Estimate", "logE"], 2)
fc_days$mv_est_free_estradiol_fc <- round(preds_fc[, "Estimate", "logFE"], 2)
fc_days$mv_est_progesterone_fc <- round(preds_fc[, "Estimate", "logP"], 2)
fc_days$mv_est_testosterone_fc <- round(preds_fc[, "Estimate", "logT"], 2)
fc_days$mv_est_free_testosterone_fc <- round(preds_fc[, "Estimate", "logFT"], 2)
fc_days$mv_est_luteinising_fc <- round(preds_fc[, "Estimate", "logLH"], 2)
rio::export(fc_days ,"merge_files/fc_days_mv.rds")
rio::export(fc_days ,"merge_files/fc_days_mv.tsv")
rio::export(fc_days ,"merge_files/fc_days_mv.sav")

Relative to LH surge

# Filter data
biocycle_lh <- biocycle %>% filter(between(lh_day, -20, 14))

bf_e_lh <- bf(logE ~ s(lh_day) + (1 | id) + (1|id:cycle))
bf_fe_lh <- bf(logFE ~ s(lh_day) + (1 | id) + (1|id:cycle))
bf_p_lh <- bf(logP ~ s(lh_day) + (1 | id) + (1|id:cycle))
bf_t_lh <- bf(logT ~ s(lh_day) + (1 | id) + (1|id:cycle))
bf_ft_lh <- bf(logFT ~ s(lh_day) + (1 | id) + (1|id:cycle))
bf_lh_lh <- bf(logLH ~ s(lh_day) + (1 | id) + (1|id:cycle))

# PBFW not modeled in LH model

mod_mv_lh <- brm(
  bf_e_lh + bf_fe_lh + bf_p_lh + bf_t_lh + bf_ft_lh + bf_lh_lh +  set_rescor(TRUE),
  data = biocycle_lh,
  control = list(adapt_delta = 0.95, max_treedepth = 20),
  file = 'models/mod_mv_lh'
)

Warning: Rows containing NAs were excluded from the model.

mod_mv_lh

 Family: MV(gaussian, gaussian, gaussian, gaussian, gaussian, gaussian) 
  Links: mu = identity
         mu = identity
         mu = identity
         mu = identity
         mu = identity
         mu = identity 
Formula: logE ~ s(lh_day) + (1 | id) + (1 | id:cycle) 
         logFE ~ s(lh_day) + (1 | id) + (1 | id:cycle) 
         logP ~ s(lh_day) + (1 | id) + (1 | id:cycle) 
         logT ~ s(lh_day) + (1 | id) + (1 | id:cycle) 
         logFT ~ s(lh_day) + (1 | id) + (1 | id:cycle) 
         logLH ~ s(lh_day) + (1 | id) + (1 | id:cycle) 
   Data: biocycle_lh (Number of observations: 2565) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Smoothing Spline Hyperparameters:
                     Estimate Est.Error l-95% CI u-95% CI Rhat
sds(logE_slh_day_1)      8.98      2.12     5.81    14.10 1.00
sds(logFE_slh_day_1)    10.25      2.59     6.65    16.61 1.00
sds(logP_slh_day_1)      6.59      1.69     4.19    10.79 1.00
sds(logT_slh_day_1)      1.65      0.49     0.96     2.85 1.00
sds(logFT_slh_day_1)     1.64      0.50     0.96     2.87 1.00
sds(logLH_slh_day_1)    11.85      2.94     7.63    18.95 1.00
                     Bulk_ESS Tail_ESS
sds(logE_slh_day_1)       838     1178
sds(logFE_slh_day_1)      715     1572
sds(logP_slh_day_1)       617     1059
sds(logT_slh_day_1)       929     1370
sds(logFT_slh_day_1)      908     1748
sds(logLH_slh_day_1)      791     1663

Multilevel Hyperparameters:
~id (Number of levels: 214) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
sd(logE_Intercept)      0.18      0.01     0.16     0.21 1.00
sd(logFE_Intercept)     0.18      0.02     0.15     0.21 1.00
sd(logP_Intercept)      0.24      0.03     0.18     0.30 1.00
sd(logT_Intercept)      0.34      0.02     0.31     0.37 1.02
sd(logFT_Intercept)     0.38      0.02     0.34     0.41 1.01
sd(logLH_Intercept)     0.29      0.02     0.25     0.33 1.00
                    Bulk_ESS Tail_ESS
sd(logE_Intercept)       762     1725
sd(logFE_Intercept)      692     1779
sd(logP_Intercept)       455      754
sd(logT_Intercept)       215      461
sd(logFT_Intercept)      358      760
sd(logLH_Intercept)     1129     2296

~id:cycle (Number of levels: 340) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
sd(logE_Intercept)      0.01      0.01     0.00     0.02 1.02
sd(logFE_Intercept)     0.01      0.01     0.00     0.02 1.01
sd(logP_Intercept)      0.25      0.03     0.20     0.31 1.00
sd(logT_Intercept)      0.01      0.01     0.00     0.02 1.01
sd(logFT_Intercept)     0.03      0.01     0.02     0.04 1.02
sd(logLH_Intercept)     0.06      0.04     0.00     0.13 1.01
                    Bulk_ESS Tail_ESS
sd(logE_Intercept)       201      566
sd(logFE_Intercept)      177      392
sd(logP_Intercept)       583     1059
sd(logT_Intercept)       282      730
sd(logFT_Intercept)      239      749
sd(logLH_Intercept)      272      671

Regression Coefficients:
                Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
logE_Intercept      4.47      0.02     4.44     4.50 1.00     1400
logFE_Intercept     0.44      0.02     0.41     0.47 1.00     1639
logP_Intercept      7.30      0.03     7.25     7.34 1.00     2047
logT_Intercept      3.30      0.02     3.26     3.34 1.02      178
logFT_Intercept    -0.91      0.03    -0.96    -0.85 1.02      143
logLH_Intercept     1.86      0.02     1.81     1.90 1.00     1067
logE_slh_day_1     -1.23      2.14    -5.40     3.03 1.00      787
logFE_slh_day_1    -1.45      2.40    -6.09     3.23 1.00      766
logP_slh_day_1    -12.75      2.64   -18.02    -7.67 1.00     1705
logT_slh_day_1      0.72      0.75    -0.76     2.20 1.00      973
logFT_slh_day_1     0.50      0.82    -1.07     2.13 1.00      905
logLH_slh_day_1    18.52      2.85    13.01    24.03 1.00     1228
                Tail_ESS
logE_Intercept      2232
logFE_Intercept     2194
logP_Intercept      2557
logT_Intercept       453
logFT_Intercept      369
logLH_Intercept     1935
logE_slh_day_1      1184
logFE_slh_day_1     1197
logP_slh_day_1      2499
logT_slh_day_1      1858
logFT_slh_day_1     1531
logLH_slh_day_1     2238

Further Distributional Parameters:
            Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
sigma_logE      0.47      0.01     0.46     0.49 1.00     1050
sigma_logFE     0.53      0.01     0.51     0.55 1.00     1027
sigma_logP      0.57      0.01     0.55     0.59 1.00     4517
sigma_logT      0.18      0.00     0.17     0.18 1.00     2035
sigma_logFT     0.20      0.00     0.19     0.20 1.00     1478
sigma_logLH     0.60      0.01     0.58     0.61 1.00     3977
            Tail_ESS
sigma_logE      2193
sigma_logFE     2244
sigma_logP      3040
sigma_logT      2475
sigma_logFT     2186
sigma_logLH     3156

Residual Correlations: 
                    Estimate Est.Error l-95% CI u-95% CI Rhat
rescor(logE,logFE)      0.99      0.00     0.99     0.99 1.00
rescor(logE,logP)       0.14      0.02     0.10     0.18 1.00
rescor(logFE,logP)      0.12      0.02     0.08     0.17 1.00
rescor(logE,logT)       0.30      0.02     0.26     0.34 1.00
rescor(logFE,logT)      0.28      0.02     0.24     0.32 1.00
rescor(logP,logT)       0.11      0.02     0.07     0.15 1.00
rescor(logE,logFT)      0.28      0.02     0.24     0.32 1.00
rescor(logFE,logFT)     0.32      0.02     0.29     0.36 1.00
rescor(logP,logFT)      0.06      0.02     0.02     0.11 1.00
rescor(logT,logFT)      0.86      0.01     0.85     0.87 1.00
rescor(logE,logLH)      0.36      0.02     0.32     0.39 1.00
rescor(logFE,logLH)     0.36      0.02     0.32     0.40 1.00
rescor(logP,logLH)      0.01      0.02    -0.03     0.04 1.00
rescor(logT,logLH)      0.27      0.02     0.24     0.31 1.00
rescor(logFT,logLH)     0.28      0.02     0.24     0.31 1.00
                    Bulk_ESS Tail_ESS
rescor(logE,logFE)       683     1786
rescor(logE,logP)       4468     2932
rescor(logFE,logP)      4463     2971
rescor(logE,logT)       2462     2750
rescor(logFE,logT)      2364     2834
rescor(logP,logT)       2319     2464
rescor(logE,logFT)      2062     2643
rescor(logFE,logFT)     2376     2743
rescor(logP,logFT)      2461     2721
rescor(logT,logFT)      1233     2737
rescor(logE,logLH)      3006     2232
rescor(logFE,logLH)     2945     2256
rescor(logP,logLH)      3790     2828
rescor(logT,logLH)      4379     3194
rescor(logFT,logLH)     3991     2789

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

preds_lh <- predict(mod_mv_lh, newdata = lh_days, re_formula = NA)

lh_days$mv_est_estradiol_lh <- round(preds_lh[, "Estimate", "logE"], 2)
lh_days$mv_est_free_estradiol_lh <- round(preds_lh[, "Estimate", "logFE"], 2)
lh_days$mv_est_progesterone_lh <- round(preds_lh[, "Estimate", "logP"], 2)
lh_days$mv_est_testosterone_lh <- round(preds_lh[, "Estimate", "logT"], 2)
lh_days$mv_est_luteinising_lh <- round(preds_lh[, "Estimate", "logLH"], 2)
lh_days$mv_est_free_testosterone_lh <- round(preds_lh[, "Estimate", "logFT"], 2)
rio::export(lh_days ,"merge_files/lh_days_mv.rds")
rio::export(lh_days ,"merge_files/lh_days_mv.tsv")
rio::export(lh_days ,"merge_files/lh_days_mv.sav")

Comparison

# Load old files
bc_days_old <- rio::import("merge_files/bc_days.rds")

Warning: Missing `trust` will be set to FALSE by default for RDS in
2.0.0.

fc_days_old <- rio::import("merge_files/fc_days.rds")

Warning: Missing `trust` will be set to FALSE by default for RDS in
2.0.0.

lh_days_old <- rio::import("merge_files/lh_days.rds")

Warning: Missing `trust` will be set to FALSE by default for RDS in
2.0.0.

# Join and correlate
# BC
bc_joined <- left_join(bc_days, bc_days_old, by = "bc_day", suffix = c("_new", "_old"))

cor_bc <- bc_joined %>% 
  summarise(
    cor_e = cor(mv_est_estradiol_bc, est_estradiol_bc, use = "complete.obs"),
    cor_fe = cor(mv_est_free_estradiol_bc, est_free_estradiol_bc, use = "complete.obs"),
    cor_p = cor(mv_est_progesterone_bc, est_progesterone_bc, use = "complete.obs"),
    cor_t = cor(mv_est_testosterone_bc, est_testosterone_bc, use = "complete.obs"),
    cor_ft = cor(mv_est_free_testosterone_bc, est_free_testosterone_bc, use = "complete.obs")
  )
print("Correlations for BC models:")

[1] "Correlations for BC models:"

print(cor_bc)

      cor_e    cor_fe     cor_p     cor_t    cor_ft
1 0.9892037 0.9871843 0.9997668 0.9989413 0.9986248

# FC
fc_joined <- left_join(fc_days, fc_days_old, by = "fc_day", suffix = c("_new", "_old"))

cor_fc <- fc_joined %>% 
  summarise(
    cor_e = cor(mv_est_estradiol_fc, est_estradiol_fc, use = "complete.obs"),
    cor_fe = cor(mv_est_free_estradiol_fc, est_free_estradiol_fc, use = "complete.obs"),
    cor_p = cor(mv_est_progesterone_fc, est_progesterone_fc, use = "complete.obs"),
    cor_t = cor(mv_est_testosterone_fc, est_testosterone_fc, use = "complete.obs"),
    cor_ft = cor(mv_est_free_testosterone_fc, est_free_testosterone_fc, use = "complete.obs")
  )
print("Correlations for FC models:")

[1] "Correlations for FC models:"

print(cor_fc)

      cor_e    cor_fe     cor_p     cor_t    cor_ft
1 0.9740625 0.9961721 0.9986579 0.9786549 0.9942325

# LH
lh_joined <- left_join(lh_days, lh_days_old, by = "lh_day", suffix = c("_new", "_old"))

cor_lh <- lh_joined %>% 
  summarise(
    cor_e = cor(mv_est_estradiol_lh, est_estradiol_lh, use = "complete.obs"),
    cor_fe = cor(mv_est_free_estradiol_lh, est_free_estradiol_lh, use = "complete.obs"),
    cor_p = cor(mv_est_progesterone_lh, est_progesterone_lh, use = "complete.obs"),
    cor_t = cor(mv_est_testosterone_lh, est_testosterone_lh, use = "complete.obs"),
    cor_ft = cor(mv_est_free_testosterone_lh, est_free_testosterone_lh, use = "complete.obs")
  )
print("Correlations for LH models:")

[1] "Correlations for LH models:"

print(cor_lh)

# A tibble: 1 × 5
  cor_e cor_fe cor_p cor_t cor_ft
  <dbl>  <dbl> <dbl> <dbl>  <dbl>
1 1.000  1.000 1.000 0.999  0.999

Bias Investigation: Inter-hormone Correlations

This section checks if the univariate imputation (Old) underestimated the correlation between different hormones compared to the multivariate imputation (New).

# Function to get lower triangle of correlation matrix
get_cor_summary <- function(df, pattern) {
  # Select columns matching pattern
  cols <- df %>% select(matches(pattern))
  # Calculate correlation
  cormat <- cor(cols, use = "complete.obs")
  # Return readable format
  return(cormat)
}

print("--- Backward Counting: Inter-hormone correlations ---")

[1] "--- Backward Counting: Inter-hormone correlations ---"

# Old Correlations (Univariate)
print("OLD (Univariate) Inter-hormone Correlations:")

[1] "OLD (Univariate) Inter-hormone Correlations:"

old_vars_bc <- c("est_estradiol_bc", "est_progesterone_bc", "est_testosterone_bc", "est_free_estradiol_bc")
print(cor(bc_joined[, old_vars_bc], use = "complete.obs"))

                      est_estradiol_bc est_progesterone_bc
est_estradiol_bc             1.0000000           0.6215797
est_progesterone_bc          0.6215797           1.0000000
est_testosterone_bc          0.5675967           0.2021296
est_free_estradiol_bc        0.9994729           0.6025274
                      est_testosterone_bc est_free_estradiol_bc
est_estradiol_bc                0.5675967             0.9994729
est_progesterone_bc             0.2021296             0.6025274
est_testosterone_bc             1.0000000             0.5634114
est_free_estradiol_bc           0.5634114             1.0000000

# New Correlations (Multivariate)
print("NEW (Multivariate) Inter-hormone Correlations:")

[1] "NEW (Multivariate) Inter-hormone Correlations:"

new_vars_bc <- c("mv_est_estradiol_bc", "mv_est_progesterone_bc", "mv_est_testosterone_bc", "mv_est_free_estradiol_bc")
print(cor(bc_joined[, new_vars_bc], use = "complete.obs"))

                         mv_est_estradiol_bc mv_est_progesterone_bc
mv_est_estradiol_bc                1.0000000              0.6278980
mv_est_progesterone_bc             0.6278980              1.0000000
mv_est_testosterone_bc             0.6672501              0.1963934
mv_est_free_estradiol_bc           0.9995365              0.6049229
                         mv_est_testosterone_bc
mv_est_estradiol_bc                   0.6672501
mv_est_progesterone_bc                0.1963934
mv_est_testosterone_bc                1.0000000
mv_est_free_estradiol_bc              0.6751168
                         mv_est_free_estradiol_bc
mv_est_estradiol_bc                     0.9995365
mv_est_progesterone_bc                  0.6049229
mv_est_testosterone_bc                  0.6751168
mv_est_free_estradiol_bc                1.0000000

print("--- Difference (New - Old) ---")

[1] "--- Difference (New - Old) ---"

# Calculate difference matrix (matching names first)
c_old <- cor(bc_joined[, old_vars_bc], use = "complete.obs")
c_new <- cor(bc_joined[, new_vars_bc], use = "complete.obs")
dimnames(c_new) <- dimnames(c_old) # Ensure names match for subtraction
print(round(c_new - c_old, 3))

                      est_estradiol_bc est_progesterone_bc
est_estradiol_bc                 0.000               0.006
est_progesterone_bc              0.006               0.000
est_testosterone_bc              0.100              -0.006
est_free_estradiol_bc            0.000               0.002
                      est_testosterone_bc est_free_estradiol_bc
est_estradiol_bc                    0.100                 0.000
est_progesterone_bc                -0.006                 0.002
est_testosterone_bc                 0.000                 0.112
est_free_estradiol_bc               0.112                 0.000

Summary of Findings

Based on the comparison between the old univariate imputations and the new multivariate imputations (with residual correlations modeled):

Backward Counting (BC):
- Negligible Bias: The correlations between old and new estimates are >0.98.
- Conclusion: The univariate method performed nearly identically to the multivariate method.
Forward Counting (FC):
- Negligible Bias: The correlations between old and new estimates are >0.97.
- Conclusion: The univariate method performed nearly identically to the multivariate method.
LH-centered Method (LH):
- No Bias: The correlation between old and new estimates is ~1.00.
- Conclusion: The LH-centered method is robust. The cycle phases are so well-aligned by the LH surge that the mean curve explains almost all variance, and multivariate information adds virtually nothing.