Helper functions used throughout

documentation on the functions is interspersed through code comments

set some options

# library = function (...) suppressMessages(base::library(...))
# This should always be the default. Google for reason if not clear.
options(stringsAsFactors = FALSE)
options(digits = 4)

library(rmarkdown); library(formatR); library(foreign); library(Hmisc); library(car); library(psych); library(QuantPsyc); library(formr); library(ggplot2); library(lubridate); library(stringr); library(stringi); library(reshape2); library(plyr); library(knitr); library(data.table); library(dplyr); library(dtplyr); library(broom)
library(pander)

# I'm tired of data table messing up my rmarkdown. Never want to print this in a live report anyway, I'm using pander for that.
options(datatable.print.nrows=0)

always use these settings

analysis_theme = theme_minimal(base_size = 24)
theme_set(analysis_theme)

Auto-adjust the table column alignment depending on data type.

alignment = function (...) UseMethod('alignment')
alignment.default = function (...) 'left'
alignment.integer = function (...) 'right'
alignment.numeric = function (...) 'right'
panderOptions("table.split.table", Inf)

two simple functions to compute coefficients of variation

binary vars

cva_bin = function(x) {
    p = mean(x,na.rm=T)
    sqrt(p*(1-p)) /
        p
}

continuous vars

cva = function(x) {
    sd(x, na.rm = T) /
        mean(x, na.rm = T)
}

shorthand for centering predictors

center = function(x) { scale(x, scale = FALSE) }

shorthand for finding mode

Mode <- function(x) { # from https://stackoverflow.com/questions/2547402/standard-library-function-in-r-for-finding-the-mode
    ux <- unique(x)
    ux[which.max(tabulate(match(x, ux)))]
}

small helper function to sample at level 2 (families, not individuals)

sample_level2 = function(data, level2_var, size, replace = F, prob = NULL) {
    if(is.data.table(data)) {
        some = data[, level2_var, with = F][[1]]
    } else {
        some = data[ ,level2_var]
    }
    some_u = unique(some)
    sample_some = sample(some_u, size, replace, prob)
    take = some %in% sample_some
    data[take,]
}

pander tables think the output from missingness_patterns are markdown

pander_escape = function(x, ...) {
    x$Pattern = stringr::str_replace_all(x$Pattern,"_","\\\\_")
    x
}

formula.tools got a bit too picky

my_as.character.formula = function(x)   Reduce( paste, deparse(x) )

we use this function to automatically get nice tables

pander_handler = function(x, ...) {
    anyS3method = function(x) {
        classes = class(x)
        any(sapply(classes, FUN = function(classes) { !is.null(getS3method('pander',classes,TRUE)) } ))
    }
    if ("knit_asis" %in% class(x)) {
        x
    } else if (is.data.table(x)) {
        ""
        # don't ever print stupid data tables
    } else if (anyS3method(x)) {
        pander(x, row.names = F, ...)
    } else if (isS4(x)) {
        show(x)
    } else {
        print(x)
    }
}

diagnostic function for me to compare number of zeros in data to a poisson process with similar amount of overdispersion

plot_zero_infl = function(variable) {
    rpois.od<-function (n, lambda,d=1) {
        if (d<=1)
            rpois(n, lambda)
        else if (d>1)
            rnbinom(n, size=(lambda/(d-1)), mu=lambda)
    }

    if( var(variable,na.rm=T) > mean(variable,na.rm=T)) { lab = paste0("Overdispersed (d=",round(var(variable,na.rm=T)/mean(variable,na.rm=T),2),") poisson generated using distribution mean and variance via neg.bin")
    } else { lab = "Poisson generated with distribution mean"}

    qplot(rpois.od(n=NROW(variable),mean(variable,na.rm=T), var(variable,na.rm=T)/mean(variable,na.rm=T)), binwidth=1, fill=I("tomato3"), alpha=I(0.5)) + ggtitle(paste0("Slate=Real; Tomato=Generated\n",lab)) +
        geom_histogram(aes(x=variable),fill="darkslategray4",alpha=0.5,binwidth=1) + xlab("")
}

this function is just a shorthand to quickly find the objects I generated

lstype = function(types=c('lmerMod','glmerMod','bglmerMod','blmerMod'), envir = parent.frame(), ...){
    inlist<-ls(envir = envir, ...)
    ifexistsgetclass = function(x, envir) {
        if(exists(x, envir = envir, inherits = FALSE)) {
            bla = get(x, envir = envir, inherits = FALSE)
            class(bla)
        } else {
            NULL
        }
    }
    classlist<- sapply(inlist,function(x) ifexistsgetclass(x, envir = envir))
    if(length(classlist) > 0 ) {
        names(classlist[
            sapply(classlist,function(x){ !!length(intersect(x, types) ) }) ## take any who have a class that is in our list
            ])
    } else character(0)
}

a helper function to get a data frame with estimates and CIs

fortify_mine = function(fit,confint="Wald", modelname = NULL, mcmclist = NULL) {
    if(is.null(modelname)) modelname = deparse(substitute(fit))
    info = summary(fit)
    link = "identity"
    family = "gaussian"
    call = ""

    if(inherits(fit, what = "MCMCglmm")) {
        library(MCMCglmm)
        if(is.null(mcmclist)) {
            conf <- as.data.frame(info$solutions[, c('post.mean','l-95% CI','u-95% CI')])
            names(conf) = c("Estimate","Lower","Upper")
        } else {
            conf = as.data.frame(summary(mcmclist, quantiles = c(0.05,0.5,0.95))$quantiles)
            names(conf) = c("Lower","Estimate","Upper")
            conf = conf[,c("Estimate","Lower","Upper")]
        }
        call = my_as.character.formula(fit$Fixed$formula)
        conf$Coefficient = rownames(conf)
        N_obs = fit$Residual$nrl
        N_fam = fit$Random$nrl
        from_class = "MCMCglmm"
        Outcome = as.character(fit$Fixed$formula[2])
        family = head(unique(fit$family),1)
        if(family != 'gaussian') link = "log"
    } else if (inherits(fit, what="glmmPQL")) {
        summ = summary(fit)
        conf = data.frame(summ$tTable)
        conf$Coefficient = rownames(conf)
        conf$Lower = conf$Value - 1.96 * conf$Std.Error
        conf$Upper = conf$Value + 1.96 * conf$Std.Error
        conf$Estimate = conf$Value
        N_obs = nrow(summ$groups)
        N_fam = nrow(unique(summ$groups))
        link = fit$family$link
        family = fit$family$family
        from_class = "glmmPQL"
        Outcome = names(fit$data)[1]
    } else if(inherits(fit,what='merMod') | inherits(fit,what='lm')| inherits(fit,what='coxme'))
    {
        if(confint=="boot"){
            #           conf <- confint(fit,method= confint, parallel = "multicore", ncpus = 256)
            conf <- confint(fit,method= confint, parallel = "multicore", ncpus = parallel::detectCores())
        } else
        {
            conf <- confint(fit,method= confint )
        }
        # citation? https://stat.ethz.ch/pipermail/r-sig-mixed-models/2013q4/021197.html
        conf = na.omit(as.data.frame(conf))
        conf$Coefficient = rownames(conf)
        names(conf) = c('Lower','Upper','Coefficient')
        if(inherits(fit,what='merMod')) {
            library(lme4)
            conf$Estimate = fixef(fit)
            call = my_as.character.formula(fit@call)
            N_obs = as.vector(info$devcomp$dims["N"])
            N_fam = as.vector(info$ngrps)
            if(length(N_fam)>1) N_fam = as.vector(info$ngrps['idParents'])
            from_class = "merMod"
            Outcome = as.character(fit@call$formula[2])
            if(!is.null(info$link)) link = info$link
            if(!is.null(info$family)) family = info$family
        } else if(inherits(fit, what="coxme")) {
            conf[,1] = fit$coefficients-1.96*sqrt(diag(vcov(fit)))
            conf[,2] = fit$coefficients+1.96*sqrt(diag(vcov(fit)))
            conf$Estimate = coef(fit)
            call = my_as.character.formula(fit$call)
            N_obs = tail(fit$n,1)
            N_fam = head(fit$n,1)
            from_class = "coxme"
            Outcome = my_as.character.formula(fit$call)[2]
            link = "log"
            family = "survival"
        } else {
            call = my_as.character.formula(fit$call)
            conf$Estimate = coef(fit)
            N_obs = nrow(fit$model)
            N_fam = nrow(fit$model)
            from_class = "lm"
            Outcome = names(fit$model)[1]
            if(!is.null(fit$family$link)) link = fit$family$link
            if(!is.null(fit$family$family)) family = fit$family$family
        }
    } else
    {
        stop("Unknown object")
    }

    coefs = within(conf, {
        N = head(N_obs,1)
        Families = head(N_fam,1)
        Outcome = Outcome
        Model = modelname
        Method = from_class
        Link = link
        Family = family
        Call = call
    })

    if(link %in% c("log","logit")) {
        # transform logits into Odds Ratios (ORs) and logs (poisson-link) into Incidence Rate Ratios
        coefs[,c("Estimate","Lower","Upper")] = exp(coefs[,c("Estimate","Lower","Upper")])
    }
    class(coefs) = c( class(coefs), 'confint.coefs')
    class(coefs) = c( class(coefs), paste0('confint.',from_class))
    coefs
}

plotting the output from the above function

plot_fortified_mer = function(coefs,ylab = "Effect on outcome",suppress_intercept=T,colour = "")
{
    modelname = coefs$Model[1]
    #   coefs = fortify.mine(fit, 'Wald', modelname)
    #   info = summary(fit)
    #   list_of_coefs[[ modelname ]] <<- list(coefs, info, fit)

    plot_title = str_c(paste0(unique(coefs[,"N"]), collapse="/"),' individuals in ',paste0(unique(coefs[,"Families"]), collapse="/"), ' families')

    if(suppress_intercept) {
        intercept = coefs[which(coefs$Coefficient == "(Intercept)"),]
        plot_title = str_c(plot_title,"\nIntercept = ",as.character(round(intercept['Estimate'],2)), ", [",as.character(round(intercept["Lower"],2)),";",as.character(round(intercept["Upper"],2)),"]")
        coefs = coefs[which(coefs$Coefficient != "(Intercept)"),]
    }

    if(coefs$Link[1] %in% c("logit","log")) {
        null.point = 1;  scaleY = scale_y_log10(ylab)
    } else {
        null.point = 0;  scaleY = scale_y_continuous(ylab)
    }

    coefs$Coefficient = car::Recode(coefs$Coefficient,"'paternalage.diff'='Paternal age';'paternalage.mean'='Paternal age\\nfamily average'")
    coefs$Coefficient = str_replace(coefs$Coefficient, "paternalage.factor", "Paternal age ")
    order = sort(unique(coefs$Coefficient))
    main = c('Paternal age\nfamily average','Paternal age')
    order = c(setdiff(order, main), intersect(main, order))
    coefs$Coefficient = factor(coefs$Coefficient,levels = order)

    if(colour != "") {
        aesthetics = aes_string(x = "Coefficient", y = "Estimate", ymin = "Lower", ymax = "Upper", colour = colour)
    } else {
        aesthetics = aes_string(x = "Coefficient", y = "Estimate", ymin = "Lower", ymax = "Upper")
    }
    ggplot(coefs, aesthetics) +
        geom_pointrange(position=position_dodge(width=0.8), size = 1) +
        coord_flip() +
        ggtitle(plot_title) +
        scaleY +
        geom_text(aes(label=round(Estimate,2)), vjust = 2, position=position_dodge(width=0.8)) +
        scale_x_discrete("Predictors") +
        geom_hline(yintercept = null.point, linetype='dotted')  +
        analysis_theme + theme(panel.grid.minor.y = element_blank(),panel.grid.major.y =element_blank(),panel.grid.major.x = element_line(colour="#eeeeee"))
}

grand mean center paternal age (or anything) and center it within groups

recenter.pat = function(data, what = 'paternalage', among_who = 'idParents', breaks = 'c(0 ,25, 30, 35, 40, 45, 50, 55, 90)') {
    makecenter = parse(text= str_c(what,'_c := meanCenter(',what,')') )
    makemean = parse(text= str_c(what,'.mean := ave(',what,',',among_who,', FUN= function(x) { mean(x,na.rm=T) })'))
    makediff = parse(text= str_c(what,'.diff := ',what,' -',what,'.mean'))
    make_factors = parse(text= str_c(what,'.factor := cut((10*',what,'),breaks = ',breaks,', include.lowest = T, right = T)'))

    data[, eval(makecenter)]
    data[, eval(makemean)]
    data[, eval(makediff)]
    data[, eval(make_factors)]
    data
}

count the number of siblings who compete for food/attention in first five years of life (and are younger than 10)

dependent_sibs_f5y = function(survive1y, byear, dyear) {
    sibs = length(byear)
    other_dependent_sibs_f5y = integer(length=sibs)
    for(i in 1:sibs) {
        other_births = byear[-i]
        other_deaths = dyear[-i]
        other_made1y = survive1y[-i]
        my_sibs = sibs - 1 - # minus self
            sum(
                other_births > (byear[i] + 5) | # born more than 5y later
                (other_births + 5) < byear[i] | # finished infancy before birth
                other_deaths <= byear[i] | # died before birth
                other_made1y == 0, # if they died right away, don't count
            na.rm=T)  # if dyear missing assume they lived
        other_dependent_sibs_f5y[i] = my_sibs
    }
    other_dependent_sibs_f5y
}

only younger sibs

younger_sibs_alive_and_dependent = function(survive5y, byear, dyear) {
    made5y = sum(survive5y==1 | is.na(survive5y)) # number of siblings who turned at least 5
    sibs = length(survive5y)
    younger_sibs_alive_and_dependent_in_first_5y = integer(length=sibs)
    for(i in 1:sibs) {
        younger_sibs = byear >= byear[i] # not using < because of twins
        younger_sibs[i] = F # minus self
        my_sibs = sum(younger_sibs, na.rm = T) # minus self
        if(my_sibs > 0) {
            sib_births = byear[ which(younger_sibs) ]
            sib_deaths = dyear[ which(younger_sibs) ]
            my_sibs = my_sibs -
                sum(
                    sib_births > (byear[i] + 5) |  # born more than 5y later
                        (sib_deaths <= sib_births + 1) # died within less than a year
                ,na.rm=T)
            younger_sibs_alive_and_dependent_in_first_5y[i] = my_sibs
        }
    }
    younger_sibs_alive_and_dependent_in_first_5y
}

only older sibs

older_sibs_alive_and_dependent = function(survive5y, byear, dyear) {
    made5y = sum(survive5y,na.rm = T) # number of siblings who turned at least 5
    sibs = length(survive5y)
    older_sibs_alive_and_dependent_in_first_5y = integer(length=sibs)
    for(i in 1:sibs) {
        older_sibs = byear <= byear[i] # not using < because of twins
        older_sibs[i] = F # minus self
        my_sibs = sum(older_sibs,na.rm = T) # minus self
        if(my_sibs > 0) {
            sib_births = byear[ which(older_sibs) ]
            sib_deaths = dyear[ which(older_sibs) ]
            my_sibs = my_sibs -
                sum(
                    sib_births < (byear[i] - 5) | # others born more than 5y earlier than me  # 10 seconds of 17
                        (sib_deaths <= sib_births + 1) | # died within less than a year  # 7 seconds of 17
                        (sib_deaths <= byear[i]) # died before my birth
                ,na.rm=T)
            older_sibs_alive_and_dependent_in_first_5y[i] = my_sibs
        }
    }
    older_sibs_alive_and_dependent_in_first_5y
}

verify whether a binned linear variable has a linear effect

this function takes a coefficient df which contains a cut-up/binned factor predictor and makes a plot to verify the linearity of the predictor, optionally accompanied by an mcmclist and plots the OR for the paternal age factor

mod_coefs_factor = function(fixefs, factor_base = "paternalage.factor")
{
    if(!inherits(fixefs, what = "confint.coefs")) stop("Need a confint.coefs object.")
    fixefs = as.data.table(fixefs)
    fixefs = fixefs[ Coefficient %contains% factor_base, ]

    if(nrow(fixefs) > 0) {
        fixefs[, Trait := str_match(Coefficient, "(.+):")[,2] ]
        escaped_fb = Hmisc::escapeRegex(as.character(factor_base))
        fixefs[ , Coefficient := str_match(Coefficient,paste0(escaped_fb, "(.+)"))[,2] ]
        fixefs[ , Coefficient := str_match(Coefficient, "\\(([0-9.,]+)\\]")[,2]]
        fixefs[ , Coefficient := str_replace(Coefficient, "\\.",replacement="")]
        fixefs[ , Coefficient := str_replace(Coefficient, ",",replacement="-")]
        reference = ifelse( fixefs$Link[1] %in% c("log","logit"), 1, 0)
        fixefs = rbind(fixefs, fixefs[nrow(fixefs), ])
        fixefs[nrow(fixefs), Lower := reference]
        fixefs[nrow(fixefs), Upper := reference]
        fixefs[nrow(fixefs), Estimate := reference]
        fixefs[nrow(fixefs), Coefficient := "14-25"]
        fixefs
    }
}

plot the factor ORs to see if it’s linear

plot_factor_linearity = function(fixefs, factor_base = "paternalage.factor")
{
    if(!inherits(fixefs, what = "confint.coefs")) stop("Need a confint.coefs object.")
    fixefs = as.data.table(fixefs)
    fixefs = fixefs[ Coefficient %contains% factor_base, ]

    if(nrow(fixefs) > 0) {
        fixefs[, Trait := str_match(Coefficient, "(.+):")[,2] ]
        escaped_fb = Hmisc::escapeRegex(as.character(factor_base))
        fixefs[ , Coefficient := str_match(Coefficient,paste0(escaped_fb, "(.+)"))[,2] ]
        fixefs[ , Coefficient := str_match(Coefficient, "\\(([0-9.,]+)\\]")[,2]]
        fixefs[ , Coefficient := str_replace(Coefficient, "\\.",replacement="")]
        fixefs[ , Coefficient := str_replace(Coefficient, ",",replacement="-")]
        reference = ifelse( fixefs$Link[1] %in% c("log","logit"), 1, 0)
        just_coefs = rbind(data.table( Lower = reference, Upper = reference, Estimate = reference, Coefficient = "14-25", Trait = unique(fixefs$Trait)[1]), fixefs[,list(Lower, Upper, Estimate, Coefficient, Trait)], use.names = T)
        if(length(unique(fixefs$Trait))>1) just_coefs = rbind(data.table(Lower = reference, Upper = reference, Estimate = reference, Coefficient = "14-25", Trait = unique(fixefs$Trait)[2]), just_coefs)

        just_coefs[is.na(Trait), Trait := ""]
        qplot(data=just_coefs, x = Coefficient, y = Estimate, geom = c("pointrange","line"),ymin = Lower, ymax = Upper, group  =1 ) + ggtitle(fixefs$Model[1]) + analysis_theme + xlab("Paternal age") + scale_y_log10(paste("Effect on",fixefs$Outcome[1]),breaks=c(0.25,0.5,0.75,1)) + geom_smooth(aes(weight = ifelse(Upper-Lower != 0, 1/(Upper-Lower),1)) ,method = "lm", se = F, lty = "dashed") + facet_wrap(~ Trait)
#   qplot(data=just_coefs, x = Coefficient, y = Estimate, geom = c("pointrange","line"),ymin = Lower, ymax = Upper, group  =1 ) + ggtitle(fixefs$Model[1]) + analysis_theme + xlab("Paternal age") + ylab(paste("Effect on",fixefs$Outcome[1])) + geom_smooth(method = "lm", se = F, lty = "dashed") + facet_wrap(~ Trait)
    }
}

make_newdata = function(pred, olddata) {
    newdata = pred
    newdata$idParents = rep(1:2, length.out=nrow(newdata))

    left = setdiff(names(olddata), names(newdata))
    for(i in seq_along(left)) {
        barename = str_match(left[i],"(.+\\()?([^()]+)\\)?")[,3] # bust var out of function term
        if(is.numeric(olddata[, left[i]]) & length(unique(na.omit(olddata[, left[i]]))) == 2) {
            newdata[, barename] = mean(olddata[, left[i]],na.rm=T)
        } else if(is.logical((olddata[, left[i]]))) {
            newdata[, barename] = FALSE
        } else if(is.numeric(olddata[, left[i]])) {
            newdata[, barename] = median(olddata[, left[i]],na.rm=T)
        } else if(is.factor(olddata[, left[i] ])) {
            fact = unique(olddata[,left[i] ]) ## choose reference for factors
            newdata[, barename] = fact[fact==levels(fact)[1]]
        } else {
            newdata[, barename] = Mode(olddata[,left[i] ])
        }
    }
    newdata = newdata[order(pred[,1]),]
    rownames(newdata) = 1:nrow(newdata)
    newdata
}

compare two model’s coefficients

(input are the result of fortify_mine)

compare_coefs = function(x, y, colour = "Model")
{
    if(is(x, 'confint.coefs') & is(y, 'confint.coefs')) {
        if(unique(x$Model)==unique(y$Model)) {
            x$Model = deparse(substitute(x))
            y$Model = deparse(substitute(y))
        }
        x$CI = str_c( sprintf("%1.2f", x$Lower), ";", sprintf("%1.2f", x$Upper) )
        y$CI = str_c( sprintf("%1.2f", y$Lower), ";", sprintf("%1.2f", y$Upper) )
        coefs = merge(x[, c('Coefficient','Estimate','CI')], y[, c('Coefficient','Estimate','CI')], by = "Coefficient")
        coefs = coefs[, c("Coefficient", setdiff(sort(names(coefs)),"Coefficient"))]
        print(plot_fortified_mer(rbind(x,y), colour = colour))
        kable(coefs,digits = 2, row.names=FALSE)
    } else {
        stop("Need confint.coefs")
    }
}

shorthand for comparing models with few/many controls

compare_controlled_to_uncontrolled = function(ucon, con) {
    ucon$Controls = "few"
    con$Controls = "many"
    compare_coefs(ucon, con, colour = "Controls")
}

shorthand for sample comparison

compare_krmh_to_rpqa = function(krmh_coefs, rpqa_coefs) {
    krmh_coefs$Population = "Krummhörn"
    rpqa_coefs$Population = "Québec"
    rpqa_coefs=rpqa_coefs[!rpqa_coefs$Coefficient %contains% "birth.cohort", ]
    krmh_coefs=krmh_coefs[!krmh_coefs$Coefficient %contains% "birth.cohort", ]
    compare_coefs(krmh_coefs, rpqa_coefs, colour = "Population")
}

putting in a named list to name the origin of the coefficients

merge_coefs = function(...) {
    dots <- list(...)
    if(length(dots) == 0) return(NULL)
    for(i in 1:length(dots)) {
        dots[[i]]$Population = names(dots)[i]
    }
    coefs = rbindlist(dots, fill = T)
    coefs = data.table(coefs)
    class(coefs) = c( class(coefs), 'confint.coefs')
    coefs
}

another comparison plot

factor_plot = function(coefs) {
    coefs$Population = factor(coefs$Population, c("Québec","Modern Sweden","Historical Sweden","Krummhörn"))
    ggplot(coefs,
                 aes(colour = Population, x = Coefficient, y = Estimate, ymin = Lower, ymax = Upper,weight = ifelse(Upper-Lower != 0, 1/(Upper-Lower),1), group = Population)) +
        geom_pointrange(position = position_dodge(width = 0.4), size = 1) +
        #   geom_line(position = position_dodge(width = 0.4))+
        geom_smooth(position = position_dodge(width = 0.4),method = "lm", se = F, linetype = "dashed", size = 0.2) +
        xlab("Paternal age") +
        scale_fill_brewer(palette = "Set2") +
        scale_colour_brewer(palette = "Set2") +
        scale_y_log10(breaks = c(0.4,0.6,0.8,1,1.2))+
        geom_hline(y = 1, colour = "black", linetype = "dotted") +
        analysis_theme
}

another comparison plot

factor_boot_plot = function(coefs) {
    coefs$Population = factor(coefs$Population, c("Québec","Modern Sweden","Historical Sweden","Krummhörn"))

    coefs$Population = factor(coefs$Population, c("Québec","Modern Sweden","Historical Sweden","Krummhörn"))

    ggplot(coefs,  aes(x = paternalage.factor, y = value, fill = Population)) +
        geom_violin(colour = "transparent", alpha = 0.3, position = position_identity(), scale = "width")  +
        geom_point(aes(colour = Population),stat = "summary", fun.data = "mean_sdl", position = position_identity())+
        geom_smooth(aes(colour = Population, weight = 1/sd(value, na.rm = T),group = Population),method = "lm", se = F, lty = "dashed", position = position_identity()) +
        ggtitle("Children of older fathers have lower evolutionary fitness\nfurther controls") +
        scale_fill_brewer(palette = "Set2") +
        scale_colour_brewer(palette = "Set2") +
        xlab("Paternal age") +
        analysis_theme +
        theme(legend.position = c(1,1),
                    legend.justification = c(1,1))
}

count eggs

produce a (weighted) count of offspring and merge it back on the individual

count_and_merge = function(df, what, wt_var) {
    if(!is.data.table(df)) df = as.data.table(df)
    setnames(df, wt_var, ".tmp_wt")
    counted.dad = df %>%
        dplyr::group_by(idPere) %>%
        dplyr::summarise(.tmp = sum(.tmp_wt, na.rm = T)) %>%
        dplyr::rename(idIndividu = idPere)
    counted.mom = df %>% dplyr::group_by(idMere) %>%
        dplyr::summarise(.tmp = sum(.tmp_wt, na.rm = T)) %>%
        dplyr::rename(idIndividu = idMere)
    counted = rbind(counted.dad,counted.mom)
    df = data.table(merge(df,counted,by='idIndividu',all.x=T))
    df[is.na(.tmp), .tmp := 0]
    setnames(df, old = c(".tmp",".tmp_wt"), new = c(what,wt_var))
    df
}

translate file name into human readable title

human_readable_filename = function(file) {
    file %>%
        basename %>%
        tools::file_path_sans_ext() %>%
        str_sub(3) %>%
        str_replace_all("_+"," ") %>%
        str_replace("peer" ,"Peerage") %>%
        str_replace("ddb" ,"historical Sweden") %>%
        str_replace("swed" ,"modern Sweden") %>%
        str_replace("rpqa" ,"Québec") %>%
        str_replace("krmh" ,"Krummhörn") %>%
        str_replace("massage" ,"Data Wrangling") %>%
        stri_trans_totitle()
}

human readable paternal age factor

human_readable_factor = function(factor) {
    tfactor = as.character(factor)
    tfactor = str_match(tfactor,"\\(?([0-9.,]+)\\]")[,2]

    if(any(is.na(tfactor))) {
        return(factor)
    }
    tfactor %>%
        str_replace(.,"\\.",replacement="") %>%
        str_replace(.,",",replacement="-") %>%
        str_replace(.,"0-25",replacement="10-25")
}

describeBin = function(df) {
    df2 = colwise(function(x){ if(is.factor(x)){ ifelse(x=="TRUE",1,0)} else{ x} })(df)
    df2 %>%
        data.frame() %>%
        melt(.,id.vars=c()) %>%
        group_by(variable) %>%
        summarise(n = sum(!is.na(value)),
                            mean = mean(value, na.rm=T),
                            sd = mean(value, na.rm=T) * (1-mean(value, na.rm=T))) %>%
        data.frame() -> descriptives
    rownames(descriptives) = descriptives$variable
    descriptives$variable = NULL
    round(descriptives,2)
}

aggDemoTrends = function(df) {
    df[, list(byear, paternalage, maternalage, maternalage_at_1st_sib, maternalage_at_last_sib,paternalage_at_1st_sib,paternalage_at_last_sib)] %>%
        mutate(maternalage_at_1st_sib = ifelse(maternalage_at_1st_sib==maternalage, maternalage_at_1st_sib, NA), # don't count sibs twice
                     paternalage_at_1st_sib = ifelse(paternalage_at_1st_sib==paternalage, paternalage_at_1st_sib, NA),
                     maternalage_at_last_sib = ifelse(maternalage_at_last_sib==maternalage, maternalage_at_last_sib, NA),
                     paternalage_at_last_sib = ifelse(paternalage_at_last_sib==paternalage, paternalage_at_last_sib, NA)) %>%
        melt(id.vars=c("byear")) %>%
        mutate(Parent = ifelse(variable %begins_with% "paternal","Father","Mother"),
                    variable = ifelse(variable %contains% "1st", "first", ifelse(variable %contains% "last", "last", "all"))
        ) %>%
        group_by(byear, Parent, variable) %>%
        mutate(value = value*10) %>%
        dcast(byear + Parent ~ variable, fun.aggregate = function(x){mean(x,na.rm=T)}) %>%
        rename(Year =  byear) %>%
        data.table()
}

#
# colour_values = c(
#   "Krummhörn" = "#542437",
#   "Québec" = "#53777A",
#   "Modern Sweden" = "#D95B43",
#   "Historical Sweden" = "#C02942"
# )
# colour_values = c(
#   "Krummhörn" = "#FF9C5B",
#   "Québec" = "#3B8183",
#   "Modern Sweden" = "#ED303C",
#   "Historical Sweden" = "#F5634A"
# )
# colour_values = c(
#   "Krummhörn" = "#5E8C6A",
#   "Québec" = "#8C2318",
#   "Modern Sweden" = "#88A65E",
#   "Historical Sweden" = "#BFB35A"
# )
# colour_values = c(
#   "Krummhörn" = "#89E0B9",
#   "Québec" = "#068981",
#   "Modern Sweden" = "#535659",
#   "Historical Sweden" = "#F4F9CB"
# )
# source: http://www.colourlovers.com/palette/3735149/Some_Elegance
colour_values = c(
    "Krummhörn" = "#E2A674",
    "Québec" = "#9F3B2B",
    "20th-century Sweden" = "#8A846C",
    "Historical Sweden" = "#21523B"
)
pop_colour = scale_colour_manual(values = colour_values)
pop_fill = scale_fill_manual(values = colour_values)

Bin years

simple way to get bins by year

year_bins = function(x, bin = 5) {
    ifelse(is.na(x), NA_character_,
        paste0(plyr::round_any(x, bin, floor),"-",plyr::round_any(x, bin, floor) + bin)
    )
}

Compute effect of 10 years of paternal age

paternal_age_10y_effect = function(model, predictor_name = "paternalage") {
    nam = rownames(fixef(model))
    pred_name = nam[nam %contains% predictor_name & ! nam %contains% paste0(predictor_name, ".mean")]
    if (length(pred_name) > 0 ) {
        pred_name = pred_name[1]
        predictor = data.frame(paternalage = c(2.5, 3.5))
        probs = c(0.025, 0.1, 0.5, 0.9, 0.975)
        if (pred_name %in% names(model$data)) {
            names(predictor) = pred_name[1]
        }
        # brms:::prepare_conditions
        newd = make_newdata(predictor, olddata = model$data)
        newd$response = newd$trait = NULL
        fixefs = round(exp(fixef(model, estimate = "quantile", probs = probs)[pred_name,]), 2)
        prediction = fitted(model, newdata = newd, re_formula = NA, summary = F)
        estlow = round(quantile(prediction[,1], probs = probs),2)
        esthigh = round(quantile(prediction[,2], probs = probs),2)
        if (model$family$family == "bernoulli") {
            pct_change = prediction[,2] - prediction[,1]
            pct_decrease = round(quantile(pct_change, probs = probs) * 100,2)
        } else {
            pct_change = prediction[,2] / prediction[,1] - 1
            pct_decrease = round(quantile(pct_change, probs = probs) * 100, 2)
        }
        estimates = data.frame(
            effect = c("estimate father 25y", "estimate father 35y","percentage change", "OR/IRR"),
            median_estimate = c(as.character(estlow['50%']),
                                                    as.character(esthigh['50%']),
                                                    as.character(pct_decrease['50%']),
                                                    as.character(fixefs['50%'])),
            ci_95 = c(
                paste0('[',estlow['2.5%'],';',estlow['97.5%'],']'),
                paste0('[',esthigh['2.5%'],';',esthigh['97.5%'],']'),
                paste0('[',pct_decrease['2.5%'],';',pct_decrease['97.5%'],']'),
                paste0('[',fixefs['2.5%'],';',fixefs['97.5%'],']')
            ),
            ci_80 = c(
                paste0('[',estlow['10%'],';',estlow['90%'],']'),
                paste0('[',esthigh['10%'],';',esthigh['90%'],']'),
                paste0('[',pct_decrease['10%'],';',pct_decrease['90%'],']'),
                paste0('[',fixefs['10%'],';',fixefs['90%'],']')
            ))
        
        if (model$family$family == "hurdle_poisson") {
            fixefs2 = round(exp(fixef(model, estimate = "quantile", probs = probs)[paste0("hu_",pred_name),]), 2)
            estimates2 = data.frame(
                effect = c("OR hurdle"),
                median_estimate = as.character(fixefs2['50%']),
                ci_95 = c(paste0('[',fixefs2['2.5%'],';',fixefs2['97.5%'],']')),
                ci_80 = c(paste0('[',fixefs2['10%'],';',fixefs2['90%'],']')))
            estimates = bind_rows(estimates, estimates2)
        }
        estimates
    } else {
        " "
    }
}

Compute effect of 30 years of paternal age

effect_of_avg_nr_of_mutations = function(model, predictor_name = "paternalage") {
    nam = rownames(fixef(model))
    pred_name = nam[nam %contains% predictor_name & ! nam %contains% paste0(predictor_name, ".mean")]
    if (length(pred_name) > 0 ) {
        pred_name = pred_name[1]
        predictor = data.frame(paternalage = c(0, 3))
        probs = c(0.025, 0.1, 0.5, 0.9, 0.975)
        if (pred_name %in% names(model$data)) {
            names(predictor) = pred_name[1]
        }
        # brms:::prepare_conditions
        newd = make_newdata(predictor, olddata = model$data)
        newd$response = newd$trait = NULL
        fixefs = round(exp(fixef(model, estimate = "quantile", probs = probs)[pred_name,]), 2)
        prediction = fitted(model, newdata = newd, re_formula = NA, summary = F)
        estlow = round(quantile(prediction[,1], probs = probs),2)
        esthigh = round(quantile(prediction[,2], probs = probs),2)
        if (model$family$family == "bernoulli") {
            pct_change = (prediction[,2] - prediction[,1])
            pct_decrease = round(quantile(pct_change, probs = probs) * 100,2)
        } else {
            pct_change = (prediction[,2] / prediction[,1] - 1)
            pct_decrease = round(quantile(pct_change, probs = probs) * 100, 2)
        }
        estimates = data.frame(
            effect = c("estimate at 0 mutations", "estimate at 60 mutations","percentage change", "OR/IRR"),
            median_estimate = c(as.character(estlow['50%']),
                                                    as.character(esthigh['50%']),
                                                    as.character(pct_decrease['50%']),
                                                    as.character(fixefs['50%'])),
            ci_95 = c(
                paste0('[',estlow['2.5%'],';',estlow['97.5%'],']'),
                paste0('[',esthigh['2.5%'],';',esthigh['97.5%'],']'),
                paste0('[',pct_decrease['2.5%'],';',pct_decrease['97.5%'],']'),
                paste0('[',fixefs['2.5%'],';',fixefs['97.5%'],']')
            ),
            ci_80 = c(
                paste0('[',estlow['10%'],';',estlow['90%'],']'),
                paste0('[',esthigh['10%'],';',esthigh['90%'],']'),
                paste0('[',pct_decrease['10%'],';',pct_decrease['90%'],']'),
                paste0('[',fixefs['10%'],';',fixefs['90%'],']')
            ))
        
        if (model$family$family == "hurdle_poisson") {
            fixefs2 = round(exp(fixef(model, estimate = "quantile", probs = probs)[paste0("hu_",pred_name),]), 2)
            estimates2 = data.frame(
                effect = c("OR hurdle"),
                median_estimate = as.character(fixefs2['50%']),
                ci_95 = c(paste0('[',fixefs2['2.5%'],';',fixefs2['97.5%'],']')),
                ci_80 = c(paste0('[',fixefs2['10%'],';',fixefs2['90%'],']')))
            estimates = bind_rows(estimates, estimates2)
        }
        estimates
    } else {
        " "
    }
}

the rendering function for Rmds and md

    render_web = function(file, auto_tab = T, include_top = "library/general_header.html", output_dir = "../paternal_age_fitness/") {
    title = human_readable_filename(file)
    pops = c("krmh","ddb","rpqa","swed")
    pops = pops[file %contains% pops]

    if(length(pops) == 1) include_top = paste0("library/",pops,"_header.html")
    pandoc_args = list(paste0("--metadata=title:\"",title,"\""))
    if(auto_tab) pandoc_args[[2]] = "--metadata=autotab:1"
    output_format = html_document(
        self_contained = FALSE,
        template =  "../paternal_age_fitness/library/website_theme.html",
        lib_dir = paste0(output_dir, "library/"),
        mathjax = NULL,
        keep_md = TRUE,
        includes = includes(before_body = include_top),
        pandoc_args = pandoc_args,
        theme = "paper"
    )
    output_format$knitr$opts_chunk$error = TRUE

    render(input = file,
                 output_format = output_format,
                 envir = new.env(),
                 output_dir = output_dir
    )
}

Plot two credibility levels for brms marginal effect plots

plot.brmsMarginalEffects_shades <- function(x, y, ncol = NULL, points = FALSE,
                                                                         rug = FALSE, theme = ggplot2::theme(),
                                                                         ask = TRUE, do_plot = TRUE, ...) {
    for(i in seq_along(x)) {
        old_attr = attributes(x[[i]])
        x[[i]] = x[[i]] %>% dplyr::bind_cols(y[[i]]  %>% dplyr::select(lowerCI, upperCI) %>% dplyr::rename(lowerCI80 = lowerCI, upperCI80 = upperCI))
        new_col_names = attributes(x[[i]])$names
        attributes(x[[i]]) = old_attr
        attributes(x[[i]])$names = new_col_names
    }
    if("paternalage" %in% names(x)) {
        order_x = names(x)
        order_x = c("paternalage", order_x[order_x != "paternalage"])
        x = x[order_x]
    }

    # Compute marginal effects plots using ggplot2
    # Returns:
    #   A list of ggplot objects
    if (do_plot) {
        default_ask <- devAskNewPage()
        on.exit(devAskNewPage(default_ask))
        devAskNewPage(ask = FALSE)
    }
    plots <- setNames(vector("list", length(x)), names(x))
    for (i in seq_along(x)) {
        response <- attributes(x[[i]])$response
        effects <- attributes(x[[i]])$effects
        plots[[i]] <- ggplot(data = x[[i]]) +
            aes_string(x = effects, y = "Estimate", ymin = "lowerCI",
                                 ymax = "upperCI") + ylab(response) + theme
        nCond <- length(unique(x[[i]]$MargCond))
        if (nCond > 1) {
            # one plot per row of marginal_data
            if (is.null(ncol)) ncol <- max(floor(sqrt(nCond)), 3)
            plots[[i]] <- plots[[i]] +
                facet_wrap("MargCond", ncol = ncol)
        }
        if (length(effects) == 2) {
            # differentiate by colour in case of interaction effects
            plots[[i]] <- plots[[i]] +
                aes_string(colour = effects[2], fill = effects[2])
        }
        if (is.numeric(x[[i]][, effects[1]])) {
            # smooth plots for numeric predictors
            plots[[i]] <- plots[[i]] +
                geom_ribbon(alpha = 0.2, color = NA) +
                geom_ribbon(aes_string(ymin = "lowerCI80", ymax = "upperCI80"), alpha = 0.2, color = NA) +
                geom_line()
            if (rug) {
                plots[[i]] <- plots[[i]] +
                    geom_rug(aes_string(x = effects[1]), sides = "b",
                                     data = attr(x[[i]], "points"), inherit.aes = FALSE)
            }
        } else {
            # points and errorbars for factors
            plots[[i]] <- plots[[i]] +
                geom_linerange(aes_string(ymin = "lowerCI80", ymax = "upperCI80"), size = 1.7,
                                             position = position_dodge(width = 0.4)) +
                geom_linerange(aes_string(ymin = "lowerCI", ymax = "upperCI"), size = 0.7,
                                             position = position_dodge(width = 0.4)) +
                geom_point(position = position_dodge(width = 0.4),
                                     size = 4 / nCond^0.25)
        }
        if (points) {
            # show the data as points in the plot
            plots[[i]] <- plots[[i]] +
                geom_point(aes_string(x = effects[1], y = ".RESP"), shape = 1,
                                     size = 4 / nCond^0.25, data = attr(x[[i]], "points"),
                                     inherit.aes = FALSE)
        }
        if (do_plot) {
            plot(plots[[i]])
            if (i == 1) devAskNewPage(ask = ask)
        }
    }
    invisible(plots)
}

Get the path to the model-file fitted on the cluster

get_coefficient_path = function(file, population) {
    paste0("coefs/", population, "/", file, ".rds")
}

Echo knit child documents as asis_output

asis_knit_child = function(file) {
    asis_output(knit_child(file, quiet = T))
}

Summarise a brms model using a knitr-child document

summarise_model = function() {
    asis_knit_child('0__model_summary.Rmd')
}

get effect of 10y paternal age increase as pctage change

get_10y_effect = function(model_name, population) {
    paternal_age_10y_effect(readRDS(get_coefficient_path(model_name, population)))[3,]
}

Comparison plot

comp_plot = function(model_name) {
    bind_rows(
        "Krummhörn" = get_10y_effect(model_name, "krmh"),
        "Québec" = get_10y_effect(model_name, "rpqa"),
        "20th-century Sweden" =  get_10y_effect(model_name, "swed"),
        "Historical Sweden" = get_10y_effect(model_name, "ddb"),
        .id = "Population"
    ) %>% mutate(
        median_estimate = as.numeric(median_estimate),
        lower95 = as.numeric(str_match(ci_95, "\\[(-?[0-9.]+);")[,2]),
        upper95 = as.numeric(str_match(ci_95, ";(-?[0-9.]+)]")[,2]),
        lower80 = as.numeric(str_match(ci_80, "\\[(-?[0-9.]+);")[,2]),
        upper80 = as.numeric(str_match(ci_80, ";(-?[0-9.]+)]")[,2])
    ) -> ests
    print(  ggplot(ests, aes(x = factor(Population), y = median_estimate, ymin = lower95, ymax = upper95)) + 
        geom_hline(yintercept = 0, linetype = 'dashed') +
        geom_linerange(size = 0.7) +
        geom_pointrange(aes(ymin = lower80, ymax = upper80), size = 1.7) +
        scale_x_discrete("Population") +
        scale_y_continuous("Paternal age effect within family") +
        coord_flip()
    )
    ests %>% select(Population, median_estimate, ci_95, ci_80)
}