documentation on the functions is interspersed through code comments
# library = function (...) suppressMessages(base::library(...))
# This should always be the default. Google for reason if not clear.
options(stringsAsFactors = FALSE)
options(digits = 4)
library(rmarkdown); library(formatR); library(foreign); library(Hmisc); library(car); library(psych); library(QuantPsyc); library(formr); library(ggplot2); library(lubridate); library(stringr); library(stringi); library(reshape2); library(plyr); library(knitr); library(data.table); library(dplyr); library(dtplyr); library(broom)
library(pander)
# I'm tired of data table messing up my rmarkdown. Never want to print this in a live report anyway, I'm using pander for that.
options(datatable.print.nrows=0)
analysis_theme = theme_minimal(base_size = 24)
theme_set(analysis_theme)
Auto-adjust the table column alignment depending on data type.
alignment = function (...) UseMethod('alignment')
alignment.default = function (...) 'left'
alignment.integer = function (...) 'right'
alignment.numeric = function (...) 'right'
panderOptions("table.split.table", Inf)
cva_bin = function(x) {
p = mean(x,na.rm=T)
sqrt(p*(1-p)) /
p
}
cva = function(x) {
sd(x, na.rm = T) /
mean(x, na.rm = T)
}
center = function(x) { scale(x, scale = FALSE) }
Mode <- function(x) { # from https://stackoverflow.com/questions/2547402/standard-library-function-in-r-for-finding-the-mode
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
sample_level2 = function(data, level2_var, size, replace = F, prob = NULL) {
if(is.data.table(data)) {
some = data[, level2_var, with = F][[1]]
} else {
some = data[ ,level2_var]
}
some_u = unique(some)
sample_some = sample(some_u, size, replace, prob)
take = some %in% sample_some
data[take,]
}
pander_escape = function(x, ...) {
x$Pattern = stringr::str_replace_all(x$Pattern,"_","\\\\_")
x
}
formula.tools got a bit too picky
my_as.character.formula = function(x) Reduce( paste, deparse(x) )
pander_handler = function(x, ...) {
anyS3method = function(x) {
classes = class(x)
any(sapply(classes, FUN = function(classes) { !is.null(getS3method('pander',classes,TRUE)) } ))
}
if ("knit_asis" %in% class(x)) {
x
} else if (is.data.table(x)) {
""
# don't ever print stupid data tables
} else if (anyS3method(x)) {
pander(x, row.names = F, ...)
} else if (isS4(x)) {
show(x)
} else {
print(x)
}
}
plot_zero_infl = function(variable) {
rpois.od<-function (n, lambda,d=1) {
if (d<=1)
rpois(n, lambda)
else if (d>1)
rnbinom(n, size=(lambda/(d-1)), mu=lambda)
}
if( var(variable,na.rm=T) > mean(variable,na.rm=T)) { lab = paste0("Overdispersed (d=",round(var(variable,na.rm=T)/mean(variable,na.rm=T),2),") poisson generated using distribution mean and variance via neg.bin")
} else { lab = "Poisson generated with distribution mean"}
qplot(rpois.od(n=NROW(variable),mean(variable,na.rm=T), var(variable,na.rm=T)/mean(variable,na.rm=T)), binwidth=1, fill=I("tomato3"), alpha=I(0.5)) + ggtitle(paste0("Slate=Real; Tomato=Generated\n",lab)) +
geom_histogram(aes(x=variable),fill="darkslategray4",alpha=0.5,binwidth=1) + xlab("")
}
lstype = function(types=c('lmerMod','glmerMod','bglmerMod','blmerMod'), envir = parent.frame(), ...){
inlist<-ls(envir = envir, ...)
ifexistsgetclass = function(x, envir) {
if(exists(x, envir = envir, inherits = FALSE)) {
bla = get(x, envir = envir, inherits = FALSE)
class(bla)
} else {
NULL
}
}
classlist<- sapply(inlist,function(x) ifexistsgetclass(x, envir = envir))
if(length(classlist) > 0 ) {
names(classlist[
sapply(classlist,function(x){ !!length(intersect(x, types) ) }) ## take any who have a class that is in our list
])
} else character(0)
}
fortify_mine = function(fit,confint="Wald", modelname = NULL, mcmclist = NULL) {
if(is.null(modelname)) modelname = deparse(substitute(fit))
info = summary(fit)
link = "identity"
family = "gaussian"
call = ""
if(inherits(fit, what = "MCMCglmm")) {
library(MCMCglmm)
if(is.null(mcmclist)) {
conf <- as.data.frame(info$solutions[, c('post.mean','l-95% CI','u-95% CI')])
names(conf) = c("Estimate","Lower","Upper")
} else {
conf = as.data.frame(summary(mcmclist, quantiles = c(0.05,0.5,0.95))$quantiles)
names(conf) = c("Lower","Estimate","Upper")
conf = conf[,c("Estimate","Lower","Upper")]
}
call = my_as.character.formula(fit$Fixed$formula)
conf$Coefficient = rownames(conf)
N_obs = fit$Residual$nrl
N_fam = fit$Random$nrl
from_class = "MCMCglmm"
Outcome = as.character(fit$Fixed$formula[2])
family = head(unique(fit$family),1)
if(family != 'gaussian') link = "log"
} else if (inherits(fit, what="glmmPQL")) {
summ = summary(fit)
conf = data.frame(summ$tTable)
conf$Coefficient = rownames(conf)
conf$Lower = conf$Value - 1.96 * conf$Std.Error
conf$Upper = conf$Value + 1.96 * conf$Std.Error
conf$Estimate = conf$Value
N_obs = nrow(summ$groups)
N_fam = nrow(unique(summ$groups))
link = fit$family$link
family = fit$family$family
from_class = "glmmPQL"
Outcome = names(fit$data)[1]
} else if(inherits(fit,what='merMod') | inherits(fit,what='lm')| inherits(fit,what='coxme'))
{
if(confint=="boot"){
# conf <- confint(fit,method= confint, parallel = "multicore", ncpus = 256)
conf <- confint(fit,method= confint, parallel = "multicore", ncpus = parallel::detectCores())
} else
{
conf <- confint(fit,method= confint )
}
# citation? https://stat.ethz.ch/pipermail/r-sig-mixed-models/2013q4/021197.html
conf = na.omit(as.data.frame(conf))
conf$Coefficient = rownames(conf)
names(conf) = c('Lower','Upper','Coefficient')
if(inherits(fit,what='merMod')) {
library(lme4)
conf$Estimate = fixef(fit)
call = my_as.character.formula(fit@call)
N_obs = as.vector(info$devcomp$dims["N"])
N_fam = as.vector(info$ngrps)
if(length(N_fam)>1) N_fam = as.vector(info$ngrps['idParents'])
from_class = "merMod"
Outcome = as.character(fit@call$formula[2])
if(!is.null(info$link)) link = info$link
if(!is.null(info$family)) family = info$family
} else if(inherits(fit, what="coxme")) {
conf[,1] = fit$coefficients-1.96*sqrt(diag(vcov(fit)))
conf[,2] = fit$coefficients+1.96*sqrt(diag(vcov(fit)))
conf$Estimate = coef(fit)
call = my_as.character.formula(fit$call)
N_obs = tail(fit$n,1)
N_fam = head(fit$n,1)
from_class = "coxme"
Outcome = my_as.character.formula(fit$call)[2]
link = "log"
family = "survival"
} else {
call = my_as.character.formula(fit$call)
conf$Estimate = coef(fit)
N_obs = nrow(fit$model)
N_fam = nrow(fit$model)
from_class = "lm"
Outcome = names(fit$model)[1]
if(!is.null(fit$family$link)) link = fit$family$link
if(!is.null(fit$family$family)) family = fit$family$family
}
} else
{
stop("Unknown object")
}
coefs = within(conf, {
N = head(N_obs,1)
Families = head(N_fam,1)
Outcome = Outcome
Model = modelname
Method = from_class
Link = link
Family = family
Call = call
})
if(link %in% c("log","logit")) {
# transform logits into Odds Ratios (ORs) and logs (poisson-link) into Incidence Rate Ratios
coefs[,c("Estimate","Lower","Upper")] = exp(coefs[,c("Estimate","Lower","Upper")])
}
class(coefs) = c( class(coefs), 'confint.coefs')
class(coefs) = c( class(coefs), paste0('confint.',from_class))
coefs
}
plot_fortified_mer = function(coefs,ylab = "Effect on outcome",suppress_intercept=T,colour = "")
{
modelname = coefs$Model[1]
# coefs = fortify.mine(fit, 'Wald', modelname)
# info = summary(fit)
# list_of_coefs[[ modelname ]] <<- list(coefs, info, fit)
plot_title = str_c(paste0(unique(coefs[,"N"]), collapse="/"),' individuals in ',paste0(unique(coefs[,"Families"]), collapse="/"), ' families')
if(suppress_intercept) {
intercept = coefs[which(coefs$Coefficient == "(Intercept)"),]
plot_title = str_c(plot_title,"\nIntercept = ",as.character(round(intercept['Estimate'],2)), ", [",as.character(round(intercept["Lower"],2)),";",as.character(round(intercept["Upper"],2)),"]")
coefs = coefs[which(coefs$Coefficient != "(Intercept)"),]
}
if(coefs$Link[1] %in% c("logit","log")) {
null.point = 1; scaleY = scale_y_log10(ylab)
} else {
null.point = 0; scaleY = scale_y_continuous(ylab)
}
coefs$Coefficient = car::Recode(coefs$Coefficient,"'paternalage.diff'='Paternal age';'paternalage.mean'='Paternal age\\nfamily average'")
coefs$Coefficient = str_replace(coefs$Coefficient, "paternalage.factor", "Paternal age ")
order = sort(unique(coefs$Coefficient))
main = c('Paternal age\nfamily average','Paternal age')
order = c(setdiff(order, main), intersect(main, order))
coefs$Coefficient = factor(coefs$Coefficient,levels = order)
if(colour != "") {
aesthetics = aes_string(x = "Coefficient", y = "Estimate", ymin = "Lower", ymax = "Upper", colour = colour)
} else {
aesthetics = aes_string(x = "Coefficient", y = "Estimate", ymin = "Lower", ymax = "Upper")
}
ggplot(coefs, aesthetics) +
geom_pointrange(position=position_dodge(width=0.8), size = 1) +
coord_flip() +
ggtitle(plot_title) +
scaleY +
geom_text(aes(label=round(Estimate,2)), vjust = 2, position=position_dodge(width=0.8)) +
scale_x_discrete("Predictors") +
geom_hline(yintercept = null.point, linetype='dotted') +
analysis_theme + theme(panel.grid.minor.y = element_blank(),panel.grid.major.y =element_blank(),panel.grid.major.x = element_line(colour="#eeeeee"))
}
recenter.pat = function(data, what = 'paternalage', among_who = 'idParents', breaks = 'c(0 ,25, 30, 35, 40, 45, 50, 55, 90)') {
makecenter = parse(text= str_c(what,'_c := meanCenter(',what,')') )
makemean = parse(text= str_c(what,'.mean := ave(',what,',',among_who,', FUN= function(x) { mean(x,na.rm=T) })'))
makediff = parse(text= str_c(what,'.diff := ',what,' -',what,'.mean'))
make_factors = parse(text= str_c(what,'.factor := cut((10*',what,'),breaks = ',breaks,', include.lowest = T, right = T)'))
data[, eval(makecenter)]
data[, eval(makemean)]
data[, eval(makediff)]
data[, eval(make_factors)]
data
}
dependent_sibs_f5y = function(survive1y, byear, dyear) {
sibs = length(byear)
other_dependent_sibs_f5y = integer(length=sibs)
for(i in 1:sibs) {
other_births = byear[-i]
other_deaths = dyear[-i]
other_made1y = survive1y[-i]
my_sibs = sibs - 1 - # minus self
sum(
other_births > (byear[i] + 5) | # born more than 5y later
(other_births + 5) < byear[i] | # finished infancy before birth
other_deaths <= byear[i] | # died before birth
other_made1y == 0, # if they died right away, don't count
na.rm=T) # if dyear missing assume they lived
other_dependent_sibs_f5y[i] = my_sibs
}
other_dependent_sibs_f5y
}
younger_sibs_alive_and_dependent = function(survive5y, byear, dyear) {
made5y = sum(survive5y==1 | is.na(survive5y)) # number of siblings who turned at least 5
sibs = length(survive5y)
younger_sibs_alive_and_dependent_in_first_5y = integer(length=sibs)
for(i in 1:sibs) {
younger_sibs = byear >= byear[i] # not using < because of twins
younger_sibs[i] = F # minus self
my_sibs = sum(younger_sibs, na.rm = T) # minus self
if(my_sibs > 0) {
sib_births = byear[ which(younger_sibs) ]
sib_deaths = dyear[ which(younger_sibs) ]
my_sibs = my_sibs -
sum(
sib_births > (byear[i] + 5) | # born more than 5y later
(sib_deaths <= sib_births + 1) # died within less than a year
,na.rm=T)
younger_sibs_alive_and_dependent_in_first_5y[i] = my_sibs
}
}
younger_sibs_alive_and_dependent_in_first_5y
}
older_sibs_alive_and_dependent = function(survive5y, byear, dyear) {
made5y = sum(survive5y,na.rm = T) # number of siblings who turned at least 5
sibs = length(survive5y)
older_sibs_alive_and_dependent_in_first_5y = integer(length=sibs)
for(i in 1:sibs) {
older_sibs = byear <= byear[i] # not using < because of twins
older_sibs[i] = F # minus self
my_sibs = sum(older_sibs,na.rm = T) # minus self
if(my_sibs > 0) {
sib_births = byear[ which(older_sibs) ]
sib_deaths = dyear[ which(older_sibs) ]
my_sibs = my_sibs -
sum(
sib_births < (byear[i] - 5) | # others born more than 5y earlier than me # 10 seconds of 17
(sib_deaths <= sib_births + 1) | # died within less than a year # 7 seconds of 17
(sib_deaths <= byear[i]) # died before my birth
,na.rm=T)
older_sibs_alive_and_dependent_in_first_5y[i] = my_sibs
}
}
older_sibs_alive_and_dependent_in_first_5y
}
this function takes a coefficient df which contains a cut-up/binned factor predictor and makes a plot to verify the linearity of the predictor, optionally accompanied by an mcmclist and plots the OR for the paternal age factor
mod_coefs_factor = function(fixefs, factor_base = "paternalage.factor")
{
if(!inherits(fixefs, what = "confint.coefs")) stop("Need a confint.coefs object.")
fixefs = as.data.table(fixefs)
fixefs = fixefs[ Coefficient %contains% factor_base, ]
if(nrow(fixefs) > 0) {
fixefs[, Trait := str_match(Coefficient, "(.+):")[,2] ]
escaped_fb = Hmisc::escapeRegex(as.character(factor_base))
fixefs[ , Coefficient := str_match(Coefficient,paste0(escaped_fb, "(.+)"))[,2] ]
fixefs[ , Coefficient := str_match(Coefficient, "\\(([0-9.,]+)\\]")[,2]]
fixefs[ , Coefficient := str_replace(Coefficient, "\\.",replacement="")]
fixefs[ , Coefficient := str_replace(Coefficient, ",",replacement="-")]
reference = ifelse( fixefs$Link[1] %in% c("log","logit"), 1, 0)
fixefs = rbind(fixefs, fixefs[nrow(fixefs), ])
fixefs[nrow(fixefs), Lower := reference]
fixefs[nrow(fixefs), Upper := reference]
fixefs[nrow(fixefs), Estimate := reference]
fixefs[nrow(fixefs), Coefficient := "14-25"]
fixefs
}
}
plot the factor ORs to see if it’s linear
plot_factor_linearity = function(fixefs, factor_base = "paternalage.factor")
{
if(!inherits(fixefs, what = "confint.coefs")) stop("Need a confint.coefs object.")
fixefs = as.data.table(fixefs)
fixefs = fixefs[ Coefficient %contains% factor_base, ]
if(nrow(fixefs) > 0) {
fixefs[, Trait := str_match(Coefficient, "(.+):")[,2] ]
escaped_fb = Hmisc::escapeRegex(as.character(factor_base))
fixefs[ , Coefficient := str_match(Coefficient,paste0(escaped_fb, "(.+)"))[,2] ]
fixefs[ , Coefficient := str_match(Coefficient, "\\(([0-9.,]+)\\]")[,2]]
fixefs[ , Coefficient := str_replace(Coefficient, "\\.",replacement="")]
fixefs[ , Coefficient := str_replace(Coefficient, ",",replacement="-")]
reference = ifelse( fixefs$Link[1] %in% c("log","logit"), 1, 0)
just_coefs = rbind(data.table( Lower = reference, Upper = reference, Estimate = reference, Coefficient = "14-25", Trait = unique(fixefs$Trait)[1]), fixefs[,list(Lower, Upper, Estimate, Coefficient, Trait)], use.names = T)
if(length(unique(fixefs$Trait))>1) just_coefs = rbind(data.table(Lower = reference, Upper = reference, Estimate = reference, Coefficient = "14-25", Trait = unique(fixefs$Trait)[2]), just_coefs)
just_coefs[is.na(Trait), Trait := ""]
qplot(data=just_coefs, x = Coefficient, y = Estimate, geom = c("pointrange","line"),ymin = Lower, ymax = Upper, group =1 ) + ggtitle(fixefs$Model[1]) + analysis_theme + xlab("Paternal age") + scale_y_log10(paste("Effect on",fixefs$Outcome[1]),breaks=c(0.25,0.5,0.75,1)) + geom_smooth(aes(weight = ifelse(Upper-Lower != 0, 1/(Upper-Lower),1)) ,method = "lm", se = F, lty = "dashed") + facet_wrap(~ Trait)
# qplot(data=just_coefs, x = Coefficient, y = Estimate, geom = c("pointrange","line"),ymin = Lower, ymax = Upper, group =1 ) + ggtitle(fixefs$Model[1]) + analysis_theme + xlab("Paternal age") + ylab(paste("Effect on",fixefs$Outcome[1])) + geom_smooth(method = "lm", se = F, lty = "dashed") + facet_wrap(~ Trait)
}
}
make_newdata = function(pred, olddata) {
newdata = pred
newdata$idParents = rep(1:2, length.out=nrow(newdata))
left = setdiff(names(olddata), names(newdata))
for(i in seq_along(left)) {
barename = str_match(left[i],"(.+\\()?([^()]+)\\)?")[,3] # bust var out of function term
if(is.numeric(olddata[, left[i]]) & length(unique(na.omit(olddata[, left[i]]))) == 2) {
newdata[, barename] = mean(olddata[, left[i]],na.rm=T)
} else if(is.logical((olddata[, left[i]]))) {
newdata[, barename] = FALSE
} else if(is.numeric(olddata[, left[i]])) {
newdata[, barename] = median(olddata[, left[i]],na.rm=T)
} else if(is.factor(olddata[, left[i] ])) {
fact = unique(olddata[,left[i] ]) ## choose reference for factors
newdata[, barename] = fact[fact==levels(fact)[1]]
} else {
newdata[, barename] = Mode(olddata[,left[i] ])
}
}
newdata = newdata[order(pred[,1]),]
rownames(newdata) = 1:nrow(newdata)
newdata
}
(input are the result of fortify_mine)
compare_coefs = function(x, y, colour = "Model")
{
if(is(x, 'confint.coefs') & is(y, 'confint.coefs')) {
if(unique(x$Model)==unique(y$Model)) {
x$Model = deparse(substitute(x))
y$Model = deparse(substitute(y))
}
x$CI = str_c( sprintf("%1.2f", x$Lower), ";", sprintf("%1.2f", x$Upper) )
y$CI = str_c( sprintf("%1.2f", y$Lower), ";", sprintf("%1.2f", y$Upper) )
coefs = merge(x[, c('Coefficient','Estimate','CI')], y[, c('Coefficient','Estimate','CI')], by = "Coefficient")
coefs = coefs[, c("Coefficient", setdiff(sort(names(coefs)),"Coefficient"))]
print(plot_fortified_mer(rbind(x,y), colour = colour))
kable(coefs,digits = 2, row.names=FALSE)
} else {
stop("Need confint.coefs")
}
}
shorthand for comparing models with few/many controls
compare_controlled_to_uncontrolled = function(ucon, con) {
ucon$Controls = "few"
con$Controls = "many"
compare_coefs(ucon, con, colour = "Controls")
}
shorthand for sample comparison
compare_krmh_to_rpqa = function(krmh_coefs, rpqa_coefs) {
krmh_coefs$Population = "Krummhörn"
rpqa_coefs$Population = "Québec"
rpqa_coefs=rpqa_coefs[!rpqa_coefs$Coefficient %contains% "birth.cohort", ]
krmh_coefs=krmh_coefs[!krmh_coefs$Coefficient %contains% "birth.cohort", ]
compare_coefs(krmh_coefs, rpqa_coefs, colour = "Population")
}
putting in a named list to name the origin of the coefficients
merge_coefs = function(...) {
dots <- list(...)
if(length(dots) == 0) return(NULL)
for(i in 1:length(dots)) {
dots[[i]]$Population = names(dots)[i]
}
coefs = rbindlist(dots, fill = T)
coefs = data.table(coefs)
class(coefs) = c( class(coefs), 'confint.coefs')
coefs
}
another comparison plot
factor_plot = function(coefs) {
coefs$Population = factor(coefs$Population, c("Québec","Modern Sweden","Historical Sweden","Krummhörn"))
ggplot(coefs,
aes(colour = Population, x = Coefficient, y = Estimate, ymin = Lower, ymax = Upper,weight = ifelse(Upper-Lower != 0, 1/(Upper-Lower),1), group = Population)) +
geom_pointrange(position = position_dodge(width = 0.4), size = 1) +
# geom_line(position = position_dodge(width = 0.4))+
geom_smooth(position = position_dodge(width = 0.4),method = "lm", se = F, linetype = "dashed", size = 0.2) +
xlab("Paternal age") +
scale_fill_brewer(palette = "Set2") +
scale_colour_brewer(palette = "Set2") +
scale_y_log10(breaks = c(0.4,0.6,0.8,1,1.2))+
geom_hline(y = 1, colour = "black", linetype = "dotted") +
analysis_theme
}
another comparison plot
factor_boot_plot = function(coefs) {
coefs$Population = factor(coefs$Population, c("Québec","Modern Sweden","Historical Sweden","Krummhörn"))
coefs$Population = factor(coefs$Population, c("Québec","Modern Sweden","Historical Sweden","Krummhörn"))
ggplot(coefs, aes(x = paternalage.factor, y = value, fill = Population)) +
geom_violin(colour = "transparent", alpha = 0.3, position = position_identity(), scale = "width") +
geom_point(aes(colour = Population),stat = "summary", fun.data = "mean_sdl", position = position_identity())+
geom_smooth(aes(colour = Population, weight = 1/sd(value, na.rm = T),group = Population),method = "lm", se = F, lty = "dashed", position = position_identity()) +
ggtitle("Children of older fathers have lower evolutionary fitness\nfurther controls") +
scale_fill_brewer(palette = "Set2") +
scale_colour_brewer(palette = "Set2") +
xlab("Paternal age") +
analysis_theme +
theme(legend.position = c(1,1),
legend.justification = c(1,1))
}
produce a (weighted) count of offspring and merge it back on the individual
count_and_merge = function(df, what, wt_var) {
if(!is.data.table(df)) df = as.data.table(df)
setnames(df, wt_var, ".tmp_wt")
counted.dad = df %>%
dplyr::group_by(idPere) %>%
dplyr::summarise(.tmp = sum(.tmp_wt, na.rm = T)) %>%
dplyr::rename(idIndividu = idPere)
counted.mom = df %>% dplyr::group_by(idMere) %>%
dplyr::summarise(.tmp = sum(.tmp_wt, na.rm = T)) %>%
dplyr::rename(idIndividu = idMere)
counted = rbind(counted.dad,counted.mom)
df = data.table(merge(df,counted,by='idIndividu',all.x=T))
df[is.na(.tmp), .tmp := 0]
setnames(df, old = c(".tmp",".tmp_wt"), new = c(what,wt_var))
df
}
human_readable_filename = function(file) {
file %>%
basename %>%
tools::file_path_sans_ext() %>%
str_sub(3) %>%
str_replace_all("_+"," ") %>%
str_replace("peer" ,"Peerage") %>%
str_replace("ddb" ,"historical Sweden") %>%
str_replace("swed" ,"modern Sweden") %>%
str_replace("rpqa" ,"Québec") %>%
str_replace("krmh" ,"Krummhörn") %>%
str_replace("massage" ,"Data Wrangling") %>%
stri_trans_totitle()
}
human_readable_factor = function(factor) {
tfactor = as.character(factor)
tfactor = str_match(tfactor,"\\(?([0-9.,]+)\\]")[,2]
if(any(is.na(tfactor))) {
return(factor)
}
tfactor %>%
str_replace(.,"\\.",replacement="") %>%
str_replace(.,",",replacement="-") %>%
str_replace(.,"0-25",replacement="10-25")
}
describeBin = function(df) {
df2 = colwise(function(x){ if(is.factor(x)){ ifelse(x=="TRUE",1,0)} else{ x} })(df)
df2 %>%
data.frame() %>%
melt(.,id.vars=c()) %>%
group_by(variable) %>%
summarise(n = sum(!is.na(value)),
mean = mean(value, na.rm=T),
sd = mean(value, na.rm=T) * (1-mean(value, na.rm=T))) %>%
data.frame() -> descriptives
rownames(descriptives) = descriptives$variable
descriptives$variable = NULL
round(descriptives,2)
}
aggDemoTrends = function(df) {
df[, list(byear, paternalage, maternalage, maternalage_at_1st_sib, maternalage_at_last_sib,paternalage_at_1st_sib,paternalage_at_last_sib)] %>%
mutate(maternalage_at_1st_sib = ifelse(maternalage_at_1st_sib==maternalage, maternalage_at_1st_sib, NA), # don't count sibs twice
paternalage_at_1st_sib = ifelse(paternalage_at_1st_sib==paternalage, paternalage_at_1st_sib, NA),
maternalage_at_last_sib = ifelse(maternalage_at_last_sib==maternalage, maternalage_at_last_sib, NA),
paternalage_at_last_sib = ifelse(paternalage_at_last_sib==paternalage, paternalage_at_last_sib, NA)) %>%
melt(id.vars=c("byear")) %>%
mutate(Parent = ifelse(variable %begins_with% "paternal","Father","Mother"),
variable = ifelse(variable %contains% "1st", "first", ifelse(variable %contains% "last", "last", "all"))
) %>%
group_by(byear, Parent, variable) %>%
mutate(value = value*10) %>%
dcast(byear + Parent ~ variable, fun.aggregate = function(x){mean(x,na.rm=T)}) %>%
rename(Year = byear) %>%
data.table()
}
#
# colour_values = c(
# "Krummhörn" = "#542437",
# "Québec" = "#53777A",
# "Modern Sweden" = "#D95B43",
# "Historical Sweden" = "#C02942"
# )
# colour_values = c(
# "Krummhörn" = "#FF9C5B",
# "Québec" = "#3B8183",
# "Modern Sweden" = "#ED303C",
# "Historical Sweden" = "#F5634A"
# )
# colour_values = c(
# "Krummhörn" = "#5E8C6A",
# "Québec" = "#8C2318",
# "Modern Sweden" = "#88A65E",
# "Historical Sweden" = "#BFB35A"
# )
# colour_values = c(
# "Krummhörn" = "#89E0B9",
# "Québec" = "#068981",
# "Modern Sweden" = "#535659",
# "Historical Sweden" = "#F4F9CB"
# )
# source: http://www.colourlovers.com/palette/3735149/Some_Elegance
colour_values = c(
"Krummhörn" = "#E2A674",
"Québec" = "#9F3B2B",
"20th-century Sweden" = "#8A846C",
"Historical Sweden" = "#21523B"
)
pop_colour = scale_colour_manual(values = colour_values)
pop_fill = scale_fill_manual(values = colour_values)
simple way to get bins by year
year_bins = function(x, bin = 5) {
ifelse(is.na(x), NA_character_,
paste0(plyr::round_any(x, bin, floor),"-",plyr::round_any(x, bin, floor) + bin)
)
}
paternal_age_10y_effect = function(model, predictor_name = "paternalage") {
nam = rownames(fixef(model))
pred_name = nam[nam %contains% predictor_name & ! nam %contains% paste0(predictor_name, ".mean")]
if (length(pred_name) > 0 ) {
pred_name = pred_name[1]
predictor = data.frame(paternalage = c(2.5, 3.5))
probs = c(0.025, 0.1, 0.5, 0.9, 0.975)
if (pred_name %in% names(model$data)) {
names(predictor) = pred_name[1]
}
# brms:::prepare_conditions
newd = make_newdata(predictor, olddata = model$data)
newd$response = newd$trait = NULL
fixefs = round(exp(fixef(model, estimate = "quantile", probs = probs)[pred_name,]), 2)
prediction = fitted(model, newdata = newd, re_formula = NA, summary = F)
estlow = round(quantile(prediction[,1], probs = probs),2)
esthigh = round(quantile(prediction[,2], probs = probs),2)
if (model$family$family == "bernoulli") {
pct_change = prediction[,2] - prediction[,1]
pct_decrease = round(quantile(pct_change, probs = probs) * 100,2)
} else {
pct_change = prediction[,2] / prediction[,1] - 1
pct_decrease = round(quantile(pct_change, probs = probs) * 100, 2)
}
estimates = data.frame(
effect = c("estimate father 25y", "estimate father 35y","percentage change", "OR/IRR"),
median_estimate = c(as.character(estlow['50%']),
as.character(esthigh['50%']),
as.character(pct_decrease['50%']),
as.character(fixefs['50%'])),
ci_95 = c(
paste0('[',estlow['2.5%'],';',estlow['97.5%'],']'),
paste0('[',esthigh['2.5%'],';',esthigh['97.5%'],']'),
paste0('[',pct_decrease['2.5%'],';',pct_decrease['97.5%'],']'),
paste0('[',fixefs['2.5%'],';',fixefs['97.5%'],']')
),
ci_80 = c(
paste0('[',estlow['10%'],';',estlow['90%'],']'),
paste0('[',esthigh['10%'],';',esthigh['90%'],']'),
paste0('[',pct_decrease['10%'],';',pct_decrease['90%'],']'),
paste0('[',fixefs['10%'],';',fixefs['90%'],']')
))
if (model$family$family == "hurdle_poisson") {
fixefs2 = round(exp(fixef(model, estimate = "quantile", probs = probs)[paste0("hu_",pred_name),]), 2)
estimates2 = data.frame(
effect = c("OR hurdle"),
median_estimate = as.character(fixefs2['50%']),
ci_95 = c(paste0('[',fixefs2['2.5%'],';',fixefs2['97.5%'],']')),
ci_80 = c(paste0('[',fixefs2['10%'],';',fixefs2['90%'],']')))
estimates = bind_rows(estimates, estimates2)
}
estimates
} else {
" "
}
}
effect_of_avg_nr_of_mutations = function(model, predictor_name = "paternalage") {
nam = rownames(fixef(model))
pred_name = nam[nam %contains% predictor_name & ! nam %contains% paste0(predictor_name, ".mean")]
if (length(pred_name) > 0 ) {
pred_name = pred_name[1]
predictor = data.frame(paternalage = c(0, 3))
probs = c(0.025, 0.1, 0.5, 0.9, 0.975)
if (pred_name %in% names(model$data)) {
names(predictor) = pred_name[1]
}
# brms:::prepare_conditions
newd = make_newdata(predictor, olddata = model$data)
newd$response = newd$trait = NULL
fixefs = round(exp(fixef(model, estimate = "quantile", probs = probs)[pred_name,]), 2)
prediction = fitted(model, newdata = newd, re_formula = NA, summary = F)
estlow = round(quantile(prediction[,1], probs = probs),2)
esthigh = round(quantile(prediction[,2], probs = probs),2)
if (model$family$family == "bernoulli") {
pct_change = (prediction[,2] - prediction[,1])
pct_decrease = round(quantile(pct_change, probs = probs) * 100,2)
} else {
pct_change = (prediction[,2] / prediction[,1] - 1)
pct_decrease = round(quantile(pct_change, probs = probs) * 100, 2)
}
estimates = data.frame(
effect = c("estimate at 0 mutations", "estimate at 60 mutations","percentage change", "OR/IRR"),
median_estimate = c(as.character(estlow['50%']),
as.character(esthigh['50%']),
as.character(pct_decrease['50%']),
as.character(fixefs['50%'])),
ci_95 = c(
paste0('[',estlow['2.5%'],';',estlow['97.5%'],']'),
paste0('[',esthigh['2.5%'],';',esthigh['97.5%'],']'),
paste0('[',pct_decrease['2.5%'],';',pct_decrease['97.5%'],']'),
paste0('[',fixefs['2.5%'],';',fixefs['97.5%'],']')
),
ci_80 = c(
paste0('[',estlow['10%'],';',estlow['90%'],']'),
paste0('[',esthigh['10%'],';',esthigh['90%'],']'),
paste0('[',pct_decrease['10%'],';',pct_decrease['90%'],']'),
paste0('[',fixefs['10%'],';',fixefs['90%'],']')
))
if (model$family$family == "hurdle_poisson") {
fixefs2 = round(exp(fixef(model, estimate = "quantile", probs = probs)[paste0("hu_",pred_name),]), 2)
estimates2 = data.frame(
effect = c("OR hurdle"),
median_estimate = as.character(fixefs2['50%']),
ci_95 = c(paste0('[',fixefs2['2.5%'],';',fixefs2['97.5%'],']')),
ci_80 = c(paste0('[',fixefs2['10%'],';',fixefs2['90%'],']')))
estimates = bind_rows(estimates, estimates2)
}
estimates
} else {
" "
}
}
render_web = function(file, auto_tab = T, include_top = "library/general_header.html", output_dir = "../paternal_age_fitness/") {
title = human_readable_filename(file)
pops = c("krmh","ddb","rpqa","swed")
pops = pops[file %contains% pops]
if(length(pops) == 1) include_top = paste0("library/",pops,"_header.html")
pandoc_args = list(paste0("--metadata=title:\"",title,"\""))
if(auto_tab) pandoc_args[[2]] = "--metadata=autotab:1"
output_format = html_document(
self_contained = FALSE,
template = "../paternal_age_fitness/library/website_theme.html",
lib_dir = paste0(output_dir, "library/"),
mathjax = NULL,
keep_md = TRUE,
includes = includes(before_body = include_top),
pandoc_args = pandoc_args,
theme = "paper"
)
output_format$knitr$opts_chunk$error = TRUE
render(input = file,
output_format = output_format,
envir = new.env(),
output_dir = output_dir
)
}
plot.brmsMarginalEffects_shades <- function(x, y, ncol = NULL, points = FALSE,
rug = FALSE, theme = ggplot2::theme(),
ask = TRUE, do_plot = TRUE, ...) {
for(i in seq_along(x)) {
old_attr = attributes(x[[i]])
x[[i]] = x[[i]] %>% dplyr::bind_cols(y[[i]] %>% dplyr::select(lowerCI, upperCI) %>% dplyr::rename(lowerCI80 = lowerCI, upperCI80 = upperCI))
new_col_names = attributes(x[[i]])$names
attributes(x[[i]]) = old_attr
attributes(x[[i]])$names = new_col_names
}
if("paternalage" %in% names(x)) {
order_x = names(x)
order_x = c("paternalage", order_x[order_x != "paternalage"])
x = x[order_x]
}
# Compute marginal effects plots using ggplot2
# Returns:
# A list of ggplot objects
if (do_plot) {
default_ask <- devAskNewPage()
on.exit(devAskNewPage(default_ask))
devAskNewPage(ask = FALSE)
}
plots <- setNames(vector("list", length(x)), names(x))
for (i in seq_along(x)) {
response <- attributes(x[[i]])$response
effects <- attributes(x[[i]])$effects
plots[[i]] <- ggplot(data = x[[i]]) +
aes_string(x = effects, y = "Estimate", ymin = "lowerCI",
ymax = "upperCI") + ylab(response) + theme
nCond <- length(unique(x[[i]]$MargCond))
if (nCond > 1) {
# one plot per row of marginal_data
if (is.null(ncol)) ncol <- max(floor(sqrt(nCond)), 3)
plots[[i]] <- plots[[i]] +
facet_wrap("MargCond", ncol = ncol)
}
if (length(effects) == 2) {
# differentiate by colour in case of interaction effects
plots[[i]] <- plots[[i]] +
aes_string(colour = effects[2], fill = effects[2])
}
if (is.numeric(x[[i]][, effects[1]])) {
# smooth plots for numeric predictors
plots[[i]] <- plots[[i]] +
geom_ribbon(alpha = 0.2, color = NA) +
geom_ribbon(aes_string(ymin = "lowerCI80", ymax = "upperCI80"), alpha = 0.2, color = NA) +
geom_line()
if (rug) {
plots[[i]] <- plots[[i]] +
geom_rug(aes_string(x = effects[1]), sides = "b",
data = attr(x[[i]], "points"), inherit.aes = FALSE)
}
} else {
# points and errorbars for factors
plots[[i]] <- plots[[i]] +
geom_linerange(aes_string(ymin = "lowerCI80", ymax = "upperCI80"), size = 1.7,
position = position_dodge(width = 0.4)) +
geom_linerange(aes_string(ymin = "lowerCI", ymax = "upperCI"), size = 0.7,
position = position_dodge(width = 0.4)) +
geom_point(position = position_dodge(width = 0.4),
size = 4 / nCond^0.25)
}
if (points) {
# show the data as points in the plot
plots[[i]] <- plots[[i]] +
geom_point(aes_string(x = effects[1], y = ".RESP"), shape = 1,
size = 4 / nCond^0.25, data = attr(x[[i]], "points"),
inherit.aes = FALSE)
}
if (do_plot) {
plot(plots[[i]])
if (i == 1) devAskNewPage(ask = ask)
}
}
invisible(plots)
}
get_coefficient_path = function(file, population) {
paste0("coefs/", population, "/", file, ".rds")
}
asis_knit_child = function(file) {
asis_output(knit_child(file, quiet = T))
}
summarise_model = function() {
asis_knit_child('0__model_summary.Rmd')
}
get_10y_effect = function(model_name, population) {
paternal_age_10y_effect(readRDS(get_coefficient_path(model_name, population)))[3,]
}
comp_plot = function(model_name) {
bind_rows(
"Krummhörn" = get_10y_effect(model_name, "krmh"),
"Québec" = get_10y_effect(model_name, "rpqa"),
"20th-century Sweden" = get_10y_effect(model_name, "swed"),
"Historical Sweden" = get_10y_effect(model_name, "ddb"),
.id = "Population"
) %>% mutate(
median_estimate = as.numeric(median_estimate),
lower95 = as.numeric(str_match(ci_95, "\\[(-?[0-9.]+);")[,2]),
upper95 = as.numeric(str_match(ci_95, ";(-?[0-9.]+)]")[,2]),
lower80 = as.numeric(str_match(ci_80, "\\[(-?[0-9.]+);")[,2]),
upper80 = as.numeric(str_match(ci_80, ";(-?[0-9.]+)]")[,2])
) -> ests
print( ggplot(ests, aes(x = factor(Population), y = median_estimate, ymin = lower95, ymax = upper95)) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_linerange(size = 0.7) +
geom_pointrange(aes(ymin = lower80, ymax = upper80), size = 1.7) +
scale_x_discrete("Population") +
scale_y_continuous("Paternal age effect within family") +
coord_flip()
)
ests %>% select(Population, median_estimate, ci_95, ci_80)
}