Krummhörn Data wrangling

description of data

The Krummhörn area in Eastern Frisia had around 15000 inhabitants in the early modern period. This population size was comparatively stable, because the settling of this relatively fertile marsh land concluded early.

This dataset resulted from a continuous research effort that started at the Anthropological Institute at the University of Göttingen, moved to the University College London and is now at the Centre for Philosophy and Scientific Basics at the University of Gießen. The focus of the investigation lies at the intersection of genetic and cultural reproduction.

The data is based on the life histories contained in church registers. These were collected systematically and combined genealogically according to the method of family reconstitution. The main aim is the identification of persons and the reconstitution of their life histories. There are additional data on their social background, that could be gleaned from tax records and other social history sources.

The dataset was constructed according to the purposes of the biological inquiry, which results in the following aspects: Families that are mobile are only found in the file at one place, independent of the number of church communities that have records on them The basic unit of analyses is the core family. Their cycle begins typically with a wedding and ends in death or the children’s weddings.

The file is not final, it is continuously being corrected, amended and extended by including further church records.

Loading details


Transforming data

krmh.individuals = read.dta("data/kh_24Nov2014/famkind3.dta")
krmh.unions = read.dta("data/kh_24Nov2014/KHfam-4.dta")
krmh.individuals$idIndividu = krmh.individuals$idk
krmh.unions$idPere = krmh.unions$idm_new
krmh.unions$idMere = krmh.unions$idf_new
krmh.individuals = merge(krmh.individuals, krmh.unions[, c('doc','idPere','idMere')], by = "doc", all.x=T)

##     0     1     2     3     4     5  <NA> 
## 26563 24290  7503  9261  7255  5936     0
krmh.individuals = data.table(krmh.individuals)
krmh.unions = data.table(krmh.unions)

# make male var.
props(~ krmh.individuals$sex)
krmh.individuals[, male := Recode(sex,"'M'=1;'W'=0;else=NA")]
props(~ krmh.individuals$male)
# famstatus variables contains the following info
# verheiratet: 1 (if famnrk empty)
# ledig (aber ueber 15): 2 (wenn ueber 5 aber famnrk empty)
# verstorben unter 15: 3
# Totgeburt: 4
props(~ krmh.individuals$famstatus)
##### Get it into the form we're used to
length(unique(krmh.unions$doc)) # unique union identifier
### parent IDs

krmh.unions[, idParents := doc]
length(intersect(krmh.unions$doc, krmh.individuals$doc))
## [1] 28250
krmh.u =
moms = krmh.u[!$auswahl_id_f),c('idf_new',names(krmh.u)[names(krmh.u) %ends_with% "_id_f"])]
dads = krmh.u[!$auswahl_id_m),c('idm_new',names(krmh.u)[names(krmh.u) %ends_with% "_id_m"])]
names(dads) = str_sub(names(dads),1,-6)
names(moms) = str_sub(names(moms),1,-6)
dads = dads[, names(moms)]
spouses = data.table(rbind(dads,moms))

join files

krmh.individuals[, ehemges := as.numeric(Recode(ehemges,"' '=1"))]
krmh.unions[, ehemges := as.numeric(Recode(ehemges,"' '=1"))]
krmh.individuals[, ehefges := as.numeric(Recode(ehefges,"' '=1"))]
krmh.unions[, ehefges := as.numeric(Recode(ehefges,"' '=1"))]

qplot(data=melt(numcolwise(identity)(krmh.individuals)),value) + facet_wrap(~ variable,scales='free')
# qplot(data=melt(catcolwise(identity)(krmh.individuals)),value) + facet_wrap(~ variable,scales='free')
(uniq_unions = setdiff(names(spouses), names(krmh.individuals)))
krmh = merge(krmh.individuals, spouses[, list(idIndividu, Nbirths, NinfantD, NchildD, NadultD, ageFfirstbirth, ageFlastbirth, ageMfirstbirth, ageMlastbirth, ageHfirst, ageHlast)], by = "idIndividu", all.x =T)
count kids

krmh[,idParents := doc]

count_spouses = function(df, df2, what,  wt_var) {
    df = data.frame(df)
    df2 = data.frame(df2)
    counted.husband = dcast(data= df2[,c('idPere',wt_var)],formula = idPere ~ .,fun.aggregate = sum, na.rm=T, value.var = wt_var)
    counted.wive = dcast(data= df2[,c('idMere',wt_var)],formula = idMere ~ .,fun.aggregate = sum, na.rm=T, value.var = wt_var)
    names(counted.husband) = names(counted.wive) = c('idIndividu',what)
    counted = rbind(counted.husband,counted.wive)
    df = merge(df,counted,by='idIndividu',all.x=T)
    df[,what] = Recode(df[,what],'NA=0')
krmh$born = 1; krmh.unions$born = 1
# spouses[spouses$idIndividu==1,]
# krmh.unions[idPere==2,]
# krmh.individuals[idIndividu==2,]
krmh = count_spouses(krmh,krmh.unions, 'spouses', "born")

# qplot(krmh$spouses,ifelse(krmh$male==T, krmh$ehemges,krmh$ehefges), geom = "jitter")
# xtabs(~ krmh$spouses + ifelse(krmh$male==T, krmh$ehemges,krmh$ehefges), exclude =NULL, na.action=na.pass)

krmh$survive1d = ifelse(krmh$ageKtod_days > 1, 1, 0)
krmh$survive1m = ifelse(krmh$ageKtod_days > 28, 1, 0)
krmh$dead1m = ifelse(krmh$ageKtod_days > 28, 0, 1)
krmh$dead1y = ifelse(krmh$ageK1 >= 1, 0, 1)
krmh$dead5y = ifelse(krmh$ageK15 > 5, 0, 1)
krmh$deadR = ifelse(krmh$ageK15 >= 15, 0, 1)
krmh$survive5y = ifelse(krmh$ageK15 > 5, 1, 0)
krmh$survive1y = ifelse(krmh$ageK1 >= 1, 1, 0)
krmh$surviveR = ifelse(krmh$ageK15 >= 15, 1, 0)

krmh$born =1 

krmh[, not_stillborn := ifelse(famstatus != 4,1,0)]
krmh[, survive1d := ifelse(famstatus != 4,1,0)] # small discrepancy of 7 with gebkk, but dead the first day, maybe not stillborn
# krmh[, survive1d := ifelse(gebkk!='Todgeburt' |,1, 0) ] # age is always missing in these ~1600 cases
krmh[survive1d == 0, age := 0 ] # i dont want age to be missing here
krmh[survive1d == 0, age.days := 0 ] # i dont want age to be missing here

krmh[, ever_married :=  ifelse(famstatus==1,1,0) ] # famstatus does not map directly
krmh[,paternalage := ageMgebK/10]
krmh[paternalage < 1 ,paternalage := NA]
krmh[,maternalage := ageFgebK/10]

krmh.unions = krmh.unions[order(krmh.unions$idPere,krmh.unions$dat4),]
krmh.unions$marriage.order.Father = ave(rep(NA, nrow(krmh.unions)), krmh.unions$idPere, FUN = seq_along)
krmh.unions = krmh.unions[order(krmh.unions$idMere,krmh.unions$dat4),]
krmh.unions$marriage.order.Mother = ave(rep(NA, nrow(krmh.unions)), krmh.unions$idMere, FUN = seq_along)

krmh = merge(krmh, krmh.unions[,list(idParents,marriage.order.Mother,marriage.order.Father,dat4,dat8),],by="idParents",all.x=T)
krmh$first.marriage = (krmh$marriage.order.Mother + krmh$marriage.order.Father) == 2
Known families

We apply the following criteria to consider a family “known”, which means we can make some further assumptions. If we know the bride’s and groom’s end of marriage, that is we have upper bound death dates (todk8) for at least one spouse and a lower bound death date for the other spouse that exceeds this date, we know that this family spent its time in the Krummhörn. Thus we can conclude that children who have missing death dates did not die, they emigrated (otherwise we’d have dates because the family is on-the-record).

# we consider a family "known" if we know that one spouse survived the other and we have the marriage date. if we have missing death dates for such children, we can assume that they emigrated (thus made 15), we'd know if they'd died in the KH
krmh$known_family = krmh$ehebekannt
krmh = count_and_merge(krmh, 'children', wt_var = "born")
krmh$children.per.spouse = krmh$children/krmh$spouses
krmh$children.per.spouse[which(krmh$spouses==0)] = NA
changeNAto1 = function(x) { colwise(function(x) { ifelse(, 1, x)})(x) }
krmh[children>0 | spouses>0, surviveR := ifelse(, 1, surviveR)]
krmh[children>0 | spouses>0, survive1d := ifelse(, 1, survive1d)]
krmh[children>0 | spouses>0, survive1y := ifelse(, 1, survive1y)]
krmh = count_and_merge(krmh, 'children.surviving1d', wt_var = 'survive1d')
krmh = count_and_merge(krmh, 'children.surviving1m', wt_var = 'survive1m')
krmh = count_and_merge(krmh, 'children.surviving1y', wt_var = 'survive1y')
krmh = count_and_merge(krmh, 'children.surviving5y', wt_var = 'survive5y')
krmh = count_and_merge(krmh, 'children.survivingR', wt_var = 'surviveR')
krmh = count_and_merge(krmh, 'children.dead1m', wt_var = 'dead1m')
krmh = count_and_merge(krmh, 'children.dead1y', wt_var = 'dead1y')
changeNAto0 = function(x) { ifelse(, 0, x) }
krmh$children.unknown_fate = krmh$Nbirths - krmh$NchildD - krmh$NinfantD - krmh$NadultD
krmh = count_and_merge(krmh, 'children.spouses', wt_var = 'spouses')
krmh = count_and_merge(krmh, 'grandchildren.per.spouse', wt_var = 'children.per.spouse')

krmh = count_and_merge(krmh, 'grandchildren',wt_var='children')
krmh = count_and_merge(krmh, 'grandchildren.surviving1d', wt_var = 'children.surviving1d')
krmh = count_and_merge(krmh, 'grandchildren.surviving1m', wt_var = 'children.surviving1m')
krmh = count_and_merge(krmh, 'grandchildren.surviving1y', wt_var = 'children.surviving1y')
krmh = count_and_merge(krmh, 'grandchildren.survivingR', wt_var = 'children.survivingR')
krmh = count_and_merge(krmh, 'grandchildren.dead1m', wt_var = 'children.dead1m')

pre-calculate some predictors

krmh = krmh[order(krmh$idParents,krmh$gebk4), ]
krmh <- transform(krmh, birthorder = ave(rep(NA, nrow(krmh)), krmh$idParents, FUN = seq_along)) # old trick to get birth order, don't know what this does to those with missings for father though
krmh$birthorder.mean = ave(krmh$birthorder,krmh$idParents,FUN= function(x) { mean(x,na.rm=T) } )
krmh$birthorder.diff = krmh$birthorder - krmh$birthorder.mean

krmh[, byear := year(gebk4)]

transform(krmh[1:40,list(idParents,byear,birthorder,surviveR)], min15.birthorder = ave(surviveR, idParents, FUN =function(x) { x[] = 0
} )) # NAs propagate problematically...
krmh <- transform(krmh, min15.birthorder = ave(surviveR, idPere, FUN =function(x) { x[] = 0
} ))
krmh$min15.birthorder.mean = ave(krmh$min15.birthorder,krmh$idParents,FUN= function(x) { mean(x,na.rm=T) } )
krmh$min15.birthorder.diff = krmh$min15.birthorder - krmh$min15.birthorder.mean

krmh[, nr.siblings := ave(born,doc,FUN= function(x) { sum(x,na.rm=T) } ) - 1] 

krmh$nr.dead.siblings1m = ave(krmh$dead1m,krmh$idParents,FUN= function(x) { sum(x,na.rm=T) } ) - krmh$dead1m # don't count self! dont't control for outcome
krmh$infant.death.cluster = krmh$nr.dead.siblings1m/krmh$nr.siblings
lag.0 = function(x) {   
    if(length(x)==1) 0
    else c(0,x[ 1:(length(x)-1)]) 
inv.lag.0 = function(x) {   
    if(length(x)==1) 0
    else c(x[ 2:length(x)],0) 
krmh = transform(krmh, older.sib.made.15y = ave(surviveR, idParents, FUN = lag.0))
krmh = transform(krmh, younger.sib.made.15y = ave(surviveR, idParents, FUN = inv.lag.0))

Get grandparents

grandparents = krmh[, list(idIndividu,idPere,idMere, paternalage, maternalage)]
names(grandparents) = c('idMere', 'idMaternalGrandfather', 'idMaternalGrandmother', 'maternal.grandpaternalage',  'maternal.grandmaternalage')
krmh = merge(krmh, grandparents, by = "idMere", all.x =T)
names(grandparents) = c('idPere', 'idPaternalGrandfather', 'idPaternalGrandmother', 'paternal.grandpaternalage',  'paternal.grandmaternalage')
krmh = merge(krmh, grandparents, by = "idPere", all.x =T)

high-level predictors

krmh$born = NULL # was just an aid
## Warning in alloc.col(x): Attempt to reduce allocation from 299 to 298
## ignored. Can only increase allocation via shallow copy.
krmh = data.table(krmh)
krmh[, bdate := gebk4]
krmh[, ddate := todk4]
krmh[, ddate.Mother := todf4]
krmh[, ddate.Father := todm4]
krmh$byear.years = year(krmh$bdate)
krmh$dyear.years = year(krmh$ddate)
krmh[, dyear := dyear.years]
krmh = krmh %>%
    tbl_df %>% 
    group_by(idParents) %>%
        younger_sibs_ad_5y = younger_sibs_alive_and_dependent(survive5y=survive5y, byear=byear.years, dyear=dyear.years) ,
        older_sibs_ad_5y = older_sibs_alive_and_dependent(survive5y=survive5y, byear=byear.years, dyear=dyear.years),
        dependent_sibs_f5y = dependent_sibs_f5y(survive1y=survive1y, byear=byear, dyear=dyear)
    ) %>% data.table()
krmh$ddate = as.Date(krmh$ddate)
krmh$ddate.Mother = as.Date(krmh$ddate.Mother)
krmh$ddate.Father = as.Date(krmh$ddate.Father)
krmh$bdate = as.Date(krmh$bdate)
krmh$bdate.Mother = as.Date(krmh$gebf4)
krmh$bdate.Father = as.Date(krmh$gebm4)
krmh$paternal_alive = as.numeric(krmh$ddate.Father-krmh$bdate)/365
krmh$paternalloss = ifelse(as.POSIXct(krmh$bdate) + dyears(5) > as.POSIXct(krmh$ddate.Father),1,0)
krmh$paternalloss_by_35 = as.POSIXct(krmh$bdate) + dyears(35) > as.POSIXct(krmh$ddate.Father)
krmh$maternalloss = ifelse(as.POSIXct(krmh$bdate) + dyears(5) > as.POSIXct(krmh$ddate.Mother),1,0)
krmh$maternalloss_by_35 = as.POSIXct(krmh$bdate) + dyears(35) > as.POSIXct(krmh$ddate.Mother)
krmh$parentalloss_might_be_disease = FALSE
krmh[as.POSIXct(bdate) + ddays(30) > as.POSIXct(ddate.Father),]$parentalloss_might_be_disease = TRUE
krmh[as.POSIXct(bdate) + ddays(30) > as.POSIXct(ddate.Mother),]$parentalloss_might_be_disease = TRUE
krmh = data.table(krmh) 
krmh$landownership = Recode(krmh$grasen,"0:1='0 landless';1:25='1-25 grasen';25:140='25-140 grasen';140:hi='140+ grasen'")
krmh$landless = Recode(krmh$grasen,"0:1=T;1:hi=F")
krmh[, ageK5 := ifelse(age > 5, 5, age)]
krmh[krmh$dat8_fam1< krmh$bdate, dat8_fam1:=NA]
krmh[, paternalloss_m := (ifelse(bdate + days(ifelse(, 5, round(ageK5*365))) <= ddate.Father,
                                                                    ifelse( | ddate.Father < dat8_fam1, 'dead_b_m', 'dead_a_m'), 'dead_5y'))]
krmh[, maternalloss_m := (ifelse(bdate + days(ifelse(, 5, round(ageK5*365))) <= ddate.Mother,
                                                                    ifelse( | ddate.Mother < dat8_fam1, 'dead_b_m', 'dead_a_m'), 'dead_5y'))]
krmh[, paternalloss_k1 := ifelse(bdate + days(ifelse(, 1, round(ageK1*365))) <= ddate.Father,
krmh[, maternalloss_k1 := ifelse(
    bdate + days(ifelse(, # if the birth date is larger 
            ifelse(,1,365), round(ageK1*365))) <= ddate.Mother,
krmh[, mother_child_die_within_7d := ifelse(abs(ddate - ddate.Mother) <= 7, 1, 0)]
krmh[, mother_child_die_within_7d := ifelse(nr.siblings > birthorder, 0, NA)]
krmh[, mother_child_die_within_7d := ifelse(survive1y==1, 0, NA)]
krmh[, early_maternalloss := maternalloss_k1 != "no_early_maternal_loss" ]
krmh[, maternalloss_k1 := relevel(factor(maternalloss_k1), ref = "no_early_maternal_loss")]

min_na = function(x) { ifelse(all(, NA, min(x, na.rm=T) ) }
max_na = function(x) { ifelse(all(, NA, max(x, na.rm=T) ) }
krmh[, paternalage_at_1st_sib := ave(paternalage, idPere, FUN = min_na)]
krmh[, paternalage_at_last_sib := ave(paternalage, idPere, FUN = max_na)]
krmh[, maternalage_at_1st_sib := ave(maternalage, idMere, FUN = min_na)]
krmh[, maternalage_at_last_sib := ave(maternalage, idMere, FUN = max_na)]
fathers = krmh[!duplicated(idPere), list(idPere, paternalage_at_1st_sib, paternalage_at_last_sib)]
names(fathers) = c("idIndividu","age_at_1st_child", "age_at_last_child")
mothers = krmh[!duplicated(idMere), list(idMere, maternalage_at_1st_sib, maternalage_at_last_sib)]
names(mothers) = c("idIndividu","age_at_1st_child", "age_at_last_child")
parents = rbind(fathers, mothers) 
krmh = merge(krmh, parents, by = "idIndividu", all.x = T)
krmh$dyear.Mother = year(krmh$ddate.Mother)
krmh$dyear.Father = year(krmh$ddate.Father)
krmh %>%  
mutate(maternal_loss_age = dyear.Mother - byear
                 ,maternal_loss_age = as.numeric(ifelse(maternal_loss_age >= -1 & maternal_loss_age < 0, 0, maternal_loss_age))
                 ,maternal_loss  = as.character(cut(maternal_loss_age, breaks = c(0,1,5,10,15,20,25,30,35,40,45), include.lowest = T ))
                 ,maternal_loss = ifelse( maternal_loss_age >= 45, "later", maternal_loss)
                 ,maternal_loss = ifelse( | maternal_loss_age < 0, "unclear", maternal_loss)
                 ,maternal_loss = factor(maternal_loss, levels = c("later","[0,1]", "(1,5]", "(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",  "(35,40]", "(40,45]", "unclear"))
                 ,paternal_loss_age = dyear.Father - byear
                 ,paternal_loss_age = as.numeric(ifelse(paternal_loss_age >= -1 & paternal_loss_age < 0, 0, paternal_loss_age))
                 ,paternal_loss  = as.character(cut(paternal_loss_age, breaks = c(0,1,5,10,15,20,25,30,35,40,45), include.lowest = T ))
                 ,paternal_loss = ifelse( paternal_loss_age >= 45, "later", paternal_loss)
                 ,paternal_loss = ifelse( | paternal_loss_age < 0, "unclear", paternal_loss)
                 ,paternal_loss = factor(paternal_loss, levels = c("later","[0,1]", "(1,5]",  "(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",  "(35,40]", "(40,45]", "unclear"))
                 ) %>%
    data.table() ->
krmh[, birth.cohort := year_bins(byear)]
krmh[byear < 1670, birth.cohort := "1570-1670"]
krmh[byear >= 1670 & byear < 1700, birth.cohort := "1670-1700"]
krmh[byear >= 1700 & byear < 1720, birth.cohort := "1700-1720"]
krmh[byear >= 1720 & byear < 1760, birth.cohort := "1720-1760"]
krmh[byear >= 1900, birth.cohort := "1900-1935"] 
krmh$older_siblings = factor(ifelse((krmh$birthorder - 1) > 4,"5+", krmh$birthorder - 1))
krmh$last_born = ifelse(krmh$birthorder == krmh$nr.siblings, 1, 0)

krmh = krmh[order(krmh$idParents,krmh$bdate), ] 
krmh <- transform(krmh, siblings = ave(rep(NA, nrow(krmh)), krmh$idParents, FUN = length)-1) # sibling count
krmh <- transform(krmh, birthorder = ave(rep(NA, nrow(krmh)), krmh$idParents, FUN = seq_along)) # old trick to get birth order, don't know what this does to those with missings for father though
krmh$younger_siblings = krmh$siblings + 1 - krmh$birthorder
recenter_all = function(x) { recenter.pat( recenter.pat( x, among_who="idParents"), what = "maternalage", among_who = "idParents") }

krmh[fall %in% c("P","C") & ehem==" ", ehem := "1"]

krmh$birth_cohort = factor(krmh$birth.cohort)
krmh$male = factor(krmh$male)
krmh$last_born = factor(krmh$last_born) 

krmh = recenter_all(krmh)
krmh[, any_surviving_children := ifelse(children.survivingR > 0, 1, 0)]
krmh[, children.wddate := children.dead1y + children.surviving1y] 
krmh = krmh[, maternalage.factor := cut((10*maternalage), breaks = c(14, 20, 35, 50))]
krmh$maternalage.factor = relevel(krmh$maternalage.factor, ref = "(20,35]")
krmh.with.paternalage = subset(krmh, subset = ! ) = subset(krmh.with.paternalage, subset = ehebekannt==TRUE & paternal_loss != "unclear" & maternal_loss != "unclear") # contains only those were we know how the marriage ended

krmh.1 =[geburtsjahrK>=1720 & geburtsjahrK< 1835, ]
# save(krmh,krmh.1,krmh_pedigree,file="krmh.rdata")