Anonymising the data

Load data

library(knitr)
opts_chunk$set(fig.width = 12, fig.height = 12, cache = F, warning = F, message = F)
source("0_helpers.R")
load("full_data.rdata")

set some defaults

diary = diary %>% 
  mutate(
  included = included_all,
  fertile = if_else(is.na(prc_stirn_b_squished), prc_stirn_b_backward_inferred, prc_stirn_b_squished),
  contraceptive_methods = factor(contraceptive_method, levels = 
  c("barrier_or_abstinence", "fertility_awareness", "none",  "hormonal")),
  relationship_status_clean = factor(relationship_status_clean),
  cohabitation = factor(cohabitation),
  certainty_menstruation = as.numeric(as.character(certainty_menstruation))
)  %>% group_by(person) %>% 
  mutate(
      fertile_mean = mean(fertile, na.rm = T)
  )

cleaning

These fields were stored as text, but can be simplified.

diary = diary %>% mutate(
  want_more_info = as.integer(want_more_info),
  trying_to_get_pregnant = if_else(trying_to_get_pregnant == 'yes', 1, 0),
  had_sex_with_partner_yet = if_else(had_sex_with_partner_yet == 'yes', 1, 0),
  breast_feeding_in_last_3_months = if_else(breast_feeding_in_last_3_months == 'yes', 1, 0),
  hormonal_medication_in_last_3_months = if_else(hormonal_medication_in_last_3_months == 'yes', 1, 0),
  pill_in_last_3_months = if_else(pill_in_last_3_months == 'yes', 1, 0)
)

remove free-text fields

Free text fields often have unique values and can contain things like IDs, addresses, free-text responses to questions, etc.

# first I look only at character columns (they tend to have most unique values)
diary %>% select_if(is.character) %>% 
  gather(variable, value) %>%  # putting this in long format allows me to easily 
  group_by(variable) %>% 
  summarise(n = n_nonmissing(value), n_dist = n_distinct(value)) %>% # summarise frequencies of values
  pander() # and display them
variable n n_dist
change_contraception_2 1057 25
children 30956 10
children_broad_categories 30956 2
children_narrow_categories 30956 5
contraception 30952 73
contraceptive_method 30956 5
contraceptive_method_by_pearl 30956 5
contraceptives_broad_categories 30952 11
contraceptives_categories_natural 29911 6
feedback_for_us 6114 176
gestagen_type 16178 12
hormonal_2 1716 9
hypothesis_guessed 30956 6
illness_2 9284 167
income_partner 30956 5
living_situation 30956 4
meaning_study 14437 414
medicament_2 11349 278
method_meeting 16257 10
method_meeting_clean 16222 7
name_hormonal 195 7
occupation 30956 14
occupation_clean 30850 7
other_pill_name 1531 43
pills 16212 72
postal_code 30956 403
pregnant_in_last_3_months 30956 2
relationship_status 30956 15
religion 30956 18
religion_clean 30881 6
sess_day 30924 30925
session 30956 1208
session_id 30924 30925
session_id.abschluss 26913 785
session_id.vorab 30956 1208
sex_orientation 30956 5
short 30956 1208
short_session 30924 1177
special_days 16985 403
special_events 2836 2010
# of course manual inspection (for me, using View() ) is important too

# same thing for factors shows no rare unique values 
diary %>% select_if(is.factor) %>% 
  gather(variable, value) %>% 
  group_by(variable) %>% 
  summarise(n = n_nonmissing(value), n_dist = n_distinct(value)) %>%
  arrange(n_dist) %>% 
  pander()
variable n n_dist
weekend 30956 2
cohabitation 30956 3
cycle_regularity 30956 3
days_with_partner 30956 3
fertile_window 11682 3
fertile_window_backward_inferred 17013 3
fertile_window_forward_counted 16201 3
fertile_window_squished 11877 3
included 28493 3
included_all 28493 3
included_conservative 14147 3
included_lax 17960 3
included_strict 7682 3
menstruation 30956 3
nights_with_partner 30956 3
relationship_status_clean 30956 3
cycle_length_groups 30956 4
estrogen_categories 26785 4
menstruation_strength 30956 4
contraceptive_methods 30357 5
included_levels 28493 5
age_group 30956 6
days_with_partner_per_month 9538 6
income 13229 6
week_number 30924 7
distance_to_partner_hours 9538 8
weekday 30924 8
gestagens 1497 10
# diary %>% select_if(is.character) %>% names() %>% cat(sep=", -")

# here I remove all columns that have text fields which might betray someone's identity
diary = diary %>% select(-session, -session_id, -session_id.vorab, -short, -sess_day, -session_id.abschluss, -short_session, -vpn, -session_id.nachbe_other_hormonal,
          -pills,-other_pill_name, -religion, -relationship_status, -postal_code, -children, -method_meeting,  -meaning_study, -medicament_2, -special_days, -illness_2, -change_contraception_2, -feedback_for_us, -name_hormonal, -hypothesis_guessed, -children_narrow_categories,  -special_events)

Dates and times

diary %>% select_if(is.instant) %>% 
  gather(variable, value) %>% 
  group_by(variable) %>% 
  summarise(n = n_nonmissing(value), n_dist = n_distinct(value)) %>%
  arrange(n_dist) %>% 
  pander()
variable n n_dist
ended.nachbe_other_hormonal 1716 52
created.nachbe_other_hormonal 1804 55
modified.nachbe_other_hormonal 1804 55
first_day 30924 209
first_diary_date 30924 209
cyklus_1 30956 306
last_menstruation 27362 308
next_menstrual_onset 18614 330
menstrual_onset_date_inferred 4068 363
last_diary_date 30924 371
menstruation_dates 4805 437
next_menstrual_onset_inferred 30924 450
last_menstrual_onset 30924 453
created_date 30924 532
ended.abschluss 26432 770
created.abschluss 26913 785
modified.abschluss 26913 785
created.vorab 30956 1206
modified.vorab 30956 1206
ended.vorab 30956 1207
ended 30924 30582
modified 30924 30795
created 30924 30797
diary = diary %>% 
  # remove all datetime we don't need
  select(-starts_with("created")) %>% 
  select(-starts_with("modified")) %>% 
  select(-starts_with("expired")) %>% 
  select(-first_day, -first_diary_date, -cyklus_1, -last_menstruation, -next_menstrual_onset, -menstrual_onset_date_inferred, -last_diary_date, -menstruation_dates, -next_menstrual_onset_inferred, -last_menstrual_onset) %>% 

    mutate( # we do want to know whether they finished a survey, but can't know when
    ended = !is.na(ended),
    ended.vorab = !is.na(ended.vorab),
    ended.abschluss = !is.na(ended.abschluss),
    ended.nachbe_other_hormonal = !is.na(ended.nachbe_other_hormonal)
    )

a similar problem exists for intervals, but we might need these and I can easily round them. Here I unfortunately didn’t store them as type interval, so I have to look through all numeric types

# diary %>% select_if(is.interval) %>% 
#   gather(variable, value) %>% 
#   group_by(variable) %>% 
#   summarise(n = n_nonmissing(value), n_dist = n_distinct(value)) %>%
#   arrange(n_dist) %>% 
#   pander()

The Hmisc::cut2 function allows me to group variables into g groups and try to make no group contain fewer than m values.

diary = diary %>% 
  mutate(
    time_for_response = Hmisc::cut2(time_for_response, g = 10, m = 30),
    time_of_response = Hmisc::cut2(time_of_response, g = 10, m = 30),
    time_since_last_response = Hmisc::cut2(time_since_last_response, cuts = 10, m = 30),
    biggest_diff = Hmisc::cut2(biggest_diff, g = 10, m = 30),
    avg_diff = Hmisc::cut2(avg_diff, g = 10, m = 300),
    days_responded_percentage = Hmisc::cut2(days_responded_percentage, g = 10, m = 300)
    )

finding rare values

Although rare values won’t allow anyone to identify many participants, it is important to protect privacy for all participants. In our case especially, a potential attacker might know that e.g. his girlfriend uses a pill with a rare gestagen and identify her like this.

To this end, I wrote the function below. It gives (depending on the last argument) the rarest value and how frequent it is. I included the group variable, because a lot of demographic data is duplicated in my diary data.

commonness_rarest_value = function(x, group, give_value = FALSE) {
  df = data.frame(x = x, group = group) %>% unique()
  counts = table(df$x)
  ret = counts[which.min(counts)]
  if (give_value) names(ret) else ret
}

diary %>% gather(variable, value, -person) %>% 
  group_by(variable) %>% 
  summarise(n = n_nonmissing(value), n_dist = n_distinct(value), lc_val = commonness_rarest_value(value, person, T), lc_freq = commonness_rarest_value(value, person, F)) %>% 
  arrange(desc(lc_freq/(n_dist+lc_freq))) %>% 
  pander()
variable n n_dist lc_val lc_freq
ended.vorab 30956 1 TRUE 1208
gender 30956 1 1 1208
hetero_relationship 30956 1 1 1208
weekend 30956 2 TRUE 1122
time_since_last_response 23275 2 [10,31] 1097
menstruated_at_all 30956 2 TRUE 910
in_pair_public_intimacy 29902 3 0 1081
mate_retention_1 29902 3 2 960
spent_night_with_partner 29902 3 0 960
fertile_broad_backward_inferred 17013 3 0.0533333333333333 959
had_sexual_intercourse 29902 3 1 945
premenstrual_phase_backward_inferred 27143 3 TRUE 929
premenstrual_phase_fab 27143 3 TRUE 929
fertile_broad_forward_counted 16201 3 0.0533333333333333 928
fertile_window_forward_counted 16201 3 broad 919
fertile_window_backward_inferred 17013 3 broad 915
menstruation_1 29901 3 1 910
fertile_narrow_forward_counted 12407 3 0.51 908
fertile_narrow_backward_inferred 13345 3 0.51 905
had_petting 29901 3 1 904
pill_contraception 30956 2 0 598
premenstrual_phase_forward_counted 27380 3 TRUE 873
contraception_meeting_partner 30956 2 2 581
pill_in_last_3_months 30956 2 0 566
pille_control 30956 2 2 566
menstruation 30956 3 pre 819
premenstrual_phase 17887 3 TRUE 819
premenstrual_phase_squished 17944 3 TRUE 807
sufficient_diary_coverage 30956 2 FALSE 518
partner_initiated_sexual_intercourse 6512 3 0 772
sexual_intercourse_3 6512 3 1 772
hormonal_contraception 30956 2 0 514
menstrual_onset 18614 3 TRUE 726
fertile_broad_squished 11877 3 0.44 697
fertile_broad 11682 3 0.44 694
fertile_window_squished 11877 3 broad 683
fertile_window 11682 3 broad 675
fertile_narrow 9648 3 0.51 670
fertile_narrow_squished 9702 3 0.51 667
ended.abschluss 30956 2 FALSE 439
hormonal_contra 30952 3 TRUE 602
hormonal_all 30952 3 FALSE 585
mate_retention_2 29902 4 1 758
dodgy_data 30924 3 TRUE 566
any_RCD 30956 2 FALSE 368
SJS_5 30956 2 2 357
SJS_5R 30956 2 1 357
hormonal 30952 3 FALSE 514
SJS_6 30956 2 2 312
ever_menstruated 30956 2 FALSE 298
had_any_period 30956 2 FALSE 298
sexual_intercourse_2 29902 4 2 595
SJS_4 30956 2 2 286
sexual_intercourse_5 29901 4 2 567
include_all 30952 3 TRUE 421
included 28493 3 cycling 421
included_all 28493 3 cycling 421
antibiotics 30956 2 1 279
hormonal_medication_in_last_3_months 30956 2 1 279
weekday 30924 8 Saturday 1034
hormonal_lax 30952 3 TRUE 374
week_number 30924 7 (28,35] 833
stress 26606 3 1 355
days_with_partner 30956 3 3-5 days 346
cohabitation 30956 3 Live in same city 334
long_distance_relationship 30956 3 2 334
sexual_intercourse_1 29902 6 1 663
sexual_intercourse_1_6scale 29902 6 1.2 663
medicament_1 26606 3 1 330
nights_with_partner 30956 3 3-5 nights 329
attention_2 29874 7 6 766
SJS_3 30956 2 2 213
SJS_3R 30956 2 1 213
male_attention_1 29874 7 2 739
situation_of_living 18107 3 1 294
hormonal_conservative 30940 3 TRUE 293
menstruation_3 4794 4 3 390
menstruation_strength 30956 4 3 390
days_since_menstrual_onset 4068 6 4 571
SJS_1 30956 2 2 188
illness_1 26606 3 1 271
children_broad_categories 30956 2 children 180
mate_retention_3 29876 7 6 594
desirability_1 29881 7 1 562
choice_of_clothing_3 29887 7 1 554
communication_partner_1 29902 4 1 311
choice_of_clothing_6 29883 7 6 541
desirability_partner 29879 7 1 537
time_of_response 30924 11 [17.0,17.3) 835
any_fertile_days_known 30956 2 FALSE 147
attention_1 29874 7 2 510
jealousy_1 29877 7 6 463
extra_pair_3 29873 7 6 451
cycle_regularity 30956 3 irregular, more than 5 days off 192
cyklus_5 30956 3 3 192
extra_pair_6 29873 7 2 445
male_mate_retention_1 29876 7 6 444
choice_of_clothing_1 29896 7 6 431
mate_retention_4 29876 7 2 427
male_mate_retention_2 29876 7 2 418
mate_retention_6 29875 7 6 417
flat_share 10737 3 1 173
extra_pair_12 29873 7 6 389
hormonal_strict 30940 3 TRUE 161
SJS_2 30956 2 2 107
SJS_2R 30956 2 1 107
menstruation_2 4794 7 6 368
extra_pair_5 29873 7 2 348
choice_of_clothing_4 29884 7 6 343
time_for_response 29143 11 [0.10, 2.15) 531
extra_pair_2 29874 7 6 336
we_know_fertile_days 30956 2 FALSE 96
include_lax 30952 3 TRUE 143
included_lax 17960 3 cycling 143
self_esteem_1 29881 7 1 309
BFI_consc_9 30956 5 3 219
BFI_consc_9R 30956 5 3 219
living_situation 30956 4 living in all-female flatshare 173
choice_of_clothing_8 29882 7 6 291
showy_clothes 29882 7 6 291
extra_pair_1 29867 3 1 124
extra_pair_intimacy 29867 3 1 124
NARQ_admiration_3 29877 7 6 289
estrogen_categories 26785 4 (300,600] 153
mate_retention_5 29876 7 6 265
include_conservative 30952 3 TRUE 112
included_conservative 14147 3 cycling 112
SOI_R_6 30956 5 2 183
SOI_R_6R 30956 5 4 183
prc_stirn_b_forward_counted 27380 20 0.14 687
fertile_fab 27143 20 0.05 679
fertile_forward_and_backward 27143 20 0.05 679
prc_stirn_b_backward_inferred 27143 20 0.05 679
SOI_R_5 30956 5 3 169
extra_pair_13 29873 7 6 234
extra_pair_sexual_fantasies 29873 7 6 234
choice_of_clothing_2 29892 7 1 233
MV_2 30956 5 1 162
attention 29874 12 1.5 382
choice_of_clothing_7 29881 7 6 222
fertile 27828 20 0.16 619
attractiveness_finance_2 30956 5 5 152
income_partner 30956 5 > 3000€ 152
NARQ_admiration_2 29877 7 6 211
SOI_R_4 30956 5 1 148
NARQ_admiration_1 29877 7 6 197
BFI_open_10 30956 5 1 136
choice_of_clothing_5 29885 7 6 186
extra_pair_7 29873 7 6 186
had_sex_with_partner_yet 30956 2 0 53
male_mate_retention 29876 12 6 314
relationship_satisfaction_1 29902 8 4.5 208
extra_pair_9 29873 7 6 179
ended.nachbe_other_hormonal 30956 2 TRUE 51
extra_pair_10 29873 7 6 178
BFI_agree_6 30956 5 1 126
BFI_agree_6R 30956 5 5 126
BFI_extra_7 30956 5 5 126
BFI_extra_7R 30956 5 1 126
trying_to_get_pregnant 30956 2 1 50
prc_wcx_b_forward_counted 27380 26 0.006 624
CJS_6 30956 5 5 119
prc_wcx_b_backward_inferred 27143 26 0.001 603
BFI_agree_4 30956 5 1 113
NARQ_1 30956 5 5 112
BFI_consc_4 30956 5 1 111
BFI_consc_4R 30956 5 5 111
included_levels 28493 5 lax 111
SGSE_5 30956 5 5 111
BFI_neuro_7 30956 5 5 108
RJS_1 30956 5 5 108
CJS_3 30956 5 1 106
MV_P_2 30956 5 1 105
SGSE_4 30956 5 5 104
SGSE_4R 30956 5 1 104
extra_pair_going_out 29873 12 5.5 242
male_jealousy_1 29876 7 6 141
male_jealousy_2 29877 7 6 138
BFI_neuro_8 30956 5 1 98
NARQ_6 30956 5 5 98
SOI_R_3 30956 5 5 97
cyklus_4 30956 5 1 95
include_strict 30952 3 TRUE 57
included_bool 7682 3 TRUE 57
included_strict 7682 3 cycling 57
extra_pair_1b 399 3 1 54
extra_pair_sex 30924 3 1 54
fertile_cont 17887 20 0.05 360
prc_stirn_b 17887 20 0.05 360
extra_pair_8 29873 7 6 125
RJS_3 30956 5 5 89
BFI_neuro_6 30956 5 1 87
BFI_neuro_6R 30956 5 5 87
relationship_status_clean 30956 3 Verlobt 52
BFI_agree_1 30956 5 5 86
BFI_agree_1R 30956 5 1 86
BFI_consc_8 30956 5 1 85
BFI_consc_8R 30956 5 5 85
BFI_neuro_5 30956 5 5 85
BFI_neuro_5R 30956 5 1 85
BFI_extra_2 30956 5 1 84
BFI_extra_2R 30956 5 5 84
MV_P_5 30956 5 1 84
MV_P_5R 30956 5 5 84
extra_pair_4 29873 7 6 116
BFI_agree_8 30956 5 1 82
BFI_agree_8R 30956 5 5 82
BFI_neuro_3 30956 5 1 81
SGSE_1 30956 5 5 81
ended 30956 2 FALSE 32
attractiveness_stp_self 30956 5 1 79
CJS_2 30956 5 5 79
attractiveness_finance_1 30956 5 5 78
extra_pair_compliments 29873 12 6 187
male_jealousy_3 29876 7 6 109
SGSE_3 30956 5 5 76
SGSE_3R 30956 5 1 76
attractiveness_stp 30956 5 1 75
NARQ_7 30956 5 1 75
extra_pair_11 29873 7 6 102
NARQ_rivalry_1 29876 7 6 100
MV_5 30956 5 1 71
MV_5R 30956 5 5 71
NARQ_3 30956 5 5 71
SOI_R_7 30956 5 5 70
NARQ_rivalry_2 29877 7 6 97
BFI_neuro_2 30956 5 1 69
BFI_neuro_2R 30956 5 5 69
prc_stirn_b_squished 17944 20 0.16 271
has_not_had_sex_yet 30956 2 1 27
BFI_neuro_1 30956 5 5 67
cyklus_2 30956 5 2 66
pregnant 30956 2 1 26
pregnant_in_last_3_months 30956 2 yes 26
NARQ_15 30956 5 5 64
NARQ_16 30956 5 5 62
cycle_length_groups 30956 4 (35,41] 49
BFI_extra_8 30956 5 1 61
contraceptive_methods 30357 5 fertility_awareness 61
SOI_R_2 30956 5 5 60
RJS_5 30956 5 5 59
BFI_open_7 30956 5 1 58
BFI_open_7R 30956 5 5 58
RJS_6 30956 5 5 58
BFI_open_5 30956 5 1 56
ZIP_7 30956 5 1 56
ZIP_7R 30956 5 5 56
NARQ_11 30956 5 1 55
want_more_info 26606 3 0 33
prc_wcx_b 17887 26 0 277
prc_wcx_b_squished 17944 26 0.018 271
change_contraception_1 26606 3 1 31
BFI_open_9 30956 5 1 51
BFI_open_9R 30956 5 5 51
CJS_1 30956 5 5 51
attractiveness_relativ_1 30956 4 4 40
breast_feeding 30956 2 1 20
breast_feeding_in_last_3_months 30956 2 1 20
SGSE_2 30956 5 5 49
BFI_open_1 30956 5 1 47
age_group 30956 6 (45,70] 56
BFI_extra_6 30956 5 1 45
BFI_open_6 30956 5 1 44
biggest_diff 30956 10 [ 2.04, 2.13) 88
NARQ_18 30956 5 1 44
NARQ_9 30956 5 5 44
BFI_consc_2 30956 5 1 43
BFI_consc_2R 30956 5 5 43
NARQ_5 30956 5 1 43
MV_P_1 30956 5 5 42
attractiveness_finance_self 13229 6 5 49
income 13229 6 > 3000€ 49
MV_4 30956 5 1 40
avg_diff 30885 11 [1.055, 1.09) 84
BFI_extra_5 30956 5 1 38
BFI_extra_5R 30956 5 5 38
sexy_clothes 30924 18 6 133
breakup 26606 3 1 22
communication_partner_2 28857 8 6 58
BFI_extra_4 30956 5 1 36
BFI_extra_3 30956 5 1 34
BFI_neuro_4 30956 5 1 34
NARQ_rivalry_3 29876 7 6 46
BFI_agree_5 30956 5 1 32
SOI_R_8 30956 5 5 32
religiosity 30956 5 5 31
RJS_4 30956 5 5 31
days_responded_percentage 30885 11 [0.95122,0.976) 67
CJS_5 30956 5 5 30
NARQ_8 30956 5 5 30
RJS_2 30956 5 5 30
BFI_consc_5 30956 5 1 29
certainty_menstruation 30956 5 1 29
cyklus_6 30956 5 2 29
SOI_R_9 30956 5 5 29
attractiveness_occupation 30956 5 1 28
contraceptive_method 30956 5 other 27
MV_P_4 30956 5 1 27
NARQ_admiration 29877 17 6 88
SOI_uv 30956 13 1.33333333333333 67
SJS 30956 7 2 36
BFI_open_8 30956 5 1 25
sexual_intercourse_satisfaction 6511 8 1 38
BFI_consc_6 30956 5 1 23
contraceptive_method_by_pearl 30956 5 fertility_awareness 23
attractiveness_finance 30956 9 5 41
CJS_4 30956 5 5 22
MV_1 30956 5 5 22
BFI_extra_1 30956 5 1 21
NARQ_4 30956 5 5 21
BFI_agree_9 30956 5 1 20
BFI_open_4 30956 5 1 20
NARQ_10 30956 5 5 19
ZIP_5 30956 5 1 19
BFI_agree_3 30956 5 1 18
BFI_agree_3R 30956 5 5 18
BFI_consc_7 30956 5 1 18
no_variation 29891 3 TRUE 10
BFI_consc_1 30956 5 1 16
MV_3 30956 5 1 15
NARQ_12 30956 5 5 15
NARQ_13 30956 5 5 15
ZIP_2 30956 5 1 15
ZIP_4 30956 5 1 14
ZIP_4R 30956 5 5 14
new_relationship 26606 3 1 8
BFI_agree_2 30956 5 1 13
MV_P_3 30956 5 1 13
satisfaction_sexual_intercourse 29562 6 1 15
days_with_partner_per_month 9538 6 > 14 days 14
duration_pill 16212 6 1 14
long_distance_relationship_2 9538 6 5 14
BFI_consc_3 30956 5 1 11
ZIP_3 30956 5 1 11
BFI_agree_7 30956 5 1 10
occupation_clean 30850 7 Nicht berufstätig 13
NARQ_2 30956 5 5 9
SOI_R_1 30956 5 5 9
sexy 30956 5 1 8
attractiveness_face 30956 5 1 7
attractiveness_overall 30956 5 1 7
BFI_open_2 30956 5 1 7
BFI_open_3 30956 5 1 7
NARQ_17 30956 5 5 7
ZIP_1 30956 5 1 7
ZIP_6 30956 5 1 7
partner_mate_retention 29874 22 6 30
number_of_cycles 30924 6 5 8
RCD_squished 18018 31 0 41
attractiveness_ltp 30956 5 1 6
partner_attractiveness_longterm 30956 5 -3.97818492731748 6
cycle_nr 30924 6 5 7
duration_relationship_months 30956 13 12 15
attractiveness_ltp_self 30956 5 1 5
contraceptives_broad_categories 30952 11 partner_sterilised 11
distance_partner 9538 8 6 8
distance_to_partner_hours 9538 8 9-12h 8
partner_attractiveness_money 30956 13 1 12
attractiveness_body 30956 5 1 4
SOI_im 30956 13 5 9
religion_clean 30881 6 Judentum 4
MV_short 30956 13 1 8
NARQ_14 30956 5 5 3
gestagen_ug 2 2 250 1
male_jealousy 29876 17 5.33333333333333 8
NARQ_rivalry 29876 17 5.66666666666667 8
method_meeting_clean 16222 7 Depotspritze 3
extra_pair_flirting 29873 17 5.66666666666667 7
MV 30956 17 1.8 7
SOI_be 30956 13 4.66666666666667 5
SGSE 30956 21 4.8 8
contraceptives_categories_natural 29911 6 infertile 2
estrogen 195 3 30 1
estrogen_ug 1497 6 15 2
estrogen_ug_merged 2157 6 15 2
female_jealousy 30924 18 5.66666666666667 6
gestagen 195 3 13 1
hormonal_1 1716 6 4 2
extra_pair_desire 29873 27 5.8 7
estrogen_ug_other 1315 4 420 1
partner_attractiveness_shortterm 30956 16 -3.34512824374663 4
MV_P_short 30956 13 1 3
partner_attractiveness_physical 30956 9 2 2
sex_orientation 30956 5 pansexuell 1
RJS 30956 25 4.83333333333333 4
hormonal_2 1716 9 circlet 1
gestagens 1497 10 NGT 1
gestagens_ug 1497 10 125 1
estrogen_ug_all 26785 11 360 1
estrogen_ug_cycle 17120 12 360 1
gestagen_type 16178 12 NES 1
gestagen_ug_merged 15610 12 1500 1
menarche 13225 12 19 1
in_pair_desire 29874 56 4.4 4
occupation 30956 14 Beamtin 1
BFI_neuro 30956 32 1.125 2
gestagen_cycle 14567 16 1.44 1
intensive_sports 26606 16 14 1
SOI_R 30956 35 4.22222222222222 2
MV_P 30956 18 4.8 1
cigarettes 26606 19 11 1
choice_of_clothing 29881 41 5.5 2
cycle_length 30956 21 39 1
cyklus_3 30956 21 39 1
day_count 30956 42 42 2
days 30956 42 25 2
fertile_days_known_backward 30956 42 41 2
first_time 29943 21 10 1
MV_diff 30956 24 -3.66666666666667 1
CJS 30956 25 5 1
ZIP 30956 27 1.28571428571429 1
partner_attractiveness_global 30956 29 -2.95278253338198 1
female_mate_retention 30924 30 3.33333333333333 1
BFI_extra 30956 31 1.125 1
BFI_agree 30956 32 1.33333333333333 1
NARQ_K_total 29876 32 5.83333333333333 1
BFI_open 30956 33 1.5 1
BFI_consc 30956 34 1.22222222222222 1
age 30956 42 54 1
fertile_days_known_backward_inferred 30956 43 42 1
fertile_days_known_forward 30956 43 42 1
n_days 30956 43 42 1
height 30956 46 116 1
number_sexual_partner 30956 47 100 1
NARQ 30956 56 1 1
extra_pair 29873 62 5.75 1
partner_attractiveness_rel_to_self 30956 65 -0.162454954632523 1
weight_post 26606 68 102 1
contraception 30952 73 coitus_interruptus, temperature_billings 1
weight 30956 79 101 1
last_lag 27362 95 107 1
cycle_length_diary 18018 111 101 1
median_cycle_length_diary 26728 127 10 1
estrogen_ug_per_kg 17120 143 10.2083333333333 1
attractiveness_income_age_corrected 30956 148 -0.049531448419045 1
mean_cycle_length_diary 30924 155 10 1
timespan 30924 181 101 1
took_days 30924 181 101 1
minimum_cycle_length_diary 30924 197 109 1
duration_relationship_total 30820 206 110 1
duration_relationship_years 30820 206 10.25 1
menstrual_onset_days_until 18614 219 -105 1
RCD 18614 219 -105 1
RCD_for_merge 18614 219 106 1
RCD_rel_to_ovulation 18614 219 -101 1
FCD 30924 270 123 1
menstrual_onset_days_since 30924 270 122 1
day_number 30924 314 159 1
RCD_inferred 30924 446 -106 1
BMI 26606 568 10.8108108108108 1
fertile_mean 30956 888 0.0111111111111111 1
# drop
diary = diary %>% 
  # durations redundant with total duration, use age group, sex orientation has overly rare categories
  select(-duration_relationship_months,-duration_relationship_years, -age, -sex_orientation
)

## round/group
diary = diary %>% mutate(
  # height = round_any(height, 5), # rounding would be nice but leaves extreme values out there
  # weight = round_any(weight, 5),
  # BMI = round_any(BMI, 2),
  duration_relationship_total = round_any(duration_relationship_total/12, 0.3)
  # first_time = round_any(first_time, 3)
) %>% 
  select(
    -estrogen_ug_per_kg # derived factors from weight can allow people to recompute weight
    )

## cap
na_if_lt = function(x, lt) {
  x = as.numeric(x)
  if_else(x < lt, NA_real_, x)
}
na_if_gt = function(x, gt) {
  x = as.numeric(x)
  if_else(x > gt, NA_real_, x)
}
diary = diary %>% mutate(
  RCD = na_if_lt(RCD, -40),
  RCD_rel_to_ovulation = na_if_lt(RCD_rel_to_ovulation, -20),
  RCD_inferred = na_if_gt(na_if_lt(RCD_inferred, -40), 30),
  RCD_for_merge = na_if_gt(RCD_for_merge, 50),
  FCD = na_if_gt(FCD, 40),
  day_number = na_if_gt(day_number, 40),
  menstrual_onset_days_since = na_if_gt(menstrual_onset_days_since, 40),
  menstrual_onset_days_until = na_if_lt(menstrual_onset_days_until, -40),
  minimum_cycle_length_diary = na_if_gt(minimum_cycle_length_diary, 40),
  took_days = na_if_gt(took_days, 40),
  timespan = na_if_gt(timespan, 40),
  mean_cycle_length_diary = na_if_gt(mean_cycle_length_diary, 40),
  median_cycle_length_diary = na_if_gt(mean_cycle_length_diary, 40),
  cycle_length_diary = na_if_gt(mean_cycle_length_diary, 40),
  last_lag = na_if_gt(last_lag, 40),
  number_sexual_partner = na_if_gt(number_sexual_partner, 30)
)

check if we did a good job

diary %>% gather(variable, value, -person) %>% 
  group_by(variable) %>% 
  summarise(n = n_nonmissing(value), n_dist = n_distinct(value), lc_val = commonness_rarest_value(value, person, T), lc_freq = commonness_rarest_value(value, person, F)) %>% 
  arrange(desc(lc_freq/(n_dist+lc_freq))) %>% 
  pander()
variable n n_dist lc_val lc_freq
ended.vorab 30956 1 TRUE 1208
gender 30956 1 1 1208
hetero_relationship 30956 1 1 1208
weekend 30956 2 TRUE 1122
time_since_last_response 23275 2 [10,31] 1097
menstruated_at_all 30956 2 TRUE 910
in_pair_public_intimacy 29902 3 0 1081
mate_retention_1 29902 3 2 960
spent_night_with_partner 29902 3 0 960
fertile_broad_backward_inferred 17013 3 0.0533333333333333 959
had_sexual_intercourse 29902 3 1 945
premenstrual_phase_backward_inferred 27143 3 TRUE 929
premenstrual_phase_fab 27143 3 TRUE 929
fertile_broad_forward_counted 16201 3 0.0533333333333333 928
fertile_window_forward_counted 16201 3 broad 919
fertile_window_backward_inferred 17013 3 broad 915
menstruation_1 29901 3 1 910
fertile_narrow_forward_counted 12407 3 0.51 908
fertile_narrow_backward_inferred 13345 3 0.51 905
had_petting 29901 3 1 904
pill_contraception 30956 2 0 598
premenstrual_phase_forward_counted 27380 3 TRUE 873
contraception_meeting_partner 30956 2 2 581
pill_in_last_3_months 30956 2 0 566
pille_control 30956 2 2 566
menstruation 30956 3 pre 819
premenstrual_phase 17887 3 TRUE 819
premenstrual_phase_squished 17944 3 TRUE 807
sufficient_diary_coverage 30956 2 FALSE 518
partner_initiated_sexual_intercourse 6512 3 0 772
sexual_intercourse_3 6512 3 1 772
hormonal_contraception 30956 2 0 514
menstrual_onset 18614 3 TRUE 726
fertile_broad_squished 11877 3 0.44 697
fertile_broad 11682 3 0.44 694
fertile_window_squished 11877 3 broad 683
fertile_window 11682 3 broad 675
fertile_narrow 9648 3 0.51 670
fertile_narrow_squished 9702 3 0.51 667
ended.abschluss 30956 2 FALSE 439
hormonal_contra 30952 3 TRUE 602
hormonal_all 30952 3 FALSE 585
mate_retention_2 29902 4 1 758
dodgy_data 30924 3 TRUE 566
any_RCD 30956 2 FALSE 368
SJS_5 30956 2 2 357
SJS_5R 30956 2 1 357
hormonal 30952 3 FALSE 514
SJS_6 30956 2 2 312
ever_menstruated 30956 2 FALSE 298
had_any_period 30956 2 FALSE 298
sexual_intercourse_2 29902 4 2 595
SJS_4 30956 2 2 286
sexual_intercourse_5 29901 4 2 567
include_all 30952 3 TRUE 421
included 28493 3 cycling 421
included_all 28493 3 cycling 421
antibiotics 30956 2 1 279
hormonal_medication_in_last_3_months 30956 2 1 279
weekday 30924 8 Saturday 1034
hormonal_lax 30952 3 TRUE 374
week_number 30924 7 (28,35] 833
stress 26606 3 1 355
days_with_partner 30956 3 3-5 days 346
cohabitation 30956 3 Live in same city 334
long_distance_relationship 30956 3 2 334
sexual_intercourse_1 29902 6 1 663
sexual_intercourse_1_6scale 29902 6 1.2 663
medicament_1 26606 3 1 330
nights_with_partner 30956 3 3-5 nights 329
attention_2 29874 7 6 766
SJS_3 30956 2 2 213
SJS_3R 30956 2 1 213
male_attention_1 29874 7 2 739
situation_of_living 18107 3 1 294
hormonal_conservative 30940 3 TRUE 293
menstruation_3 4794 4 3 390
menstruation_strength 30956 4 3 390
days_since_menstrual_onset 4068 6 4 571
SJS_1 30956 2 2 188
illness_1 26606 3 1 271
children_broad_categories 30956 2 children 180
mate_retention_3 29876 7 6 594
desirability_1 29881 7 1 562
choice_of_clothing_3 29887 7 1 554
communication_partner_1 29902 4 1 311
choice_of_clothing_6 29883 7 6 541
desirability_partner 29879 7 1 537
time_of_response 30924 11 [17.0,17.3) 835
any_fertile_days_known 30956 2 FALSE 147
attention_1 29874 7 2 510
jealousy_1 29877 7 6 463
extra_pair_3 29873 7 6 451
cycle_regularity 30956 3 irregular, more than 5 days off 192
cyklus_5 30956 3 3 192
extra_pair_6 29873 7 2 445
male_mate_retention_1 29876 7 6 444
choice_of_clothing_1 29896 7 6 431
mate_retention_4 29876 7 2 427
male_mate_retention_2 29876 7 2 418
mate_retention_6 29875 7 6 417
flat_share 10737 3 1 173
extra_pair_12 29873 7 6 389
hormonal_strict 30940 3 TRUE 161
SJS_2 30956 2 2 107
SJS_2R 30956 2 1 107
menstruation_2 4794 7 6 368
extra_pair_5 29873 7 2 348
choice_of_clothing_4 29884 7 6 343
time_for_response 29143 11 [0.10, 2.15) 531
extra_pair_2 29874 7 6 336
we_know_fertile_days 30956 2 FALSE 96
include_lax 30952 3 TRUE 143
included_lax 17960 3 cycling 143
self_esteem_1 29881 7 1 309
BFI_consc_9 30956 5 3 219
BFI_consc_9R 30956 5 3 219
living_situation 30956 4 living in all-female flatshare 173
choice_of_clothing_8 29882 7 6 291
showy_clothes 29882 7 6 291
extra_pair_1 29867 3 1 124
extra_pair_intimacy 29867 3 1 124
NARQ_admiration_3 29877 7 6 289
estrogen_categories 26785 4 (300,600] 153
mate_retention_5 29876 7 6 265
include_conservative 30952 3 TRUE 112
included_conservative 14147 3 cycling 112
SOI_R_6 30956 5 2 183
SOI_R_6R 30956 5 4 183
prc_stirn_b_forward_counted 27380 20 0.14 687
fertile_fab 27143 20 0.05 679
fertile_forward_and_backward 27143 20 0.05 679
prc_stirn_b_backward_inferred 27143 20 0.05 679
SOI_R_5 30956 5 3 169
extra_pair_13 29873 7 6 234
extra_pair_sexual_fantasies 29873 7 6 234
choice_of_clothing_2 29892 7 1 233
MV_2 30956 5 1 162
attention 29874 12 1.5 382
choice_of_clothing_7 29881 7 6 222
fertile 27828 20 0.16 619
attractiveness_finance_2 30956 5 5 152
income_partner 30956 5 > 3000€ 152
NARQ_admiration_2 29877 7 6 211
SOI_R_4 30956 5 1 148
NARQ_admiration_1 29877 7 6 197
BFI_open_10 30956 5 1 136
choice_of_clothing_5 29885 7 6 186
extra_pair_7 29873 7 6 186
had_sex_with_partner_yet 30956 2 0 53
male_mate_retention 29876 12 6 314
relationship_satisfaction_1 29902 8 4.5 208
extra_pair_9 29873 7 6 179
ended.nachbe_other_hormonal 30956 2 TRUE 51
extra_pair_10 29873 7 6 178
BFI_agree_6 30956 5 1 126
BFI_agree_6R 30956 5 5 126
BFI_extra_7 30956 5 5 126
BFI_extra_7R 30956 5 1 126
trying_to_get_pregnant 30956 2 1 50
prc_wcx_b_forward_counted 27380 26 0.006 624
CJS_6 30956 5 5 119
prc_wcx_b_backward_inferred 27143 26 0.001 603
BFI_agree_4 30956 5 1 113
NARQ_1 30956 5 5 112
BFI_consc_4 30956 5 1 111
BFI_consc_4R 30956 5 5 111
included_levels 28493 5 lax 111
SGSE_5 30956 5 5 111
BFI_neuro_7 30956 5 5 108
RJS_1 30956 5 5 108
CJS_3 30956 5 1 106
MV_P_2 30956 5 1 105
SGSE_4 30956 5 5 104
SGSE_4R 30956 5 1 104
extra_pair_going_out 29873 12 5.5 242
male_jealousy_1 29876 7 6 141
male_jealousy_2 29877 7 6 138
BFI_neuro_8 30956 5 1 98
NARQ_6 30956 5 5 98
SOI_R_3 30956 5 5 97
cyklus_4 30956 5 1 95
include_strict 30952 3 TRUE 57
included_bool 7682 3 TRUE 57
included_strict 7682 3 cycling 57
extra_pair_1b 399 3 1 54
extra_pair_sex 30924 3 1 54
fertile_cont 17887 20 0.05 360
prc_stirn_b 17887 20 0.05 360
extra_pair_8 29873 7 6 125
RJS_3 30956 5 5 89
BFI_neuro_6 30956 5 1 87
BFI_neuro_6R 30956 5 5 87
relationship_status_clean 30956 3 Verlobt 52
BFI_agree_1 30956 5 5 86
BFI_agree_1R 30956 5 1 86
BFI_consc_8 30956 5 1 85
BFI_consc_8R 30956 5 5 85
BFI_neuro_5 30956 5 5 85
BFI_neuro_5R 30956 5 1 85
BFI_extra_2 30956 5 1 84
BFI_extra_2R 30956 5 5 84
MV_P_5 30956 5 1 84
MV_P_5R 30956 5 5 84
extra_pair_4 29873 7 6 116
BFI_agree_8 30956 5 1 82
BFI_agree_8R 30956 5 5 82
BFI_neuro_3 30956 5 1 81
SGSE_1 30956 5 5 81
ended 30956 2 FALSE 32
attractiveness_stp_self 30956 5 1 79
CJS_2 30956 5 5 79
attractiveness_finance_1 30956 5 5 78
extra_pair_compliments 29873 12 6 187
male_jealousy_3 29876 7 6 109
SGSE_3 30956 5 5 76
SGSE_3R 30956 5 1 76
attractiveness_stp 30956 5 1 75
NARQ_7 30956 5 1 75
extra_pair_11 29873 7 6 102
NARQ_rivalry_1 29876 7 6 100
MV_5 30956 5 1 71
MV_5R 30956 5 5 71
NARQ_3 30956 5 5 71
SOI_R_7 30956 5 5 70
NARQ_rivalry_2 29877 7 6 97
BFI_neuro_2 30956 5 1 69
BFI_neuro_2R 30956 5 5 69
prc_stirn_b_squished 17944 20 0.16 271
has_not_had_sex_yet 30956 2 1 27
BFI_neuro_1 30956 5 5 67
cyklus_2 30956 5 2 66
pregnant 30956 2 1 26
pregnant_in_last_3_months 30956 2 yes 26
NARQ_15 30956 5 5 64
day_number 28729 42 35 534
NARQ_16 30956 5 5 62
cycle_length_groups 30956 4 (35,41] 49
BFI_extra_8 30956 5 1 61
contraceptive_methods 30357 5 fertility_awareness 61
SOI_R_2 30956 5 5 60
RJS_5 30956 5 5 59
BFI_open_7 30956 5 1 58
BFI_open_7R 30956 5 5 58
RJS_6 30956 5 5 58
BFI_open_5 30956 5 1 56
ZIP_7 30956 5 1 56
ZIP_7R 30956 5 5 56
NARQ_11 30956 5 1 55
want_more_info 26606 3 0 33
prc_wcx_b 17887 26 0 277
prc_wcx_b_squished 17944 26 0.018 271
change_contraception_1 26606 3 1 31
BFI_open_9 30956 5 1 51
BFI_open_9R 30956 5 5 51
CJS_1 30956 5 5 51
attractiveness_relativ_1 30956 4 4 40
breast_feeding 30956 2 1 20
breast_feeding_in_last_3_months 30956 2 1 20
SGSE_2 30956 5 5 49
BFI_open_1 30956 5 1 47
age_group 30956 6 (45,70] 56
BFI_extra_6 30956 5 1 45
BFI_open_6 30956 5 1 44
biggest_diff 30956 10 [ 2.04, 2.13) 88
NARQ_18 30956 5 1 44
NARQ_9 30956 5 5 44
BFI_consc_2 30956 5 1 43
BFI_consc_2R 30956 5 5 43
NARQ_5 30956 5 1 43
MV_P_1 30956 5 5 42
attractiveness_finance_self 13229 6 5 49
income 13229 6 > 3000€ 49
MV_4 30956 5 1 40
avg_diff 30885 11 [1.055, 1.09) 84
BFI_extra_5 30956 5 1 38
BFI_extra_5R 30956 5 5 38
sexy_clothes 30924 18 6 133
breakup 26606 3 1 22
communication_partner_2 28857 8 6 58
BFI_extra_4 30956 5 1 36
BFI_extra_3 30956 5 1 34
BFI_neuro_4 30956 5 1 34
NARQ_rivalry_3 29876 7 6 46
BFI_agree_5 30956 5 1 32
SOI_R_8 30956 5 5 32
religiosity 30956 5 5 31
RJS_4 30956 5 5 31
days_responded_percentage 30885 11 [0.95122,0.976) 67
CJS_5 30956 5 5 30
NARQ_8 30956 5 5 30
RJS_2 30956 5 5 30
BFI_consc_5 30956 5 1 29
certainty_menstruation 30956 5 1 29
cyklus_6 30956 5 2 29
SOI_R_9 30956 5 5 29
attractiveness_occupation 30956 5 1 28
contraceptive_method 30956 5 other 27
MV_P_4 30956 5 1 27
NARQ_admiration 29877 17 6 88
SOI_uv 30956 13 1.33333333333333 67
SJS 30956 7 2 36
BFI_open_8 30956 5 1 25
sexual_intercourse_satisfaction 6511 8 1 38
BFI_consc_6 30956 5 1 23
contraceptive_method_by_pearl 30956 5 fertility_awareness 23
attractiveness_finance 30956 9 5 41
CJS_4 30956 5 5 22
MV_1 30956 5 5 22
BFI_extra_1 30956 5 1 21
NARQ_4 30956 5 5 21
BFI_agree_9 30956 5 1 20
BFI_open_4 30956 5 1 20
NARQ_10 30956 5 5 19
ZIP_5 30956 5 1 19
BFI_agree_3 30956 5 1 18
BFI_agree_3R 30956 5 5 18
BFI_consc_7 30956 5 1 18
no_variation 29891 3 TRUE 10
BFI_consc_1 30956 5 1 16
MV_3 30956 5 1 15
NARQ_12 30956 5 5 15
NARQ_13 30956 5 5 15
ZIP_2 30956 5 1 15
ZIP_4 30956 5 1 14
ZIP_4R 30956 5 5 14
new_relationship 26606 3 1 8
BFI_agree_2 30956 5 1 13
MV_P_3 30956 5 1 13
satisfaction_sexual_intercourse 29562 6 1 15
days_with_partner_per_month 9538 6 > 14 days 14
duration_pill 16212 6 1 14
long_distance_relationship_2 9538 6 5 14
BFI_consc_3 30956 5 1 11
ZIP_3 30956 5 1 11
FCD 27288 41 40 89
menstrual_onset_days_since 27380 42 39 89
BFI_agree_7 30956 5 1 10
occupation_clean 30850 7 Nicht berufstätig 13
NARQ_2 30956 5 5 9
SOI_R_1 30956 5 5 9
sexy 30956 5 1 8
attractiveness_face 30956 5 1 7
attractiveness_overall 30956 5 1 7
BFI_open_2 30956 5 1 7
BFI_open_3 30956 5 1 7
NARQ_17 30956 5 5 7
ZIP_1 30956 5 1 7
ZIP_6 30956 5 1 7
partner_mate_retention 29874 22 6 30
number_of_cycles 30924 6 5 8
RCD_squished 18018 31 0 41
attractiveness_ltp 30956 5 1 6
partner_attractiveness_longterm 30956 5 -3.97818492731748 6
RCD_rel_to_ovulation 17731 37 -20 44
cycle_nr 30924 6 5 7
attractiveness_ltp_self 30956 5 1 5
contraceptives_broad_categories 30952 11 partner_sterilised 11
distance_partner 9538 8 6 8
distance_to_partner_hours 9538 8 9-12h 8
partner_attractiveness_money 30956 13 1 12
attractiveness_body 30956 5 1 4
SOI_im 30956 13 5 9
religion_clean 30881 6 Judentum 4
MV_short 30956 13 1 8
NARQ_14 30956 5 5 3
menstrual_onset_days_until 17887 42 -38 24
RCD 17887 42 -38 24
gestagen_ug 2 2 250 1
male_jealousy 29876 17 5.33333333333333 8
NARQ_rivalry 29876 17 5.66666666666667 8
method_meeting_clean 16222 7 Depotspritze 3
extra_pair_flirting 29873 17 5.66666666666667 7
MV 30956 17 1.8 7
SOI_be 30956 13 4.66666666666667 5
SGSE 30956 21 4.8 8
contraceptives_categories_natural 29911 6 infertile 2
estrogen 195 3 30 1
estrogen_ug 1497 6 15 2
estrogen_ug_merged 2157 6 15 2
female_jealousy 30924 18 5.66666666666667 6
gestagen 195 3 13 1
hormonal_1 1716 6 4 2
RCD_inferred 28104 72 -27 22
RCD_for_merge 18097 51 49 14
extra_pair_desire 29873 27 5.8 7
estrogen_ug_other 1315 4 420 1
partner_attractiveness_shortterm 30956 16 -3.34512824374663 4
MV_P_short 30956 13 1 3
partner_attractiveness_physical 30956 9 2 2
minimum_cycle_length_diary 23767 41 40 8
RJS 30956 25 4.83333333333333 4
hormonal_2 1716 9 circlet 1
gestagens 1497 10 NGT 1
gestagens_ug 1497 10 125 1
estrogen_ug_all 26785 11 360 1
estrogen_ug_cycle 17120 12 360 1
gestagen_type 16178 12 NES 1
gestagen_ug_merged 15610 12 1500 1
menarche 13225 12 19 1
in_pair_desire 29874 56 4.4 4
occupation 30956 14 Beamtin 1
BFI_neuro 30956 32 1.125 2
gestagen_cycle 14567 16 1.44 1
intensive_sports 26606 16 14 1
SOI_R 30956 35 4.22222222222222 2
MV_P 30956 18 4.8 1
cigarettes 26606 19 11 1
choice_of_clothing 29881 41 5.5 2
cycle_length 30956 21 39 1
cyklus_3 30956 21 39 1
day_count 30956 42 42 2
days 30956 42 25 2
fertile_days_known_backward 30956 42 41 2
first_time 29943 21 10 1
MV_diff 30956 24 -3.66666666666667 1
CJS 30956 25 5 1
ZIP 30956 27 1.28571428571429 1
number_sexual_partner 30232 28 26 1
partner_attractiveness_global 30956 29 -2.95278253338198 1
female_mate_retention 30924 30 3.33333333333333 1
BFI_extra 30956 31 1.125 1
BFI_agree 30956 32 1.33333333333333 1
NARQ_K_total 29876 32 5.83333333333333 1
BFI_open 30956 33 1.5 1
BFI_consc 30956 34 1.22222222222222 1
last_lag 25783 42 27 1
timespan 18527 42 26 1
took_days 18602 42 26 1
fertile_days_known_backward_inferred 30956 43 42 1
fertile_days_known_forward 30956 43 42 1
n_days 30956 43 42 1
height 30956 46 116 1
NARQ 30956 56 1 1
extra_pair 29873 62 5.75 1
partner_attractiveness_rel_to_self 30956 65 -0.162454954632523 1
weight_post 26606 68 102 1
cycle_length_diary 22767 69 10 1
mean_cycle_length_diary 22767 69 10 1
median_cycle_length_diary 22767 69 10 1
contraception 30952 73 coitus_interruptus, temperature_billings 1
weight 30956 79 101 1
duration_relationship_total 30820 85 11.1 1
attractiveness_income_age_corrected 30956 148 -0.049531448419045 1
BMI 26606 568 10.8108108108108 1
fertile_mean 30956 888 0.0111111111111111 1

finding rare combinations

Ideally, no participant should be uniquely identified, but this is not realistic. Instead, I look for unique combinations of demographic variables.

cut_to_number = function(x, group = NULL, m = 40) { 
  if(!is.null(group)) {
    df = data.frame(x = x, group = group) %>% unique()
  } else {
    df = data.frame(x = x)
  }
    cuts = Hmisc::cut2(df$x, m = m, onlycuts = T) # cut on person level
    as.numeric(as.character(Hmisc::cut2(x, cuts = cuts, levels.mean = T)))
  }

unique_combos = diary %>% 
  group_by(age_group, height, weight, religion_clean, duration_relationship_total) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  arrange(desc(n)) %>% 
  filter(n < 7)

nrow(unique_combos)
## [1] 198
diary = diary %>% 
  mutate(
    religion_clean = recode(religion_clean, "Judentum" = "other", "Islam" = "other", "Buddhismus" = "other", .missing = "other"),
    weight = cut_to_number(weight,person, m = 50),
    height = cut_to_number(height,person, m = 50),
    first_time = cut_to_number(first_time, person),
    duration_relationship_total = cut_to_number(duration_relationship_total, person, m = 50)
  )

unique_combos = diary %>% 
  group_by(age_group, height, weight, religion_clean, duration_relationship_total) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  arrange(desc(n)) %>% 
  filter(n < 7)

nrow(unique_combos)
## [1] 186
# unique_combos %>%
#   pander()


diary = diary %>% left_join(unique_combos %>% select(-n) %>% mutate(too_unique = T))

diary = diary %>% mutate(
  too_unique = if_else(is.na(too_unique), F, too_unique),
  age_group = if_else(!too_unique, age_group, NA_integer_),
  height = if_else(too_unique, NA_real_, height),
  weight = if_else(too_unique, NA_real_, weight),
  religion_clean = if_else(!too_unique, religion_clean, NA_character_),
  duration_relationship_total = if_else(too_unique, NA_real_, duration_relationship_total)
)

Make one version with the less sensitive data

diary_noep = diary %>% select(-starts_with("SOI")) %>% select(-starts_with("extra_pair")) %>% select(-number_sexual_partner, -trying_to_get_pregnant, -has_not_had_sex_yet, -had_sex_with_partner_yet, -first_time)

Save

save(diary_noep, file = "diary_restricted_anonymised.rdata")
haven::write_sav(diary_noep, path = "diary_restricted_anonymised.sav")
## Error in if (any(bad_lengths)) {: missing value where TRUE/FALSE needed
save(diary, file = "diary_anonymised.rdata")
haven::write_sav(diary, path = "diary_anonymised.sav")
## Error in if (any(bad_lengths)) {: missing value where TRUE/FALSE needed