library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidylog)
##
## Attaching package: 'tidylog'
## The following objects are masked from 'package:dplyr':
##
## add_count, add_tally, anti_join, count, distinct, distinct_all,
## distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
## full_join, group_by, group_by_all, group_by_at, group_by_if,
## inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
## relocate, rename, rename_all, rename_at, rename_if, rename_with,
## right_join, sample_frac, sample_n, select, select_all, select_at,
## select_if, semi_join, slice, slice_head, slice_max, slice_min,
## slice_sample, slice_tail, summarise, summarise_all, summarise_at,
## summarise_if, summarize, summarize_all, summarize_at, summarize_if,
## tally, top_frac, top_n, transmute, transmute_all, transmute_at,
## transmute_if, ungroup
## The following objects are masked from 'package:tidyr':
##
## drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
## spread, uncount
## The following object is masked from 'package:stats':
##
## filter
load("~/research/gocd2/data/cleaned_selected.rdata")
#
# diary <- diary %>%
# group_by(person, cycle_nr) %>%
# mutate(menses_length = round(sum(menstruation, na.rm = T)),
# RCD_std = case_when(
# round(menstruation)==1 ~ ((RCD + 24 + menses_length)/menses_length * 5) - 24,
# RCD > -24 ~ RCD,
# TRUE ~ RCD/cycle_length*29
# )
# )
# diary %>% filter(RCD_std < -30) %>% select(RCD_std, RCD, cycle_length, menses_length, menstruation)
# ggplot(diary, aes(RCD, RCD_std, color = cycle_length)) + geom_point(alpha = 0.1)
# only singles
diary <- diary %>%
mutate(reasons_for_exclusion = str_c(reasons_for_exclusion,
if_else(hetero_relationship == 1 | relationship_status != 1, "not_single, ", "", "")
)
)
## mutate: changed 51,899 values (67%) of 'reasons_for_exclusion' (0 new NA)
# no women who live with their parents
diary <- diary %>%
mutate(reasons_for_exclusion = str_c(reasons_for_exclusion,
if_else(abode_flat_share == 2, "living_with_parents, ", "", "")
)
)
## mutate: changed 6,587 values (9%) of 'reasons_for_exclusion' (0 new NA)
all_surveys <- all_surveys %>%
mutate(reasons_for_exclusion = str_c(reasons_for_exclusion,
if_else(abode_flat_share == 2, "living_with_parents, ", "", "")
)
)
## mutate: changed 145 values (9%) of 'reasons_for_exclusion' (0 new NA)
diary_social <- diary_social %>% mutate(
person_is_related_woman = if_else(person_relationship_to_anchor == "biological_relative" & person_sex == 1, 1, 0),
person_is_unrelated_woman = if_else(person_relationship_to_anchor != "biological_relative" & person_sex == 1, 1, 0),
person_is_related_man_inferred_seen = if_else(person_is_related_man_inferred == 1 & person_seen == 1, 1, 0),
person_is_related_man_seen = if_else(person_is_related_man == 1 & person_seen == 1, 1, 0),
person_is_related_man_thoughts = if_else(person_is_related_man == 1 & person_thought_about == 1, 1, 0)
)
## mutate: new variable 'person_is_related_woman' (double) with 3 unique values and 74% NA
## new variable 'person_is_unrelated_woman' (double) with 3 unique values and 74% NA
## new variable 'person_is_related_man_inferred_seen' (double) with 3 unique values and 64% NA
## new variable 'person_is_related_man_seen' (double) with 3 unique values and 74% NA
## new variable 'person_is_related_man_thoughts' (double) with 3 unique values and 75% NA
diary_persons <- diary %>% left_join(diary_social %>% select(session, created_date, starts_with("person_is_related"), starts_with("person_is_unrelated")) %>% group_by(session, created_date) %>% summarise_all(~sum(., na.rm = T)))
## select: dropped 718 variables (created_diary, modified_diary, ended_diary, expired_diary, browser, …)
## group_by: 2 grouping variables (session, created_date)
## summarise_all: now 64,185 rows and 12 columns, one group variable remaining (session)
## Joining, by = c("session", "created_date")
## left_join: added 10 columns (person_is_related_inferred, person_is_related_man, person_is_related_man_inferred, person_is_related_woman, person_is_related_man_inferred_seen, …)
## > rows only in x 13,506
## > rows only in y ( 327)
## > matched rows 63,858
## > ========
## > rows total 77,364
n_distinct(diary$person)
## [1] 1373
mean(diary_persons$person_is_related_man)
## [1] NA
var(diary_persons$person_is_related_man)
## [1] NA
mean(diary_persons$person_is_unrelated_man)
## [1] NA
var(diary_persons$person_is_unrelated_man)
## [1] NA
table(diary_persons$person_is_related_man, exclude = NULL)
##
## 0 1 2 3 <NA>
## 62567 1042 202 47 13506
table(diary_persons$person_is_unrelated_man, exclude = NULL)
##
## 0 1 2 3 4 <NA>
## 57110 5169 1337 226 16 13506
diary_persons$premenstrual_phase_fab = factor(diary_persons$premenstrual_phase_fab)
diary_persons$hormonal_contraception = factor(diary_persons$hormonal_contraception)
saveRDS(diary_persons, "diary_persons.rds")
saveRDS(all_surveys, "persons.rds")