1_subset_data.utf8

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidylog)

## 
## Attaching package: 'tidylog'

## The following objects are masked from 'package:dplyr':
## 
##     add_count, add_tally, anti_join, count, distinct, distinct_all,
##     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
##     full_join, group_by, group_by_all, group_by_at, group_by_if,
##     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
##     relocate, rename, rename_all, rename_at, rename_if, rename_with,
##     right_join, sample_frac, sample_n, select, select_all, select_at,
##     select_if, semi_join, slice, slice_head, slice_max, slice_min,
##     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
##     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
##     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
##     transmute_if, ungroup

## The following objects are masked from 'package:tidyr':
## 
##     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
##     spread, uncount

## The following object is masked from 'package:stats':
## 
##     filter

load("~/research/gocd2/data/cleaned_selected.rdata")
# 
# diary <- diary %>% 
#   group_by(person, cycle_nr) %>% 
#   mutate(menses_length = round(sum(menstruation, na.rm = T)),
#     RCD_std = case_when(
#     round(menstruation)==1 ~ ((RCD + 24 + menses_length)/menses_length * 5) - 24,
#     RCD > -24 ~ RCD,
#     TRUE ~ RCD/cycle_length*29
#   )
# )
# diary %>% filter(RCD_std < -30) %>% select(RCD_std, RCD, cycle_length, menses_length, menstruation)
# ggplot(diary, aes(RCD, RCD_std, color = cycle_length)) + geom_point(alpha = 0.1)

# only singles
diary <- diary %>% 
  mutate(reasons_for_exclusion = str_c(reasons_for_exclusion,
                                            if_else(hetero_relationship == 1 | relationship_status != 1, "not_single, ", "", "")
                                       )
  )

## mutate: changed 51,899 values (67%) of 'reasons_for_exclusion' (0 new NA)

# no women who live with their parents
diary <- diary %>% 
  mutate(reasons_for_exclusion = str_c(reasons_for_exclusion,
                                            if_else(abode_flat_share == 2, "living_with_parents, ", "", "")
                                       )
  )

## mutate: changed 6,587 values (9%) of 'reasons_for_exclusion' (0 new NA)

all_surveys <- all_surveys %>% 
  mutate(reasons_for_exclusion = str_c(reasons_for_exclusion,
                                            if_else(abode_flat_share == 2, "living_with_parents, ", "", "")
                                       )
  )

## mutate: changed 145 values (9%) of 'reasons_for_exclusion' (0 new NA)

diary_social <- diary_social %>% mutate(
  person_is_related_woman = if_else(person_relationship_to_anchor == "biological_relative" & person_sex == 1, 1, 0),  
  person_is_unrelated_woman = if_else(person_relationship_to_anchor != "biological_relative" & person_sex == 1, 1, 0),  
  person_is_related_man_inferred_seen = if_else(person_is_related_man_inferred == 1 & person_seen == 1, 1, 0),
  person_is_related_man_seen = if_else(person_is_related_man == 1 & person_seen == 1, 1, 0),
  person_is_related_man_thoughts = if_else(person_is_related_man == 1 & person_thought_about == 1, 1, 0)
)

## mutate: new variable 'person_is_related_woman' (double) with 3 unique values and 74% NA

##         new variable 'person_is_unrelated_woman' (double) with 3 unique values and 74% NA

##         new variable 'person_is_related_man_inferred_seen' (double) with 3 unique values and 64% NA

##         new variable 'person_is_related_man_seen' (double) with 3 unique values and 74% NA

##         new variable 'person_is_related_man_thoughts' (double) with 3 unique values and 75% NA

diary_persons <- diary %>% left_join(diary_social %>% select(session, created_date, starts_with("person_is_related"), starts_with("person_is_unrelated")) %>% group_by(session, created_date) %>% summarise_all(~sum(., na.rm = T)))

## select: dropped 718 variables (created_diary, modified_diary, ended_diary, expired_diary, browser, …)

## group_by: 2 grouping variables (session, created_date)

## summarise_all: now 64,185 rows and 12 columns, one group variable remaining (session)

## Joining, by = c("session", "created_date")

## left_join: added 10 columns (person_is_related_inferred, person_is_related_man, person_is_related_man_inferred, person_is_related_woman, person_is_related_man_inferred_seen, …)

##            > rows only in x   13,506

##            > rows only in y  (   327)

##            > matched rows     63,858

##            >                 ========

##            > rows total       77,364

n_distinct(diary$person)

## [1] 1373

mean(diary_persons$person_is_related_man)

## [1] NA

var(diary_persons$person_is_related_man)

## [1] NA

mean(diary_persons$person_is_unrelated_man)

## [1] NA

var(diary_persons$person_is_unrelated_man)

## [1] NA

table(diary_persons$person_is_related_man, exclude = NULL)

## 
##     0     1     2     3  <NA> 
## 62567  1042   202    47 13506

table(diary_persons$person_is_unrelated_man, exclude = NULL)

## 
##     0     1     2     3     4  <NA> 
## 57110  5169  1337   226    16 13506

diary_persons$premenstrual_phase_fab = factor(diary_persons$premenstrual_phase_fab)
diary_persons$hormonal_contraception = factor(diary_persons$hormonal_contraception)

saveRDS(diary_persons, "diary_persons.rds")
saveRDS(all_surveys, "persons.rds")