Aggregate data

Load individual datasets

##############################################################################################
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

##alles zu einem Datensatz machen
#alle zusammen gefuegten Rohdaten einlesen

daten1 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten1.xlsx")
daten2 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten2.xlsx")
daten3 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten3.xlsx")
daten4 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten4.xlsx")
daten5 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten5.xlsx")
daten6 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten6.xlsx")
daten7 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten7.xlsx")
daten8 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten8.xlsx")
daten9 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten9.xlsx")
daten10 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten10_new.xlsx")
daten11 <- rio::import("Rohdaten_und_einzelne_Datensaetze/Daten_PSK_Stimme_zusammen_roh/daten11.xlsx")

data_complete_all_raw <- rbind(daten1, daten2, daten3, daten4, daten5, daten6, daten7, daten8, daten9, daten10, daten11)

Export raw

rio::export(data_complete_all_raw, "data_complete_untrans.rds")
rio::export(data_complete_all_raw, "datarelease/data_personality_voices_untransformed.rds")
rio::export(data_complete_all_raw, "datarelease/data_personality_voices_untransformed.sav")
rio::export(data_complete_all_raw, "datarelease/data_personality_voices_untransformed.xlsx")
rio::export(data_complete_all_raw, "datarelease/data_personality_voices_untransformed.csv")

Compute POMP (percentage of maximum possible)

Add min/max for each scale

############################################################################################
#Persoenlichkeitsdaten alle auf -2 bis +2 bringen (wie von Ruben empfohlen)
#Formel: (x-5)/10*5  fuer 9er Skala. Sonst immer die 9 austauschen durch Länge der Skala und die erste 5 in der Klammer durch den Skalenmittelpunkt

#Daten1: SOIR 1-5
psych::describe(daten1)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##           vars   n    mean     sd  median trimmed    mad     min     max
## voice_id     1 339  170.00  98.01  170.00  170.00 126.02    1.00  339.00
## ID           2 339 1545.37 170.02 1562.00 1559.52 176.43 1006.00 1798.00
## dataset      3 339    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## sex          4 339   -1.00   0.00   -1.00   -1.00   0.00   -1.00   -1.00
## age          5 339   20.68   3.21   20.00   20.03   1.48   18.00   35.00
## f0           6 339  211.00  22.25  209.75  210.08  20.25  145.50  303.37
## f1           7 339  470.63  56.24  460.44  466.45  48.99  320.83  718.77
## f2           8 339 1678.81 188.45 1670.17 1671.33 190.60 1229.63 2479.51
## f3           9 339 2913.68 127.94 2903.64 2911.40 121.26 2574.70 3245.74
## f4          10 339 3993.69 177.61 4011.59 4001.69 172.20 3492.43 4509.54
## pf          11   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## neuro       12   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## extra       13   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## openn       14   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## agree       15   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## consc       16   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## dominance   17   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## behavior    18 339    2.10   0.92    2.00    2.02   0.99    1.00    5.00
## attitude    19 339    3.32   1.13    3.33    3.38   0.99    1.00    5.00
## desire      20 339    2.72   0.98    2.67    2.70   0.99    1.00    5.00
## soir_full   21 339    2.71   0.79    2.78    2.71   0.82    1.00    4.78
##             range  skew kurtosis    se
## voice_id   338.00  0.00    -1.21  5.32
## ID         792.00 -0.70     0.12  9.23
## dataset      0.00   NaN      NaN  0.00
## sex          0.00   NaN      NaN  0.00
## age         17.00  2.05     4.35  0.17
## f0         157.88  0.49     0.81  1.21
## f1         397.94  0.82     1.24  3.05
## f2        1249.88  0.49     0.61 10.24
## f3         671.04  0.16    -0.17  6.95
## f4        1017.11 -0.34    -0.20  9.65
## pf           -Inf    NA       NA    NA
## neuro        -Inf    NA       NA    NA
## extra        -Inf    NA       NA    NA
## openn        -Inf    NA       NA    NA
## agree        -Inf    NA       NA    NA
## consc        -Inf    NA       NA    NA
## dominance    -Inf    NA       NA    NA
## behavior     4.00  0.58    -0.49  0.05
## attitude     4.00 -0.39    -0.69  0.06
## desire       4.00  0.17    -0.90  0.05
## soir_full    3.78  0.01    -0.58  0.04

names(daten1)

##  [1] "voice_id"  "ID"        "dataset"   "sex"       "age"       "f0"       
##  [7] "f1"        "f2"        "f3"        "f4"        "pf"        "neuro"    
## [13] "extra"     "openn"     "agree"     "consc"     "dominance" "behavior" 
## [19] "attitude"  "desire"    "soir_full"

daten1$soi_minp <- 1
daten1$soi_maxp <- 5

##################################################################
#Daten2: SOIR 1-5
#dominance auch 1-5?
#BFI auch 1-5
psych::describe(daten2)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##           vars   n       mean        sd     median    trimmed       mad
## voice_id     1 383     531.00    110.71     531.00     531.00    142.33
## ID           2 383 1645586.39 506570.17 2020902.00 1647804.94 356135.35
## dataset      3 383       2.00      0.00       2.00       2.00      0.00
## sex          4 383      -0.01      1.00      -1.00      -0.01      0.00
## age          5 382      32.76      7.36      32.00      32.32      8.90
## f0           6 383     158.61     51.93     158.15     157.16     72.63
## f1           7 383     488.74     82.27     479.87     481.04     80.00
## f2           8 383    1675.95    142.67    1683.39    1673.36    178.04
## f3           9 383    2785.36    243.38    2749.97    2775.83    317.79
## f4          10 383    3847.34    343.85    3783.67    3823.67    411.55
## pf          11   0        NaN        NA         NA        NaN        NA
## neuro       12 382       2.62      0.72       2.58       2.59      0.74
## extra       13 382       3.51      0.57       3.50       3.53      0.62
## openn       14 382       3.83      0.51       3.88       3.84      0.56
## agree       15 382       3.81      0.52       3.83       3.83      0.50
## consc       16 382       3.82      0.61       3.83       3.84      0.62
## dominance   17 381       3.20      0.51       3.25       3.22      0.49
## behavior    18 381       2.67      1.13       2.67       2.64      1.48
## attitude    19 382       3.16      1.13       3.33       3.18      1.48
## desire      20 382       3.07      0.98       3.00       3.04      0.99
## soir_full   21 381       2.97      0.83       3.00       2.96      0.83
##                  min        max      range  skew kurtosis       se
## voice_id      340.00     722.00     382.00  0.00    -1.21     5.66
## ID        1020901.00 2261113.00 1240212.00 -0.02    -1.93 25884.53
## dataset         2.00       2.00       0.00   NaN      NaN     0.00
## sex            -1.00       1.00       2.00  0.02    -2.00     0.05
## age            18.00      54.00      36.00  0.48    -0.39     0.38
## f0             83.51     265.56     182.04  0.13    -1.61     2.65
## f1            341.17     847.28     506.11  1.03     1.67     4.20
## f2           1348.88    2087.72     738.84  0.14    -0.83     7.29
## f3           2387.15    3460.33    1073.18  0.25    -1.22    12.44
## f4           3305.17    4631.96    1326.78  0.37    -1.19    17.57
## pf               Inf       -Inf       -Inf    NA       NA       NA
## neuro           1.08       4.58       3.50  0.37    -0.34     0.04
## extra           1.83       4.75       2.92 -0.29    -0.33     0.03
## openn           1.92       4.92       3.00 -0.43     0.21     0.03
## agree           2.00       4.92       2.92 -0.40    -0.13     0.03
## consc           1.92       5.00       3.08 -0.40    -0.09     0.03
## dominance       1.58       4.58       3.00 -0.27     0.07     0.03
## behavior        1.00       5.00       4.00  0.13    -1.03     0.06
## attitude        1.00       5.00       4.00 -0.11    -1.05     0.06
## desire          1.00       5.00       4.00  0.17    -0.86     0.05
## soir_full       1.00       5.00       4.00  0.06    -0.65     0.04

daten2$soi_minp <- 1
daten2$soi_maxp <- 5
daten2$dominance_minp <- 1
daten2$dominance_maxp <- 5
daten2$big5_minp <- 1
daten2$big5_maxp <- 5


##################################################################
#Daten3: SOIR 1-5
#dominance -3 bis +3?
n_distinct(daten3$dominance)

## [1] 37

items <- 8
all(round(daten3$dominance*items) == daten3$dominance*items,na.rm=T)

## [1] TRUE

# 8 items. closest whole digit to min/max is -3/3
#BFI auch 1-5
psych::describe(daten3)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##           vars   n    mean     sd  median trimmed    mad     min     max
## voice_id     1 285  865.00  82.42  865.00  865.00 105.26  723.00 1007.00
## ID           2 285  260.65 171.17  245.00  250.56 204.60   11.00  643.00
## dataset      3 285    3.00   0.00    3.00    3.00   0.00    3.00    3.00
## sex          4 285   -0.01   1.00   -1.00   -0.01   0.00   -1.00    1.00
## age          5 284   23.73   2.73   23.00   23.59   2.97   19.00   30.00
## f0           6 285  166.12  50.18  170.92  165.63  70.24   84.93  265.02
## f1           7 285  511.82  47.33  507.58  508.05  42.18  409.34  745.47
## f2           8 285 1588.13 145.70 1566.75 1581.00 167.71 1286.97 2022.02
## f3           9 285 2698.48 219.04 2695.78 2700.03 297.24 2216.26 3095.68
## f4          10 285 3805.49 284.59 3810.24 3807.16 378.50 3057.45 4313.86
## pf          11   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## neuro       12 284   20.02   5.17   20.00   19.95   5.93    9.00   33.00
## extra       13 284   28.57   5.69   29.00   28.72   5.93   12.00   39.00
## openn       14 284   38.36   6.18   39.00   38.73   5.93   16.00   50.00
## agree       15 284   26.80   4.40   27.00   26.86   4.45   14.00   39.00
## consc       16 284   31.08   5.60   31.00   31.01   5.93   15.00   45.00
## dominance   17 283    0.67   0.86    0.62    0.67   0.74   -2.62    2.75
## behavior    18 284    2.57   1.38    2.20    2.37   1.19    1.00    8.20
## attitude    19 284    6.19   2.04    6.60    6.38   1.93    1.00    9.00
## desire      20 284    4.33   1.90    4.40    4.28   2.37    1.00    9.00
## soir_full   21 284    4.36   1.37    4.40    4.39   1.38    1.07    8.40
##             range  skew kurtosis    se
## voice_id   284.00  0.00    -1.21  4.88
## ID         632.00  0.38    -0.92 10.14
## dataset      0.00   NaN      NaN  0.00
## sex          2.00  0.02    -2.01  0.06
## age         11.00  0.36    -0.92  0.16
## f0         180.09  0.03    -1.59  2.97
## f1         336.13  1.09     2.45  2.80
## f2         735.05  0.41    -0.57  8.63
## f3         879.41 -0.03    -1.38 12.97
## f4        1256.41 -0.04    -1.39 16.86
## pf           -Inf    NA       NA    NA
## neuro       24.00  0.15    -0.54  0.31
## extra       27.00 -0.23    -0.34  0.34
## openn       34.00 -0.70     0.86  0.37
## agree       25.00 -0.14    -0.15  0.26
## consc       30.00  0.03    -0.06  0.33
## dominance    5.38 -0.09     0.25  0.05
## behavior     7.20  1.31     1.73  0.08
## attitude     8.00 -0.68    -0.31  0.12
## desire       8.00  0.20    -0.90  0.11
## soir_full    7.33 -0.08    -0.21  0.08

daten3$soi_minp <- 1
daten3$soi_maxp <- 9
daten3$dominance_minp <- -3
daten3$dominance_maxp <- 3
daten3$big5_minp <- 1
daten3$big5_maxp <- 5



# turn sum scores into means
daten3$neuro <- daten3$neuro/7
daten3$extra <- daten3$extra/8
daten3$openn <- daten3$openn/10
daten3$agree <- daten3$agree/8
daten3$consc <- daten3$consc/9

psych::describe(daten3)[,c("min", "max")]

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                    min     max
## voice_id        723.00 1007.00
## ID               11.00  643.00
## dataset           3.00    3.00
## sex              -1.00    1.00
## age              19.00   30.00
## f0               84.93  265.02
## f1              409.34  745.47
## f2             1286.97 2022.02
## f3             2216.26 3095.68
## f4             3057.45 4313.86
## pf                 Inf    -Inf
## neuro             1.29    4.71
## extra             1.50    4.88
## openn             1.60    5.00
## agree             1.75    4.88
## consc             1.67    5.00
## dominance        -2.62    2.75
## behavior          1.00    8.20
## attitude          1.00    9.00
## desire            1.00    9.00
## soir_full         1.07    8.40
## soi_minp          1.00    1.00
## soi_maxp          9.00    9.00
## dominance_minp   -3.00   -3.00
## dominance_maxp    3.00    3.00
## big5_minp         1.00    1.00
## big5_maxp         5.00    5.00

##################################################################
#Daten4: SOIR 1-5
#BFI auch 1-5
psych::describe(daten4)[,c("min", "max")]

## Warning in psych::describe(daten4): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##               min     max
## voice_id  1008.00 1272.00
## ID*           Inf    -Inf
## dataset      4.00    4.00
## sex         -1.00   -1.00
## age         18.00   35.00
## f0         155.91  272.93
## f1         305.37  478.59
## f2        1451.97 2005.61
## f3        2553.46 3074.81
## f4        3682.98 4408.68
## pf            Inf    -Inf
## neuro        1.25    4.75
## extra        1.50    4.88
## openn        1.70    4.90
## agree        1.78    4.89
## consc        1.67    5.00
## dominance     Inf    -Inf
## behavior     1.00    5.00
## attitude     1.00    5.00
## desire       1.00    5.00
## soir_full    1.00    4.67

daten4$soi_minp <- 1
daten4$soi_maxp <- 5
daten4$big5_minp <- 1
daten4$big5_maxp <- 5

##################################################################
#Daten5: SOIR 1-5
psych::describe(daten5)[,c("min", "max")]

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##               min     max
## voice_id  1273.00 1459.00
## ID        1017.00 2210.00
## dataset      5.00    5.00
## sex         -1.00    1.00
## age         18.00   27.00
## f0          82.84  245.95
## f1         354.38  589.63
## f2        1331.52 1969.18
## f3        2276.41 3080.25
## f4        3115.03 4234.61
## pf            Inf    -Inf
## neuro         Inf    -Inf
## extra         Inf    -Inf
## openn         Inf    -Inf
## agree         Inf    -Inf
## consc         Inf    -Inf
## dominance     Inf    -Inf
## behavior     1.00    6.00
## attitude     1.00    8.67
## desire       1.00    8.67
## soir_full    1.00    6.89

names(daten5)

##  [1] "voice_id"  "ID"        "dataset"   "sex"       "age"       "f0"       
##  [7] "f1"        "f2"        "f3"        "f4"        "pf"        "neuro"    
## [13] "extra"     "openn"     "agree"     "consc"     "dominance" "behavior" 
## [19] "attitude"  "desire"    "soir_full"

daten5$soi_minp <- 1
daten5$soi_maxp <- 9

##################################################################
#Daten6: SOIR 1-5
psych::describe(daten6)[,c("min", "max")]

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##               min     max
## voice_id  1460.00 1643.00
## ID           1.00  186.00
## dataset      6.00    6.00
## sex          1.00    1.00
## age         18.00   56.00
## f0          86.96  177.71
## f1         295.49  440.08
## f2        1278.61 1681.44
## f3        2209.98 2709.71
## f4        3026.11 3996.97
## pf            Inf    -Inf
## neuro         Inf    -Inf
## extra         Inf    -Inf
## openn         Inf    -Inf
## agree         Inf    -Inf
## consc         Inf    -Inf
## dominance     Inf    -Inf
## behavior     1.00    9.00
## attitude     1.00    9.00
## desire       1.00    9.00
## soir_full    1.11    8.56

names(daten6)

##  [1] "voice_id"  "ID"        "dataset"   "sex"       "age"       "f0"       
##  [7] "f1"        "f2"        "f3"        "f4"        "pf"        "neuro"    
## [13] "extra"     "openn"     "agree"     "consc"     "dominance" "behavior" 
## [19] "attitude"  "desire"    "soir_full"

daten6$soi_minp <- 1
daten6$soi_maxp <- 9

##################################################################
#Daten7: SOIR 1-9
#BFI auch 1-5
#dominance 1-5
daten7$soi_minp <- 1
daten7$soi_maxp <- 9
daten7$dominance_minp <- 1
daten7$dominance_maxp <- 5
daten7$big5_minp <- 1
daten7$big5_maxp <- 5
psych::describe(daten7)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                vars   n    mean     sd  median trimmed    mad     min     max
## voice_id          1 164 1725.50  47.49 1725.50 1725.50  60.79 1644.00 1807.00
## ID                2 164   83.74  48.31   84.50   83.80  62.27    1.00  166.00
## dataset           3 164    7.00   0.00    7.00    7.00   0.00    7.00    7.00
## sex               4 164    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## age               5 164   24.30   3.25   24.00   24.08   2.97   18.00   34.00
## f0                6 164  118.73  15.27  117.30  118.03  14.31   87.46  166.08
## f1                7 164  481.04  34.85  479.66  480.90  32.52  390.69  584.62
## f2                8 164 1396.66  82.01 1394.13 1393.93  76.76 1133.20 1652.04
## f3                9 164 2444.48  94.80 2450.25 2444.33  84.44 2208.07 2777.29
## f4               10 164 3429.84 150.81 3441.89 3430.10 113.10 2968.93 3902.35
## pf               11   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## neuro            12 164    2.67   0.71    2.64    2.65   0.74    1.00    4.57
## extra            13 164    3.51   0.69    3.62    3.53   0.74    1.62    5.00
## openn            14 164    3.76   0.57    3.80    3.76   0.59    2.30    5.00
## agree            15 164    3.44   0.59    3.50    3.45   0.56    1.88    4.62
## consc            16 164    3.26   0.67    3.22    3.27   0.82    1.22    4.89
## dominance        17 164    3.48   0.59    3.60    3.51   0.59    1.60    4.80
## behavior         18 164    3.13   1.91    2.33    2.90   1.48    1.00    9.00
## attitude         19 164    6.43   2.07    6.67    6.63   2.47    1.00    9.00
## desire           20 164    5.17   1.98    5.33    5.19   2.47    1.00    9.00
## soir_full        21 164    4.91   1.48    4.78    4.89   1.32    1.11    8.67
## soi_minp         22 164    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## soi_maxp         23 164    9.00   0.00    9.00    9.00   0.00    9.00    9.00
## dominance_minp   24 164    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## dominance_maxp   25 164    5.00   0.00    5.00    5.00   0.00    5.00    5.00
## big5_minp        26 164    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## big5_maxp        27 164    5.00   0.00    5.00    5.00   0.00    5.00    5.00
##                 range  skew kurtosis    se
## voice_id       163.00  0.00    -1.22  3.71
## ID             165.00 -0.01    -1.24  3.77
## dataset          0.00   NaN      NaN  0.00
## sex              0.00   NaN      NaN  0.00
## age             16.00  0.61     0.09  0.25
## f0              78.62  0.50     0.30  1.19
## f1             193.93  0.01     0.21  2.72
## f2             518.84  0.23     0.50  6.40
## f3             569.22  0.16     0.50  7.40
## f4             933.43 -0.05     0.69 11.78
## pf               -Inf    NA       NA    NA
## neuro            3.57  0.26     0.04  0.06
## extra            3.38 -0.26    -0.42  0.05
## openn            2.70 -0.03    -0.60  0.04
## agree            2.75 -0.30    -0.43  0.05
## consc            3.67 -0.14    -0.50  0.05
## dominance        3.20 -0.47     0.03  0.05
## behavior         8.00  0.89    -0.13  0.15
## attitude         8.00 -0.67    -0.21  0.16
## desire           8.00 -0.12    -1.01  0.15
## soir_full        7.56  0.11    -0.16  0.12
## soi_minp         0.00   NaN      NaN  0.00
## soi_maxp         0.00   NaN      NaN  0.00
## dominance_minp   0.00   NaN      NaN  0.00
## dominance_maxp   0.00   NaN      NaN  0.00
## big5_minp        0.00   NaN      NaN  0.00
## big5_maxp        0.00   NaN      NaN  0.00

################
##################################################################
#Daten8: SOIR 1-5
#BFI auch 1-5
#dominance -2:2
daten8$soi_minp <- 1
daten8$soi_maxp <- 9
daten8$dominance_minp <- -2
daten8$dominance_maxp <- 2
daten8$big5_minp <- 1
daten8$big5_maxp <- 5

psych::describe(daten8)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                vars   n    mean     sd  median trimmed    mad     min     max
## voice_id          1 157 1886.00  45.47 1886.00 1886.00  57.82 1808.00 1964.00
## ID                2 157  191.35  50.81  193.00  191.70  63.75  100.00  279.00
## dataset           3 157    8.00   0.00    8.00    8.00   0.00    8.00    8.00
## sex               4 157   -1.00   0.00   -1.00   -1.00   0.00   -1.00   -1.00
## age               5 157   23.06   3.44   23.00   22.86   2.97   18.00   34.00
## f0                6 157  211.83  17.73  212.60  211.51  17.34  158.41  264.89
## f1                7 157  446.89  69.85  438.42  444.98  72.94  285.37  600.67
## f2                8 157 1706.03  79.48 1713.90 1710.34  72.90 1389.02 1885.53
## f3                9 157 2845.21 138.94 2829.02 2839.73 143.02 2594.67 3255.57
## f4               10 157 4009.92 131.96 4007.99 4012.65 136.17 3629.25 4317.39
## pf               11   0     NaN     NA      NA     NaN     NA     Inf    -Inf
## neuro            12 142    2.89   0.66    2.88    2.89   0.74    1.25    4.50
## extra            13 142    3.57   0.70    3.62    3.61   0.74    1.62    4.88
## openn            14 142    3.71   0.64    3.80    3.75   0.74    2.10    4.90
## agree            15 142    3.68   0.61    3.72    3.71   0.58    2.00    4.89
## consc            16 142    3.46   0.66    3.44    3.46   0.66    2.00    4.78
## dominance        17 157    0.52   0.60    0.62    0.52   0.74   -1.12    1.88
## behavior         18 142    2.32   1.08    2.33    2.25   1.48    1.00    5.00
## attitude         19 142    3.26   1.03    3.33    3.29   0.99    1.00    5.00
## desire           20 142    2.81   0.90    2.67    2.80   0.99    1.00    5.00
## soir_full        21 142    2.80   0.76    2.78    2.78   0.82    1.22    4.89
## soi_minp         22 157    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## soi_maxp         23 157    9.00   0.00    9.00    9.00   0.00    9.00    9.00
## dominance_minp   24 157   -2.00   0.00   -2.00   -2.00   0.00   -2.00   -2.00
## dominance_maxp   25 157    2.00   0.00    2.00    2.00   0.00    2.00    2.00
## big5_minp        26 157    1.00   0.00    1.00    1.00   0.00    1.00    1.00
## big5_maxp        27 157    5.00   0.00    5.00    5.00   0.00    5.00    5.00
##                 range  skew kurtosis    se
## voice_id       156.00  0.00    -1.22  3.63
## ID             179.00 -0.06    -1.16  4.06
## dataset          0.00   NaN      NaN  0.00
## sex              0.00   NaN      NaN  0.00
## age             16.00  0.54    -0.37  0.27
## f0             106.47  0.16     0.47  1.41
## f1             315.30  0.24    -0.65  5.58
## f2             496.51 -0.64     0.93  6.34
## f3             660.90  0.37    -0.45 11.09
## f4             688.13 -0.17    -0.32 10.53
## pf               -Inf    NA       NA    NA
## neuro            3.25  0.01    -0.40  0.06
## extra            3.25 -0.51    -0.38  0.06
## openn            2.80 -0.43    -0.43  0.05
## agree            2.89 -0.45    -0.14  0.05
## consc            2.78 -0.05    -0.76  0.06
## dominance        3.00 -0.11    -0.50  0.05
## behavior         4.00  0.39    -1.03  0.09
## attitude         4.00 -0.23    -0.75  0.09
## desire           4.00  0.17    -0.52  0.08
## soir_full        3.67  0.24    -0.49  0.06
## soi_minp         0.00   NaN      NaN  0.00
## soi_maxp         0.00   NaN      NaN  0.00
## dominance_minp   0.00   NaN      NaN  0.00
## dominance_maxp   0.00   NaN      NaN  0.00
## big5_minp        0.00   NaN      NaN  0.00
## big5_maxp        0.00   NaN      NaN  0.00

###############
##################################################################
#Daten9:
#BFI auch 1-7

psych::describe(daten9 %>% select(extra, consc, neuro, agree,openn))

##       vars   n mean   sd median trimmed  mad  min  max range  skew kurtosis
## extra    1 113 3.50 1.24    3.5    3.48 1.48 0.50 6.50   6.0  0.10    -0.47
## consc    2 114 4.01 0.99    4.0    3.98 0.74 1.00 6.00   5.0  0.02     0.12
## neuro    3 114 4.02 1.39    4.0    4.05 1.48 1.00 6.50   5.5 -0.19    -0.66
## agree    4 114 4.27 1.11    4.5    4.31 1.48 1.00 6.50   5.5 -0.31    -0.17
## openn    5 114 4.82 0.93    5.0    4.85 0.99 2.67 6.67   4.0 -0.22    -0.82
##         se
## extra 0.12
## consc 0.09
## neuro 0.13
## agree 0.10
## openn 0.09

daten9$big5_minp <- 0
daten9$big5_maxp <- 7


##################################################################
#Daten10:
#BFI auch 1-5

psych::describe(daten10)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##           vars  n    mean     sd  median trimmed    mad     min     max  range
## voice_id     1 88 2141.50  25.55 2141.50 2141.50  32.62 2098.00 2185.00  87.00
## ID           2  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## dataset      3 88   10.00   0.00   10.00   10.00   0.00   10.00   10.00   0.00
## sex          4 88    1.00   0.00    1.00    1.00   0.00    1.00    1.00   0.00
## age          5 88   24.14   2.74   24.00   23.99   1.48   19.00   31.00  12.00
## f0           6 88  121.51  17.95  118.72  120.62  17.49   91.28  175.82  84.54
## f1           7 88  408.73  85.85  386.51  395.38  40.87  300.05  947.63 647.58
## f2           8 88 1770.63 123.72 1779.76 1770.59 118.65 1435.43 2064.01 628.58
## f3           9 88 2655.71 141.03 2647.06 2651.44 159.63 2363.84 3064.42 700.57
## f4          10 88 3551.65 158.42 3543.07 3532.19 133.58 3330.14 4266.06 935.92
## pf          11  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## neuro       12 88    2.59   0.75    2.56    2.56   0.65    1.12    4.50   3.38
## extra       13 88    3.44   0.88    3.50    3.48   0.93    1.25    5.00   3.75
## openn       14 88    3.76   0.60    3.85    3.79   0.52    1.40    4.90   3.50
## agree       15 88    3.56   0.61    3.56    3.57   0.66    1.78    4.89   3.11
## consc       16 88    3.32   0.73    3.44    3.33   0.82    1.33    4.78   3.44
## dominance   17  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## behavior    18  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## attitude    19  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## desire      20  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## soir_full   21  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
##            skew kurtosis    se
## voice_id   0.00    -1.24  2.72
## ID           NA       NA    NA
## dataset     NaN      NaN  0.00
## sex         NaN      NaN  0.00
## age        0.50    -0.08  0.29
## f0         0.54    -0.10  1.91
## f1         3.36    16.37  9.15
## f2        -0.06    -0.24 13.19
## f3         0.27    -0.40 15.03
## f4         1.78     4.78 16.89
## pf           NA       NA    NA
## neuro      0.30    -0.43  0.08
## extra     -0.32    -0.65  0.09
## openn     -0.79     1.51  0.06
## agree     -0.18     0.00  0.07
## consc     -0.26    -0.41  0.08
## dominance    NA       NA    NA
## behavior     NA       NA    NA
## attitude     NA       NA    NA
## desire       NA       NA    NA
## soir_full    NA       NA    NA

daten10$big5_minp <- 1
daten10$big5_maxp <- 5


##################################################################
#Daten11: SOIR 1-9

psych::describe(daten11)

## Warning in psych::describe(daten11): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##           vars  n    mean     sd  median trimmed    mad     min     max  range
## voice_id     1 56 2213.50  16.31 2213.50 2213.50  20.76 2186.00 2241.00  55.00
## ID*          2 56     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## dataset      3 56   11.00   0.00   11.00   11.00   0.00   11.00   11.00   0.00
## sex          4 56    1.00   0.00    1.00    1.00   0.00    1.00    1.00   0.00
## age          5 56   19.96   1.21   20.00   19.93   1.48   18.00   23.00   5.00
## f0           6 56  105.92  11.44  102.55  105.13   8.95   88.94  133.76  44.82
## f1           7 56  410.40  33.04  418.16  412.39  28.13  338.15  483.39 145.24
## f2           8 56 1491.34  56.39 1493.32 1491.80  68.56 1381.04 1603.63 222.59
## f3           9 56 2485.94  78.16 2484.18 2482.57  71.33 2311.55 2680.83 369.28
## f4          10 56 3433.50 109.01 3433.66 3431.88 112.39 3192.24 3700.46 508.22
## pf          11  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## neuro       12  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## extra       13  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## openn       14  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## agree       15  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## consc       16  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## dominance   17  0     NaN     NA      NA     NaN     NA     Inf    -Inf   -Inf
## behavior    18 56    3.99   1.62    3.83    3.92   1.73    1.33    7.67   6.33
## attitude    19 56    7.37   1.45    7.67    7.57   1.24    2.67    9.00   6.33
## desire      20 56    6.49   1.38    6.67    6.57   1.48    3.00    9.00   6.00
## soir_full   21 56    5.95   1.11    5.89    6.00   1.15    3.00    7.78   4.78
##            skew kurtosis    se
## voice_id   0.00    -1.26  2.18
## ID*          NA       NA    NA
## dataset     NaN      NaN  0.00
## sex         NaN      NaN  0.00
## age        0.19    -0.68  0.16
## f0         0.65    -0.30  1.53
## f1        -0.53    -0.27  4.41
## f2        -0.08    -0.97  7.53
## f3         0.34    -0.08 10.44
## f4         0.13    -0.03 14.57
## pf           NA       NA    NA
## neuro        NA       NA    NA
## extra        NA       NA    NA
## openn        NA       NA    NA
## agree        NA       NA    NA
## consc        NA       NA    NA
## dominance    NA       NA    NA
## behavior   0.31    -0.82  0.22
## attitude  -1.23     1.27  0.19
## desire    -0.55    -0.05  0.18
## soir_full -0.33    -0.18  0.15

names(daten11)

##  [1] "voice_id"  "ID"        "dataset"   "sex"       "age"       "f0"       
##  [7] "f1"        "f2"        "f3"        "f4"        "pf"        "neuro"    
## [13] "extra"     "openn"     "agree"     "consc"     "dominance" "behavior" 
## [19] "attitude"  "desire"    "soir_full"

daten11$soi_minp <- 1
daten11$soi_maxp <- 9

Test that all values are between min/max

##############################################################################################
#erneut alles zusammen fuegen um zu z-transformieren

data_complete_all <- list(daten1, daten2, daten3, daten4, daten5, daten6, daten7, daten8, daten9, daten10, daten11) %>% map(~ mutate(., ID = as.character(ID))) %>% bind_rows() %>% as_tibble()

outside_range <- function(vec, min, max) {
   vec < min | vec > max
}
none <- function(df) {
  nrow(df) == 0
}
data_complete_all %>% filter(
  outside_range(extra, big5_minp, big5_maxp)
) %>% none() %>% 
  testthat::expect_true()

## Error in get(genname, envir = envir) : object 'testthat_print' not found

data_complete_all %>% filter(
  outside_range(agree, big5_minp, big5_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(neuro, big5_minp, big5_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(openn, big5_minp, big5_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(consc, big5_minp, big5_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(behavior, soi_minp, soi_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(attitude, soi_minp, soi_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(desire, soi_minp, soi_maxp)
) %>% none() %>% 
  testthat::expect_true()
data_complete_all %>% filter(
  outside_range(soir_full, soi_minp, soi_maxp)
) %>% none() %>% 
  testthat::expect_true()

data_complete_all %>% filter(
  outside_range(dominance, dominance_minp, dominance_maxp)
) %>% none() %>% 
  testthat::expect_true()

Use min/max to compute POMP

pomp <- function(raw, min, max) {
    (raw - min)/(max-min)
}

data_complete_all <- data_complete_all %>%
  mutate_at(vars(extra, agree, neuro, openn, consc), ~ pomp(., big5_minp, big5_maxp)) %>% 
  mutate_at(vars(behavior, attitude, desire, soir_full), ~ pomp(., soi_minp, soi_maxp)) %>% 
  mutate_at(vars(dominance), ~ pomp(., dominance_minp, dominance_maxp))

data_complete_all %>% select(extra, agree, neuro, openn, consc, behavior, attitude, desire, soir_full, dominance) %>% gather(variable, value) %>% 
  ggplot(aes(value)) + geom_histogram() + facet_wrap(~ variable)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 6229 rows containing non-finite values (stat_bin).

Calculate pf

#Pf bilden
data_complete_all$pf <- (scale(data_complete_all$f1)[,1] +
  scale(data_complete_all$f2)[,1] +
  scale(data_complete_all$f3)[,1] +
  scale(data_complete_all$f4)[,1])/4

Add labels

library(labelled)
var_label(data_complete_all$dominance) <- "Dominance"
var_label(data_complete_all$neuro) <- "Neuroticism"
var_label(data_complete_all$agree) <- "Agreeableness"
var_label(data_complete_all$extra) <- "Extraversion"
var_label(data_complete_all$openn) <- "Openness"
var_label(data_complete_all$consc) <- "Conscientiousness"
var_label(data_complete_all$soir_full) <- "Unrestricted sociosexuality"
var_label(data_complete_all$f0) <- "Voice pitch"
var_label(data_complete_all$pf) <- "Formants"
data_complete_all$sex_c <- data_complete_all$sex
data_complete_all$sex <- factor(if_else(data_complete_all$sex == 1, "male", "female"))
contrasts(data_complete_all$sex) <- contr.helmert(2)
var_label(data_complete_all$age) <- "Age"

Save pre-standardisation

data_complete_all_unstd <- data_complete_all
rio::export(data_complete_all_unstd, "data_complete_2021_unstd_pomp.rds")

Compute z-scores

data_complete_all <- data_complete_all %>% mutate_at(vars(extra, agree, neuro, openn, consc, behavior, attitude, desire, soir_full, dominance, f0, f1, f2, f3, f4, pf), ~ scale(.)[,1])

data_complete_all %>% select(extra, agree, neuro, openn, consc, behavior, attitude, desire, soir_full, dominance, f0, f1, f2, f3, f4, pf) %>% gather(variable, value) %>% 
  ggplot(aes(value)) + geom_histogram() + facet_wrap(~ variable)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 6256 rows containing non-finite values (stat_bin).

Save data

rio::export(data_complete_all, "data_complete_2021_zscored.rds")

rio::export(data_complete_all, "datarelease/data_personality_voices_zscored.rds")
rio::export(data_complete_all, "datarelease/data_personality_voices_zscored.sav")
rio::export(data_complete_all, "datarelease/data_personality_voices_zscored.xlsx")
rio::export(data_complete_all, "datarelease/data_personality_voices_zscored.csv")

Standardize within dataset, re-add sex differences

vcs <- data_complete_all_unstd
vcs <- vcs %>% mutate_at(vars(extra, agree, neuro, openn, consc, behavior, attitude, desire, soir_full, dominance), ~ scale(.)[,1])

# vcs10c <- rio::import("daten10_formants.rds")
# vcs10 <- vcs %>% filter(dataset == 10)
# vcs10 <- vcs10 %>% select(-(f0:f4)) %>% left_join(vcs10c)
# vcs <- vcs %>% filter(dataset != 10) %>% 
#   bind_rows(vcs10)

we first standardise within datasets

svcs <- vcs %>% group_by(dataset) %>% 
  mutate(f0 = scale(f0)[,1],
         f1 = scale(f1)[,1],
         f2 = scale(f2)[,1],
         f3 = scale(f3)[,1],
         f4 = scale(f4)[,1])

we use the HQ datasets with both genders to estimate mean diff of m/f and restricted variance

svcs %>% filter(dataset %in% c(2,3, 5, 9)) %>% ungroup() %>% 
  summarise_at(vars(f0:f4), ~broom::tidy(t.test(. ~ sex))$estimate)

## # A tibble: 1 x 5
##      f0    f1    f2    f3    f4
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  1.86  1.21  1.55  1.76  1.77

svcs %>% filter(dataset %in% c(2,3, 5, 9)) %>% 
  group_by(dataset) %>% 
  summarise_at(vars(f0:f4), ~broom::tidy(t.test(. ~ sex))$estimate)

## # A tibble: 4 x 6
##   dataset    f0    f1    f2    f3    f4
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1       2  1.88  1.26  1.64  1.81  1.80
## 2       3  1.88  1.10  1.36  1.80  1.84
## 3       5  1.98  1.33  1.84  1.88  1.86
## 4       9  2.03  1.39  1.65  1.73  1.78

svcs %>% filter(dataset %in% c(2,3, 5, 9)) %>% 
  group_by(dataset, sex) %>% 
  summarise_at(vars(f0:f4), list(mean = ~mean(., na.rm = T), sd = ~sd(., na.rm = T)))

## # A tibble: 8 x 12
## # Groups:   dataset [4]
##   dataset sex   f0_mean f1_mean f2_mean f3_mean f4_mean f0_sd f1_sd f2_sd f3_sd
##     <dbl> <fct>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1       2 fema…   0.934   0.627   0.814   0.899   0.895 0.398 0.943 0.598 0.514
## 2       2 male   -0.949  -0.637  -0.826  -0.913  -0.909 0.251 0.556 0.544 0.301
## 3       3 fema…   0.931   0.542   0.671   0.892   0.909 0.359 1.03  0.846 0.432
## 4       3 male   -0.951  -0.554  -0.686  -0.911  -0.928 0.309 0.575 0.600 0.428
## 5       5 fema…   0.651   0.435   0.605   0.615   0.611 0.369 0.843 0.527 0.519
## 6       5 male   -1.33   -0.892  -1.24   -1.26   -1.25  0.332 0.640 0.433 0.352
## 7       9 fema…   0.590   0.428   0.508   0.534   0.548 0.404 0.805 0.658 0.569
## 8       9 male   -1.44   -0.963  -1.14   -1.20   -1.23  0.324 0.676 0.622 0.658
## # … with 1 more variable: f4_sd <dbl>

# not that much variation in sex diff across datasets, so we pool

sex_diffs <- svcs %>% filter(dataset %in% c(2,3, 5, 9)) %>% 
  group_by(sex) %>% 
  summarise_at(vars(f0:f4), list(mean = ~mean(., na.rm = T), sd = ~sd(., na.rm = T)))
sex_diffs

## # A tibble: 2 x 11
##   sex    f0_mean f1_mean f2_mean f3_mean f4_mean f0_sd f1_sd f2_sd f3_sd f4_sd
##   <fct>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 female   0.813   0.529   0.679   0.773   0.778 0.411 0.926 0.675 0.527 0.531
## 2 male    -1.05   -0.676  -0.868  -0.988  -0.994 0.340 0.601 0.588 0.418 0.384

we fudge the data in the single gender datasets ((x + gender_mean) * sd_gender)

svcs %>% filter(!dataset %in% c(2,3, 5, 9)) %>% 
  group_by(dataset, sex) %>% 
  summarise_at(vars(f0:f4), list(mean = ~mean(., na.rm = T), sd = ~sd(., na.rm = T)))

## # A tibble: 7 x 12
## # Groups:   dataset [7]
##   dataset sex     f0_mean   f1_mean   f2_mean   f3_mean   f4_mean f0_sd f1_sd
##     <dbl> <fct>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl> <dbl> <dbl>
## 1       1 fema… -2.10e-16 -2.35e-16  5.83e-16  1.43e-15 -2.80e-16  1     1.00
## 2       4 fema… -7.82e-16 -3.22e-16 -1.08e-15  1.68e-15 -1.42e-15  1     1   
## 3       6 male   3.05e-16  8.55e-16  7.23e-16  1.36e-15 -9.86e-16  1.00  1   
## 4       7 male  -2.80e-16  2.43e-16 -3.06e-16 -6.45e-16 -8.05e-16  1.00  1.00
## 5       8 fema… -5.26e-16 -3.84e-16 -7.00e-16  7.74e-16  1.79e-16  1.00  1   
## 6      10 male   2.51e-16 -2.87e-16  4.40e-16  3.95e-16 -1.41e-15  1     1   
## 7      11 male   4.69e-16 -2.23e-16 -7.39e-17 -3.06e-16  2.02e-15  1.00  1   
## # … with 3 more variables: f2_sd <dbl>, f3_sd <dbl>, f4_sd <dbl>

svcs <- svcs %>% 
  left_join(sex_diffs, by = "sex") %>% 
  mutate(f0 = if_else(!dataset %in% c(2,3, 5, 9),
                      f0 * f0_sd + f0_mean, f0),
         f1 = if_else(!dataset %in% c(2,3, 5, 9),
                      f1 * f1_sd + f1_mean, f1),
         f2 = if_else(!dataset %in% c(2,3, 5, 9),
                      f2 * f2_sd + f2_mean, f2),
         f3 = if_else(!dataset %in% c(2,3, 5, 9),
                      f3 * f3_sd + f3_mean, f3),
         f4 = if_else(!dataset %in% c(2,3, 5, 9),
                      f4 * f4_sd + f4_mean, f4))

svcs %>% filter(!dataset %in% c(2,3, 5, 9)) %>% 
  group_by(dataset, sex) %>% 
  summarise_at(vars(f0:f4), list(mean = ~mean(., na.rm = T), sd = ~sd(., na.rm = T)))

## # A tibble: 7 x 12
## # Groups:   dataset [7]
##   dataset sex   f0_mean f1_mean f2_mean f3_mean f4_mean f0_sd f1_sd f2_sd f3_sd
##     <dbl> <fct>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1       1 fema…   0.813   0.529   0.679   0.773   0.778 0.411 0.926 0.675 0.527
## 2       4 fema…   0.813   0.529   0.679   0.773   0.778 0.411 0.926 0.675 0.527
## 3       6 male   -1.05   -0.676  -0.868  -0.988  -0.994 0.340 0.601 0.588 0.418
## 4       7 male   -1.05   -0.676  -0.868  -0.988  -0.994 0.340 0.601 0.588 0.418
## 5       8 fema…   0.813   0.529   0.679   0.773   0.778 0.411 0.926 0.675 0.527
## 6      10 male   -1.05   -0.676  -0.868  -0.988  -0.994 0.340 0.601 0.588 0.418
## 7      11 male   -1.05   -0.676  -0.868  -0.988  -0.994 0.340 0.601 0.588 0.418
## # … with 1 more variable: f4_sd <dbl>

svcs %>% filter(!dataset %in% c(2,3, 5, 9)) %>% ungroup() %>% 
  summarise_at(vars(f0:f4), ~broom::tidy(t.test(. ~ sex))$estimate)

## # A tibble: 1 x 5
##      f0    f1    f2    f3    f4
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  1.86  1.21  1.55  1.76  1.77

svcs %>% filter(dataset %in% c(2,3, 5, 9)) %>% ungroup() %>% 
  summarise_at(vars(f0:f4), ~broom::tidy(t.test(. ~ sex))$estimate)

## # A tibble: 1 x 5
##      f0    f1    f2    f3    f4
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  1.86  1.21  1.55  1.76  1.77

then we average across f1 to f4 and standardise pf again

svcs %>% ungroup() %>% 
  summarise_at(vars(f0:f4), list(mean = ~mean(., na.rm = T), sd = ~sd(., na.rm = T)))

## # A tibble: 1 x 10
##   f0_mean f1_mean f2_mean f3_mean f4_mean f0_sd f1_sd f2_sd f3_sd f4_sd
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  0.0466  0.0313  0.0402  0.0457  0.0460 0.992  1.00 0.995 0.993 0.994

svcs <- svcs %>% ungroup() %>% mutate(pf = scale(f1 + f2 + f3 + f4)[,1])

svcs %>% filter(!dataset %in% c(2,3, 5, 9)) %>% 
  group_by(dataset, sex) %>% 
  summarise_at(vars(pf, f0:f4), list(mean = ~mean(., na.rm = T), sd = ~sd(., na.rm = T)))

## # A tibble: 7 x 14
## # Groups:   dataset [7]
##   dataset sex   pf_mean f0_mean f1_mean f2_mean f3_mean f4_mean pf_sd f0_sd
##     <dbl> <fct>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl> <dbl>
## 1       1 fema…   0.746   0.813   0.529   0.679   0.773   0.778 0.462 0.411
## 2       4 fema…   0.746   0.813   0.529   0.679   0.773   0.778 0.494 0.411
## 3       6 male   -1.06   -1.05   -0.676  -0.868  -0.988  -0.994 0.318 0.340
## 4       7 male   -1.06   -1.05   -0.676  -0.868  -0.988  -0.994 0.322 0.340
## 5       8 fema…   0.746   0.813   0.529   0.679   0.773   0.778 0.569 0.411
## 6      10 male   -1.06   -1.05   -0.676  -0.868  -0.988  -0.994 0.488 0.340
## 7      11 male   -1.06   -1.05   -0.676  -0.868  -0.988  -0.994 0.303 0.340
## # … with 4 more variables: f1_sd <dbl>, f2_sd <dbl>, f3_sd <dbl>, f4_sd <dbl>

let’s check intercorrelations in each dataset after this transformation

vcs %>% select(dataset, f0:f4, sex_c) %>%
  group_by(dataset) %>% 
  mutate_at(vars(f0:f4),  ~resid(lm(. ~ sex_c, na.action = na.exclude))) %>% 
  summarise(corrr::correlate(across(f0:f4)) %>% corrr::shave() %>% corrr::stretch()) %>% 
  unite(vars, x, y) %>% 
  drop_na() %>% 
  ggplot(aes(dataset, r)) +
  geom_text(aes(label = dataset)) + 
  coord_flip() +
  facet_wrap(~ vars)

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

## `summarise()` regrouping output by 'dataset' (override with `.groups` argument)

svcs %>% select(dataset, f0:f4, sex_c) %>% 
  group_by(dataset) %>% 
  mutate_at(vars(f0:f4),  ~resid(lm(. ~ sex_c, na.action = na.exclude))) %>% 
  summarise(corrr::correlate(across(f0:f4)) %>% corrr::shave() %>% corrr::stretch()) %>% 
  unite(vars, x, y) %>% 
  drop_na() %>% 
  ggplot(aes(dataset, r)) +
  geom_text(aes(label = dataset)) + 
  coord_flip() +
  facet_wrap(~ vars)

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## 
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

## `summarise()` regrouping output by 'dataset' (override with `.groups` argument)

bind_rows(stdised = svcs %>% select(dataset, f0:f4, sex_c) %>% 
  ungroup() %>% 
  mutate_at(vars(f0:f4),  ~resid(lm(. ~ sex_c, na.action = na.exclude))) %>% 
  summarise(corrr::correlate(across(f0:f4)) %>% corrr::shave() %>% corrr::stretch()) %>% 
  unite(vars, x, y) %>% 
  drop_na(),
  
  unstdised = vcs %>% select(dataset, f0:f4, sex_c) %>%
  ungroup() %>% 
  mutate_at(vars(f0:f4),  ~resid(lm(. ~ sex_c, na.action = na.exclude))) %>% 
  summarise(corrr::correlate(across(f0:f4)) %>% corrr::shave() %>% corrr::stretch()) %>% 
  unite(vars, x, y) %>% 
  drop_na(), .id = "process") %>% 
  ggplot(aes(process, r)) +
  geom_text(aes(label = sprintf("%.2f", r))) + 
  coord_flip() +
  facet_wrap(~ vars)

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

let’s check how this looks compared to the simple approach

svcs <- svcs %>% ungroup()
vcs_2 <- svcs %>% select(voice_id, f0:f4, pf) %>%
  inner_join(
    data_complete_all %>% select(voice_id, f0:f4, pf), by = "voice_id", suffix = c("_within", "_std"))
rcamisc::mtmm(vcs_2 %>% ungroup() %>% select(-voice_id))

Distributions by dataset

svcs$sex <- factor(if_else(svcs$sex_c == 1, "male", "female"))
contrasts(svcs$sex) <- contr.helmert(2)

ggplot(svcs, aes(f0, fill = factor(dataset))) + 
  geom_histogram(position = "identity", alpha = 0.4)+
  facet_wrap(~ sex, scales = "free_x") +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 7 rows containing non-finite values (stat_bin).

ggplot(svcs, aes(f1, fill = factor(dataset))) + 
  geom_histogram(position = "identity", alpha = 0.4)+
  facet_wrap(~ sex, scales = "free_x") +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

ggplot(svcs, aes(f2, fill = factor(dataset))) + 
  geom_histogram(position = "identity", alpha = 0.4)+
  facet_wrap(~ sex, scales = "free_x") +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

ggplot(svcs, aes(f3, fill = factor(dataset))) + 
  geom_histogram(position = "identity", alpha = 0.4)+
  facet_wrap(~ sex, scales = "free_x") +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

ggplot(svcs, aes(f4, fill = factor(dataset))) + 
  geom_histogram(position = "identity", alpha = 0.4)+
  facet_wrap(~ sex, scales = "free_x") +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

ggplot(svcs, aes(pf, fill = factor(dataset))) + 
  geom_histogram(position = "identity", alpha = 0.4)+
  facet_wrap(~ sex, scales = "free_x") +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 4 rows containing non-finite values (stat_bin).

Save data

rio::export(svcs, "data_complete_2021_within_zscored.rds")

Outlier removal

svcs <- data_complete_all
svcs <- svcs %>% # group_by(sex) %>% 
  mutate(pf_threshold_hi = median(pf, na.rm = T) + 2.5 * mad(pf, na.rm = T),
         pf_threshold_lo = median(pf, na.rm = T) - 2.5 * mad(pf, na.rm = T),
         f0_threshold_hi = median(f0, na.rm = T) + 2.5 * mad(f0, na.rm = T),
         f0_threshold_lo = median(f0, na.rm = T) - 2.5 * mad(f0, na.rm = T))

pf_outliers <- svcs %>% 
  filter(pf < pf_threshold_lo | pf > pf_threshold_hi)
nrow(pf_outliers)

## [1] 3

xtabs(~ dataset + sex, pf_outliers)

##        sex
## dataset female male
##      2       2    0
##      10      0    1

f0_outliers <- svcs %>% 
  filter(f0 < f0_threshold_lo | f0 > f0_threshold_hi)
nrow(f0_outliers)

## [1] 0

xtabs(~ dataset + sex, f0_outliers)

## < table of extent 0 x 2 >

vcs <- svcs %>% filter(
  pf > pf_threshold_lo, 
  pf < pf_threshold_hi, 
  f0 > f0_threshold_lo, 
  f0 < f0_threshold_hi)

psych::describeBy(vcs %>% ungroup() %>% select(sex_c, f0:f4, pf), vcs$dataset)

## 
##  Descriptive statistics by group 
## group: 1
##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## sex_c    1 339 -1.00 0.00  -1.00   -1.00 0.00 -1.00 -1.00  0.00   NaN      NaN
## f0       2 339  0.77 0.44   0.74    0.75 0.40 -0.53  2.60  3.13  0.49     0.81
## f1       3 339  0.19 0.77   0.05    0.14 0.67 -1.86  3.60  5.46  0.82     1.24
## f2       4 339  0.31 1.15   0.25    0.26 1.17 -2.44  5.20  7.64  0.49     0.61
## f3       5 339  0.82 0.57   0.78    0.81 0.54 -0.68  2.29  2.97  0.16    -0.17
## f4       6 339  0.61 0.58   0.67    0.64 0.56 -1.01  2.29  3.30 -0.34    -0.20
## pf       7 339  0.60 0.60   0.56    0.58 0.58 -1.02  2.84  3.86  0.39     0.19
##         se
## sex_c 0.00
## f0    0.02
## f1    0.04
## f2    0.06
## f3    0.03
## f4    0.03
## pf    0.03
## ------------------------------------------------------------ 
## group: 2
##       vars   n  mean   sd median trimmed  mad   min  max range skew kurtosis
## sex_c    1 381  0.00 1.00  -1.00    0.00 0.00 -1.00 1.00  2.00 0.01    -2.01
## f0       2 381 -0.28 1.03  -0.36   -0.31 1.43 -1.76 1.68  3.44 0.12    -1.63
## f1       3 381  0.42 1.08   0.32    0.33 1.10 -1.58 4.46  6.05 0.79     0.62
## f2       4 381  0.28 0.86   0.33    0.27 1.09 -1.71 2.47  4.18 0.10    -0.92
## f3       5 381  0.24 1.06  -0.05    0.20 1.31 -1.51 2.78  4.29 0.22    -1.30
## f4       6 381  0.13 1.11  -0.16    0.05 1.31 -1.62 2.69  4.31 0.36    -1.21
## pf       7 381  0.33 1.16   0.17    0.27 1.50 -1.42 3.05  4.47 0.28    -1.33
##         se
## sex_c 0.05
## f0    0.05
## f1    0.06
## f2    0.04
## f3    0.05
## f4    0.06
## pf    0.06
## ------------------------------------------------------------ 
## group: 3
##       vars   n  mean   sd median trimmed  mad   min  max range  skew kurtosis
## sex_c    1 285 -0.01 1.00  -1.00   -0.01 0.00 -1.00 1.00  2.00  0.02    -2.01
## f0       2 285 -0.12 0.99  -0.03   -0.13 1.39 -1.73 1.84  3.57  0.03    -1.59
## f1       3 285  0.76 0.65   0.70    0.71 0.58 -0.65 3.96  4.61  1.09     2.45
## f2       4 285 -0.25 0.89  -0.38   -0.29 1.03 -2.09 2.41  4.50  0.41    -0.57
## f3       5 285 -0.13 0.97  -0.14   -0.13 1.32 -2.27 1.63  3.89 -0.03    -1.38
## f4       6 285  0.00 0.92   0.02    0.01 1.23 -2.43 1.65  4.08 -0.04    -1.39
## pf       7 285  0.12 0.93  -0.04    0.10 1.21 -1.76 2.38  4.14  0.15    -1.27
##         se
## sex_c 0.06
## f0    0.06
## f1    0.04
## f2    0.05
## f3    0.06
## f4    0.05
## pf    0.06
## ------------------------------------------------------------ 
## group: 4
##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## sex_c    1 265 -1.00 0.00  -1.00   -1.00 0.00 -1.00 -1.00  0.00   NaN      NaN
## f0       2 265  0.79 0.32   0.77    0.78 0.29 -0.33  1.99  2.32  0.18     0.88
## f1       3 265 -0.92 0.44  -0.91   -0.93 0.42 -2.07  0.30  2.38  0.05     0.07
## f2       4 265  0.36 0.55   0.30    0.35 0.59 -1.08  2.31  3.39  0.15    -0.09
## f3       5 265  0.40 0.47   0.43    0.41 0.52 -0.77  1.53  2.31 -0.11    -0.44
## f4       6 265  0.74 0.42   0.76    0.75 0.40 -0.40  1.96  2.36 -0.09    -0.03
## pf       7 265  0.18 0.39   0.15    0.18 0.40 -0.84  1.27  2.11  0.13    -0.41
##         se
## sex_c 0.00
## f0    0.02
## f1    0.03
## f2    0.03
## f3    0.03
## f4    0.03
## pf    0.02
## ------------------------------------------------------------ 
## group: 5
##       vars   n  mean   sd median trimmed  mad   min  max range  skew kurtosis
## sex_c    1 186 -0.34 0.94  -1.00   -0.43 0.00 -1.00 1.00  2.00  0.73    -1.48
## f0       2 186  0.02 0.94   0.41    0.07 0.74 -1.77 1.46  3.23 -0.54    -1.23
## f1       3 186  0.20 0.62   0.17    0.18 0.61 -1.40 1.83  3.23  0.30    -0.15
## f2       4 186  0.11 0.88   0.43    0.14 0.83 -1.82 2.08  3.90 -0.39    -1.00
## f3       5 186 -0.01 0.91   0.22    0.01 1.00 -2.00 1.56  3.56 -0.34    -1.16
## f4       6 186 -0.23 0.91   0.01   -0.20 0.94 -2.24 1.40  3.64 -0.40    -1.06
## pf       7 186  0.02 0.93   0.33    0.05 0.88 -1.85 1.71  3.56 -0.40    -1.28
##         se
## sex_c 0.07
## f0    0.07
## f1    0.05
## f2    0.06
## f3    0.07
## f4    0.07
## pf    0.07
## ------------------------------------------------------------ 
## group: 6
##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## sex_c    1 184  1.00 0.00   1.00    1.00 0.00  1.00  1.00  0.00   NaN      NaN
## f0       2 184 -1.02 0.31  -1.08   -1.04 0.28 -1.69  0.11  1.80  0.67     0.54
## f1       3 184 -1.29 0.38  -1.29   -1.28 0.39 -2.21 -0.23  1.98 -0.16    -0.18
## f2       4 184 -0.95 0.45  -0.96   -0.96 0.44 -2.14  0.32  2.46  0.12    -0.16
## f3       5 184 -1.16 0.44  -1.16   -1.14 0.45 -2.30 -0.08  2.21 -0.19    -0.24
## f4       6 184 -0.95 0.47  -0.93   -0.95 0.44 -2.53  0.62  3.15  0.11     0.65
## pf       7 184 -1.36 0.31  -1.36   -1.35 0.32 -2.10 -0.42  1.67  0.02    -0.12
##         se
## sex_c 0.00
## f0    0.02
## f1    0.03
## f2    0.03
## f3    0.03
## f4    0.03
## pf    0.02
## ------------------------------------------------------------ 
## group: 7
##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## sex_c    1 164  1.00 0.00   1.00    1.00 0.00  1.00  1.00  0.00   NaN      NaN
## f0       2 164 -1.06 0.30  -1.09   -1.08 0.28 -1.68 -0.12  1.56  0.50     0.30
## f1       3 164  0.34 0.48   0.32    0.33 0.45 -0.90  1.76  2.66  0.01     0.21
## f2       4 164 -1.42 0.50  -1.43   -1.43 0.47 -3.03  0.14  3.17  0.23     0.50
## f3       5 164 -1.26 0.42  -1.23   -1.26 0.37 -2.30  0.22  2.52  0.16     0.50
## f4       6 164 -1.22 0.49  -1.18   -1.22 0.37 -2.71  0.32  3.03 -0.05     0.69
## pf       7 164 -1.11 0.34  -1.13   -1.11 0.37 -2.01 -0.17  1.84  0.08    -0.08
##         se
## sex_c 0.00
## f0    0.02
## f1    0.04
## f2    0.04
## f3    0.03
## f4    0.04
## pf    0.03
## ------------------------------------------------------------ 
## group: 8
##       vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## sex_c    1 157 -1.00 0.00  -1.00   -1.00 0.00 -1.00 -1.00  0.00   NaN      NaN
## f0       2 157  0.78 0.35   0.80    0.78 0.34 -0.28  1.83  2.11  0.16     0.47
## f1       3 157 -0.13 0.96  -0.25   -0.16 1.00 -2.35  1.98  4.33  0.24    -0.65
## f2       4 157  0.47 0.49   0.52    0.50 0.45 -1.46  1.57  3.04 -0.64     0.93
## f3       5 157  0.52 0.62   0.45    0.49 0.63 -0.59  2.33  2.93  0.37    -0.45
## f4       6 157  0.67 0.43   0.66    0.67 0.44 -0.57  1.66  2.23 -0.17    -0.32
## pf       7 157  0.48 0.58   0.48    0.48 0.59 -1.02  1.79  2.82 -0.06    -0.41
##         se
## sex_c 0.00
## f0    0.03
## f1    0.08
## f2    0.04
## f3    0.05
## f4    0.03
## pf    0.05
## ------------------------------------------------------------ 
## group: 9
##       vars   n  mean   sd median trimmed  mad   min  max range  skew kurtosis
## sex_c    1 127 -0.42 0.91  -1.00   -0.51 0.00 -1.00 1.00  2.00  0.91    -1.19
## f0       2 127  0.54 1.02   0.91    0.60 0.65 -1.66 2.33  3.99 -0.63    -0.95
## f1       3 127  0.42 0.75   0.46    0.41 0.78 -1.06 2.76  3.82  0.17    -0.23
## f2       4 127  0.35 0.91   0.50    0.38 0.93 -2.12 2.35  4.47 -0.31    -0.44
## f3       5 127 -0.10 0.96   0.07   -0.06 0.94 -2.28 1.79  4.07 -0.46    -0.72
## f4       6 127 -0.03 1.09   0.24    0.03 1.12 -2.78 1.73  4.51 -0.51    -0.72
## pf       7 127  0.20 1.01   0.50    0.25 0.87 -1.76 2.25  4.01 -0.53    -0.95
##         se
## sex_c 0.08
## f0    0.09
## f1    0.07
## f2    0.08
## f3    0.09
## f4    0.10
## pf    0.09
## ------------------------------------------------------------ 
## group: 10
##       vars  n  mean   sd median trimmed  mad   min  max range  skew kurtosis
## sex_c    1 87  1.00 0.00   1.00    1.00 0.00  1.00 1.00  0.00   NaN      NaN
## f0       2 87 -1.00 0.36  -1.06   -1.02 0.34 -1.61 0.07  1.68  0.53    -0.09
## f1       3 87 -0.74 0.87  -0.97   -0.86 0.53 -2.15 2.55  4.70  1.66     3.41
## f2       4 87  0.85 0.74   0.92    0.86 0.72 -1.18 2.45  3.63 -0.15    -0.31
## f3       5 87 -0.34 0.60  -0.37   -0.35 0.69 -1.61 1.04  2.66  0.09    -0.81
## f4       6 87 -0.85 0.45  -0.85   -0.89 0.43 -1.54 0.86  2.40  1.27     2.51
## pf       7 87 -0.34 0.69  -0.43   -0.39 0.62 -1.56 2.14  3.70  0.95     1.59
##         se
## sex_c 0.00
## f0    0.04
## f1    0.09
## f2    0.08
## f3    0.06
## f4    0.05
## pf    0.07
## ------------------------------------------------------------ 
## group: 11
##       vars  n  mean   sd median trimmed  mad   min   max range  skew kurtosis
## sex_c    1 56  1.00 0.00   1.00    1.00 0.00  1.00  1.00  0.00   NaN      NaN
## f0       2 56 -1.32 0.23  -1.38   -1.33 0.18 -1.65 -0.76  0.89  0.65    -0.30
## f1       3 56 -0.63 0.45  -0.53   -0.61 0.39 -1.62  0.37  1.99 -0.53    -0.27
## f2       4 56 -0.84 0.34  -0.83   -0.84 0.42 -1.51 -0.15  1.36 -0.08    -0.97
## f3       5 56 -1.07 0.35  -1.08   -1.09 0.32 -1.85 -0.21  1.63  0.34    -0.08
## f4       6 56 -1.21 0.35  -1.21   -1.21 0.36 -1.99 -0.34  1.65  0.13    -0.03
## pf       7 56 -1.17 0.24  -1.16   -1.17 0.18 -1.71 -0.58  1.12  0.17     0.21
##         se
## sex_c 0.00
## f0    0.03
## f1    0.06
## f2    0.05
## f3    0.05
## f4    0.05
## pf    0.03

Save data

rio::export(vcs, "data_complete_2021_zscored_no_outliers.rds")

Aggregate data

Ruben Arslan

4/9/2021

Load individual datasets

Export raw

Compute POMP (percentage of maximum possible)

Add min/max for each scale

Test that all values are between min/max

Use min/max to compute POMP

Calculate pf

Add labels

Save pre-standardisation

Compute z-scores

Save data

Standardize within dataset, re-add sex differences

Distributions by dataset

Save data

Outlier removal

Save data