Open
Description
Often times researchers want to create summary statistics for different groups. In tables, each column tends to be a group and each row the summary stat for that group/var. It would be nice if skimr made it easier to reshape the stats for each group into a list.
I had an initial go at this function, but am not 100% confident I'm not missing something. Here's a reprex with the split_by_group function:
`
library(skimr)
library(tidyverse)
reconcile_skimmers <- function(data, groups, base) {
all_columns <- names(data)
skimmers_used <- skimmers_used(data)
with_base_columns <- c(
"skim_variable",
"skim_type",
base,
collapse_skimmers(skimmers_used)
)
extra_cols <- dplyr::setdiff(all_columns, with_base_columns)
if (length(extra_cols) > 0) {
grouped <- dplyr::group_by(data, .data$skim_type)
complete_by_type <- dplyr::summarize_at(
grouped,
dplyr::vars(extra_cols),
~ !all(is.na(.x))
)
complete_cols <- purrr::pmap(
complete_by_type,
get_complete_columns,
names = extra_cols
)
new_cols_by_type <- rlang::set_names(
complete_cols,
complete_by_type$skim_type
)
skimmers_used <- purrr::list_merge(skimmers_used, !!!new_cols_by_type)
}
skimmers_used
}
collapse_skimmers <- function(skimmers_used) {
with_type <- purrr::imap(skimmers_used, ~ paste(.y, .x, sep = "."))
purrr::flatten_chr(with_type)
}
get_complete_columns <- function(skim_type, ..., names) {
names[c(...)]
}
split_by_group <- function(data){
assert_is_skim_df(data)
groups <- group_names(data)
base <- base_skimmers(data)
skimmers <- reconcile_skimmers(data, groups, base)
# get group_names to name list after group_split
group_name <- data %>%
dplyr::group_keys(!!! groups) %>%
unite(group_name, sep = " - ") %>%
.[["group_name"]]
# Name list by group_names
data_by_group <- data %>% dplyr::group_split(!!! groups) %>% setNames(group_name)
# Make each data frame a skim_df
data_by_group <- lapply(data_by_group, function(x) {
attr(x, "class") <- c("skim_df", class(x))
return(x)
} )
# Make list a skim_lists
attr(data_by_group, "class") <- c("skim_list", "list")
# Return skim_list split by groups
data_by_group
}
data <- mtcars %>%
mutate(cyl = factor(.$cyl, levels = c(4,6,8), labels = c("4 cyl", "6 cyl", "8 cyl")),
gear = factor(.$gear, levels = c(3,4,5), labels = c("3 gears", "4 gears", "5 gears"))) %>%
group_by(cyl, gear) %>%
skim()
split_by_group(data)
`
This outputs:
`
$"4 cyl - 3 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 21.5 NA 21.5 21.5 21.5 21.5 21.5 ▁▁▇▁▁ 4 cyl 3 gears
2 disp 0 1 120. NA 120. 120. 120. 120. 120. ▁▁▇▁▁ 4 cyl 3 gears
3 hp 0 1 97 NA 97 97 97 97 97 ▁▁▇▁▁ 4 cyl 3 gears
4 drat 0 1 3.7 NA 3.7 3.7 3.7 3.7 3.7 ▁▁▇▁▁ 4 cyl 3 gears
5 wt 0 1 2.46 NA 2.46 2.46 2.46 2.46 2.46 ▁▁▇▁▁ 4 cyl 3 gears
6 qsec 0 1 20.0 NA 20.0 20.0 20.0 20.0 20.0 ▁▁▇▁▁ 4 cyl 3 gears
7 vs 0 1 1 NA 1 1 1 1 1 ▁▁▇▁▁ 4 cyl 3 gears
8 am 0 1 0 NA 0 0 0 0 0 ▁▁▇▁▁ 4 cyl 3 gears
9 carb 0 1 1 NA 1 1 1 1 1 ▁▁▇▁▁ 4 cyl 3 gears
$"4 cyl - 4 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 26.9 4.81 21.4 22.8 25.8 30.9 33.9 ▇▂▂▂▅ 4 cyl 4 gears
2 disp 0 1 103. 30.7 71.1 78.0 93.5 126. 147. ▇▁▂▂▃ 4 cyl 4 gears
3 hp 0 1 76 20.1 52 64.2 66 93.5 109 ▅▇▁▅▂ 4 cyl 4 gears
4 drat 0 1 4.11 0.372 3.69 3.90 4.08 4.14 4.93 ▇▇▂▁▂ 4 cyl 4 gears
5 wt 0 1 2.38 0.601 1.62 1.91 2.26 2.87 3.19 ▇▇▃▃▇ 4 cyl 4 gears
6 qsec 0 1 19.6 1.45 18.5 18.6 19.2 19.9 22.9 ▇▆▁▁▂ 4 cyl 4 gears
7 vs 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁ 4 cyl 4 gears
8 am 0 1 0.75 0.463 0 0.75 1 1 1 ▂▁▁▁▇ 4 cyl 4 gears
9 carb 0 1 1.5 0.535 1 1 1.5 2 2 ▇▁▁▁▇ 4 cyl 4 gears
$"4 cyl - 5 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 28.2 3.11 26 27.1 28.2 29.3 30.4 ▇▁▁▁▇ 4 cyl 5 gears
2 disp 0 1 108. 17.8 95.1 101. 108. 114 120. ▇▁▁▁▇ 4 cyl 5 gears
3 hp 0 1 102 15.6 91 96.5 102 108. 113 ▇▁▁▁▇ 4 cyl 5 gears
4 drat 0 1 4.1 0.467 3.77 3.94 4.1 4.26 4.43 ▇▁▁▁▇ 4 cyl 5 gears
5 wt 0 1 1.83 0.443 1.51 1.67 1.83 1.98 2.14 ▇▁▁▁▇ 4 cyl 5 gears
6 qsec 0 1 16.8 0.141 16.7 16.8 16.8 16.8 16.9 ▇▁▁▁▇ 4 cyl 5 gears
7 vs 0 1 0.5 0.707 0 0.25 0.5 0.75 1 ▇▁▁▁▇ 4 cyl 5 gears
8 am 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁ 4 cyl 5 gears
9 carb 0 1 2 0 2 2 2 2 2 ▁▁▇▁▁ 4 cyl 5 gears
$"6 cyl - 3 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 19.8 2.33 18.1 18.9 19.8 20.6 21.4 ▇▁▁▁▇ 6 cyl 3 gears
2 disp 0 1 242. 23.3 225 233. 242. 250. 258 ▇▁▁▁▇ 6 cyl 3 gears
3 hp 0 1 108. 3.54 105 106. 108. 109. 110 ▇▁▁▁▇ 6 cyl 3 gears
4 drat 0 1 2.92 0.226 2.76 2.84 2.92 3 3.08 ▇▁▁▁▇ 6 cyl 3 gears
5 wt 0 1 3.34 0.173 3.22 3.28 3.34 3.40 3.46 ▇▁▁▁▇ 6 cyl 3 gears
6 qsec 0 1 19.8 0.552 19.4 19.6 19.8 20.0 20.2 ▇▁▁▁▇ 6 cyl 3 gears
7 vs 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁ 6 cyl 3 gears
8 am 0 1 0 0 0 0 0 0 0 ▁▁▇▁▁ 6 cyl 3 gears
9 carb 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁ 6 cyl 3 gears
$"6 cyl - 4 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 19.8 1.55 17.8 18.8 20.1 21 21 ▃▁▃▁▇ 6 cyl 4 gears
2 disp 0 1 164. 4.39 160 160 164. 168. 168. ▇▁▁▁▇ 6 cyl 4 gears
3 hp 0 1 116. 7.51 110 110 116. 123 123 ▇▁▁▁▇ 6 cyl 4 gears
4 drat 0 1 3.91 0.0115 3.9 3.9 3.91 3.92 3.92 ▇▁▁▁▇ 6 cyl 4 gears
5 wt 0 1 3.09 0.413 2.62 2.81 3.16 3.44 3.44 ▃▃▁▁▇ 6 cyl 4 gears
6 qsec 0 1 17.7 1.12 16.5 16.9 17.7 18.5 18.9 ▇▇▁▇▇ 6 cyl 4 gears
7 vs 0 1 0.5 0.577 0 0 0.5 1 1 ▇▁▁▁▇ 6 cyl 4 gears
8 am 0 1 0.5 0.577 0 0 0.5 1 1 ▇▁▁▁▇ 6 cyl 4 gears
9 carb 0 1 4 0 4 4 4 4 4 ▁▁▇▁▁ 6 cyl 4 gears
$"6 cyl - 5 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 19.7 NA 19.7 19.7 19.7 19.7 19.7 ▁▁▇▁▁ 6 cyl 5 gears
2 disp 0 1 145 NA 145 145 145 145 145 ▁▁▇▁▁ 6 cyl 5 gears
3 hp 0 1 175 NA 175 175 175 175 175 ▁▁▇▁▁ 6 cyl 5 gears
4 drat 0 1 3.62 NA 3.62 3.62 3.62 3.62 3.62 ▁▁▇▁▁ 6 cyl 5 gears
5 wt 0 1 2.77 NA 2.77 2.77 2.77 2.77 2.77 ▁▁▇▁▁ 6 cyl 5 gears
6 qsec 0 1 15.5 NA 15.5 15.5 15.5 15.5 15.5 ▁▁▇▁▁ 6 cyl 5 gears
7 vs 0 1 0 NA 0 0 0 0 0 ▁▁▇▁▁ 6 cyl 5 gears
8 am 0 1 1 NA 1 1 1 1 1 ▁▁▇▁▁ 6 cyl 5 gears
9 carb 0 1 6 NA 6 6 6 6 6 ▁▁▇▁▁ 6 cyl 5 gears
$"8 cyl - 3 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 15.0 2.77 10.4 14.0 15.2 16.6 19.2 ▃▂▇▃▃ 8 cyl 3 gears
2 disp 0 1 358. 71.8 276. 297. 355 410 472 ▇▃▃▂▆ 8 cyl 3 gears
3 hp 0 1 194. 33.4 150 175 180 219. 245 ▃▇▂▂▅ 8 cyl 3 gears
4 drat 0 1 3.12 0.230 2.76 3.05 3.08 3.16 3.73 ▃▇▆▁▂ 8 cyl 3 gears
5 wt 0 1 4.10 0.768 3.44 3.56 3.81 4.36 5.42 ▇▃▁▁▃ 8 cyl 3 gears
6 qsec 0 1 17.1 0.802 15.4 17.0 17.4 17.7 18 ▃▁▂▇▆ 8 cyl 3 gears
7 vs 0 1 0 0 0 0 0 0 0 ▁▁▇▁▁ 8 cyl 3 gears
8 am 0 1 0 0 0 0 0 0 0 ▁▁▇▁▁ 8 cyl 3 gears
9 carb 0 1 3.08 0.900 2 2 3 4 4 ▆▁▅▁▇ 8 cyl 3 gears
$"8 cyl - 5 gears"
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 32
Number of columns 11
_______________________
Column type frequency:
numeric 9
________________________
Group variables None
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist cyl gear
1 mpg 0 1 15.4 0.566 15 15.2 15.4 15.6 15.8 ▇▁▁▁▇ 8 cyl 5 gears
2 disp 0 1 326 35.4 301 314. 326 338. 351 ▇▁▁▁▇ 8 cyl 5 gears
3 hp 0 1 300. 50.2 264 282. 300. 317. 335 ▇▁▁▁▇ 8 cyl 5 gears
4 drat 0 1 3.88 0.481 3.54 3.71 3.88 4.05 4.22 ▇▁▁▁▇ 8 cyl 5 gears
5 wt 0 1 3.37 0.283 3.17 3.27 3.37 3.47 3.57 ▇▁▁▁▇ 8 cyl 5 gears
6 qsec 0 1 14.6 0.0707 14.5 14.5 14.6 14.6 14.6 ▇▁▁▁▇ 8 cyl 5 gears
7 vs 0 1 0 0 0 0 0 0 0 ▁▁▇▁▁ 8 cyl 5 gears
8 am 0 1 1 0 1 1 1 1 1 ▁▁▇▁▁ 8 cyl 5 gears
9 carb 0 1 6 2.83 4 5 6 7 8 ▇▁▁▁▇ 8 cyl 5 gears
`