Skip to content

Commit 2615b99

Browse files
authored
merge pr #405: rename generate()s cols argument to variables
2 parents 671608f + 462aa05 commit 2615b99

File tree

8 files changed

+145
-38
lines changed

8 files changed

+145
-38
lines changed

NEWS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,13 +269,13 @@ gss %>%
269269
#> # … with 290 more rows
270270
```
271271

272-
If `type = "permute"`, a set of unquoted column names in the data to permute (independently of each other) can be passed via the `cols` argument to `generate`. It defaults to only the response variable.
272+
If `type = "permute"`, a set of unquoted column names in the data to permute (independently of each other) can be passed via the `variables` argument to `generate`. It defaults to only the response variable.
273273

274274
``` r
275275
gss %>%
276276
specify(hours ~ age + college) %>%
277277
hypothesize(null = "independence") %>%
278-
generate(reps = 100, type = "permute", cols = c(age, college)) %>%
278+
generate(reps = 100, type = "permute", variables = c(age, college)) %>%
279279
fit()
280280
#> # A tibble: 300 x 3
281281
#> # Groups: replicate [100]

R/fit.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ generics::fit
5757
#' beyond those required for one explanatory variable. Namely, the distribution
5858
#' of the response variable must be similar to the distribution of the errors
5959
#' under the null hypothesis' specification of a fixed effect of the explanatory
60-
#' variables. (This null hypothesis is reflected in the `cols` argument to
60+
#' variables. (This null hypothesis is reflected in the `variables` argument to
6161
#' [generate()]. By default, all of the explanatory variables are treated
6262
#' as fixed.) A general rule of thumb here is, if there are large outliers
6363
#' in the distributions of any of the explanatory variables, this distributional

R/generate.R

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
#' @param type The method used to generate resamples of the observed
1616
#' data reflecting the null hypothesis. Currently one of
1717
#' `"bootstrap"`, `"permute"`, or `"draw"` (see below).
18-
#' @param cols If `type = "permute"`, a set of unquoted column names in the
18+
#' @param variables If `type = "permute"`, a set of unquoted column names in the
1919
#' data to permute (independently of each other). Defaults to only the
20-
#' response variable.
20+
#' response variable. Note that any derived effects that depend on these
21+
#' columns (e.g., interaction effects) will also be affected.
2122
#' @param ... Currently ignored.
2223
#'
2324
#' @return A tibble containing `reps` generated datasets, indicated by the
@@ -64,7 +65,7 @@
6465
#' @family core functions
6566
#' @export
6667
generate <- function(x, reps = 1, type = NULL,
67-
cols = !!response_expr(x), ...) {
68+
variables = !!response_expr(x), ...) {
6869
# Check type argument, warning if necessary
6970
type <- sanitize_generation_type(type)
7071
auto_type <- sanitize_generation_type(attr(x, "type"))
@@ -74,7 +75,7 @@ generate <- function(x, reps = 1, type = NULL,
7475
use_auto_type(auto_type)
7576
}
7677

77-
check_cols(x, rlang::enquo(cols), type, missing(cols))
78+
check_cols(x, rlang::enquo(variables), type, missing(variables))
7879

7980
attr(x, "generated") <- TRUE
8081

@@ -83,7 +84,7 @@ generate <- function(x, reps = 1, type = NULL,
8384
bootstrap = bootstrap(x, reps, ...),
8485
permute = {
8586
check_permutation_attributes(x)
86-
permute(x, reps, rlang::enquo(cols), ...)
87+
permute(x, reps, rlang::enquo(variables), ...)
8788
},
8889
draw = draw(x, reps, ...),
8990
simulate = draw(x, reps, ...)
@@ -145,22 +146,27 @@ check_permutation_attributes <- function(x, attr) {
145146
}
146147
}
147148

148-
check_cols <- function(x, cols, type, missing) {
149-
if (!rlang::is_symbolic(rlang::get_expr(cols))) {
149+
check_cols <- function(x, variables, type, missing) {
150+
if (!rlang::is_symbolic(rlang::get_expr(variables))) {
150151
stop_glue(
151-
"The `cols` argument should be one or more unquoted variable names ",
152+
"The `variables` argument should be one or more unquoted variable names ",
152153
"(not strings in quotation marks)."
153154
)
154155
}
155156

156-
col_names <- all.vars(rlang::get_expr(cols))
157-
158157
if (!missing && type != "permute") {
159158
warning_glue(
160-
'The `cols` argument is only relevant for the "permute" ',
159+
'The `variables` argument is only relevant for the "permute" ',
161160
'generation type and will be ignored.'
162161
)
162+
163+
should_prompt <- FALSE
164+
} else {
165+
should_prompt <- TRUE
163166
}
167+
168+
col_names <- process_variables(variables, should_prompt)
169+
164170

165171
if (any(!col_names %in% colnames(x))) {
166172
bad_cols <- col_names[!col_names %in% colnames(x)]
@@ -171,7 +177,7 @@ check_cols <- function(x, cols, type, missing) {
171177

172178
stop_glue(
173179
'The column{plurals[1]} `{list(bad_cols)}` provided to ',
174-
'the `cols` argument {plurals[2]} not in the supplied data.'
180+
'the `variables` argument {plurals[2]} not in the supplied data.'
175181
)
176182
}
177183
}
@@ -204,8 +210,8 @@ bootstrap <- function(x, reps = 1, ...) {
204210
}
205211

206212
#' @importFrom dplyr bind_rows group_by
207-
permute <- function(x, reps = 1, cols, ...) {
208-
df_out <- replicate(reps, permute_once(x, cols), simplify = FALSE) %>%
213+
permute <- function(x, reps = 1, variables, ...) {
214+
df_out <- replicate(reps, permute_once(x, variables), simplify = FALSE) %>%
209215
dplyr::bind_rows() %>%
210216
dplyr::mutate(replicate = rep(1:reps, each = nrow(x))) %>%
211217
dplyr::group_by(replicate)
@@ -215,12 +221,12 @@ permute <- function(x, reps = 1, cols, ...) {
215221
append_infer_class(df_out)
216222
}
217223

218-
permute_once <- function(x, cols, ...) {
224+
permute_once <- function(x, variables, ...) {
219225
dots <- list(...)
220226

221227
if (is_hypothesized(x) && (attr(x, "null") == "independence")) {
222228
# for each column, determine whether it should be permuted
223-
needs_permuting <- colnames(x) %in% all.vars(rlang::get_expr(cols))
229+
needs_permuting <- colnames(x) %in% process_variables(variables, FALSE)
224230

225231
# pass each to permute_column with its associated logical
226232
out <- purrr::map2_dfc(x, needs_permuting, permute_column)
@@ -234,6 +240,36 @@ permute_once <- function(x, cols, ...) {
234240
}
235241
}
236242

243+
process_variables <- function(variables, should_prompt) {
244+
# extract the expression and convert each element to string
245+
out <- rlang::get_expr(variables)
246+
247+
if (length(out) == 1) {
248+
out <- as.character(out)
249+
} else {
250+
out <- purrr::map(out, as.character)
251+
}
252+
253+
254+
# drop c()
255+
out[out == "c"] <- NULL
256+
257+
# drop interactions and message
258+
interactions <- purrr::map_lgl(out, `%in%`, x = "*")
259+
260+
if (any(interactions) && should_prompt) {
261+
message_glue(
262+
"Message: Please supply only data columns to the `variables` argument. ",
263+
"Note that any derived effects that depend on these columns will also ",
264+
"be affected."
265+
)
266+
}
267+
268+
out <- out[!interactions]
269+
270+
out
271+
}
272+
237273
permute_column <- function(col, permute) {
238274
if (permute) {
239275
sample(col, size = length(col), replace = FALSE)

man/fit.infer.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/generate.Rd

Lines changed: 4 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-generate.R

Lines changed: 82 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ test_that("generate() can permute with multiple explanatory variables", {
323323
expect_equal(ncol(x), 4)
324324
})
325325

326-
test_that("generate is sensitive to the cols argument", {
326+
test_that("generate is sensitive to the variables argument", {
327327
# default argument works appropriately
328328
expect_equal({
329329
set.seed(1)
@@ -338,15 +338,15 @@ test_that("generate is sensitive to the cols argument", {
338338
gss[1:10,] %>%
339339
specify(hours ~ age + college) %>%
340340
hypothesize(null = "independence") %>%
341-
generate(reps = 2, type = "permute", cols = hours)
341+
generate(reps = 2, type = "permute", variables = hours)
342342
})
343343

344344
# permuting changes output
345345
expect_silent(
346346
perm_age <- gss[1:10,] %>%
347347
specify(hours ~ age + college) %>%
348348
hypothesize(null = "independence") %>%
349-
generate(reps = 2, type = "permute", cols = age)
349+
generate(reps = 2, type = "permute", variables = age)
350350
)
351351

352352
expect_false(all(perm_age$age[1:10] == perm_age$age[11:20]))
@@ -357,7 +357,7 @@ test_that("generate is sensitive to the cols argument", {
357357
perm_college <- gss[1:10,] %>%
358358
specify(hours ~ age + college) %>%
359359
hypothesize(null = "independence") %>%
360-
generate(reps = 2, type = "permute", cols = college)
360+
generate(reps = 2, type = "permute", variables = college)
361361
)
362362

363363
expect_true(all(perm_college$age[1:10] == perm_college$age[11:20]))
@@ -368,46 +368,116 @@ test_that("generate is sensitive to the cols argument", {
368368
perm_college_age <- gss[1:10,] %>%
369369
specify(hours ~ age + college) %>%
370370
hypothesize(null = "independence") %>%
371-
generate(reps = 2, type = "permute", cols = c(college, age))
371+
generate(reps = 2, type = "permute", variables = c(college, age))
372372
)
373373

374374
expect_false(all(perm_college_age$age[1:10] == perm_college_age$age[11:20]))
375375
expect_true(all(perm_college_age$hours[1:10] == perm_college_age$hours[11:20]))
376376
expect_false(all(perm_college_age$college[1:10] == perm_college_age$college[11:20]))
377+
378+
# interaction effects are ignored
379+
expect_equal({
380+
set.seed(1)
381+
382+
expect_message(
383+
gss[1:10,] %>%
384+
specify(hours ~ age + college) %>%
385+
hypothesize(null = "independence") %>%
386+
generate(reps = 2, type = "permute", variables = c(hours, age*college))
387+
)
388+
}, {
389+
set.seed(1)
390+
391+
gss[1:10,] %>%
392+
specify(hours ~ age + college) %>%
393+
hypothesize(null = "independence") %>%
394+
generate(reps = 2, type = "permute", variables = hours)
395+
})
377396
})
378397

379-
test_that("cols argument prompts when it ought to", {
398+
test_that("variables argument prompts when it ought to", {
380399
expect_error(
381400
gss[1:10,] %>%
382401
specify(hours ~ age + college) %>%
383402
hypothesize(null = "independence") %>%
384-
generate(reps = 2, type = "permute", cols = c(howdy)),
385-
"column `howdy`.*is not in the supplied data."
403+
generate(reps = 2, type = "permute", variables = c(howdy)),
404+
"howdy.*is not in the supplied data."
386405
)
387406

388407
expect_error(
389408
gss[1:10,] %>%
390409
specify(hours ~ age + college) %>%
391410
hypothesize(null = "independence") %>%
392-
generate(reps = 2, type = "permute", cols = c(howdy, doo)),
393-
'columns `c\\("howdy", "doo"\\)`.*are not in the supplied data.'
411+
generate(reps = 2, type = "permute", variables = c(howdy, doo)),
412+
'columns.*"howdy", "doo".*are not in the supplied data.'
394413
)
395414

396415
expect_warning(
397416
gss[1:10,] %>%
398417
specify(hours ~ NULL) %>%
399418
hypothesize(null = "point", mu = 40) %>%
400-
generate(reps = 2, type = "bootstrap", cols = c(hours)),
419+
generate(reps = 2, type = "bootstrap", variables = c(hours)),
401420
"is only relevant for.*will be ignored."
402421
)
403422

404423
expect_error(
405424
gss[1:10,] %>%
406425
specify(hours ~ age + college) %>%
407426
hypothesize(null = "independence") %>%
408-
generate(reps = 2, type = "permute", cols = "hours"),
427+
generate(reps = 2, type = "permute", variables = "hours"),
409428
'unquoted variable names'
410429
)
430+
431+
expect_message(
432+
gss[1:10,] %>%
433+
specify(hours ~ age + college + age*college) %>%
434+
hypothesize(null = "independence") %>%
435+
generate(reps = 2, type = "permute", variables = age*college),
436+
"supply only data columns"
437+
)
438+
439+
expect_message(
440+
gss[1:10,] %>%
441+
specify(hours ~ age + college + age*college) %>%
442+
hypothesize(null = "independence") %>%
443+
generate(reps = 2, type = "permute", variables = c(hours, age*college)),
444+
"supply only data columns"
445+
)
446+
447+
expect_silent(
448+
gss[1:10,] %>%
449+
specify(hours ~ age + college + age*college) %>%
450+
hypothesize(null = "independence") %>%
451+
generate(reps = 2, type = "permute", variables = c(hours))
452+
)
453+
454+
expect_silent(
455+
gss[1:10,] %>%
456+
specify(hours ~ age + college + age*college) %>%
457+
hypothesize(null = "independence") %>%
458+
generate(reps = 2, type = "permute")
459+
)
460+
461+
expect_silent(
462+
gss[1:10,] %>%
463+
specify(hours ~ age + college) %>%
464+
hypothesize(null = "independence") %>%
465+
generate(reps = 2, type = "permute")
466+
)
467+
468+
# warn on type != permute but don't raise message re: interaction
469+
# effects unless otherwise used appropriately
470+
expect_silent(
471+
expect_warning(
472+
gss[1:10,] %>%
473+
specify(hours ~ age*college) %>%
474+
generate(
475+
reps = 2,
476+
type = "bootstrap",
477+
variables = c(hours, age*college)
478+
)
479+
)
480+
)
411481
})
412482

413483
test_that("type = 'draw'/'simulate' superseding handled gracefully", {

vignettes/infer.Rmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ null_fits <- gss %>%
314314
null_fits
315315
```
316316

317-
To permute variables other than the response variable, the `cols` argument to `generate()` allows you to choose any of the `specify()`ed variables to permute independently of each other.
317+
To permute variables other than the response variable, the `variables` argument to `generate()` allows you to choose columns from the data to permute. Note that any derived effects that depend on these columns (e.g., interaction effects) will also be affected.
318318

319319
Beyond this point, observed fits and distributions from null fits interface exactly like analogous outputs from `calculate()`. For instance, we can use the following code to calculate a 95% confidence interval from these objects.
320320

vignettes/observed_stat_examples.Rmd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,7 +1003,7 @@ Generating a distribution of fits where each explanatory variable is permuted in
10031003
null_distn2 <- gss %>%
10041004
specify(hours ~ age + college) %>%
10051005
hypothesize(null = "independence") %>%
1006-
generate(reps = 1000, type = "permute", cols = c(age, college)) %>%
1006+
generate(reps = 1000, type = "permute", variables = c(age, college)) %>%
10071007
fit()
10081008
```
10091009

@@ -1597,7 +1597,7 @@ Alternatively, generating a distribution of fits where each explanatory variable
15971597
null_distn2 <- gss %>%
15981598
specify(hours ~ age + college) %>%
15991599
hypothesize(null = "independence") %>%
1600-
generate(reps = 1000, type = "permute", cols = c(age, college)) %>%
1600+
generate(reps = 1000, type = "permute", variables = c(age, college)) %>%
16011601
fit()
16021602
```
16031603

0 commit comments

Comments
 (0)