merge pr #405: rename generate()s cols argument to variables

simonpcouch · web-flow · commit 2615b99bc3af · 2021-08-10T11:50:56.000-04:00
diff --git a/NEWS.md b/NEWS.md
@@ -269,13 +269,13 @@ gss %>%
 #> # … with 290 more rows
 ```
 
-If `type = "permute"`, a set of unquoted column names in the data to permute (independently of each other) can be passed via the `cols` argument to `generate`. It defaults to only the response variable.
+If `type = "permute"`, a set of unquoted column names in the data to permute (independently of each other) can be passed via the `variables` argument to `generate`. It defaults to only the response variable.
 
 ``` r
 gss %>%
   specify(hours ~ age + college) %>%
   hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute", cols = c(age, college)) %>%
+  generate(reps = 100, type = "permute", variables = c(age, college)) %>%
   fit()
 #> # A tibble: 300 x 3
 #> # Groups:   replicate [100]
diff --git a/R/fit.R b/R/fit.R
@@ -57,7 +57,7 @@ generics::fit
 #' beyond those required for one explanatory variable. Namely, the distribution
 #' of the response variable must be similar to the distribution of the errors
 #' under the null hypothesis' specification of a fixed effect of the explanatory 
-#' variables. (This null hypothesis is reflected in the `cols` argument to 
+#' variables. (This null hypothesis is reflected in the `variables` argument to 
 #' [generate()]. By default, all of the explanatory variables are treated
 #' as fixed.) A general rule of thumb here is, if there are large outliers
 #' in the distributions of any of the explanatory variables, this distributional
diff --git a/R/generate.R b/R/generate.R
@@ -15,9 +15,10 @@
 #' @param type The method used to generate resamples of the observed
 #'   data reflecting the null hypothesis. Currently one of
 #'   `"bootstrap"`, `"permute"`, or `"draw"` (see below).
-#' @param cols If `type = "permute"`, a set of unquoted column names in the
+#' @param variables If `type = "permute"`, a set of unquoted column names in the
 #'   data to permute (independently of each other). Defaults to only the
-#'   response variable.
+#'   response variable. Note that any derived effects that depend on these
+#'   columns (e.g., interaction effects) will also be affected.
 #' @param ... Currently ignored.
 #'
 #' @return A tibble containing `reps` generated datasets, indicated by the
@@ -64,7 +65,7 @@
 #' @family core functions
 #' @export
 generate <- function(x, reps = 1, type = NULL,
-                     cols = !!response_expr(x), ...) {
+                     variables = !!response_expr(x), ...) {
   # Check type argument, warning if necessary
   type <- sanitize_generation_type(type)
   auto_type <- sanitize_generation_type(attr(x, "type"))
@@ -74,7 +75,7 @@ generate <- function(x, reps = 1, type = NULL,
     use_auto_type(auto_type)
   }
 
-  check_cols(x, rlang::enquo(cols), type, missing(cols))
+  check_cols(x, rlang::enquo(variables), type, missing(variables))
 
   attr(x, "generated") <- TRUE
 
@@ -83,7 +84,7 @@ generate <- function(x, reps = 1, type = NULL,
     bootstrap = bootstrap(x, reps, ...),
     permute = {
       check_permutation_attributes(x)
-      permute(x, reps, rlang::enquo(cols), ...)
+      permute(x, reps, rlang::enquo(variables), ...)
     },
     draw = draw(x, reps, ...),
     simulate = draw(x, reps, ...)
@@ -145,22 +146,27 @@ check_permutation_attributes <- function(x, attr) {
   }
 }
 
-check_cols <- function(x, cols, type, missing) {
-  if (!rlang::is_symbolic(rlang::get_expr(cols))) {
+check_cols <- function(x, variables, type, missing) {
+  if (!rlang::is_symbolic(rlang::get_expr(variables))) {
     stop_glue(
-      "The `cols` argument should be one or more unquoted variable names ",
+      "The `variables` argument should be one or more unquoted variable names ",
       "(not strings in quotation marks)."
     )
   }
 
-  col_names <- all.vars(rlang::get_expr(cols))
-
   if (!missing && type != "permute") {
     warning_glue(
-      'The `cols` argument is only relevant for the "permute" ',
+      'The `variables` argument is only relevant for the "permute" ',
       'generation type and will be ignored.'
     )
+    
+    should_prompt <- FALSE
+  } else {
+    should_prompt <- TRUE
   }
+  
+  col_names <- process_variables(variables, should_prompt)
+  
 
   if (any(!col_names %in% colnames(x))) {
     bad_cols <- col_names[!col_names %in% colnames(x)]
@@ -171,7 +177,7 @@ check_cols <- function(x, cols, type, missing) {
 
     stop_glue(
       'The column{plurals[1]} `{list(bad_cols)}` provided to ',
-      'the `cols` argument {plurals[2]} not in the supplied data.'
+      'the `variables` argument {plurals[2]} not in the supplied data.'
     )
   }
 }
@@ -204,8 +210,8 @@ bootstrap <- function(x, reps = 1, ...) {
 }
 
 #' @importFrom dplyr bind_rows group_by
-permute <- function(x, reps = 1, cols, ...) {
-  df_out <- replicate(reps, permute_once(x, cols), simplify = FALSE) %>%
+permute <- function(x, reps = 1, variables, ...) {
+  df_out <- replicate(reps, permute_once(x, variables), simplify = FALSE) %>%
     dplyr::bind_rows() %>%
     dplyr::mutate(replicate = rep(1:reps, each = nrow(x))) %>%
     dplyr::group_by(replicate)
@@ -215,12 +221,12 @@ permute <- function(x, reps = 1, cols, ...) {
   append_infer_class(df_out)
 }
 
-permute_once <- function(x, cols, ...) {
+permute_once <- function(x, variables, ...) {
   dots <- list(...)
 
   if (is_hypothesized(x) && (attr(x, "null") == "independence")) {
     # for each column, determine whether it should be permuted
-    needs_permuting <- colnames(x) %in% all.vars(rlang::get_expr(cols))
+    needs_permuting <- colnames(x) %in% process_variables(variables, FALSE)
 
     # pass each to permute_column with its associated logical
     out <- purrr::map2_dfc(x, needs_permuting, permute_column)
@@ -234,6 +240,36 @@ permute_once <- function(x, cols, ...) {
   }
 }
 
+process_variables <- function(variables, should_prompt) {
+  # extract the expression and convert each element to string
+  out <- rlang::get_expr(variables)
+  
+  if (length(out) == 1) {
+    out <- as.character(out)
+  } else {
+    out <- purrr::map(out, as.character)
+  }
+    
+  
+  # drop c()
+  out[out == "c"] <- NULL
+  
+  # drop interactions and message
+  interactions <- purrr::map_lgl(out, `%in%`, x = "*")
+  
+  if (any(interactions) && should_prompt) {
+    message_glue(
+      "Message: Please supply only data columns to the `variables` argument. ",
+      "Note that any derived effects that depend on these columns will also ",
+      "be affected."
+    )
+  }
+  
+  out <- out[!interactions]
+  
+  out
+}
+
 permute_column <- function(col, permute) {
   if (permute) {
     sample(col, size = length(col), replace = FALSE)
diff --git a/man/fit.infer.Rd b/man/fit.infer.Rd
diff --git a/man/generate.Rd b/man/generate.Rd
diff --git a/tests/testthat/test-generate.R b/tests/testthat/test-generate.R
@@ -323,7 +323,7 @@ test_that("generate() can permute with multiple explanatory variables", {
   expect_equal(ncol(x), 4)
 })
 
-test_that("generate is sensitive to the cols argument", {
+test_that("generate is sensitive to the variables argument", {
   # default argument works appropriately
   expect_equal({ 
       set.seed(1)
@@ -338,15 +338,15 @@ test_that("generate is sensitive to the cols argument", {
       gss[1:10,] %>%
         specify(hours ~ age + college) %>%
         hypothesize(null = "independence") %>%
-        generate(reps = 2, type = "permute", cols = hours)
+        generate(reps = 2, type = "permute", variables = hours)
   })
   
   # permuting changes output
   expect_silent(
     perm_age <- gss[1:10,] %>%
       specify(hours ~ age + college) %>%
       hypothesize(null = "independence") %>%
-      generate(reps = 2, type = "permute", cols = age)
+      generate(reps = 2, type = "permute", variables = age)
   )
   
   expect_false(all(perm_age$age[1:10] == perm_age$age[11:20]))
@@ -357,7 +357,7 @@ test_that("generate is sensitive to the cols argument", {
     perm_college <- gss[1:10,] %>%
       specify(hours ~ age + college) %>%
       hypothesize(null = "independence") %>%
-      generate(reps = 2, type = "permute", cols = college)
+      generate(reps = 2, type = "permute", variables = college)
   )
   
   expect_true(all(perm_college$age[1:10] == perm_college$age[11:20]))
@@ -368,46 +368,116 @@ test_that("generate is sensitive to the cols argument", {
     perm_college_age <- gss[1:10,] %>%
       specify(hours ~ age + college) %>%
       hypothesize(null = "independence") %>%
-      generate(reps = 2, type = "permute", cols = c(college, age))
+      generate(reps = 2, type = "permute", variables = c(college, age))
   )
   
   expect_false(all(perm_college_age$age[1:10] == perm_college_age$age[11:20]))
   expect_true(all(perm_college_age$hours[1:10] == perm_college_age$hours[11:20]))
   expect_false(all(perm_college_age$college[1:10] == perm_college_age$college[11:20]))
+  
+  # interaction effects are ignored
+  expect_equal({ 
+    set.seed(1)
+    
+    expect_message(
+      gss[1:10,] %>%
+        specify(hours ~ age + college) %>%
+        hypothesize(null = "independence") %>%
+        generate(reps = 2, type = "permute", variables = c(hours, age*college))
+    )
+  }, { 
+    set.seed(1)
+    
+    gss[1:10,] %>%
+      specify(hours ~ age + college) %>%
+      hypothesize(null = "independence") %>%
+      generate(reps = 2, type = "permute", variables = hours)
+  })
 })
 
-test_that("cols argument prompts when it ought to", {
+test_that("variables argument prompts when it ought to", {
   expect_error(
     gss[1:10,] %>%
       specify(hours ~ age + college) %>%
       hypothesize(null = "independence") %>%
-      generate(reps = 2, type = "permute", cols = c(howdy)),
-    "column `howdy`.*is not in the supplied data."
+      generate(reps = 2, type = "permute", variables = c(howdy)),
+    "howdy.*is not in the supplied data."
   )
   
   expect_error(
     gss[1:10,] %>%
       specify(hours ~ age + college) %>%
       hypothesize(null = "independence") %>%
-      generate(reps = 2, type = "permute", cols = c(howdy, doo)),
-    'columns `c\\("howdy", "doo"\\)`.*are not in the supplied data.'
+      generate(reps = 2, type = "permute", variables = c(howdy, doo)),
+    'columns.*"howdy", "doo".*are not in the supplied data.'
   )
   
   expect_warning(
     gss[1:10,] %>%
       specify(hours ~ NULL) %>%
       hypothesize(null = "point", mu = 40) %>%
-      generate(reps = 2, type = "bootstrap", cols = c(hours)),
+      generate(reps = 2, type = "bootstrap", variables = c(hours)),
     "is only relevant for.*will be ignored."
   )
   
   expect_error(
     gss[1:10,] %>%
       specify(hours ~ age + college) %>%
       hypothesize(null = "independence") %>%
-      generate(reps = 2, type = "permute", cols = "hours"),
+      generate(reps = 2, type = "permute", variables = "hours"),
     'unquoted variable names'
   )
+  
+  expect_message(
+    gss[1:10,] %>%
+      specify(hours ~ age + college + age*college) %>%
+      hypothesize(null = "independence") %>%
+      generate(reps = 2, type = "permute", variables = age*college),
+    "supply only data columns"
+  )
+  
+  expect_message(
+    gss[1:10,] %>%
+      specify(hours ~ age + college + age*college) %>%
+      hypothesize(null = "independence") %>%
+      generate(reps = 2, type = "permute", variables = c(hours, age*college)),
+    "supply only data columns"
+  )
+  
+  expect_silent(
+    gss[1:10,] %>%
+      specify(hours ~ age + college + age*college) %>%
+      hypothesize(null = "independence") %>%
+      generate(reps = 2, type = "permute", variables = c(hours))
+  )
+  
+  expect_silent(
+    gss[1:10,] %>%
+      specify(hours ~ age + college + age*college) %>%
+      hypothesize(null = "independence") %>%
+      generate(reps = 2, type = "permute")
+  )
+  
+  expect_silent(
+    gss[1:10,] %>%
+      specify(hours ~ age + college) %>%
+      hypothesize(null = "independence") %>%
+      generate(reps = 2, type = "permute")
+  )
+  
+  # warn on type != permute but don't raise message re: interaction
+  # effects unless otherwise used appropriately
+  expect_silent(
+    expect_warning(
+      gss[1:10,] %>%
+        specify(hours ~ age*college) %>%
+        generate(
+          reps = 2, 
+          type = "bootstrap", 
+          variables = c(hours, age*college)
+        )
+    )
+  )
 })
 
 test_that("type = 'draw'/'simulate' superseding handled gracefully", {
diff --git a/vignettes/infer.Rmd b/vignettes/infer.Rmd
@@ -314,7 +314,7 @@ null_fits <- gss %>%
 null_fits
 ```
 
-To permute variables other than the response variable, the `cols` argument to `generate()` allows you to choose any of the `specify()`ed variables to permute independently of each other.
+To permute variables other than the response variable, the `variables` argument to `generate()` allows you to choose columns from the data to permute. Note that any derived effects that depend on these columns (e.g., interaction effects) will also be affected.
 
 Beyond this point, observed fits and distributions from null fits interface exactly like analogous outputs from `calculate()`. For instance, we can use the following code to calculate a 95% confidence interval from these objects.
 
diff --git a/vignettes/observed_stat_examples.Rmd b/vignettes/observed_stat_examples.Rmd
@@ -1003,7 +1003,7 @@ Generating a distribution of fits where each explanatory variable is permuted in
 null_distn2 <- gss %>%
   specify(hours ~ age + college) %>%
   hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute", cols = c(age, college)) %>%
+  generate(reps = 1000, type = "permute", variables = c(age, college)) %>%
   fit()
 ```
 
@@ -1597,7 +1597,7 @@ Alternatively, generating a distribution of fits where each explanatory variable
 null_distn2 <- gss %>%
   specify(hours ~ age + college) %>%
   hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute", cols = c(age, college)) %>%
+  generate(reps = 1000, type = "permute", variables = c(age, college)) %>%
   fit()
 ```