handle missing values when calculating confidence intervals (#521)

simonpcouch · web-flow · commit 3866325ef35c · 2024-01-31T08:33:09.000-06:00
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # infer v1.0.5.9000 (development version)
 
+* Fixed bug where `get_confidence_interval()` would error uninformatively when the supplied distribution of estimates contained missing values. The function will now warn and return a confidence interval calculated using the non-missing estimates.
+
 * Updated infrastructure for errors, warnings, and messages (#513). Most of these changes will not be visible to users, though:
      - Many longer error messages are now broken up into several lines.
      - For references to help-files, users can now click on the error message's text to navigate to the cited documentation.
diff --git a/R/get_confidence_interval.R b/R/get_confidence_interval.R
@@ -227,9 +227,23 @@ switch_ci <- function(type, x, level, point_estimate) {
   )
 }
 
+remove_missing_estimates <- function(estimates) {
+  na_estimates <- is.na(estimates)
+  na_estimates_n <- sum(na_estimates)
+
+  if (na_estimates_n > 0) {
+     cli_warn("{na_estimates_n} estimates were missing and were removed when \\
+               calculating the confidence interval.")
+  }
+
+  estimates[!na_estimates]
+}
+
 ci_percentile <- function(x, level) {
   # x[[ncol(x)]] pulls out the stat or estimate column
-  ci_vec <- stats::quantile(x[[ncol(x)]], probs = (1 + c(-level, level)) / 2)
+  estimates <- remove_missing_estimates(x[[ncol(x)]])
+
+  ci_vec <- stats::quantile(estimates, probs = (1 + c(-level, level)) / 2)
 
   make_ci_df(ci_vec)
 }
@@ -247,7 +261,9 @@ ci_se <- function(x, level, point_estimate) {
     }
   } else {
     # x[[ncol(x)]] pulls out the stat or estimate column
-    se <- stats::sd(x[[ncol(x)]])
+    estimates <- remove_missing_estimates(x[[ncol(x)]])
+    se <- stats::sd(estimates)
+
     qfn <- "qnorm"
   }
 
@@ -269,14 +285,16 @@ ci_bias_corrected <- function(x, level, point_estimate) {
   point_estimate <- check_obs_stat(point_estimate)
 
   # x[[ncol(x)]] pulls out the stat or estimate column
-  p <- mean(x[[ncol(x)]] <= point_estimate)
+  estimates <- remove_missing_estimates(x[[ncol(x)]])
+
+  p <- mean(estimates <= point_estimate)
+
   z0 <- stats::qnorm(p)
   # z_alpha_2 is z_(alpha/2)
   z_alpha_2 <- stats::qnorm((1 + c(-level, level)) / 2)
   new_probs <- stats::pnorm(2 * z0 + z_alpha_2)
 
-  # x[[ncol(x)]] pulls out the stat or estimate column
-  ci_vec <- stats::quantile(x[[ncol(x)]], probs = new_probs)
+  ci_vec <- stats::quantile(estimates, probs = new_probs)
 
   make_ci_df(ci_vec)
 }
diff --git a/tests/testthat/_snaps/get_confidence_interval.md b/tests/testthat/_snaps/get_confidence_interval.md
@@ -175,3 +175,11 @@
       Error in `get_confidence_interval()`:
       ! Confidence intervals using a `z` distribution for `stat = mean` are not implemented.
 
+# handles missing values gracefully (#520)
+
+    Code
+      res <- get_confidence_interval(boot_dist, 0.95)
+    Condition
+      Warning:
+      4 estimates were missing and were removed when calculating the confidence interval.
+
diff --git a/tests/testthat/test-get_confidence_interval.R b/tests/testthat/test-get_confidence_interval.R
@@ -471,3 +471,22 @@ test_that("theoretical CIs check arguments properly", {
     )
   )
 })
+
+test_that("handles missing values gracefully (#520)", {
+   data <- data.frame(
+      prop = seq(0, 1, length.out = 10),
+      group = rep(c("a", "b"), each = 5L)
+   )
+
+   set.seed(1)
+   boot_dist <-
+     data %>%
+     specify(prop ~ group) %>%
+     hypothesize(null = "independence") %>%
+     generate(reps = 1000, type = "bootstrap") %>%
+     calculate(stat = "diff in medians", order = c("b", "a"))
+
+   expect_snapshot(res <- get_confidence_interval(boot_dist, .95))
+
+   expect_s3_class(res, "data.frame")
+})