diff --git a/.Rbuildignore b/.Rbuildignore index cb0b7ed2..6ee732f2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -17,4 +17,8 @@ ^DEVELOPMENT.md$ man-roxygen ^.venv$ -^sandbox.R$ \ No newline at end of file +^sandbox.R$ +^README.Rmd$ +^README_cache$ +^pkgdown-watch.R$ +^scrap.Rmd$ \ No newline at end of file diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..5ed5ba34 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# EditorConfig helps developers define and maintain consistent +# coding styles between different editors and IDEs +# editorconfig.org + +root = true + + +[*] + +# Change these settings to your own preference +indent_style = space +indent_size = 2 + +# We recommend you to keep these unchanged +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8dc001be..4ac3061c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,9 @@ docs renv/ renv.lock .Rprofile -sandbox.R \ No newline at end of file +sandbox.R +# Vignette caches +*_cache/ +vignettes/*.html +vignettes/*.R +!vignettes/_common.R diff --git a/DESCRIPTION b/DESCRIPTION index b021ec3d..a6ea0289 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ -Type: Package Package: epiprocess +Type: Package Title: Tools for basic signal processing in epidemiology Version: 0.9.5 Authors@R: c( @@ -28,11 +28,11 @@ Authors@R: c( person("Carnegie Mellon University Delphi Group", role = "dtc", comment = "Owner of claims-based CLI data from the Delphi Epidata API") ) -Description: This package introduces a common data structure for - epidemiological data reported by location and time, provides another - data structure to work with revisions to these data sets over time, - and offers associated utilities to perform basic signal processing - tasks. +Description: This package introduces common data structures for working with + epidemiological data reported by location and time and offers associated + utilities to perform basic signal processing tasks. The package is designed + to be used in conjunction with `epipredict` for building and evaluating + epidemiological models. License: MIT + file LICENSE URL: https://cmu-delphi.github.io/epiprocess/ Depends: @@ -62,6 +62,7 @@ Imports: Suggests: devtools, epidatr, + here, knitr, outbreaks, readr, @@ -88,7 +89,7 @@ Collate: 'correlation.R' 'epi_df.R' 'epi_df_forbidden_methods.R' - 'epiprocess.R' + 'epiprocess-package.R' 'group_by_epi_df_methods.R' 'methods-epi_archive.R' 'grouped_epi_archive.R' diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 46fa3125..60265a84 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -1,10 +1,8 @@ ## Setting up the development environment ```r -install.packages(c('devtools', 'pkgdown', 'styler', 'lintr')) # install dev dependencies -devtools::install_deps(dependencies = TRUE) # install package dependencies -devtools::document() # generate package meta data and man files -devtools::build() # build package +install.packages(c('devtools', 'pkgdown', 'styler', 'lintr', 'pak')) # install dev dependencies +pak::pkg_install(".") # install package and dependencies ``` ## Validating the package @@ -13,8 +11,12 @@ devtools::build() # build package styler::style_pkg() # format code lintr::lint_package() # lint code +devtools::check() # run R CMD check, which runs everything below +devtools::document() # generate package meta data and man files devtools::test() # test package -devtools::check() # check package for errors +devtools::build_vignettes() # build vignettes only +devtools::run_examples() # run doc examples +devtools::check(vignettes = FALSE) # check package without vignettes ``` ## Developing the documentation site @@ -24,20 +26,16 @@ Our CI builds two version of the documentation: - https://cmu-delphi.github.io/epiprocess/ from the `main` branch and - https://cmu-delphi.github.io/epiprocess/dev from the `dev` branch. -The documentation site can be previewed locally by running in R: - -```r -# Should automatically open a browser -pkgdown::build_site(preview=TRUE) -``` - -If the above does not open a browser, you can try using a Python server from the -command line: +We include the script `pkgdown-watch.R` that will automatically rebuild the +documentation locally and preview it. It can be used with: -```bash -R -e 'devtools::document()' -R -e 'pkgdown::build_site()' -python -m http.server -d docs +```sh +# Make sure you have servr installed +R -e 'renv::install("servr")' +# Will start a local server +Rscript pkgdown-watch.R +# You may need to first build the site with +R -e 'pkgdown::build_site(".", examples = FALSE, devel = TRUE, preview = FALSE)' ``` ## Versioning diff --git a/NAMESPACE b/NAMESPACE index aa136af5..1fd65d37 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,8 +42,6 @@ S3method(key_colnames,default) S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(mean,epi_df) -S3method(next_after,Date) -S3method(next_after,integer) S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) @@ -65,6 +63,7 @@ export(complete) export(covid_case_death_rates_extended) export(covid_incidence_county_subset) export(covid_incidence_outliers) +export(deprecated_quo_is_present) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -89,11 +88,9 @@ export(guess_period) export(is_epi_df) export(is_grouped_epi_archive) export(key_colnames) -export(max_version_with_row_in) export(mutate) export(new_epi_archive) export(new_epi_df) -export(next_after) export(relocate) export(rename) export(revision_summary) diff --git a/R/archive.R b/R/archive.R index d8102165..a50529a2 100644 --- a/R/archive.R +++ b/R/archive.R @@ -9,8 +9,9 @@ #' Validate a version bound arg #' -#' Expected to be used on `clobberable_versions_start`, `versions_end`, -#' and similar arguments. Some additional context-specific checks may be needed. +#' Expected to be used on `clobberable_versions_start`, `versions_end`, and +#' similar arguments. Some additional context-specific checks may be needed. +#' Side effects: raises an error if version bound appears invalid. #' #' @param version_bound the version bound to validate #' @param x a data frame containing a version column with which to check @@ -20,9 +21,7 @@ #' @param version_bound_arg optional string; what to call the version bound in #' error messages #' -#' @section Side effects: raises an error if version bound appears invalid -#' -#' @noRd +#' @keywords internal validate_version_bound <- function(version_bound, x, na_ok = FALSE, version_bound_arg = rlang::caller_arg(version_bound), x_arg = rlang::caller_arg(x)) { @@ -77,7 +76,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, #' #' @importFrom checkmate check_names #' -#' @export +#' @keywords internal max_version_with_row_in <- function(x) { if (nrow(x) == 0L) { cli_abort( @@ -108,72 +107,71 @@ max_version_with_row_in <- function(x) { #' @param x the starting "value"(s) #' @return same class, typeof, and length as `x` #' -#' @export +#' @keywords internal next_after <- function(x) UseMethod("next_after") -#' @export +#' @keywords internal next_after.integer <- function(x) x + 1L -#' @export +#' @keywords internal next_after.Date <- function(x) x + 1L -#' Compactify -#' -#' This section describes the internals of how compactification works in an -#' `epi_archive()`. Compactification can potentially improve code speed or -#' memory usage, depending on your data. -#' -#' In general, the last version of each observation is carried forward (LOCF) to -#' fill in data between recorded versions, and between the last recorded -#' update and the `versions_end`. One consequence is that the `DT` doesn't -#' have to contain a full snapshot of every version (although this generally -#' works), but can instead contain only the rows that are new or changed from -#' the previous version (see `compactify`, which does this automatically). -#' Currently, deletions must be represented as revising a row to a special -#' state (e.g., making the entries `NA` or including a special column that -#' flags the data as removed and performing some kind of post-processing), and -#' the archive is unaware of what this state is. Note that `NA`s *can* be -#' introduced by `epi_archive` methods for other reasons, e.g., in -#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to -#' represent potential update data that we do not yet have access to; or in -#' [`epix_merge`] to represent the "value" of an observation before the -#' version in which it was first released, or if no version of that -#' observation appears in the archive data at all. -#' -#' @name compactify -NULL - - -#' Epi Archive +#' `epi_archive` object #' -#' @title `epi_archive` object +#' @description The second main data structure for storing time series in +#' `epiprocess`. It is similar to `epi_df` in that it fundamentally a table with +#' a few required columns that stores epidemiological time series data. An +#' `epi_archive` requires a `geo_value`, `time_value`, and `version` column (and +#' possibly other key columns) along with measurement values. In brief, an +#' `epi_archive` is a history of the time series data, where the `version` +#' column tracks the time at which the data was available. This allows for +#' version-aware forecasting. #' -#' @description An `epi_archive` is an S3 class which contains a data table -#' along with several relevant pieces of metadata. The data table can be seen -#' as the full archive (version history) for some signal variables of -#' interest. +#' `new_epi_archive` is the constructor for `epi_archive` objects that assumes +#' all arguments have been validated. Most users should use `as_epi_archive`. #' -#' @details An `epi_archive` contains a data table `DT`, of class `data.table` -#' from the `data.table` package, with (at least) the following columns: +#' @details An `epi_archive` contains a `data.table` object `DT` (from the +#' `{data.table}` package), with (at least) the following columns: #' -#' * `geo_value`: the geographic value associated with each row of measurements. -#' * `time_value`: the time value associated with each row of measurements. +#' * `geo_value`: the geographic value associated with each row of measurements, +#' * `time_value`: the time value associated with each row of measurements, #' * `version`: the time value specifying the version for each row of #' measurements. For example, if in a given row the `version` is January 15, #' 2022 and `time_value` is January 14, 2022, then this row contains the #' measurements of the data for January 14, 2022 that were available one day #' later. #' -#' The data table `DT` has key variables `geo_value`, `time_value`, `version`, -#' as well as any others (these can be specified when instantiating the -#' `epi_archive` object via the `other_keys` argument, and/or set by operating -#' on `DT` directly). Note that there can only be a single row per unique -#' combination of key variables. +#' The variables `geo_value`, `time_value`, `version` serve as key variables for +#' the data table (in addition to any other keys specified in the metadata). +#' There can only be a single row per unique combination of key variables. The +#' keys for an `epi_archive` can be viewed with `key(epi_archive$DT)`. +#' +#' ## Compactification +#' +#' By default, an `epi_archive` will compactify the data table to remove +#' redundant rows. This is done by not storing rows that have the same value, +#' except for the `version` column (this is essentially a last observation +#' carried forward, but along the version index). This is done to save space and +#' improve performance. If you do not want to compactify the data, you can set +#' `compactify = FALSE` in `as_epi_archive()`. +#' +#' Note that in some data scenarios, LOCF may not be appropriate. For instance, +#' if you expected data to be updated on a given day, but your data source did +#' not update, then it could be reasonable to code the data as `NA` for that +#' day, instead of assuming LOCF. +#' +#' `NA`s *can* be introduced by `epi_archive` methods for other +#' reasons, e.g., in [`epix_fill_through_version`] and [`epix_merge`], if +#' requested, to represent potential update data that we do not yet have access +#' to; or in [`epix_merge`] to represent the "value" of an observation before +#' the version in which it was first released, or if no version of that +#' observation appears in the archive data at all. +#' +#' ## Metadata #' -#' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: #' @@ -187,20 +185,6 @@ NULL #' archive. Unexpected behavior may result from modifying the metadata #' directly. #' -#' @section Generating Snapshots: -#' An `epi_archive` object can be used to generate a snapshot of the data in -#' `epi_df` format, which represents the most up-to-date time series values up -#' to a point in time. This is accomplished by calling `epix_as_of()`. -#' -#' @section Sliding Computations: -#' We can run a sliding computation over an `epi_archive` object, much like -#' `epi_slide()` does for an `epi_df` object. This is accomplished by calling -#' the `slide()` method for an `epi_archive` object, which works similarly to -#' the way `epi_slide()` works for an `epi_df` object, but with one key -#' difference: it is version-aware. That is, for an `epi_archive` object, the -#' sliding computation at any given reference time point t is performed on -#' **data that would have been available as of t**. -#' #' @param x A data.frame, data.table, or tibble, with columns `geo_value`, #' `time_value`, `version`, and then any additional number of columns. #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the @@ -239,9 +223,11 @@ NULL #' value of `clobberable_versions_start` does not fully trust these empty #' updates, and assumes that any version `>= max(x$version)` could be #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. -#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification +#' @param compactify_tol double. the tolerance used to detect approximate +#' equality for compactification #' @return An `epi_archive` object. #' +#' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`] #' @importFrom data.table as.data.table key setkeyv #' @importFrom dplyr if_any if_all everything #' @importFrom utils capture.output @@ -356,12 +342,13 @@ new_epi_archive <- function( ) } -#' given a tibble as would be found in an epi_archive, remove duplicate entries. -#' @description -#' works by shifting all rows except the version, then comparing values to see +#' Given a tibble as would be found in an epi_archive, remove duplicate entries. +#' +#' Works by shifting all rows except the version, then comparing values to see #' if they've changed. We need to arrange in descending order, but note that #' we don't need to group, since at least one column other than version has #' changed, and so is kept. +#' #' @keywords internal #' @importFrom dplyr filter apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) { @@ -466,6 +453,7 @@ validate_epi_archive <- function( #' `as_epi_archive` converts a data frame, data table, or tibble into an #' `epi_archive` object. +#' #' @param ... used for specifying column names, as in [`dplyr::rename`]. For #' example `version = release_date` #' @param .versions_end location based versions_end, used to avoid prefix diff --git a/R/epi_df.R b/R/epi_df.R index 070ddb06..66ccd88a 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -1,8 +1,10 @@ #' `epi_df` object #' -#' An `epi_df` is a tibble with certain minimal column structure and metadata. -#' It can be seen as a snapshot of a data set that contains the most -#' up-to-date values of some signal variables of interest, as of a given time. +#' One of the two main data structures for storing time series in `epiprocess`. +#' It is simply tibble with at least two columns, `geo_value` and `time_value`, +#' that provide the keys for the time series. It can have any other columns, +#' which can be seen as measured variables at each key. In brief, an `epi_df` +#' represents a snapshot of an epidemiological data set at a point in time. #' #' @details An `epi_df` is a tibble with (at least) the following columns: #' @@ -40,6 +42,9 @@ #' single snapshot of a data set that contains the most up-to-date values of #' the signals variables, as of the time specified in the `as_of` field. #' +#' If an `epi_df` ever loses its `geo_value` or `time_value` columns, it will +#' decay into a regular tibble. +#' #' A companion object is the `epi_archive` object, which contains the full #' version history of a given data set. Revisions are common in many types of #' epidemiological data streams, and paying attention to data revisions can be @@ -49,7 +54,8 @@ #' generate `epi_df` objects, as data snapshots, from an `epi_archive` #' object). #' -#' @section Geo Types: +#' ## Geo Types +#' #' The following geo types are recognized in an `epi_df`. #' #' * `"county"`: each observation corresponds to a U.S. county; coded by 5-digit @@ -67,7 +73,8 @@ #' #' An unrecognizable geo type is labeled "custom". #' -#' @section Time Types: +#' ## Time Types +#' #' The following time types are recognized in an `epi_df`. #' #' * `"day"`: each observation corresponds to a day; coded as a `Date` object, @@ -85,33 +92,30 @@ #' @examples #' # Convert a `tsibble` that has county code as an extra key #' # Notice that county code should be a character string to preserve any leading zeroes -#' #' ex1_input <- tibble::tibble( -#' geo_value = rep(c("ca", "fl", "pa"), each = 3), -#' county_code = c( +#' geo_value = c( #' "06059", "06061", "06067", #' "12111", "12113", "12117", #' "42101", "42103", "42105" #' ), +#' state_name = rep(c("ca", "fl", "pa"), each = 3), #' time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), #' by = "day" #' ), length.out = length(geo_value)), #' value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) #' ) %>% -#' tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) +#' tsibble::as_tsibble(index = time_value, key = c(geo_value, state_name)) #' -#' # The `other_keys` metadata (`"county_code"` in this case) is automatically +#' # The `other_keys` metadata (`"state_name"` in this case) is automatically #' # inferred from the `tsibble`'s `key`: #' ex1 <- as_epi_df(x = ex1_input, as_of = "2020-06-03") #' attr(ex1, "metadata")[["other_keys"]] #' -#' #' # Dealing with misspecified column names: #' # Geographical and temporal information must be provided in columns named #' # `geo_value` and `time_value`; if we start from a data frame with a #' # different format, it must be converted to use `geo_value` and `time_value` #' # before calling `as_epi_df`. -#' #' ex2_input <- tibble::tibble( #' state = rep(c("ca", "fl", "pa"), each = 3), # misnamed #' pol = rep(c("blue", "swing", "swing"), each = 3), # extra key @@ -120,7 +124,6 @@ #' ), length.out = length(state)), # misnamed #' value = 1:length(state) + 0.01 * rnorm(length(state)) #' ) -#' #' print(ex2_input) #' #' ex2 <- ex2_input %>% @@ -129,12 +132,9 @@ #' as_of = "2020-06-03", #' other_keys = "pol" #' ) -#' #' attr(ex2, "metadata") #' -#' #' # Adding additional keys to an `epi_df` object -#' #' ex3_input <- covid_incidence_county_subset %>% #' dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% #' dplyr::slice_tail(n = 6) @@ -149,6 +149,10 @@ #' as_epi_df(other_keys = c("state", "pol")) #' #' attr(ex3, "metadata") +#' +#' # Decays to a tibble +#' covid_incidence_county_subset %>% +#' select(-geo_value) NULL #' @describeIn epi_df Lower-level constructor for `epi_df` object @@ -298,7 +302,7 @@ as_epi_df.tbl_df <- function( #' @method as_epi_df data.frame #' @export as_epi_df.data.frame <- function(x, as_of, other_keys = character(), ...) { - as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...) + as_epi_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...) } #' @rdname epi_df @@ -310,7 +314,7 @@ as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) { if (length(tsibble_other_keys) > 0) { other_keys <- unique(c(other_keys, tsibble_other_keys)) } - as_epi_df.tbl_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...) + as_epi_df(x = tibble::as_tibble(x), as_of = as_of, other_keys = other_keys, ...) } #' Test for `epi_df` format @@ -318,6 +322,7 @@ as_epi_df.tbl_ts <- function(x, as_of, other_keys = character(), ...) { #' @param x An object. #' @return `TRUE` if the object inherits from `epi_df`. #' +#' @rdname epi_df #' @export is_epi_df <- function(x) { inherits(x, "epi_df") diff --git a/R/epiprocess-package.R b/R/epiprocess-package.R new file mode 100644 index 00000000..200e6f64 --- /dev/null +++ b/R/epiprocess-package.R @@ -0,0 +1,23 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @import epidatasets +#' @importFrom checkmate anyInfinite anyMissing assert assert_character +#' @importFrom checkmate assert_class assert_data_frame assert_int assert_list +#' @importFrom checkmate assert_logical assert_numeric assert_scalar checkInt +#' @importFrom checkmate check_atomic check_data_frame expect_class test_int +#' @importFrom checkmate test_subset test_set_equal vname +#' @importFrom cli cli_abort cli_warn +#' @importFrom lifecycle deprecated +#' @importFrom rlang %||% +## usethis namespace: end +NULL + +utils::globalVariables(c( + ".x", ".group_key", ".ref_time_value", "resid", + "fitted", ".response", "geo_value", "time_value", + "value", ".real", "lag", "max_value", "min_value", + "median_value", "spread", "rel_spread", "time_to", + "time_near_latest", "n_revisions" +)) diff --git a/R/epiprocess.R b/R/epiprocess.R deleted file mode 100644 index 147d4ef9..00000000 --- a/R/epiprocess.R +++ /dev/null @@ -1,24 +0,0 @@ -#' epiprocess: Tools for basic signal processing in epidemiology -#' -#' This package introduces a common data structure for epidemiological data sets -#' measured over space and time, and offers associated utilities to perform -#' basic signal processing tasks. -#' -#' @importFrom checkmate assert assert_scalar assert_data_frame anyMissing -#' assert_logical assert_list assert_character assert_class -#' assert_int assert_numeric check_data_frame vname check_atomic -#' anyInfinite test_subset test_set_equal checkInt expect_class -#' test_int -#' @importFrom cli cli_abort cli_warn -#' @importFrom rlang %||% -#' @importFrom lifecycle deprecated -#' @import epidatasets -#' @name epiprocess -"_PACKAGE" -utils::globalVariables(c( - ".x", ".group_key", ".ref_time_value", "resid", - "fitted", ".response", "geo_value", "time_value", - "value", ".real", "lag", "max_value", "min_value", - "median_value", "spread", "rel_spread", "time_to", - "time_near_latest", "n_revisions" -)) diff --git a/R/growth_rate.R b/R/growth_rate.R index b9b9a440..307309b5 100644 --- a/R/growth_rate.R +++ b/R/growth_rate.R @@ -56,7 +56,8 @@ #' `genlasso::trendfilter()`, divided by the fitted value of the discrete #' spline at `x0`. #' -#' @section Log Scale: +#' ## Log Scale +#' #' An alternative view for the growth rate of a function f in general is given #' by defining g(t) = log(f(t)), and then observing that g'(t) = f'(t) / #' f(t). Therefore, any method that estimates the derivative can be simply @@ -65,7 +66,8 @@ #' "trend_filter") has a log scale analog, which can be used by setting #' `log_scale = TRUE`. #' -#' @section Sliding Windows: +#' ## Sliding Windows +#' #' For the local methods, "rel_change" and "linear_reg", we use a sliding window #' centered at the reference point of bandiwidth `h`. In other words, the #' sliding window consists of all points in `x` whose distance to the @@ -75,7 +77,8 @@ #' sliding window contains all data in between January 1 and 14 (matching the #' behavior of `epi_slide()` with `before = h - 1` and `after = h`). #' -#' @section Additional Arguments: +#' ## Additional Arguments +#' #' For the global methods, "smooth_spline" and "trend_filter", additional #' arguments can be specified via `...` for the underlying estimation #' function. For the smoothing spline case, these additional arguments are diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 0304d9a6..d16b1485 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -138,33 +138,45 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, #' Fill `epi_archive` unobserved history #' #' @description -#' Sometimes, due to upstream data pipeline issues, we have to work with a -#' version history that isn't completely up to date, but with functions that -#' expect archives that are completely up to date, or equally as up-to-date as -#' another archive. This function provides one way to approach such mismatches: -#' pretend that we've "observed" additional versions, filling in these versions -#' with NAs or extrapolated values. +#' This function fills in missing version history in an `epi_archive` object up +#' to a specified version, updating the `versions_end` field as necessary. Note +#' that the filling is done in a compactified way, see details. #' #' @param x An `epi_archive` -#' @param fill_versions_end Length-1, same class&type as `x$version`: the -#' version through which to fill in missing version history; this will be the -#' result's `$versions_end` unless it already had a later -#' `$versions_end`. -#' @param how Optional; `"na"` or `"locf"`: `"na"` will fill in any missing -#' required version history with `NA`s, by inserting (if necessary) an update -#' immediately after the current `$versions_end` that revises all -#' existing measurements to be `NA` (this is only supported for `version` -#' classes with a `next_after` implementation); `"locf"` will fill in missing -#' version history with the last version of each observation carried forward -#' (LOCF), by leaving the update `$DT` alone (other `epi_archive` methods are -#' based on LOCF). Default is `"na"`. +#' @param fill_versions_end a scalar of the same class&type as `x$version`: the +#' version through which to fill in missing version history; the +#' `epi_archive`'s `versions_end` attribute will be set to this, unless it +#' already had a later `$versions_end`. +#' @param how Optional; `"na"` or `"locf"`: `"na"` fills missing version history +#' with `NA`s, `"locf"` fills missing version history with the last version of +#' each observation carried forward (LOCF). Default is `"na"`. +#' @return An `epi_archive` +#' @details +#' Note that we generally store `epi_archive`'s in a compacted form, meaning +#' that, implciitly, if a version does not exist, but the `version_end` +#' attribute is greater, then it is understood that all the versions in between +#' had the same value as the last observed version. This affects the behavior of +#' this function in the following ways: +#' +#' - if `how = "na"`, then the function will fill in at most one missing version +#' with `NA` and the rest will be implicit. +#' - if `how = "locf"`, then the function will not fill any values. #' #' @importFrom data.table copy ":=" #' @importFrom rlang arg_match #' @return An `epi_archive` #' @export -epix_fill_through_version <- function(x, fill_versions_end, - how = c("na", "locf")) { +#' @examples +#' test_date <- as.Date("2020-01-01") +#' ea_orig <- as_epi_archive(data.table::data.table( +#' geo_value = "ak", +#' time_value = test_date + c(rep(0L, 5L), 1L), +#' version = test_date + c(1:5, 2L), +#' value = 1:6 +#' )) +#' epix_fill_through_version(ea_orig, test_date + 8, "na") +#' epix_fill_through_version(ea_orig, test_date + 8, "locf") +epix_fill_through_version <- function(x, fill_versions_end, how = c("na", "locf")) { assert_class(x, "epi_archive") validate_version_bound(fill_versions_end, x$DT, na_ok = FALSE) @@ -233,24 +245,38 @@ epix_fill_through_version <- function(x, fill_versions_end, #' parameter controls what is done. #' #' @param x,y Two `epi_archive` objects to join together. -#' @param sync Optional; `"forbid"`, `"na"`, `"locf"`, or `"truncate"`; in the -#' case that `x$versions_end` doesn't match `y$versions_end`, what do we do?: -#' `"forbid"`: emit an error; "na": use `max(x$versions_end, y$versions_end)` -#' as the result's `versions_end`, but ensure that, if we request a snapshot -#' as of a version after `min(x$versions_end, y$versions_end)`, the -#' observation columns from the less up-to-date archive will be all NAs (i.e., -#' imagine there was an update immediately after its `versions_end` which -#' revised all observations to be `NA`); `"locf"`: use `max(x$versions_end, -#' y$versions_end)` as the result's `versions_end`, allowing the last version -#' of each observation to be carried forward to extrapolate unavailable -#' versions for the less up-to-date input archive (i.e., imagining that in the -#' less up-to-date archive's data set remained unchanged between its actual -#' `versions_end` and the other archive's `versions_end`); or `"truncate"`: -#' use `min(x$versions_end, y$versions_end)` as the result's `versions_end`, -#' and discard any rows containing update rows for later versions. -#' @param compactify Optional; `TRUE`, `FALSE`, or `NULL`; should the result be -#' compactified? See `as_epi_archive()` for an explanation of what this means. -#' Default here is `TRUE`. +#' @param sync Optional; character. The argument that decides how to handle the +#' situation when one signal has a more recent revision than another signal +#' for a key that they have both already observed. The options are: +#' +#' - `"forbid"`: the default and the strictest option, throws an error; this +#' is likely not what you want, but it is strict to make the user aware of the +#' issues, +#' - `"locf"`: carry forward the last observed version of the missing signal +#' to the new version and use `max(x$versions_end, y$versions_end)` as the +#' result's `versions_end`, +#' - `"na"`: fill the unobserved values with `NA`'s (this can be handy when +#' you know that source data is truly missing upstream and you want to +#' represent the lack of information accurately, for instance) and use +#' `max(x$versions_end, y$versions_end)` as the result's `versions_end`, +#' - `"truncate"`: discard any rows containing update rows for later versions +#' and use `min(x$versions_end, y$versions_end)` as the result's +#' `versions_end`. +#' +#' @param compactify Optional; `TRUE` (default), `FALSE`, or `NULL`; should the +#' result be compactified? See `as_epi_archive()` for details. +#' @details +#' When merging archives, unless the archives have identical data release +#' patterns, we often have to handle the situation when one signal has a more +#' recent observation for a key than another signal. In this case, we have two +#' options: +#' +#' - if the the other signal has never observed that key, we need to introduce +#' `NA`s in the non-key variables for the missing signal, +#' - if the other signal has observed that key previously, but at an ealier +#' revision date, then we need to decide how to handle the missing value in the +#' more recent signal; the `sync` argument controls this behavior. +#' #' @return the resulting `epi_archive` #' #' @details In all cases, `clobberable_versions_start` will be set to the @@ -265,18 +291,14 @@ epix_fill_through_version <- function(x, fill_versions_end, #' version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-02")), #' signal1 = c(10, 11, 7) #' ) -#' #' s2 <- tibble::tibble( #' geo_value = c("ca", "ca"), #' time_value = as.Date(c("2024-08-01", "2024-08-02")), #' version = as.Date(c("2024-08-03", "2024-08-03")), #' signal2 = c(2, 3) #' ) -#' -#' #' s1 <- s1 %>% as_epi_archive() #' s2 <- s2 %>% as_epi_archive() -#' #' merged <- epix_merge(s1, s2, sync = "locf") #' merged[["DT"]] #' @@ -288,18 +310,14 @@ epix_fill_through_version <- function(x, fill_versions_end, #' version = as.Date(c("2024-08-01", "2024-08-03", "2024-08-03", "2024-08-03")), #' signal1 = c(12, 13, 22, 19) #' ) -#' #' s2 <- tibble::tibble( #' geo_value = c("ca", "ca"), #' time_value = as.Date(c("2024-08-01", "2024-08-02")), #' version = as.Date(c("2024-08-02", "2024-08-02")), #' signal2 = c(4, 5), #' ) -#' -#' #' s1 <- s1 %>% as_epi_archive() #' s2 <- s2 %>% as_epi_archive() -#' #' merged <- epix_merge(s1, s2, sync = "locf") #' merged[["DT"]] #' @@ -311,7 +329,6 @@ epix_fill_through_version <- function(x, fill_versions_end, #' version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), #' signal1 = c(14, 11, 9) #' ) -#' #' # The s2 signal at August 1st gets revised from 3 to 5 on August 3rd #' s2 <- tibble::tibble( #' geo_value = c("ca", "ca", "ca"), @@ -319,11 +336,8 @@ epix_fill_through_version <- function(x, fill_versions_end, #' version = as.Date(c("2024-08-02", "2024-08-03", "2024-08-03")), #' signal2 = c(3, 5, 2), #' ) -#' #' s1 <- s1 %>% as_epi_archive() #' s2 <- s2 %>% as_epi_archive() -#' -#' # Some LOCF for signal 1 as signal 2 gets updated #' merged <- epix_merge(s1, s2, sync = "locf") #' merged[["DT"]] #' @importFrom data.table key set setkeyv diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 901b9b32..84a75e46 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -7,10 +7,8 @@ #' `as_tibble()` on `epi_df`s but you actually want them to remain `epi_df`s, #' use `attr(your_epi_df, "decay_to_tibble") <- FALSE` beforehand. #' -#' @template x -#' +#' @param x an `epi_df` #' @inheritParams tibble::as_tibble -#' #' @importFrom tibble as_tibble #' @export as_tibble.epi_df <- function(x, ...) { @@ -34,7 +32,7 @@ as_tibble.epi_df <- function(x, ...) { #' others in the `other_keys` field of the metadata, or else explicitly set. #' #' @method as_tsibble epi_df -#' @template x +#' @param x an `epi_df` #' @param key Optional. Any additional keys (other than `geo_value`) to add to #' the `tsibble`. #' @param ... additional arguments passed on to `tsibble::as_tsibble()` @@ -54,8 +52,7 @@ as_tsibble.epi_df <- function(x, key, ...) { #' #' Print and summary functions for an `epi_df` object. #' -#' @template x -#' +#' @param x an `epi_df` #' @method print epi_df #' @param ... additional arguments to forward to `NextMethod()`, or unused #' @export @@ -89,6 +86,7 @@ print.epi_df <- function(x, ...) { #' @method summary epi_df #' @importFrom rlang .data #' @importFrom stats median +#' @rdname print.epi_df #' @export summary.epi_df <- function(object, ...) { cat("An `epi_df` x, with metadata:\n") @@ -123,7 +121,7 @@ summary.epi_df <- function(object, ...) { #' @return `x` with any metadata dropped and the `"epi_df"` class, if previously #' present, dropped #' -#' @noRd +#' @keywords internal decay_epi_df <- function(x) { attributes(x)$metadata <- NULL class(x) <- class(x)[class(x) != "epi_df"] @@ -140,6 +138,8 @@ decay_epi_df <- function(x) { # We'll implement `[` to allow either 1d or 2d. We'll also implement some other # methods where we want to (try to) maintain an `epi_df`. +#' dplyr_reconstruct +#' #' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may #' directly feed in latter from our other methods) #' @param template `epi_df` template to use to restore @@ -147,7 +147,7 @@ decay_epi_df <- function(x) { #' @importFrom dplyr dplyr_reconstruct #' @importFrom cli cli_vec #' @export -#' @noRd +#' @keywords internal dplyr_reconstruct.epi_df <- function(data, template) { # Start from a reconstruction for the backing S3 classes; this ensures that we # keep any grouping that has been applied: @@ -258,9 +258,9 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { #' Complete epi_df #' -#' A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function +#' A `tidyr::complete()` analogue for `epi_df`` objects. This function #' can be used, for example, to add rows for missing combinations -#' of ‘geo_value’ and ‘time_value’, filling other columns with `NA`s. +#' of `geo_value` and `time_value`, filling other columns with `NA`s. #' See the examples for usage details. #' #' @param data an `epi_df` diff --git a/R/outliers.R b/R/outliers.R index 43c41d6e..68e921bb 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -1,12 +1,15 @@ #' Detect outliers #' -#' Applies one or more outlier detection methods to a given signal variable, and +#' @description Applies one or more outlier detection methods to a given signal variable, and #' optionally aggregates the outputs to create a consensus result. See the #' [outliers #' vignette](https://cmu-delphi.github.io/epiprocess/articles/outliers.html) for #' examples. #' -#' @template x-y +#' @param x Design points corresponding to the signal values `y`. Default is +#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of +#' `y`). +#' @param y Signal values. #' @param methods A tibble specifying the method(s) to use for outlier #' detection, with one row per method, and the following columns: #' * `method`: Either "rm" or "stl", or a custom function for outlier @@ -22,7 +25,9 @@ #' summarized results are calculated. Note that if the number of `methods` #' (number of rows) is odd, then "median" is equivalent to a majority vote for #' purposes of determining whether a given observation is an outlier. -#' @template detect-outlr-return +#' @return An tibble with number of rows equal to `length(y)` and columns +#' giving the outlier detection thresholds (`lower` and `upper`) and +#' replacement values from each detection method (`replacement`). #' #' @details Each outlier detection method, one per row of the passed `methods` #' tibble, is a function that must take as its first two arguments `x` and @@ -38,6 +43,7 @@ #' "stl", shorthand for `detect_outlr_stl()`, which detects outliers via an #' STL decomposition. #' +#' @rdname detect_outlr #' @export #' @importFrom dplyr select #' @examples @@ -138,20 +144,18 @@ detect_outlr <- function(x = seq_along(y), y, return(results) } -#' Detect outliers based on a rolling median +#' @description `detect_outlr_rm` detects outliers based on a distance from the +#' rolling median specified in terms of multiples of the rolling interquartile +#' range (IQR). #' -#' Detects outliers based on a distance from the rolling median specified in -#' terms of multiples of the rolling interquartile range (IQR). -#' -#' @template x-y #' @param n Number of time steps to use in the rolling window. Default is 21. #' This value is centrally aligned. When `n` is an odd number, the rolling #' window extends from `(n-1)/2` time steps before each design point to `(n-1)/2` #' time steps after. When `n` is even, then the rolling range extends from #' `n/2-1` time steps before to `n/2` time steps after. #' @template outlier-detection-options -#' @template detect-outlr-return #' +#' @rdname detect_outlr #' @export #' @examples #' # Detect outliers based on a rolling median @@ -208,11 +212,9 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, return(z) } -#' Detect outliers based on an STL decomposition -#' -#' Detects outliers based on a seasonal-trend decomposition using LOESS (STL). +#' @description `detect_outlr_stl` detects outliers based on a seasonal-trend +#' decomposition using LOESS (STL). #' -#' @template x-y #' @param n_trend Number of time steps to use in the rolling window for trend. #' Default is 21. #' @param n_seasonal Number of time steps to use in the rolling window for @@ -233,7 +235,6 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' `seasonal_period` will still have an impact on the result, though, by #' impacting the estimation of the trend component. #' @template outlier-detection-options -#' @template detect-outlr-return #' #' @details The STL decomposition is computed using [`stats::stl()`]. Once #' computed, the outlier detection method is analogous to the rolling median @@ -244,6 +245,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' The last set of arguments, `log_transform` through `replacement_multiplier`, #' are exactly as in `detect_outlr_rm()`. #' +#' @rdname detect_outlr #' @importFrom stats median #' @importFrom tidyselect starts_with #' @export diff --git a/R/revision_analysis.R b/R/revision_analysis.R index 7be0cd24..27944489 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -1,9 +1,11 @@ -#' A function to describe revision behavior for an archive +#' A function to describe revision behavior for an archive. +#' #' @description #' `revision_summary` removes all missing values (if requested), and then #' computes some basic statistics about the revision behavior of an archive, -#' returning a tibble summarizing the revisions per time_value+epi_key features. If `print_inform` is true, it -#' prints a concise summary. The columns returned are: +#' returning a tibble summarizing the revisions per time_value+epi_key +#' features. If `print_inform` is true, it prints a concise summary. The +#' columns returned are: #' 1. `n_revisions`: the total number of revisions for that entry #' 2. `min_lag`: the minimum time to any value (if `drop_nas=FALSE`, this #' includes `NA`'s) @@ -17,11 +19,12 @@ #' 8. `rel_spread`: `spread` divided by the largest value (so it will #' always be less than 1). Note that this need not be the final value. It will #' be `NA` whenever `spread` is 0. -#' 9. `time_near_latest`: This gives the lag when the value is within -#' `within_latest` (default 20%) of the value at the latest time. For example, -#' consider the series (0,20, 99, 150, 102, 100); then `time_near_latest` is -#' the 5th index, since even though 99 is within 20%, it is outside the window -#' afterwards at 150. +#' 9. `time_near_latest`: the time taken for the revisions to settle to within +#' `within_latest` (default 20%) of the final value and stay there. For +#' example, consider the series (0, 20, 99, 150, 102, 100); then +#' `time_near_latest` is 5, since even though 99 is within 20%, it is outside +#' the window afterwards at 150. +#' #' @param epi_arch an epi_archive to be analyzed #' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to #' summarize. If empty, it chooses the first. Currently only implemented for @@ -57,11 +60,11 @@ #' @param compactify_tol float, used if `drop_nas=TRUE`, it determines the #' threshold for when two floats are considered identical. #' @param should_compactify bool. Compactify if `TRUE`. -#' @examples #' +#' @examples #' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) -#' #' revision_example %>% arrange(desc(spread)) +#' #' @export #' @importFrom cli cli_inform cli_abort cli_li #' @importFrom rlang list2 syms diff --git a/R/slide.R b/R/slide.R index c792187e..62a0d03e 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,91 +1,124 @@ #' Slide a function over variables in an `epi_df` object #' -#' Slides a given function over variables in an `epi_df` object. See the -#' [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) -#' for examples. +#' @description Slides a given function over variables in an `epi_df` object. +#' This is useful for computations like rolling averages. The function supports +#' many ways to specify the computation, but by far the most common use case is +#' as follows: +#' +#' ``` +#' # To compute the 7-day trailing average of cases +#' epi_slide(edf, cases_7dav = mean(cases), .window_size = 7) +#' ``` +#' +#' This will create the new column `cases_7dav` that contains a 7-day rolling +#' average of values in "cases". See `vignette("epi_df")` for more examples. #' #' @template basic-slide-params #' @param .f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation within a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `.window_size` and `.align` parameters, see the details -#' section for more. If a function, `.f` must have the form `function(x, g, t, -#' ...)`, where +#' computation to slide. The return of the computation should either be a +#' scalar or a 1-row data frame. Data frame returns will be +#' `tidyr::unpack()`-ed, if named, and will be [`tidyr::pack`]-ed columns, if +#' not named. See examples. +#' +#' - If `.f` is missing, then `...` will specify the computation via +#' tidy-evaluation. This is usually the most convenient way to use +#' `epi_slide`. See examples. +#' - If `.f` is a formula, then the formula should use `.x` (not the same as +#' the input `epi_df`) to operate on the columns of the input `epi_df`, e.g. +#' `~mean(.x$var)` to compute a mean of `var`. +#' - If a function, `.f` must have the form `function(x, g, t, ...)`, where: +#' - `x` is a data frame with the same column names as the original object, +#' minus any grouping variables, with only the windowed data for one +#' group-`.ref_time_value` combination +#' - `g` is a one-row tibble containing the values of the grouping variables +#' for the associated group +#' - `t` is the `.ref_time_value` for the current window +#' - `...` are additional arguments #' -#' - `x` is a data frame with the same column names as the original object, -#' minus any grouping variables, with only the windowed data for one -#' group-`.ref_time_value` combination -#' - `g` is a one-row tibble containing the values of the grouping variables -#' for the associated group -#' - `t` is the `.ref_time_value` for the current window -#' - `...` are additional arguments -#' -#' If a formula, `.f` can operate directly on columns accessed via `.x$var` or -#' `.$var`, as in `~mean(.x$var)` to compute a mean of a column `var` for each -#' `ref_time_value`-group combination. The group key can be accessed via `.y`. -#' If `.f` is missing, then `...` will specify the computation. #' @param ... Additional arguments to pass to the function or formula specified #' via `.f`. Alternatively, if `.f` is missing, then the `...` is interpreted #' as a ["data-masking"][rlang::args_data_masking] expression or expressions -#' for tidy evaluation; in addition to referring columns directly by name, the -#' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs, -#' and can also refer to `.x` (not the same as the input epi_df), -#' `.group_key`, and `.ref_time_value`. See details. -#' @param .new_col_name String indicating the name of the new column that will -#' contain the derivative values. The default is "slide_value" unless your -#' slide computations output data frames, in which case they will be unpacked +#' for tidy evaluation. +#' @param .new_col_name Name for the new column that will contain the computed +#' values. The default is "slide_value" unless your slide computations output +#' data frames, in which case they will be unpacked (as in `tidyr::unpack()`) #' into the constituent columns and those names used. New columns should not -#' be given names that clash with the existing columns of `.x`; see details. +#' be given names that clash with the existing columns of `.x`. +#' +#' @details +#' ## Advanced uses of `.f` via tidy evaluation #' -#' @template basic-slide-details +#' If specifying `.f` via tidy evaluation, in addition to the standard [`.data`] +#' and [`.env`], we make some additional "pronoun"-like bindings available: +#' +#' - .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object +#' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun] +#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and +#' `epiprocess` operations. If you have multiple expressions in `...`, this +#' won't let you refer to the output of the earlier expressions, but `.data` +#' will. +#' - .group_key, which is like `.y` in [`dplyr::group_modify`]. +#' - .ref_time_value, which is the element of `.ref_time_values` that +#' determined the time window for the current computation. #' #' @importFrom lubridate days weeks #' @importFrom dplyr bind_rows group_map group_vars filter select #' @importFrom rlang .data .env !! enquos sym env missing_arg #' @export -#' @seealso [`epi_slide_opt`] [`epi_slide_mean`] [`epi_slide_sum`] +#' @seealso [`epi_slide_opt`] for optimized slide functions #' @examples -#' # slide a 7-day trailing average formula on cases -#' # Simple sliding means and sums are much faster to do using -#' # the `epi_slide_mean` and `epi_slide_sum` functions instead. -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% -#' ungroup() -#' -#' # slide a 7-day leading average +#' # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases #' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "left") %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% -#' ungroup() +#' epi_slide( +#' cases_7sd = sd(cases, na.rm = TRUE), +#' cases_7dav = mean(cases, na.rm = TRUE), +#' .window_size = 7 +#' ) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) #' -#' # slide a 7-day centre-aligned average +#' # The same as above, but unpacking using an unnamed data.frame with a formula #' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "center") %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% -#' ungroup() +#' epi_slide( +#' ~ data.frame( +#' cases_7sd = sd(.x$cases, na.rm = TRUE), +#' cases_7dav = mean(.x$cases, na.rm = TRUE) +#' ), +#' .window_size = 7 +#' ) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) #' -#' # slide a 14-day centre-aligned average +#' # The same as above, but packing using a named data.frame with a tidy evaluation +#' # expression #' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide(cases_14dav = mean(cases), .window_size = 14, .align = "center") %>% -#' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% -#' ungroup() +#' epi_slide( +#' slide_packed = data.frame( +#' cases_7sd = sd(.x$cases, na.rm = TRUE), +#' cases_7dav = mean(.x$cases, na.rm = TRUE) +#' ), +#' .window_size = 7 +#' ) %>% +#' dplyr::select(geo_value, time_value, cases, slide_packed) #' #' # nested new columns #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide( -#' cases_2d = list(data.frame( -#' cases_2dav = mean(cases), -#' cases_2dma = mad(cases) -#' )), -#' .window_size = 2 +#' function(x, g, t) { +#' data.frame( +#' cases_7sd = sd(x$cases, na.rm = TRUE), +#' cases_7dav = mean(x$cases, na.rm = TRUE) +#' ) +#' }, +#' .window_size = 7 #' ) %>% -#' ungroup() +#' dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) +#' +#' # Use the geo_value or the ref_time_value in the slide computation +#' cases_deaths_subset %>% +#' epi_slide(~ .x$geo_value[[1]], .window_size = 7) +#' +#' cases_deaths_subset %>% +#' epi_slide(~ .x$time_value[[1]], .window_size = 7) epi_slide <- function( .x, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -492,16 +525,24 @@ get_before_after_from_window <- function(window_size, align, time_type) { return(list(before = before, after = after)) } -#' Optimized slide function for performing common rolling computations on an -#' `epi_df` object +#' Optimized slide functions for common cases #' -#' Slides an n-timestep [data.table::froll] or [slider::summary-slide] function -#' over variables in an `epi_df` object. See the -#' [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) -#' for examples. +#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] +#' or [slider::summary-slide] function over variables in an `epi_df` object. +#' These functions tend to be much faster than `epi_slide()`. See +#' `vignette("epi_df")` for more examples. #' #' @template basic-slide-params -#' @template opt-slide-params +#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column +#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), +#' [other tidy-select expression][tidyselect::language], or a vector of +#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if +#' they were positions in the data frame, so expressions like `x:y` can be +#' used to select a range of variables. +#' +#' The tidy-selection renaming interface is not supported, and cannot be used +#' to provide output column names; if you want to customize the output column +#' names, use [`dplyr::rename`] after the slide. #' @param .f Function; together with `...` specifies the computation to slide. #' `.f` must be one of `data.table`'s rolling functions #' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one @@ -518,7 +559,6 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' example, `algo` or `na.rm` in data.table functions. You don't need to #' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider #' functions). -#' @template opt-slide-details #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select all_of #' @importFrom rlang enquo quo_get_expr as_label expr_label caller_arg @@ -529,53 +569,24 @@ get_before_after_from_window <- function(window_size, align, time_type) { #' @importFrom checkmate assert_function #' @importFrom slider slide_sum slide_prod slide_mean slide_min slide_max slide_all slide_any #' @export -#' @seealso [`epi_slide`] [`epi_slide_mean`] [`epi_slide_sum`] +#' @seealso [`epi_slide`] for the more general slide function #' @examples -#' # slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` +#' # Compute a 7-day trailing average on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% -#' epi_slide_opt( -#' cases, -#' .f = data.table::frollmean, .window_size = 7 -#' ) %>% -#' # Remove a nonessential var. to ensure new col is printed, and rename new col -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() +#' epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) %>% +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) #' -#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed -#' # and accuracy, and to allow partially-missing windows. +#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and +#' # to allow partially-missing windows. #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, #' .f = data.table::frollmean, .window_size = 7, -#' # `frollmean` options #' algo = "exact", hasNA = TRUE, na.rm = TRUE #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() -#' -#' # slide a 7-day leading average -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_opt( -#' cases, -#' .f = slider::slide_mean, .window_size = 7, .align = "left" -#' ) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() -#' -#' # slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_opt( -#' cases, -#' .f = data.table::frollsum, .window_size = 6, .align = "center" -#' ) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) epi_slide_opt <- function( .x, .col_names, .f, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -808,70 +819,28 @@ epi_slide_opt <- function( return(result) } -#' Optimized slide function for performing rolling averages on an `epi_df` object -#' -#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. -#' -#' Wrapper around `epi_slide_opt` with `.f = datatable::frollmean`. -#' -#' @template basic-slide-params -#' @template opt-slide-params -#' @param ... Additional arguments to pass to the slide computation `.f`, for -#' example, `algo` or `na.rm` in data.table functions. You don't need to -#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider -#' functions). -#' -#' @template opt-slide-details +#' @rdname epi_slide_opt +#' @description `epi_slide_mean` is a wrapper around `epi_slide_opt` with `.f = +#' datatable::frollmean`. #' #' @export -#' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_sum`] #' @examples -#' # slide a 7-day trailing average formula on cases +#' # Compute a 7-day trailing average on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean(cases, .window_size = 7) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) #' -#' # slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed -#' # and accuracy, and to allow partially-missing windows. +#' # Same as above, but adjust `frollmean` settings for speed, accuracy, and +#' # to allow partially-missing windows. #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean( #' cases, #' .window_size = 7, -#' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() -#' -#' # slide a 7-day leading average -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(cases, .window_size = 7, .align = "right") %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() -#' -#' # slide a 7-day centre-aligned average -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(cases, .window_size = 7, .align = "center") %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) %>% -#' ungroup() -#' -#' # slide a 14-day centre-aligned average -#' cases_deaths_subset %>% -#' group_by(geo_value) %>% -#' epi_slide_mean(cases, .window_size = 14, .align = "center") %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) %>% -#' ungroup() +#' dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) epi_slide_mean <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -922,33 +891,17 @@ epi_slide_mean <- function( ) } -#' Optimized slide function for performing rolling sums on an `epi_df` object -#' -#' Slides an n-timestep sum over variables in an `epi_df` object. See the [slide -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. -#' -#' Wrapper around `epi_slide_opt` with `.f = datatable::frollsum`. -#' -#' @template basic-slide-params -#' @template opt-slide-params -#' @param ... Additional arguments to pass to the slide computation `.f`, for -#' example, `algo` or `na.rm` in data.table functions. You don't need to -#' specify `.x`, `.window_size`, or `.align` (or `before`/`after` for slider -#' functions). -#' -#' @template opt-slide-details +#' @rdname epi_slide_opt +#' @description `epi_slide_sum` is a wrapper around `epi_slide_opt` with `.f = +#' datatable::frollsum`. #' #' @export -#' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_mean`] #' @examples -#' # slide a 7-day trailing sum formula on cases +#' # Compute a 7-day trailing sum on cases. #' cases_deaths_subset %>% #' group_by(geo_value) %>% #' epi_slide_sum(cases, .window_size = 7) %>% -#' # Remove a nonessential var. to ensure new col is printed -#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) %>% -#' ungroup() +#' dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) epi_slide_sum <- function( .x, .col_names, ..., .window_size = NULL, .align = c("right", "center", "left"), @@ -1006,7 +959,7 @@ epi_slide_sum <- function( #' function (using `validate_slide_window_arg`). #' #' @importFrom checkmate assert_function -#' @noRd +#' @keywords internal full_date_seq <- function(x, before, after, time_type) { if (!time_type %in% c("day", "week", "yearmonth", "integer")) { cli_abort( diff --git a/R/utils.R b/R/utils.R index 066b374e..1bfd2129 100644 --- a/R/utils.R +++ b/R/utils.R @@ -182,7 +182,7 @@ format_tibble_row <- function(x, empty = "*none*") { #' @importFrom purrr map_lgl #' @importFrom utils tail #' -#' @noRd +#' @keywords internal assert_sufficient_f_args <- function(.f, ..., .ref_time_value_label) { mandatory_f_args_labels <- c("window data", "group key", .ref_time_value_label) n_mandatory_f_args <- length(mandatory_f_args_labels) @@ -670,6 +670,7 @@ upcase_snake_case <- function(vec) { #' the full list of potential substitutions for the `time_value` column name: #' `r time_column_names()` #' @export +#' @keywords internal time_column_names <- function() { substitutions <- c( "time_value", "date", "time", "datetime", "dateTime", "date_time", "target_date", @@ -686,6 +687,7 @@ time_column_names <- function() { #' the full list of potential substitutions for the `geo_value` column name: #' `r geo_column_names()` #' @export +#' @keywords internal geo_column_names <- function() { substitutions <- c( "geo_value", "geo_values", "geo_id", "geos", "location", "jurisdiction", "fips", "zip", @@ -702,6 +704,7 @@ geo_column_names <- function() { #' the full list of potential substitutions for the `version` column name: #' `r version_column_names()` #' @export +#' @keywords internal version_column_names <- function() { substitutions <- c( "version", "issue", "release" @@ -833,7 +836,8 @@ list2var <- function(x) { #' #' @importFrom lifecycle deprecated #' -#' @noRd +#' @export +#' @keywords internal deprecated_quo_is_present <- function(quo) { if (!rlang::is_quosure(quo)) { cli_abort("`quo` must be a quosure; `enquo` the arg first", @@ -991,6 +995,7 @@ gcd_num <- function(dividends, ..., rrtol = 1e-6, pqlim = 1e6, irtol = 1e-6) { #' by adding `k * result` for an integer k, and such that there is no smaller #' `result` that can achieve this. #' +#' @keywords internal #' @export guess_period <- function(time_values, time_values_arg = rlang::caller_arg(time_values), ...) { UseMethod("guess_period") diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 00000000..ab3fa72f --- /dev/null +++ b/README.Rmd @@ -0,0 +1,119 @@ +--- +output: github_document +--- + + + +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +knitr::opts_chunk$set( + fig.path = "man/figures/README-" +) +``` + +# epiprocess + +The `{epiprocess}` package works with epidemiological time series data and +provides tools to manage, analyze, and process the data in preparation for +modeling. It is designed to work in tandem with +`{epipredict}`, which provides +pre-built epiforecasting models and as well as tools to build custom models. +Both packages are designed to lower the barrier to entry and implementation cost +for epidemiological time series analysis and forecasting. + +`{epiprocess}` contains: + +- `epi_df()` and `epi_archive()`, two data frame classes (that work like a +`{tibble}` with `{dplyr}` verbs) for working with epidemiological time +series data; +- signal processing tools building on these data structures such as + - `epi_slide()` for sliding window operations; + - `epix_slide()` for sliding window operations on archives; + - `growth_rate()` for computing growth rates; + - `detect_outlr()` for outlier detection; + - `epi_cor()` for computing correlations; + +If you are new to this set of tools, you may be interested learning through a +book format: [Introduction to Epidemiological +Forecasting](https://cmu-delphi.github.io/delphi-tooling-book/). + +You may also be interested in: + +- `{epidatr}`, for accessing wide range +of epidemiological data sets, including COVID-19 data, flu data, and more. +- `{rtestim}`, a package for estimating +the time-varying reproduction number of an epidemic. + +This package is provided by the [Delphi group](https://delphi.cmu.edu/) at +Carnegie Mellon University. + +## Installation + +To install: + +```{r, eval=FALSE} +# Stable version +pak::pkg_install("cmu-delphi/epiprocess@main") + +# Dev version +pak::pkg_install("cmu-delphi/epiprocess@dev") +``` + +The package is not yet on CRAN. + +## Usage + +Once `epiprocess` and `epidatr` are installed, you can use the following code to +get started: + +```{r, results=FALSE, warning=FALSE, message=FALSE} +library(epiprocess) +library(epidatr) +library(dplyr) +library(magrittr) +``` + +Get COVID-19 confirmed cumulative case data from JHU CSSE for California, +Florida, New York, and Texas, from March 1, 2020 to January 31, 2022 + +```{r} +df <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_cumulative_num", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200301, 20220131), +) %>% + select(geo_value, time_value, cases_cumulative = value) +df +``` + +Convert the data to an epi_df object and sort by geo_value and time_value. You +can work with an `epi_df` like you can with a `{tibble}` by using `{dplyr}` +verbs + +```{r} +edf <- df %>% + as_epi_df() %>% + arrange_canonical() %>% + group_by(geo_value) %>% + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) +edf +``` + +Compute the 7 day moving average of the confirmed daily cases for each geo_value + +```{r} +edf <- edf %>% + group_by(geo_value) %>% + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) %>% + rename(smoothed_cases_daily = slide_value_cases_daily) +``` + +Autoplot the confirmed daily cases for each geo_value + +```{r} +edf %>% + autoplot(smoothed_cases_daily) +``` diff --git a/README.md b/README.md index d0a1c740..837791b4 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,47 @@ + + + # epiprocess - +The `{epiprocess}` package works with epidemiological time series data +and provides tools to manage, analyze, and process the data in +preparation for modeling. It is designed to work in tandem with +`{epipredict}`, which provides pre-built epiforecasting models and as +well as tools to build custom models. Both packages are designed to +lower the barrier to entry and implementation cost for epidemiological +time series analysis and forecasting. + +`{epiprocess}` contains: + + - `epi_df()` and `epi_archive()`, two data frame classes (that work + like a `{tibble}` with `{dplyr}` verbs) for working with + epidemiological time series data; + - signal processing tools building on these data structures such as + - `epi_slide()` for sliding window operations; + - `epix_slide()` for sliding window operations on archives; + - `growth_rate()` for computing growth rates; + - `detect_outlr()` for outlier detection; + - `epi_cor()` for computing correlations; + +If you are new to this set of tools, you may be interested learning +through a book format: [Introduction to Epidemiological +Forecasting](https://cmu-delphi.github.io/delphi-tooling-book/). -[![R-CMD-check](https://github.com/cmu-delphi/epiprocess/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/cmu-delphi/epiprocess/actions/workflows/R-CMD-check.yaml) +You may also be interested in: - + - `{epidatr}`, for accessing wide range of epidemiological data sets, + including COVID-19 data, flu data, and more. + - `{rtestim}`, a package for estimating the time-varying reproduction + number of an epidemic. -This package introduces a common data structure for epidemiological data sets -measured over space and time, and offers associated utilities to perform basic -signal processing tasks. See the getting started guide and vignettes for -examples. +This package is provided by the [Delphi group](https://delphi.cmu.edu/) +at Carnegie Mellon University. ## Installation -To install (unless you're making changes to the package, use the stable version): +To install: -```r +``` r # Stable version pak::pkg_install("cmu-delphi/epiprocess@main") @@ -23,56 +49,91 @@ pak::pkg_install("cmu-delphi/epiprocess@main") pak::pkg_install("cmu-delphi/epiprocess@dev") ``` -## `epi_df`: snapshot of a data set - -The first main data structure in the `epiprocess` package is called -[`epi_df`](reference/epi_df.html). This is simply a tibble with a couple of -required columns, `geo_value` and `time_value`. It can have any other number of -columns, which can be seen as measured variables, which we also call signal -variables. In brief, an `epi_df` object represents a snapshot of a data set that -contains the most up-to-date values of the signals variables, as of a given -time. - -By convention, functions in the `epiprocess` package that operate on `epi_df` -objects begin with `epi`. For example: - -- `epi_slide()`, for iteratively applying a custom computation to a variable in - an `epi_df` object over sliding windows in time; -- `epi_cor()`, for computing lagged correlations between variables in an - `epi_df` object, (allowing for grouping by geo value, time value, or any other - variables). - -Functions in the package that operate directly on given variables do not begin -with `epi`. For example: - -- `growth_rate()`, for estimating the growth rate of a given signal at given - time values, using various methodologies; -- `detect_outlr()`, for detecting outliers in a given signal over time, using - either built-in or custom methodologies. - -## `epi_archive`: full version history of a data set - -The second main data structure in the package is called -[`epi_archive`](reference/epi_archive.html). This is a special class (R6 format) -wrapped around a data table that stores the archive (version history) of some -signal variables of interest. - -By convention, functions in the `epiprocess` package that operate on -`epi_archive` objects begin with `epix` (the "x" is meant to remind you of -"archive"). These are just wrapper functions around the public methods for the -`epi_archive` R6 class. For example: - -- `epix_as_of()`, for generating a snapshot in `epi_df` format from the data - archive, which represents the most up-to-date values of the signal variables, - as of the specified version; -- `epix_fill_through_version()`, for filling in some fake version data following - simple rules, for use when downstream methods expect an archive that is more - up-to-date (e.g., if it is a forecasting deadline date and one of our data - sources cannot be accessed to provide the latest versions of its data) -- `epix_merge()`, for merging two data archives with each other, with support - for various approaches to handling when one of the archives is more up-to-date - version-wise than the other; -- `epix_slide()`, for sliding a custom computation to a data archive over local - windows in time, much like `epi_slide` for an `epi_df` object, but with one - key difference: the sliding computation at any given reference time t is - performed only on the **data that would have been available as of t**. +The package is not yet on CRAN. + +## Usage + +Once `epiprocess` and `epidatr` are installed, you can use the following +code to get started: + +``` r +library(epiprocess) +library(epidatr) +library(dplyr) +library(magrittr) +``` + +Get COVID-19 confirmed cumulative case data from JHU CSSE for +California, Florida, New York, and Texas, from March 1, 2020 to January +31, 2022 + +``` r +df <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_cumulative_num", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200301, 20220131), +) %>% + select(geo_value, time_value, cases_cumulative = value) +df +#> # A tibble: 2,808 × 3 +#> geo_value time_value cases_cumulative +#> +#> 1 ca 2020-03-01 19 +#> 2 fl 2020-03-01 0 +#> 3 ny 2020-03-01 0 +#> 4 tx 2020-03-01 0 +#> 5 ca 2020-03-02 23 +#> 6 fl 2020-03-02 1 +#> # ℹ 2,802 more rows +``` + +Convert the data to an epi\_df object and sort by geo\_value and +time\_value. You can work with an `epi_df` like you can with a +`{tibble}` by using `{dplyr}` verbs + +``` r +edf <- df %>% + as_epi_df() %>% + arrange_canonical() %>% + group_by(geo_value) %>% + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) +edf +#> An `epi_df` object, 2,808 x 4 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2024-10-15 02:26:30.787809 +#> +#> # A tibble: 2,808 × 4 +#> # Groups: geo_value [4] +#> geo_value time_value cases_cumulative cases_daily +#> * +#> 1 ca 2020-03-01 19 19 +#> 2 ca 2020-03-02 23 4 +#> 3 ca 2020-03-03 29 6 +#> 4 ca 2020-03-04 40 11 +#> 5 ca 2020-03-05 50 10 +#> 6 ca 2020-03-06 68 18 +#> # ℹ 2,802 more rows +``` + +Compute the 7 day moving average of the confirmed daily cases for each +geo\_value + +``` r +edf <- edf %>% + group_by(geo_value) %>% + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) %>% + rename(smoothed_cases_daily = slide_value_cases_daily) +``` + +Autoplot the confirmed daily cases for each geo\_value + +``` r +edf %>% + autoplot(smoothed_cases_daily) +``` + + diff --git a/_pkgdown.yml b/_pkgdown.yml index e8c05a65..2214df7c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -42,12 +42,14 @@ articles: navbar: ~ contents: - epiprocess - - slide + - epi_df + - epi_archive + - outliers - growth_rate - correlation - - aggregation - - outliers - - archive + + - title: Developer + contents: - compactify repo: @@ -59,35 +61,46 @@ repo: reference: - title: "`epi_df` basics" - desc: Details on `epi_df` format, and basic functionality. + desc: Constructors and information for `epi_df` objects. - contents: - - matches("epi_df") - - matches("column_names") - - title: "`epi_*()` functions" - desc: Functions that act on `epi_df` objects. + - epi_df + - print.epi_df + - group_epi_df + - autoplot.epi_df + + - title: "`epi_df` manipulation" + desc: Functions operating on `epi_df` objects. - contents: + - complete.epi_df - epi_slide - epi_slide_mean - - epi_slide_sum - epi_slide_opt + - epi_slide_sum + - sum_groups_epi_df - epi_cor - - title: Vector functions - desc: Functions that act directly on signal variables. - - contents: - - growth_rate - detect_outlr - - detect_outlr_rm - - detect_outlr_stl + - growth_rate + - as_tibble.epi_df + - as_tsibble.epi_df + - title: "`epi_archive` basics" - desc: Details on `epi_archive`, and basic functionality. + desc: Constructors and information for `epi_archive` objects. - contents: - - matches("archive") - - revision_summary - - title: "`epix_*()` functions" - desc: Functions that act on an `epi_archive` and/or `grouped_epi_archive` object. - - contents: - - starts_with("epix") + - epi_archive + - print.epi_archive + - clone - group_by.epi_archive + + - title: "`epi_archive` manipulation" + desc: Functions operating on `epi_archive` objects. + - contents: + - epix_as_of + - epix_slide + - epix_merge + - revision_summary + - epix_fill_through_version + - epix_truncate_versions_after + - title: Example data - contents: - cases_deaths_subset @@ -95,16 +108,7 @@ reference: - covid_incidence_county_subset - covid_incidence_outliers - covid_case_death_rates_extended - - title: Basic automatic plotting - - contents: - - autoplot.epi_df - - title: Advanced internals - - contents: - - compactify + - title: internal - contents: - - epiprocess - - max_version_with_row_in - - next_after - - guess_period - - key_colnames + - starts_with("internal") diff --git a/man-roxygen/basic-slide-details.R b/man-roxygen/basic-slide-details.R deleted file mode 100644 index df87f882..00000000 --- a/man-roxygen/basic-slide-details.R +++ /dev/null @@ -1,69 +0,0 @@ -#' @details To "slide" means to apply a function or formula over a rolling -#' window. The `.window_size` arg determines the width of the window -#' (including the reference time) and the `.align` arg governs how the window -#' is aligned (see below for examples). The `.ref_time_values` arg controls -#' which time values to consider for the slide and `.all_rows` allows you to -#' keep NAs around. -#' -#' `epi_slide()` does not require a complete window (such as on the left -#' boundary of the dataset) and will attempt to perform the computation -#' anyway. The issue of what to do with partial computations (those run on -#' incomplete windows) is therefore left up to the user, either through the -#' specified function or formula, or through post-processing. -#' -#' Let's look at some window examples, assuming that the reference time value -#' is "tv". With .align = "right" and .window_size = 3, the window will be: -#' -#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv - 2, tv - 1, tv -#' -#' With .align = "center" and .window_size = 3, the window will be: -#' -#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv - 1, tv, tv + 1 -#' -#' With .align = "center" and .window_size = 4, the window will be: -#' -#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv - 2, tv - 1, tv, tv + 1 -#' -#' With .align = "left" and .window_size = 3, the window will be: -#' -#' time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv, tv + 1, tv + 2 -#' -#' If `.f` is missing, then ["data-masking"][rlang::args_data_masking] -#' expression(s) for tidy evaluation can be specified, for example, as in: -#' ``` -#' epi_slide(x, cases_7dav = mean(cases), .window_size = 7) -#' ``` -#' which would be equivalent to: -#' ``` -#' epi_slide(x, function(x, g, t) mean(x$cases), .window_size = 7, -#' .new_col_name = "cases_7dav") -#' ``` -#' In a manner similar to [`dplyr::mutate`]: -#' * Expressions evaluating to length-1 vectors will be recycled to -#' appropriate lengths. -#' * `, name_var := value` can be used to set the output column name based on -#' a variable `name_var` rather than requiring you to use a hard-coded -#' name. (The leading comma is needed to make sure that `.f` is treated as -#' missing.) -#' * `= NULL` can be used to remove results from previous expressions (though -#' we don't allow it to remove pre-existing columns). -#' * `, fn_returning_a_data_frame(.x)` will unpack the output of the function -#' into multiple columns in the result. -#' * Named expressions evaluating to data frames will be placed into -#' [`tidyr::pack`]ed columns. -#' -#' In addition to [`.data`] and [`.env`], we make some additional -#' "pronoun"-like bindings available: -#' * .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object -#' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun] -#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and -#' `epiprocess` operations. If you have multiple expressions in `...`, this -#' won't let you refer to the output of the earlier expressions, but `.data` -#' will. -#' * .group_key, which is like `.y` in [`dplyr::group_modify`]. -#' * .ref_time_value, which is the element of `.ref_time_values` that -#' determined the time window for the current computation. diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R index dfa2512f..638307d6 100644 --- a/man-roxygen/basic-slide-params.R +++ b/man-roxygen/basic-slide-params.R @@ -1,36 +1,35 @@ -#' @param .x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `.x` will be treated as part of a -#' single data group. -#' @param .window_size The size of the sliding window. By default, this is 1, -#' meaning that only the current ref_time_value is included. The accepted values -#' here depend on the `time_value` column: +#' @param .x An `epi_df` object. If ungrouped, we group by `geo_value` and any +#' columns in `other_keys`. If grouped, we make sure the grouping is by +#' `geo_value` and `other_keys`. +#' @param .window_size The size of the sliding window. The accepted values +#' depend on the type of the `time_value` column in `.x`: #' -#' - if time_type is Date and the cadence is daily, then `.window_size` can be -#' an integer (which will be interpreted in units of days) or a difftime +#' - if time type is `Date` and the cadence is daily, then `.window_size` can +#' be an integer (which will be interpreted in units of days) or a difftime #' with units "days" -#' - if time_type is Date and the cadence is weekly, then `.window_size` must -#' be a difftime with units "weeks" -#' - if time_type is an yearmonth or integer, then `.window_size` must be an +#' - if time type is `Date` and the cadence is weekly, then `.window_size` must +#' be a `difftime` with units "weeks" +#' - if time type is a `yearmonth` or an integer, then `.window_size` must be an #' integer #' -#' @param .align The alignment of the sliding window. If `right` (default), then -#' the window has its end at the reference time; if `center`, then the window is -#' centered at the reference time; if `left`, then the window has its start at -#' the reference time. If the alignment is `center` and the window size is odd, -#' then the window will have floor(window_size/2) points before and after the -#' reference time. If the window size is even, then the window will be -#' asymmetric and have one less value on the right side of the reference time -#' (assuming time increases from left to right). -#' @param .ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param .all_rows If `.all_rows = TRUE`, then all rows of `.x` will be kept in -#' the output even with `.ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `.ref_time_values`; otherwise, there will be one row for each row in -#' `.x` that had a `time_value` in `.ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. -#' @return An `epi_df` object given by appending one or more new columns to `.x`, -#' named according to the `.new_col_name` argument. +#' @param .align The alignment of the sliding window. +#' +#' - If "right" (default), then the window has its end at the reference time. +#' This is likely the most common use case, e.g. `.window_size=7` and +#' `.align="right"` slides over the past week of data. +#' - If "left", then the window has its start at the reference time. +#' - If "center", then the window is centered at the reference time. If the +#' window size is odd, then the window will have floor(window_size/2) points +#' before and after the reference time; if the window size is even, then the +#' window will be asymmetric and have one more value before the reference time +#' than after. +#' +#' @param .ref_time_values The time values at which to compute the slides +#' values. By default, this is all the unique time values in `.x`. +#' @param .all_rows If `.all_rows = FALSE`, the default, then the output +#' `epi_df` will have only the rows that had a `time_value` in +#' `.ref_time_values`. Otherwise, all the rows from `.x` are included by with +#' a missing value marker (typically NA, but more technically the result of +#' `vctrs::vec_cast`-ing `NA` to the type of the slide computation output). +#' @return An `epi_df` object with one or more new slide computation columns +#' added. diff --git a/man-roxygen/detect-outlr-return.R b/man-roxygen/detect-outlr-return.R deleted file mode 100644 index 50222e0e..00000000 --- a/man-roxygen/detect-outlr-return.R +++ /dev/null @@ -1,3 +0,0 @@ -#' @return An tibble with number of rows equal to `length(y)` and columns -#' giving the outlier detection thresholds (`lower` and `upper`) and -#' replacement values from each detection method (`replacement`). diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R deleted file mode 100644 index a8d93d93..00000000 --- a/man-roxygen/opt-slide-details.R +++ /dev/null @@ -1,33 +0,0 @@ -#' @details To "slide" means to apply a function or formula over a rolling -#' window. The `.window_size` arg determines the width of the window -#' (including the reference time) and the `.align` arg governs how the window -#' is aligned (see below for examples). The `.ref_time_values` arg controls -#' which time values to consider for the slide and `.all_rows` allows you to -#' keep NAs around. -#' -#' `epi_slide_*()` does not require a complete window (such as on the left -#' boundary of the dataset) and will attempt to perform the computation -#' anyway. The issue of what to do with partial computations (those run on -#' incomplete windows) is therefore left up to the user, either through the -#' specified function or formula `f`, or through post-processing. -#' -#' Let's look at some window examples, assuming that the reference time value -#' is "tv". With .align = "right" and .window_size = 3, the window will be: -#' -#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv - 2, tv - 1, tv -#' -#' With .align = "center" and .window_size = 3, the window will be: -#' -#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv - 1, tv, tv + 1 -#' -#' With .align = "center" and .window_size = 4, the window will be: -#' -#' time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv - 2, tv - 1, tv, tv + 1 -#' -#' With .align = "left" and .window_size = 3, the window will be: -#' -#' time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -#' window: tv, tv + 1, tv + 2 diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R deleted file mode 100644 index ba4b4877..00000000 --- a/man-roxygen/opt-slide-params.R +++ /dev/null @@ -1,10 +0,0 @@ -#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name(e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), -#' [other tidy-select expression][tidyselect::language], or a vector of -#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if -#' they were positions in the data frame, so expressions like `x:y` can be -#' used to select a range of variables. -#' -#' The tidy-selection renaming interface is not supported, and cannot be used -#' to provide output column names; if you want to customize the output column -#' names, use [`dplyr::rename`] after the slide. diff --git a/man-roxygen/x-y.R b/man-roxygen/x-y.R deleted file mode 100644 index a4f9d1d7..00000000 --- a/man-roxygen/x-y.R +++ /dev/null @@ -1,4 +0,0 @@ -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. diff --git a/man-roxygen/x.R b/man-roxygen/x.R deleted file mode 100644 index a26f9f25..00000000 --- a/man-roxygen/x.R +++ /dev/null @@ -1 +0,0 @@ -#' @param x an `epi_df` diff --git a/man/apply_compactify.Rd b/man/apply_compactify.Rd index 14b884c6..0e1f0b3c 100644 --- a/man/apply_compactify.Rd +++ b/man/apply_compactify.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/archive.R \name{apply_compactify} \alias{apply_compactify} -\title{given a tibble as would be found in an epi_archive, remove duplicate entries.} +\title{Given a tibble as would be found in an epi_archive, remove duplicate entries.} \usage{ apply_compactify(df, keys, tolerance = .Machine$double.eps^0.5) } \description{ -works by shifting all rows except the version, then comparing values to see +Works by shifting all rows except the version, then comparing values to see if they've changed. We need to arrange in descending order, but note that we don't need to group, since at least one column other than version has changed, and so is kept. diff --git a/man/assert_sufficient_f_args.Rd b/man/assert_sufficient_f_args.Rd new file mode 100644 index 00000000..a0c2cbb8 --- /dev/null +++ b/man/assert_sufficient_f_args.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{assert_sufficient_f_args} +\alias{assert_sufficient_f_args} +\title{Assert that a sliding computation function takes enough args} +\usage{ +assert_sufficient_f_args(.f, ..., .ref_time_value_label) +} +\arguments{ +\item{...}{Dots that will be forwarded to \code{f} from the dots of \code{epi_slide} or +\code{epix_slide}.} + +\item{.ref_time_value_label}{String; how to describe/label the \code{ref_time_value} in +error messages; e.g., "reference time value" or "version".} + +\item{f}{Function; specifies a computation to slide over an \code{epi_df} or +\code{epi_archive} in \code{epi_slide} or \code{epix_slide}.} +} +\description{ +Assert that a sliding computation function takes enough args +} +\keyword{internal} diff --git a/man/compactify.Rd b/man/compactify.Rd deleted file mode 100644 index 2f210315..00000000 --- a/man/compactify.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/archive.R -\name{compactify} -\alias{compactify} -\title{Compactify} -\description{ -This section describes the internals of how compactification works in an -\code{epi_archive()}. Compactification can potentially improve code speed or -memory usage, depending on your data. -} -\details{ -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions, and between the last recorded -update and the \code{versions_end}. One consequence is that the \code{DT} doesn't -have to contain a full snapshot of every version (although this generally -works), but can instead contain only the rows that are new or changed from -the previous version (see \code{compactify}, which does this automatically). -Currently, deletions must be represented as revising a row to a special -state (e.g., making the entries \code{NA} or including a special column that -flags the data as removed and performing some kind of post-processing), and -the archive is unaware of what this state is. Note that \code{NA}s \emph{can} be -introduced by \code{epi_archive} methods for other reasons, e.g., in -\code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if requested, to -represent potential update data that we do not yet have access to; or in -\code{\link{epix_merge}} to represent the "value" of an observation before the -version in which it was first released, or if no version of that -observation appears in the archive data at all. -} diff --git a/man/complete.epi_df.Rd b/man/complete.epi_df.Rd index 9d791fb7..71dbcb38 100644 --- a/man/complete.epi_df.Rd +++ b/man/complete.epi_df.Rd @@ -16,9 +16,7 @@ \item{explicit}{see \code{\link[tidyr:complete]{tidyr::complete}}} } \description{ -A ‘tidyr::complete()’ analogue for ‘epi_df’ objects. This function -can be used, for example, to add rows for missing combinations -of ‘geo_value’ and ‘time_value’, filling other columns with \code{NA}s. +A \code{tidyr::complete()} analogue for \verb{epi_df`` objects. This function can be used, for example, to add rows for missing combinations of }geo_value\code{and}time_value\verb{, filling other columns with }NA`s. See the examples for usage details. } \examples{ diff --git a/man/decay_epi_df.Rd b/man/decay_epi_df.Rd new file mode 100644 index 00000000..d581db08 --- /dev/null +++ b/man/decay_epi_df.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{decay_epi_df} +\alias{decay_epi_df} +\title{Drop any \code{epi_df} metadata and class on a data frame} +\usage{ +decay_epi_df(x) +} +\arguments{ +\item{x}{an \code{epi_df} or other data frame} +} +\value{ +\code{x} with any metadata dropped and the \code{"epi_df"} class, if previously +present, dropped +} +\description{ +Useful in implementing \code{?dplyr_extending} when manipulations cause invariants +of \code{epi_df}s to be violated and we need to return some other class. Note that +this will maintain any grouping (keeping the \code{grouped_df} class and +associated attributes, if present). +} +\keyword{internal} diff --git a/man/deprecated_quo_is_present.Rd b/man/deprecated_quo_is_present.Rd new file mode 100644 index 00000000..add87529 --- /dev/null +++ b/man/deprecated_quo_is_present.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{deprecated_quo_is_present} +\alias{deprecated_quo_is_present} +\title{\code{\link[lifecycle:deprecated]{lifecycle::is_present}} for enquosed deprecated NSE arg} +\usage{ +deprecated_quo_is_present(quo) +} +\arguments{ +\item{quo}{\link[rlang:enquo]{enquosed} arg} +} +\value{ +bool; was \code{quo} "present", or did it look like a missing quosure or +have an expr that looked like \code{deprecated()} or \code{lifecycle::deprecated()}? +} +\description{ +\code{\link[lifecycle:deprecated]{lifecycle::is_present}} is designed for use with args that undergo standard +evaluation, rather than non-standard evaluation (NSE). This function is +designed to fulfill a similar purpose, but for args we have +\link[rlang:enquo]{enquosed} in preparation for NSE. +} +\examples{ + +fn <- function(x = deprecated()) { + deprecated_quo_is_present(rlang::enquo(x)) +} + +fn() # FALSE +fn(.data$something) # TRUE + +# Functions that wrap `fn` should forward the NSE arg to `fn` using +# [`{{ arg }}`][rlang::embrace-operator] (or, if they are working from an +# argument that has already been defused into a quosure, `!!quo`). (This is +# already how NSE arguments that will be enquosed should be forwarded.) + +wrapper1 <- function(x = deprecated()) fn({{ x }}) +wrapper2 <- function(x = lifecycle::deprecated()) fn({{ x }}) +wrapper3 <- function(x) fn({{ x }}) +wrapper4 <- function(x) fn(!!rlang::enquo(x)) + +wrapper1() # FALSE +wrapper2() # FALSE +wrapper3() # FALSE +wrapper4() # FALSE + +# More advanced: wrapper that receives an already-enquosed arg: + +inner_wrapper <- function(quo) fn(!!quo) +outer_wrapper1 <- function(x = deprecated()) inner_wrapper(rlang::enquo(x)) + +outer_wrapper1() # FALSE + +# Improper argument forwarding from a wrapper function will cause this +# function to produce incorrect results. +bad_wrapper1 <- function(x) fn(x) +bad_wrapper1() # TRUE, bad + +} +\keyword{internal} diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 744b9345..bee62aec 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -2,6 +2,8 @@ % Please edit documentation in R/outliers.R \name{detect_outlr} \alias{detect_outlr} +\alias{detect_outlr_rm} +\alias{detect_outlr_stl} \title{Detect outliers} \usage{ detect_outlr( @@ -10,6 +12,32 @@ detect_outlr( methods = tibble::tibble(method = "rm", args = list(list()), abbr = "rm"), combiner = c("median", "mean", "none") ) + +detect_outlr_rm( + x = seq_along(y), + y, + n = 21, + log_transform = FALSE, + detect_negatives = FALSE, + detection_multiplier = 2, + min_radius = 0, + replacement_multiplier = 0 +) + +detect_outlr_stl( + x = seq_along(y), + y, + n_trend = 21, + n_seasonal = 21, + n_threshold = 21, + seasonal_period, + seasonal_as_residual = FALSE, + log_transform = FALSE, + detect_negatives = FALSE, + detection_multiplier = 2, + min_radius = 0, + replacement_multiplier = 0 +) } \arguments{ \item{x}{Design points corresponding to the signal values \code{y}. Default is @@ -36,6 +64,56 @@ as well as a replacement value for any outliers. If "none", then no summarized results are calculated. Note that if the number of \code{methods} (number of rows) is odd, then "median" is equivalent to a majority vote for purposes of determining whether a given observation is an outlier.} + +\item{n}{Number of time steps to use in the rolling window. Default is 21. +This value is centrally aligned. When \code{n} is an odd number, the rolling +window extends from \code{(n-1)/2} time steps before each design point to \code{(n-1)/2} +time steps after. When \code{n} is even, then the rolling range extends from +\code{n/2-1} time steps before to \code{n/2} time steps after.} + +\item{log_transform}{Should a log transform be applied before running outlier +detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the +log transform will be padded by 1.} + +\item{detect_negatives}{Should negative values automatically count as +outliers? Default is \code{FALSE}.} + +\item{detection_multiplier}{Value determining how far the outlier detection +thresholds are from the rolling median, which are calculated as (rolling +median) +/- (detection multiplier) * (rolling IQR). Default is 2.} + +\item{min_radius}{Minimum distance between rolling median and threshold, on +transformed scale. Default is 0.} + +\item{replacement_multiplier}{Value determining how far the replacement +values are from the rolling median. The replacement is the original value +if it is within the detection thresholds, or otherwise it is rounded to the +nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). +Default is 0.} + +\item{n_trend}{Number of time steps to use in the rolling window for trend. +Default is 21.} + +\item{n_seasonal}{Number of time steps to use in the rolling window for +seasonality. Default is 21. Can also be the string "periodic". See +\code{s.window} in \code{\link[stats:stl]{stats::stl}}.} + +\item{n_threshold}{Number of time steps to use in rolling window for the IQR +outlier thresholds.} + +\item{seasonal_period}{Integer specifying period of "seasonality". For +example, for daily data, a period 7 means weekly seasonality. It must be +strictly larger than 1. Also impacts the size of the low-pass filter +window; see \code{l.window} in \code{\link[stats:stl]{stats::stl}}.} + +\item{seasonal_as_residual}{Boolean specifying whether the seasonal(/weekly) +component should be treated as part of the residual component instead of as +part of the predictions. The default, FALSE, treats them as part of the +predictions, so large seasonal(/weekly) components will not lead to +flagging points as outliers. \code{TRUE} may instead consider the extrema of +large seasonal variations to be outliers; \code{n_seasonal} and +\code{seasonal_period} will still have an impact on the result, though, by +impacting the estimation of the trend component.} } \value{ An tibble with number of rows equal to \code{length(y)} and columns @@ -47,6 +125,13 @@ Applies one or more outlier detection methods to a given signal variable, and optionally aggregates the outputs to create a consensus result. See the \href{https://cmu-delphi.github.io/epiprocess/articles/outliers.html}{outliers vignette} for examples. + +\code{detect_outlr_rm} detects outliers based on a distance from the +rolling median specified in terms of multiples of the rolling interquartile +range (IQR). + +\code{detect_outlr_stl} detects outliers based on a seasonal-trend +decomposition using LOESS (STL). } \details{ Each outlier detection method, one per row of the passed \code{methods} @@ -62,6 +147,15 @@ For convenience, the outlier detection method can be specified (in the \code{detect_outlr_rm()}, which detects outliers via a rolling median; or by "stl", shorthand for \code{detect_outlr_stl()}, which detects outliers via an STL decomposition. + +The STL decomposition is computed using \code{\link[stats:stl]{stats::stl()}}. Once +computed, the outlier detection method is analogous to the rolling median +method in \code{\link[=detect_outlr_rm]{detect_outlr_rm()}}, except with the fitted values and residuals +from the STL decomposition taking the place of the rolling median and +residuals to the rolling median, respectively. + +The last set of arguments, \code{log_transform} through \code{replacement_multiplier}, +are exactly as in \code{detect_outlr_rm()}. } \examples{ detection_methods <- dplyr::bind_rows( @@ -104,4 +198,21 @@ x <- covid_incidence_outliers \%>\% combiner = "median" )) \%>\% unnest(outlier_info) +# Detect outliers based on a rolling median +covid_incidence_outliers \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr_rm( + x = time_value, y = cases + )) +# Detects outliers based on a seasonal-trend decomposition using LOESS +covid_incidence_outliers \%>\% + dplyr::select(geo_value, time_value, cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr_stl( + x = time_value, y = cases, + seasonal_period = 7 # weekly seasonality for daily data + )) } diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd deleted file mode 100644 index 36e784ca..00000000 --- a/man/detect_outlr_rm.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/outliers.R -\name{detect_outlr_rm} -\alias{detect_outlr_rm} -\title{Detect outliers based on a rolling median} -\usage{ -detect_outlr_rm( - x = seq_along(y), - y, - n = 21, - log_transform = FALSE, - detect_negatives = FALSE, - detection_multiplier = 2, - min_radius = 0, - replacement_multiplier = 0 -) -} -\arguments{ -\item{x}{Design points corresponding to the signal values \code{y}. Default is -\code{seq_along(y)} (that is, equally-spaced points from 1 to the length of -\code{y}).} - -\item{y}{Signal values.} - -\item{n}{Number of time steps to use in the rolling window. Default is 21. -This value is centrally aligned. When \code{n} is an odd number, the rolling -window extends from \code{(n-1)/2} time steps before each design point to \code{(n-1)/2} -time steps after. When \code{n} is even, then the rolling range extends from -\code{n/2-1} time steps before to \code{n/2} time steps after.} - -\item{log_transform}{Should a log transform be applied before running outlier -detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the -log transform will be padded by 1.} - -\item{detect_negatives}{Should negative values automatically count as -outliers? Default is \code{FALSE}.} - -\item{detection_multiplier}{Value determining how far the outlier detection -thresholds are from the rolling median, which are calculated as (rolling -median) +/- (detection multiplier) * (rolling IQR). Default is 2.} - -\item{min_radius}{Minimum distance between rolling median and threshold, on -transformed scale. Default is 0.} - -\item{replacement_multiplier}{Value determining how far the replacement -values are from the rolling median. The replacement is the original value -if it is within the detection thresholds, or otherwise it is rounded to the -nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). -Default is 0.} -} -\value{ -An tibble with number of rows equal to \code{length(y)} and columns -giving the outlier detection thresholds (\code{lower} and \code{upper}) and -replacement values from each detection method (\code{replacement}). -} -\description{ -Detects outliers based on a distance from the rolling median specified in -terms of multiples of the rolling interquartile range (IQR). -} -\examples{ -# Detect outliers based on a rolling median -covid_incidence_outliers \%>\% - dplyr::select(geo_value, time_value, cases) \%>\% - as_epi_df() \%>\% - group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr_rm( - x = time_value, y = cases - )) -} diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd deleted file mode 100644 index 27204142..00000000 --- a/man/detect_outlr_stl.Rd +++ /dev/null @@ -1,101 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/outliers.R -\name{detect_outlr_stl} -\alias{detect_outlr_stl} -\title{Detect outliers based on an STL decomposition} -\usage{ -detect_outlr_stl( - x = seq_along(y), - y, - n_trend = 21, - n_seasonal = 21, - n_threshold = 21, - seasonal_period, - seasonal_as_residual = FALSE, - log_transform = FALSE, - detect_negatives = FALSE, - detection_multiplier = 2, - min_radius = 0, - replacement_multiplier = 0 -) -} -\arguments{ -\item{x}{Design points corresponding to the signal values \code{y}. Default is -\code{seq_along(y)} (that is, equally-spaced points from 1 to the length of -\code{y}).} - -\item{y}{Signal values.} - -\item{n_trend}{Number of time steps to use in the rolling window for trend. -Default is 21.} - -\item{n_seasonal}{Number of time steps to use in the rolling window for -seasonality. Default is 21. Can also be the string "periodic". See -\code{s.window} in \code{\link[stats:stl]{stats::stl}}.} - -\item{n_threshold}{Number of time steps to use in rolling window for the IQR -outlier thresholds.} - -\item{seasonal_period}{Integer specifying period of "seasonality". For -example, for daily data, a period 7 means weekly seasonality. It must be -strictly larger than 1. Also impacts the size of the low-pass filter -window; see \code{l.window} in \code{\link[stats:stl]{stats::stl}}.} - -\item{seasonal_as_residual}{Boolean specifying whether the seasonal(/weekly) -component should be treated as part of the residual component instead of as -part of the predictions. The default, FALSE, treats them as part of the -predictions, so large seasonal(/weekly) components will not lead to -flagging points as outliers. \code{TRUE} may instead consider the extrema of -large seasonal variations to be outliers; \code{n_seasonal} and -\code{seasonal_period} will still have an impact on the result, though, by -impacting the estimation of the trend component.} - -\item{log_transform}{Should a log transform be applied before running outlier -detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the -log transform will be padded by 1.} - -\item{detect_negatives}{Should negative values automatically count as -outliers? Default is \code{FALSE}.} - -\item{detection_multiplier}{Value determining how far the outlier detection -thresholds are from the rolling median, which are calculated as (rolling -median) +/- (detection multiplier) * (rolling IQR). Default is 2.} - -\item{min_radius}{Minimum distance between rolling median and threshold, on -transformed scale. Default is 0.} - -\item{replacement_multiplier}{Value determining how far the replacement -values are from the rolling median. The replacement is the original value -if it is within the detection thresholds, or otherwise it is rounded to the -nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). -Default is 0.} -} -\value{ -An tibble with number of rows equal to \code{length(y)} and columns -giving the outlier detection thresholds (\code{lower} and \code{upper}) and -replacement values from each detection method (\code{replacement}). -} -\description{ -Detects outliers based on a seasonal-trend decomposition using LOESS (STL). -} -\details{ -The STL decomposition is computed using \code{\link[stats:stl]{stats::stl()}}. Once -computed, the outlier detection method is analogous to the rolling median -method in \code{\link[=detect_outlr_rm]{detect_outlr_rm()}}, except with the fitted values and residuals -from the STL decomposition taking the place of the rolling median and -residuals to the rolling median, respectively. - -The last set of arguments, \code{log_transform} through \code{replacement_multiplier}, -are exactly as in \code{detect_outlr_rm()}. -} -\examples{ -# Detects outliers based on a seasonal-trend decomposition using LOESS -covid_incidence_outliers \%>\% - dplyr::select(geo_value, time_value, cases) \%>\% - as_epi_df() \%>\% - group_by(geo_value) \%>\% - mutate(outlier_info = detect_outlr_stl( - x = time_value, y = cases, - seasonal_period = 7 # weekly seasonality for daily data - )) -} diff --git a/man/dplyr_reconstruct.epi_df.Rd b/man/dplyr_reconstruct.epi_df.Rd new file mode 100644 index 00000000..36557154 --- /dev/null +++ b/man/dplyr_reconstruct.epi_df.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_df.R +\name{dplyr_reconstruct.epi_df} +\alias{dplyr_reconstruct.epi_df} +\title{dplyr_reconstruct} +\usage{ +\method{dplyr_reconstruct}{epi_df}(data, template) +} +\arguments{ +\item{data}{tibble or \code{epi_df} (\code{dplyr} feeds in former, but we may +directly feed in latter from our other methods)} + +\item{template}{\code{epi_df} template to use to restore} +} +\value{ +\code{epi_df} or degrade into \code{tbl_df} +} +\description{ +dplyr_reconstruct +} +\keyword{internal} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index a5055f4e..4b459d5e 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -84,7 +84,8 @@ value of \code{clobberable_versions_start} does not fully trust these empty updates, and assumes that any version \verb{>= max(x$version)} could be clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} -\item{compactify_tol}{double. the tolerance used to detect approximate equality for compactification} +\item{compactify_tol}{double. the tolerance used to detect approximate +equality for compactification} \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being @@ -97,19 +98,24 @@ example \code{version = release_date}} An \code{epi_archive} object. } \description{ -An \code{epi_archive} is an S3 class which contains a data table -along with several relevant pieces of metadata. The data table can be seen -as the full archive (version history) for some signal variables of -interest. +The second main data structure for storing time series in +\code{epiprocess}. It is similar to \code{epi_df} in that it fundamentally a table with +a few required columns that stores epidemiological time series data. An +\code{epi_archive} requires a \code{geo_value}, \code{time_value}, and \code{version} column (and +possibly other key columns) along with measurement values. In brief, an +\code{epi_archive} is a history of the time series data, where the \code{version} +column tracks the time at which the data was available. This allows for +version-aware forecasting. + +\code{new_epi_archive} is the constructor for \code{epi_archive} objects that assumes +all arguments have been validated. Most users should use \code{as_epi_archive}. } \details{ -Epi Archive - -An \code{epi_archive} contains a data table \code{DT}, of class \code{data.table} -from the \code{data.table} package, with (at least) the following columns: +An \code{epi_archive} contains a \code{data.table} object \code{DT} (from the +\code{{data.table}} package), with (at least) the following columns: \itemize{ -\item \code{geo_value}: the geographic value associated with each row of measurements. -\item \code{time_value}: the time value associated with each row of measurements. +\item \code{geo_value}: the geographic value associated with each row of measurements, +\item \code{time_value}: the time value associated with each row of measurements, \item \code{version}: the time value specifying the version for each row of measurements. For example, if in a given row the \code{version} is January 15, 2022 and \code{time_value} is January 14, 2022, then this row contains the @@ -117,13 +123,33 @@ measurements of the data for January 14, 2022 that were available one day later. } -The data table \code{DT} has key variables \code{geo_value}, \code{time_value}, \code{version}, -as well as any others (these can be specified when instantiating the -\code{epi_archive} object via the \code{other_keys} argument, and/or set by operating -on \code{DT} directly). Note that there can only be a single row per unique -combination of key variables. +The variables \code{geo_value}, \code{time_value}, \code{version} serve as key variables for +the data table (in addition to any other keys specified in the metadata). +There can only be a single row per unique combination of key variables. The +keys for an \code{epi_archive} can be viewed with \code{key(epi_archive$DT)}. +\subsection{Compactification}{ + +By default, an \code{epi_archive} will compactify the data table to remove +redundant rows. This is done by not storing rows that have the same value, +except for the \code{version} column (this is essentially a last observation +carried forward, but along the version index). This is done to save space and +improve performance. If you do not want to compactify the data, you can set +\code{compactify = FALSE} in \code{as_epi_archive()}. + +Note that in some data scenarios, LOCF may not be appropriate. For instance, +if you expected data to be updated on a given day, but your data source did +not update, then it could be reasonable to code the data as \code{NA} for that +day, instead of assuming LOCF. + +\code{NA}s \emph{can} be introduced by \code{epi_archive} methods for other +reasons, e.g., in \code{\link{epix_fill_through_version}} and \code{\link{epix_merge}}, if +requested, to represent potential update data that we do not yet have access +to; or in \code{\link{epix_merge}} to represent the "value" of an observation before +the version in which it was first released, or if no version of that +observation appears in the archive data at all. } -\section{Metadata}{ + +\subsection{Metadata}{ The following pieces of metadata are included as fields in an \code{epi_archive} object: @@ -139,25 +165,7 @@ as read-only, and to use the \code{epi_archive} methods to interact with the dat archive. Unexpected behavior may result from modifying the metadata directly. } - -\section{Generating Snapshots}{ - -An \code{epi_archive} object can be used to generate a snapshot of the data in -\code{epi_df} format, which represents the most up-to-date time series values up -to a point in time. This is accomplished by calling \code{epix_as_of()}. } - -\section{Sliding Computations}{ - -We can run a sliding computation over an \code{epi_archive} object, much like -\code{epi_slide()} does for an \code{epi_df} object. This is accomplished by calling -the \code{slide()} method for an \code{epi_archive} object, which works similarly to -the way \code{epi_slide()} works for an \code{epi_df} object, but with one key -difference: it is version-aware. That is, for an \code{epi_archive} object, the -sliding computation at any given reference time point t is performed on -\strong{data that would have been available as of t}. -} - \examples{ # Simple ex. with necessary keys tib <- tibble::tibble( @@ -197,3 +205,6 @@ df <- data.frame( x <- df \%>\% as_epi_archive(other_keys = "county") } +\seealso{ +\code{\link{epix_as_of}} \code{\link{epix_merge}} \code{\link{epix_slide}} +} diff --git a/man/epi_df.Rd b/man/epi_df.Rd index d863f655..71aa6ce1 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -8,6 +8,7 @@ \alias{as_epi_df.tbl_ts} \alias{new_epi_df} \alias{epi_df} +\alias{is_epi_df} \title{\code{epi_df} object} \usage{ as_epi_df(x, ...) @@ -35,10 +36,11 @@ new_epi_df( other_keys = character(), ... ) + +is_epi_df(x) } \arguments{ -\item{x}{An \code{epi_df}, \code{data.frame}, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} -to be converted} +\item{x}{An object.} \item{...}{Additional arguments passed to methods.} @@ -64,11 +66,15 @@ as a character vector here (typical examples are "age" or sub-geographies).} } \value{ An \code{epi_df} object. + +\code{TRUE} if the object inherits from \code{epi_df}. } \description{ -An \code{epi_df} is a tibble with certain minimal column structure and metadata. -It can be seen as a snapshot of a data set that contains the most -up-to-date values of some signal variables of interest, as of a given time. +One of the two main data structures for storing time series in \code{epiprocess}. +It is simply tibble with at least two columns, \code{geo_value} and \code{time_value}, +that provide the keys for the time series. It can have any other columns, +which can be seen as measured variables at each key. In brief, an \code{epi_df} +represents a snapshot of an epidemiological data set at a point in time. } \details{ An \code{epi_df} is a tibble with (at least) the following columns: @@ -109,6 +115,9 @@ of an \code{epi_df} object. In brief, we can think of an \code{epi_df} object as single snapshot of a data set that contains the most up-to-date values of the signals variables, as of the time specified in the \code{as_of} field. +If an \code{epi_df} ever loses its \code{geo_value} or \code{time_value} columns, it will +decay into a regular tibble. + A companion object is the \code{epi_archive} object, which contains the full version history of a given data set. Revisions are common in many types of epidemiological data streams, and paying attention to data revisions can be @@ -117,15 +126,7 @@ the documentation for \code{\link{epi_archive}} for more details on how data versioning works in the \code{epiprocess} package (including how to generate \code{epi_df} objects, as data snapshots, from an \code{epi_archive} object). -} -\section{Functions}{ -\itemize{ -\item \code{as_epi_df()}: The preferred way of constructing \code{epi_df}s - -\item \code{new_epi_df()}: Lower-level constructor for \code{epi_df} object - -}} -\section{Geo Types}{ +\subsection{Geo Types}{ The following geo types are recognized in an \code{epi_df}. \itemize{ @@ -146,7 +147,7 @@ alpha-2 country codes (lowercase). An unrecognizable geo type is labeled "custom". } -\section{Time Types}{ +\subsection{Time Types}{ The following time types are recognized in an \code{epi_df}. \itemize{ @@ -162,37 +163,41 @@ arbitrary (as to whether a week starts on a Monday, Tuesday); coded as a An unrecognizable time type is labeled "custom". } +} +\section{Functions}{ +\itemize{ +\item \code{as_epi_df()}: The preferred way of constructing \code{epi_df}s + +\item \code{new_epi_df()}: Lower-level constructor for \code{epi_df} object +}} \examples{ # Convert a `tsibble` that has county code as an extra key # Notice that county code should be a character string to preserve any leading zeroes - ex1_input <- tibble::tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c( + geo_value = c( "06059", "06061", "06067", "12111", "12113", "12117", "42101", "42103", "42105" ), + state_name = rep(c("ca", "fl", "pa"), each = 3), time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day" ), length.out = length(geo_value)), value = 1:length(geo_value) + 0.01 * rnorm(length(geo_value)) ) \%>\% - tsibble::as_tsibble(index = time_value, key = c(geo_value, county_code)) + tsibble::as_tsibble(index = time_value, key = c(geo_value, state_name)) -# The `other_keys` metadata (`"county_code"` in this case) is automatically +# The `other_keys` metadata (`"state_name"` in this case) is automatically # inferred from the `tsibble`'s `key`: ex1 <- as_epi_df(x = ex1_input, as_of = "2020-06-03") attr(ex1, "metadata")[["other_keys"]] - # Dealing with misspecified column names: # Geographical and temporal information must be provided in columns named # `geo_value` and `time_value`; if we start from a data frame with a # different format, it must be converted to use `geo_value` and `time_value` # before calling `as_epi_df`. - ex2_input <- tibble::tibble( state = rep(c("ca", "fl", "pa"), each = 3), # misnamed pol = rep(c("blue", "swing", "swing"), each = 3), # extra key @@ -201,7 +206,6 @@ ex2_input <- tibble::tibble( ), length.out = length(state)), # misnamed value = 1:length(state) + 0.01 * rnorm(length(state)) ) - print(ex2_input) ex2 <- ex2_input \%>\% @@ -210,12 +214,9 @@ ex2 <- ex2_input \%>\% as_of = "2020-06-03", other_keys = "pol" ) - attr(ex2, "metadata") - # Adding additional keys to an `epi_df` object - ex3_input <- covid_incidence_county_subset \%>\% dplyr::filter(time_value > "2021-12-01", state_name == "Massachusetts") \%>\% dplyr::slice_tail(n = 6) @@ -230,4 +231,8 @@ ex3 <- ex3_input \%>\% as_epi_df(other_keys = c("state", "pol")) attr(ex3, "metadata") + +# Decays to a tibble +covid_incidence_county_subset \%>\% + select(-geo_value) } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 71734cc1..bd3f2f68 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -16,15 +16,23 @@ epi_slide( ) } \arguments{ -\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a -single data group.} +\item{.x}{An \code{epi_df} object. If ungrouped, we group by \code{geo_value} and any +columns in \code{other_keys}. If grouped, we make sure the grouping is by +\code{geo_value} and \code{other_keys}.} \item{.f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation within a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{.window_size} and \code{.align} parameters, see the details -section for more. If a function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where +computation to slide. The return of the computation should either be a +scalar or a 1-row data frame. Data frame returns will be +\code{tidyr::unpack()}-ed, if named, and will be \code{\link[tidyr:pack]{tidyr::pack}}-ed columns, if +not named. See examples. +\itemize{ +\item If \code{.f} is missing, then \code{...} will specify the computation via +tidy-evaluation. This is usually the most convenient way to use +\code{epi_slide}. See examples. +\item If \code{.f} is a formula, then the formula should use \code{.x} (not the same as +the input \code{epi_df}) to operate on the columns of the input \code{epi_df}, e.g. +\code{~mean(.x$var)} to compute a mean of \code{var}. +\item If a function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where: \itemize{ \item \code{x} is a data frame with the same column names as the original object, minus any grouping variables, with only the windowed data for one @@ -34,135 +42,75 @@ for the associated group \item \code{t} is the \code{.ref_time_value} for the current window \item \code{...} are additional arguments } - -If a formula, \code{.f} can operate directly on columns accessed via \code{.x$var} or -\code{.$var}, as in \code{~mean(.x$var)} to compute a mean of a column \code{var} for each -\code{ref_time_value}-group combination. The group key can be accessed via \code{.y}. -If \code{.f} is missing, then \code{...} will specify the computation.} +}} \item{...}{Additional arguments to pass to the function or formula specified via \code{.f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions -for tidy evaluation; in addition to referring columns directly by name, the -expressions have access to \code{.data} and \code{.env} pronouns as in \code{dplyr} verbs, -and can also refer to \code{.x} (not the same as the input epi_df), -\code{.group_key}, and \code{.ref_time_value}. See details.} +for tidy evaluation.} -\item{.window_size}{The size of the sliding window. By default, this is 1, -meaning that only the current ref_time_value is included. The accepted values -here depend on the \code{time_value} column: +\item{.window_size}{The size of the sliding window. The accepted values +depend on the type of the \code{time_value} column in \code{.x}: \itemize{ -\item if time_type is Date and the cadence is daily, then \code{.window_size} can be -an integer (which will be interpreted in units of days) or a difftime +\item if time type is \code{Date} and the cadence is daily, then \code{.window_size} can +be an integer (which will be interpreted in units of days) or a difftime with units "days" -\item if time_type is Date and the cadence is weekly, then \code{.window_size} must -be a difftime with units "weeks" -\item if time_type is an yearmonth or integer, then \code{.window_size} must be an +\item if time type is \code{Date} and the cadence is weekly, then \code{.window_size} must +be a \code{difftime} with units "weeks" +\item if time type is a \code{yearmonth} or an integer, then \code{.window_size} must be an integer }} -\item{.align}{The alignment of the sliding window. If \code{right} (default), then -the window has its end at the reference time; if \code{center}, then the window is -centered at the reference time; if \code{left}, then the window has its start at -the reference time. If the alignment is \code{center} and the window size is odd, -then the window will have floor(window_size/2) points before and after the -reference time. If the window size is even, then the window will be -asymmetric and have one less value on the right side of the reference time -(assuming time increases from left to right).} +\item{.align}{The alignment of the sliding window. +\itemize{ +\item If "right" (default), then the window has its end at the reference time. +This is likely the most common use case, e.g. \code{.window_size=7} and +\code{.align="right"} slides over the past week of data. +\item If "left", then the window has its start at the reference time. +\item If "center", then the window is centered at the reference time. If the +window size is odd, then the window will have floor(window_size/2) points +before and after the reference time; if the window size is even, then the +window will be asymmetric and have one more value before the reference time +than after. +}} -\item{.ref_time_values}{Time values for sliding computations, meaning, each -element of this vector serves as the reference time point for one sliding -window. If missing, then this will be set to all unique time values in the -underlying data table, by default.} +\item{.ref_time_values}{The time values at which to compute the slides +values. By default, this is all the unique time values in \code{.x}.} -\item{.new_col_name}{String indicating the name of the new column that will -contain the derivative values. The default is "slide_value" unless your -slide computations output data frames, in which case they will be unpacked +\item{.new_col_name}{Name for the new column that will contain the computed +values. The default is "slide_value" unless your slide computations output +data frames, in which case they will be unpacked (as in \code{tidyr::unpack()}) into the constituent columns and those names used. New columns should not -be given names that clash with the existing columns of \code{.x}; see details.} +be given names that clash with the existing columns of \code{.x}.} -\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in -the output even with \code{.ref_time_values} provided, with some type of missing -value marker for the slide computation output column(s) for \code{time_value}s -outside \code{.ref_time_values}; otherwise, there will be one row for each row in -\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The -missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +\item{.all_rows}{If \code{.all_rows = FALSE}, the default, then the output +\code{epi_df} will have only the rows that had a \code{time_value} in +\code{.ref_time_values}. Otherwise, all the rows from \code{.x} are included by with +a missing value marker (typically NA, but more technically the result of +\code{vctrs::vec_cast}-ing \code{NA} to the type of the slide computation output).} } \value{ -An \code{epi_df} object given by appending one or more new columns to \code{.x}, -named according to the \code{.new_col_name} argument. +An \code{epi_df} object with one or more new slide computation columns +added. } \description{ -Slides a given function over variables in an \code{epi_df} object. See the -\href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} -for examples. -} -\details{ -To "slide" means to apply a function or formula over a rolling -window. The \code{.window_size} arg determines the width of the window -(including the reference time) and the \code{.align} arg governs how the window -is aligned (see below for examples). The \code{.ref_time_values} arg controls -which time values to consider for the slide and \code{.all_rows} allows you to -keep NAs around. - -\code{epi_slide()} does not require a complete window (such as on the left -boundary of the dataset) and will attempt to perform the computation -anyway. The issue of what to do with partial computations (those run on -incomplete windows) is therefore left up to the user, either through the -specified function or formula, or through post-processing. - -Let's look at some window examples, assuming that the reference time value -is "tv". With .align = "right" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv - -With .align = "center" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 1, tv, tv + 1 - -With .align = "center" and .window_size = 4, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv, tv + 1 - -With .align = "left" and .window_size = 3, the window will be: - -time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv, tv + 1, tv + 2 - -If \code{.f} is missing, then \link[rlang:args_data_masking]{"data-masking"} -expression(s) for tidy evaluation can be specified, for example, as in: - -\if{html}{\out{
}}\preformatted{epi_slide(x, cases_7dav = mean(cases), .window_size = 7) -}\if{html}{\out{
}} - -which would be equivalent to: +Slides a given function over variables in an \code{epi_df} object. +This is useful for computations like rolling averages. The function supports +many ways to specify the computation, but by far the most common use case is +as follows: -\if{html}{\out{
}}\preformatted{epi_slide(x, function(x, g, t) mean(x$cases), .window_size = 7, - .new_col_name = "cases_7dav") +\if{html}{\out{
}}\preformatted{# To compute the 7-day trailing average of cases +epi_slide(edf, cases_7dav = mean(cases), .window_size = 7) }\if{html}{\out{
}} -In a manner similar to \code{\link[dplyr:mutate]{dplyr::mutate}}: -\itemize{ -\item Expressions evaluating to length-1 vectors will be recycled to -appropriate lengths. -\item \verb{, name_var := value} can be used to set the output column name based on -a variable \code{name_var} rather than requiring you to use a hard-coded -name. (The leading comma is needed to make sure that \code{.f} is treated as -missing.) -\item \verb{= NULL} can be used to remove results from previous expressions (though -we don't allow it to remove pre-existing columns). -\item \verb{, fn_returning_a_data_frame(.x)} will unpack the output of the function -into multiple columns in the result. -\item Named expressions evaluating to data frames will be placed into -\code{\link[tidyr:pack]{tidyr::pack}}ed columns. +This will create the new column \code{cases_7dav} that contains a 7-day rolling +average of values in "cases". See \code{vignette("epi_df")} for more examples. } +\details{ +\subsection{Advanced uses of \code{.f} via tidy evaluation}{ -In addition to \code{\link{.data}} and \code{\link{.env}}, we make some additional -"pronoun"-like bindings available: +If specifying \code{.f} via tidy evaluation, in addition to the standard \code{\link{.data}} +and \code{\link{.env}}, we make some additional "pronoun"-like bindings available: \itemize{ \item .x, which is like \code{.x} in \code{\link[dplyr:group_map]{dplyr::group_modify}}; an ordinary object like an \code{epi_df} rather than an rlang \link[rlang:as_data_mask]{pronoun} @@ -175,49 +123,61 @@ will. determined the time window for the current computation. } } +} \examples{ -# slide a 7-day trailing average formula on cases -# Simple sliding means and sums are much faster to do using -# the `epi_slide_mean` and `epi_slide_sum` functions instead. -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), .window_size = 7) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% - ungroup() - -# slide a 7-day leading average +# Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "left") \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% - ungroup() + epi_slide( + cases_7sd = sd(cases, na.rm = TRUE), + cases_7dav = mean(cases, na.rm = TRUE), + .window_size = 7 + ) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) -# slide a 7-day centre-aligned average +# The same as above, but unpacking using an unnamed data.frame with a formula cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide(cases_7dav = mean(cases), .window_size = 7, .align = "center") \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% - ungroup() + epi_slide( + ~ data.frame( + cases_7sd = sd(.x$cases, na.rm = TRUE), + cases_7dav = mean(.x$cases, na.rm = TRUE) + ), + .window_size = 7 + ) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) -# slide a 14-day centre-aligned average +# The same as above, but packing using a named data.frame with a tidy evaluation +# expression cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide(cases_14dav = mean(cases), .window_size = 14, .align = "center") \%>\% - dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% - ungroup() + epi_slide( + slide_packed = data.frame( + cases_7sd = sd(.x$cases, na.rm = TRUE), + cases_7dav = mean(.x$cases, na.rm = TRUE) + ), + .window_size = 7 + ) \%>\% + dplyr::select(geo_value, time_value, cases, slide_packed) # nested new columns cases_deaths_subset \%>\% group_by(geo_value) \%>\% epi_slide( - cases_2d = list(data.frame( - cases_2dav = mean(cases), - cases_2dma = mad(cases) - )), - .window_size = 2 + function(x, g, t) { + data.frame( + cases_7sd = sd(x$cases, na.rm = TRUE), + cases_7dav = mean(x$cases, na.rm = TRUE) + ) + }, + .window_size = 7 ) \%>\% - ungroup() + dplyr::select(geo_value, time_value, cases, cases_7sd, cases_7dav) + +# Use the geo_value or the ref_time_value in the slide computation +cases_deaths_subset \%>\% + epi_slide(~ .x$geo_value[[1]], .window_size = 7) + +cases_deaths_subset \%>\% + epi_slide(~ .x$time_value[[1]], .window_size = 7) } \seealso{ -\code{\link{epi_slide_opt}} \code{\link{epi_slide_mean}} \code{\link{epi_slide_sum}} +\code{\link{epi_slide_opt}} for optimized slide functions } diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd deleted file mode 100644 index e075f759..00000000 --- a/man/epi_slide_mean.Rd +++ /dev/null @@ -1,166 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R -\name{epi_slide_mean} -\alias{epi_slide_mean} -\title{Optimized slide function for performing rolling averages on an \code{epi_df} object} -\usage{ -epi_slide_mean( - .x, - .col_names, - ..., - .window_size = NULL, - .align = c("right", "center", "left"), - .ref_time_values = NULL, - .all_rows = FALSE -) -} -\arguments{ -\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a -single data group.} - -\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), -\link[tidyselect:language]{other tidy-select expression}, or a vector of -characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if -they were positions in the data frame, so expressions like \code{x:y} can be -used to select a range of variables. - -The tidy-selection renaming interface is not supported, and cannot be used -to provide output column names; if you want to customize the output column -names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} - -\item{...}{Additional arguments to pass to the slide computation \code{.f}, for -example, \code{algo} or \code{na.rm} in data.table functions. You don't need to -specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider -functions).} - -\item{.window_size}{The size of the sliding window. By default, this is 1, -meaning that only the current ref_time_value is included. The accepted values -here depend on the \code{time_value} column: -\itemize{ -\item if time_type is Date and the cadence is daily, then \code{.window_size} can be -an integer (which will be interpreted in units of days) or a difftime -with units "days" -\item if time_type is Date and the cadence is weekly, then \code{.window_size} must -be a difftime with units "weeks" -\item if time_type is an yearmonth or integer, then \code{.window_size} must be an -integer -}} - -\item{.align}{The alignment of the sliding window. If \code{right} (default), then -the window has its end at the reference time; if \code{center}, then the window is -centered at the reference time; if \code{left}, then the window has its start at -the reference time. If the alignment is \code{center} and the window size is odd, -then the window will have floor(window_size/2) points before and after the -reference time. If the window size is even, then the window will be -asymmetric and have one less value on the right side of the reference time -(assuming time increases from left to right).} - -\item{.ref_time_values}{Time values for sliding computations, meaning, each -element of this vector serves as the reference time point for one sliding -window. If missing, then this will be set to all unique time values in the -underlying data table, by default.} - -\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in -the output even with \code{.ref_time_values} provided, with some type of missing -value marker for the slide computation output column(s) for \code{time_value}s -outside \code{.ref_time_values}; otherwise, there will be one row for each row in -\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The -missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} -} -\value{ -An \code{epi_df} object given by appending one or more new columns to \code{.x}, -named according to the \code{.new_col_name} argument. -} -\description{ -Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for -examples. -} -\details{ -Wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollmean}. - -To "slide" means to apply a function or formula over a rolling -window. The \code{.window_size} arg determines the width of the window -(including the reference time) and the \code{.align} arg governs how the window -is aligned (see below for examples). The \code{.ref_time_values} arg controls -which time values to consider for the slide and \code{.all_rows} allows you to -keep NAs around. - -\verb{epi_slide_*()} does not require a complete window (such as on the left -boundary of the dataset) and will attempt to perform the computation -anyway. The issue of what to do with partial computations (those run on -incomplete windows) is therefore left up to the user, either through the -specified function or formula \code{f}, or through post-processing. - -Let's look at some window examples, assuming that the reference time value -is "tv". With .align = "right" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv - -With .align = "center" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 1, tv, tv + 1 - -With .align = "center" and .window_size = 4, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv, tv + 1 - -With .align = "left" and .window_size = 3, the window will be: - -time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv, tv + 1, tv + 2 -} -\examples{ -# slide a 7-day trailing average formula on cases -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 7) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() - -# slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed -# and accuracy, and to allow partially-missing windows. -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_mean( - cases, - .window_size = 7, - # `frollmean` options - na.rm = TRUE, algo = "exact", hasNA = TRUE - ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() - -# slide a 7-day leading average -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 7, .align = "right") \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() - -# slide a 7-day centre-aligned average -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 7, .align = "center") \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() - -# slide a 14-day centre-aligned average -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_mean(cases, .window_size = 14, .align = "center") \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_14dav = slide_value_cases) \%>\% - ungroup() -} -\seealso{ -\code{\link{epi_slide}} \code{\link{epi_slide_opt}} \code{\link{epi_slide_sum}} -} diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 7ec78828..cd293ee1 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -2,8 +2,9 @@ % Please edit documentation in R/slide.R \name{epi_slide_opt} \alias{epi_slide_opt} -\title{Optimized slide function for performing common rolling computations on an -\code{epi_df} object} +\alias{epi_slide_mean} +\alias{epi_slide_sum} +\title{Optimized slide functions for common cases} \usage{ epi_slide_opt( .x, @@ -15,11 +16,31 @@ epi_slide_opt( .ref_time_values = NULL, .all_rows = FALSE ) + +epi_slide_mean( + .x, + .col_names, + ..., + .window_size = NULL, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE +) + +epi_slide_sum( + .x, + .col_names, + ..., + .window_size = NULL, + .align = c("right", "center", "left"), + .ref_time_values = NULL, + .all_rows = FALSE +) } \arguments{ -\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a -single data group.} +\item{.x}{An \code{epi_df} object. If ungrouped, we group by \code{geo_value} and any +columns in \code{other_keys}. If grouped, we make sure the grouping is by +\code{geo_value} and \code{other_keys}.} \item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), @@ -50,133 +71,93 @@ example, \code{algo} or \code{na.rm} in data.table functions. You don't need to specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider functions).} -\item{.window_size}{The size of the sliding window. By default, this is 1, -meaning that only the current ref_time_value is included. The accepted values -here depend on the \code{time_value} column: +\item{.window_size}{The size of the sliding window. The accepted values +depend on the type of the \code{time_value} column in \code{.x}: \itemize{ -\item if time_type is Date and the cadence is daily, then \code{.window_size} can be -an integer (which will be interpreted in units of days) or a difftime +\item if time type is \code{Date} and the cadence is daily, then \code{.window_size} can +be an integer (which will be interpreted in units of days) or a difftime with units "days" -\item if time_type is Date and the cadence is weekly, then \code{.window_size} must -be a difftime with units "weeks" -\item if time_type is an yearmonth or integer, then \code{.window_size} must be an +\item if time type is \code{Date} and the cadence is weekly, then \code{.window_size} must +be a \code{difftime} with units "weeks" +\item if time type is a \code{yearmonth} or an integer, then \code{.window_size} must be an integer }} -\item{.align}{The alignment of the sliding window. If \code{right} (default), then -the window has its end at the reference time; if \code{center}, then the window is -centered at the reference time; if \code{left}, then the window has its start at -the reference time. If the alignment is \code{center} and the window size is odd, -then the window will have floor(window_size/2) points before and after the -reference time. If the window size is even, then the window will be -asymmetric and have one less value on the right side of the reference time -(assuming time increases from left to right).} +\item{.align}{The alignment of the sliding window. +\itemize{ +\item If "right" (default), then the window has its end at the reference time. +This is likely the most common use case, e.g. \code{.window_size=7} and +\code{.align="right"} slides over the past week of data. +\item If "left", then the window has its start at the reference time. +\item If "center", then the window is centered at the reference time. If the +window size is odd, then the window will have floor(window_size/2) points +before and after the reference time; if the window size is even, then the +window will be asymmetric and have one more value before the reference time +than after. +}} -\item{.ref_time_values}{Time values for sliding computations, meaning, each -element of this vector serves as the reference time point for one sliding -window. If missing, then this will be set to all unique time values in the -underlying data table, by default.} +\item{.ref_time_values}{The time values at which to compute the slides +values. By default, this is all the unique time values in \code{.x}.} -\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in -the output even with \code{.ref_time_values} provided, with some type of missing -value marker for the slide computation output column(s) for \code{time_value}s -outside \code{.ref_time_values}; otherwise, there will be one row for each row in -\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The -missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +\item{.all_rows}{If \code{.all_rows = FALSE}, the default, then the output +\code{epi_df} will have only the rows that had a \code{time_value} in +\code{.ref_time_values}. Otherwise, all the rows from \code{.x} are included by with +a missing value marker (typically NA, but more technically the result of +\code{vctrs::vec_cast}-ing \code{NA} to the type of the slide computation output).} } \value{ -An \code{epi_df} object given by appending one or more new columns to \code{.x}, -named according to the \code{.new_col_name} argument. +An \code{epi_df} object with one or more new slide computation columns +added. } \description{ -Slides an n-timestep \link[data.table:froll]{data.table::froll} or \link[slider:summary-slide]{slider::summary-slide} function -over variables in an \code{epi_df} object. See the -\href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} -for examples. -} -\details{ -To "slide" means to apply a function or formula over a rolling -window. The \code{.window_size} arg determines the width of the window -(including the reference time) and the \code{.align} arg governs how the window -is aligned (see below for examples). The \code{.ref_time_values} arg controls -which time values to consider for the slide and \code{.all_rows} allows you to -keep NAs around. - -\verb{epi_slide_*()} does not require a complete window (such as on the left -boundary of the dataset) and will attempt to perform the computation -anyway. The issue of what to do with partial computations (those run on -incomplete windows) is therefore left up to the user, either through the -specified function or formula \code{f}, or through post-processing. - -Let's look at some window examples, assuming that the reference time value -is "tv". With .align = "right" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv +\code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} +or \link[slider:summary-slide]{slider::summary-slide} function over variables in an \code{epi_df} object. +These functions tend to be much faster than \code{epi_slide()}. See +\code{vignette("epi_df")} for more examples. -With .align = "center" and .window_size = 3, the window will be: +\code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollmean}. -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 1, tv, tv + 1 - -With .align = "center" and .window_size = 4, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv, tv + 1 - -With .align = "left" and .window_size = 3, the window will be: - -time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv, tv + 1, tv + 2 +\code{epi_slide_sum} is a wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. } \examples{ -# slide a 7-day trailing average formula on cases. This can also be done with `epi_slide_mean` +# Compute a 7-day trailing average on cases. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt( - cases, - .f = data.table::frollmean, .window_size = 7 - ) \%>\% - # Remove a nonessential var. to ensure new col is printed, and rename new col - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() + epi_slide_opt(cases, .f = data.table::frollmean, .window_size = 7) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) -# slide a 7-day trailing average formula on cases. Adjust `frollmean` settings for speed -# and accuracy, and to allow partially-missing windows. +# Same as above, but adjust `frollmean` settings for speed, accuracy, and +# to allow partially-missing windows. cases_deaths_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, .f = data.table::frollmean, .window_size = 7, - # `frollmean` options algo = "exact", hasNA = TRUE, na.rm = TRUE ) \%>\% - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() - -# slide a 7-day leading average + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +# Compute a 7-day trailing average on cases. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt( - cases, - .f = slider::slide_mean, .window_size = 7, .align = "left" - ) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() + epi_slide_mean(cases, .window_size = 7) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) -# slide a 7-day centre-aligned sum. This can also be done with `epi_slide_sum` +# Same as above, but adjust `frollmean` settings for speed, accuracy, and +# to allow partially-missing windows. cases_deaths_subset \%>\% group_by(geo_value) \%>\% - epi_slide_opt( + epi_slide_mean( cases, - .f = data.table::frollsum, .window_size = 6, .align = "center" + .window_size = 7, + na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) \%>\% - ungroup() + dplyr::select(geo_value, time_value, cases, cases_7dav = slide_value_cases) +# Compute a 7-day trailing sum on cases. +cases_deaths_subset \%>\% + group_by(geo_value) \%>\% + epi_slide_sum(cases, .window_size = 7) \%>\% + dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) } \seealso{ -\code{\link{epi_slide}} \code{\link{epi_slide_mean}} \code{\link{epi_slide_sum}} +\code{\link{epi_slide}} for the more general slide function } diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd deleted file mode 100644 index 920aa370..00000000 --- a/man/epi_slide_sum.Rd +++ /dev/null @@ -1,129 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/slide.R -\name{epi_slide_sum} -\alias{epi_slide_sum} -\title{Optimized slide function for performing rolling sums on an \code{epi_df} object} -\usage{ -epi_slide_sum( - .x, - .col_names, - ..., - .window_size = NULL, - .align = c("right", "center", "left"), - .ref_time_values = NULL, - .all_rows = FALSE -) -} -\arguments{ -\item{.x}{The \code{epi_df} object under consideration, \link[dplyr:group_by]{grouped} -or ungrouped. If ungrouped, all data in \code{.x} will be treated as part of a -single data group.} - -\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column -name(e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), -\link[tidyselect:language]{other tidy-select expression}, or a vector of -characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if -they were positions in the data frame, so expressions like \code{x:y} can be -used to select a range of variables. - -The tidy-selection renaming interface is not supported, and cannot be used -to provide output column names; if you want to customize the output column -names, use \code{\link[dplyr:rename]{dplyr::rename}} after the slide.} - -\item{...}{Additional arguments to pass to the slide computation \code{.f}, for -example, \code{algo} or \code{na.rm} in data.table functions. You don't need to -specify \code{.x}, \code{.window_size}, or \code{.align} (or \code{before}/\code{after} for slider -functions).} - -\item{.window_size}{The size of the sliding window. By default, this is 1, -meaning that only the current ref_time_value is included. The accepted values -here depend on the \code{time_value} column: -\itemize{ -\item if time_type is Date and the cadence is daily, then \code{.window_size} can be -an integer (which will be interpreted in units of days) or a difftime -with units "days" -\item if time_type is Date and the cadence is weekly, then \code{.window_size} must -be a difftime with units "weeks" -\item if time_type is an yearmonth or integer, then \code{.window_size} must be an -integer -}} - -\item{.align}{The alignment of the sliding window. If \code{right} (default), then -the window has its end at the reference time; if \code{center}, then the window is -centered at the reference time; if \code{left}, then the window has its start at -the reference time. If the alignment is \code{center} and the window size is odd, -then the window will have floor(window_size/2) points before and after the -reference time. If the window size is even, then the window will be -asymmetric and have one less value on the right side of the reference time -(assuming time increases from left to right).} - -\item{.ref_time_values}{Time values for sliding computations, meaning, each -element of this vector serves as the reference time point for one sliding -window. If missing, then this will be set to all unique time values in the -underlying data table, by default.} - -\item{.all_rows}{If \code{.all_rows = TRUE}, then all rows of \code{.x} will be kept in -the output even with \code{.ref_time_values} provided, with some type of missing -value marker for the slide computation output column(s) for \code{time_value}s -outside \code{.ref_time_values}; otherwise, there will be one row for each row in -\code{.x} that had a \code{time_value} in \code{.ref_time_values}. Default is \code{FALSE}. The -missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} -} -\value{ -An \code{epi_df} object given by appending one or more new columns to \code{.x}, -named according to the \code{.new_col_name} argument. -} -\description{ -Slides an n-timestep sum over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for -examples. -} -\details{ -Wrapper around \code{epi_slide_opt} with \code{.f = datatable::frollsum}. - -To "slide" means to apply a function or formula over a rolling -window. The \code{.window_size} arg determines the width of the window -(including the reference time) and the \code{.align} arg governs how the window -is aligned (see below for examples). The \code{.ref_time_values} arg controls -which time values to consider for the slide and \code{.all_rows} allows you to -keep NAs around. - -\verb{epi_slide_*()} does not require a complete window (such as on the left -boundary of the dataset) and will attempt to perform the computation -anyway. The issue of what to do with partial computations (those run on -incomplete windows) is therefore left up to the user, either through the -specified function or formula \code{f}, or through post-processing. - -Let's look at some window examples, assuming that the reference time value -is "tv". With .align = "right" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv - -With .align = "center" and .window_size = 3, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 1, tv, tv + 1 - -With .align = "center" and .window_size = 4, the window will be: - -time_values: tv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv - 2, tv - 1, tv, tv + 1 - -With .align = "left" and .window_size = 3, the window will be: - -time_values: ttv - 3, tv - 2, tv - 1, tv, tv + 1, tv + 2, tv + 3 -window: tv, tv + 1, tv + 2 -} -\examples{ -# slide a 7-day trailing sum formula on cases -cases_deaths_subset \%>\% - group_by(geo_value) \%>\% - epi_slide_sum(cases, .window_size = 7) \%>\% - # Remove a nonessential var. to ensure new col is printed - dplyr::select(geo_value, time_value, cases, cases_7dsum = slide_value_cases) \%>\% - ungroup() -} -\seealso{ -\code{\link{epi_slide}} \code{\link{epi_slide_opt}} \code{\link{epi_slide_mean}} -} diff --git a/man/epiprocess.Rd b/man/epiprocess-package.Rd similarity index 77% rename from man/epiprocess.Rd rename to man/epiprocess-package.Rd index bf5f5279..b4f3e174 100644 --- a/man/epiprocess.Rd +++ b/man/epiprocess-package.Rd @@ -1,14 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epiprocess.R +% Please edit documentation in R/epiprocess-package.R \docType{package} -\name{epiprocess} -\alias{epiprocess-package} +\name{epiprocess-package} \alias{epiprocess} +\alias{epiprocess-package} \title{epiprocess: Tools for basic signal processing in epidemiology} \description{ -This package introduces a common data structure for epidemiological data sets -measured over space and time, and offers associated utilities to perform -basic signal processing tasks. +This package introduces common data structures for working with epidemiological data reported by location and time and offers associated utilities to perform basic signal processing tasks. The package is designed to be used in conjunction with `epipredict` for building and evaluating epidemiological models. } \seealso{ Useful links: @@ -46,3 +44,4 @@ Other contributors: } } +\keyword{internal} diff --git a/man/epix_fill_through_version.Rd b/man/epix_fill_through_version.Rd index a6f9c360..bd1bf4de 100644 --- a/man/epix_fill_through_version.Rd +++ b/man/epix_fill_through_version.Rd @@ -9,28 +9,45 @@ epix_fill_through_version(x, fill_versions_end, how = c("na", "locf")) \arguments{ \item{x}{An \code{epi_archive}} -\item{fill_versions_end}{Length-1, same class&type as \code{x$version}: the -version through which to fill in missing version history; this will be the -result's \verb{$versions_end} unless it already had a later -\verb{$versions_end}.} +\item{fill_versions_end}{a scalar of the same class&type as \code{x$version}: the +version through which to fill in missing version history; the +\code{epi_archive}'s \code{versions_end} attribute will be set to this, unless it +already had a later \verb{$versions_end}.} -\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} will fill in any missing -required version history with \code{NA}s, by inserting (if necessary) an update -immediately after the current \verb{$versions_end} that revises all -existing measurements to be \code{NA} (this is only supported for \code{version} -classes with a \code{next_after} implementation); \code{"locf"} will fill in missing -version history with the last version of each observation carried forward -(LOCF), by leaving the update \verb{$DT} alone (other \code{epi_archive} methods are -based on LOCF). Default is \code{"na"}.} +\item{how}{Optional; \code{"na"} or \code{"locf"}: \code{"na"} fills missing version history +with \code{NA}s, \code{"locf"} fills missing version history with the last version of +each observation carried forward (LOCF). Default is \code{"na"}.} } \value{ +An \code{epi_archive} + An \code{epi_archive} } \description{ -Sometimes, due to upstream data pipeline issues, we have to work with a -version history that isn't completely up to date, but with functions that -expect archives that are completely up to date, or equally as up-to-date as -another archive. This function provides one way to approach such mismatches: -pretend that we've "observed" additional versions, filling in these versions -with NAs or extrapolated values. +This function fills in missing version history in an \code{epi_archive} object up +to a specified version, updating the \code{versions_end} field as necessary. Note +that the filling is done in a compactified way, see details. +} +\details{ +Note that we generally store \code{epi_archive}'s in a compacted form, meaning +that, implciitly, if a version does not exist, but the \code{version_end} +attribute is greater, then it is understood that all the versions in between +had the same value as the last observed version. This affects the behavior of +this function in the following ways: +\itemize{ +\item if \code{how = "na"}, then the function will fill in at most one missing version +with \code{NA} and the rest will be implicit. +\item if \code{how = "locf"}, then the function will not fill any values. +} +} +\examples{ +test_date <- as.Date("2020-01-01") +ea_orig <- as_epi_archive(data.table::data.table( + geo_value = "ak", + time_value = test_date + c(rep(0L, 5L), 1L), + version = test_date + c(1:5, 2L), + value = 1:6 +)) +epix_fill_through_version(ea_orig, test_date + 8, "na") +epix_fill_through_version(ea_orig, test_date + 8, "locf") } diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index 564a1fdc..3ffebc99 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -14,24 +14,27 @@ epix_merge( \arguments{ \item{x, y}{Two \code{epi_archive} objects to join together.} -\item{sync}{Optional; \code{"forbid"}, \code{"na"}, \code{"locf"}, or \code{"truncate"}; in the -case that \code{x$versions_end} doesn't match \code{y$versions_end}, what do we do?: -\code{"forbid"}: emit an error; "na": use \code{max(x$versions_end, y$versions_end)} -as the result's \code{versions_end}, but ensure that, if we request a snapshot -as of a version after \code{min(x$versions_end, y$versions_end)}, the -observation columns from the less up-to-date archive will be all NAs (i.e., -imagine there was an update immediately after its \code{versions_end} which -revised all observations to be \code{NA}); \code{"locf"}: use \code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, allowing the last version -of each observation to be carried forward to extrapolate unavailable -versions for the less up-to-date input archive (i.e., imagining that in the -less up-to-date archive's data set remained unchanged between its actual -\code{versions_end} and the other archive's \code{versions_end}); or \code{"truncate"}: -use \code{min(x$versions_end, y$versions_end)} as the result's \code{versions_end}, -and discard any rows containing update rows for later versions.} - -\item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{NULL}; should the result be -compactified? See \code{as_epi_archive()} for an explanation of what this means. -Default here is \code{TRUE}.} +\item{sync}{Optional; character. The argument that decides how to handle the +situation when one signal has a more recent revision than another signal +for a key that they have both already observed. The options are: +\itemize{ +\item \code{"forbid"}: the default and the strictest option, throws an error; this +is likely not what you want, but it is strict to make the user aware of the +issues, +\item \code{"locf"}: carry forward the last observed version of the missing signal +to the new version and use \code{max(x$versions_end, y$versions_end)} as the +result's \code{versions_end}, +\item \code{"na"}: fill the unobserved values with \code{NA}'s (this can be handy when +you know that source data is truly missing upstream and you want to +represent the lack of information accurately, for instance) and use +\code{max(x$versions_end, y$versions_end)} as the result's \code{versions_end}, +\item \code{"truncate"}: discard any rows containing update rows for later versions +and use \code{min(x$versions_end, y$versions_end)} as the result's +\code{versions_end}. +}} + +\item{compactify}{Optional; \code{TRUE} (default), \code{FALSE}, or \code{NULL}; should the +result be compactified? See \code{as_epi_archive()} for details.} } \value{ the resulting \code{epi_archive} @@ -46,6 +49,18 @@ clobberable versions). If the \code{versions_end} values differ, the \code{sync} parameter controls what is done. } \details{ +When merging archives, unless the archives have identical data release +patterns, we often have to handle the situation when one signal has a more +recent observation for a key than another signal. In this case, we have two +options: +\itemize{ +\item if the the other signal has never observed that key, we need to introduce +\code{NA}s in the non-key variables for the missing signal, +\item if the other signal has observed that key previously, but at an ealier +revision date, then we need to decide how to handle the missing value in the +more recent signal; the \code{sync} argument controls this behavior. +} + In all cases, \code{clobberable_versions_start} will be set to the earliest version that could be clobbered in either input archive. } @@ -58,18 +73,14 @@ s1 <- tibble::tibble( version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-02")), signal1 = c(10, 11, 7) ) - s2 <- tibble::tibble( geo_value = c("ca", "ca"), time_value = as.Date(c("2024-08-01", "2024-08-02")), version = as.Date(c("2024-08-03", "2024-08-03")), signal2 = c(2, 3) ) - - s1 <- s1 \%>\% as_epi_archive() s2 <- s2 \%>\% as_epi_archive() - merged <- epix_merge(s1, s2, sync = "locf") merged[["DT"]] @@ -81,18 +92,14 @@ s1 <- tibble::tibble( version = as.Date(c("2024-08-01", "2024-08-03", "2024-08-03", "2024-08-03")), signal1 = c(12, 13, 22, 19) ) - s2 <- tibble::tibble( geo_value = c("ca", "ca"), time_value = as.Date(c("2024-08-01", "2024-08-02")), version = as.Date(c("2024-08-02", "2024-08-02")), signal2 = c(4, 5), ) - - s1 <- s1 \%>\% as_epi_archive() s2 <- s2 \%>\% as_epi_archive() - merged <- epix_merge(s1, s2, sync = "locf") merged[["DT"]] @@ -104,7 +111,6 @@ s1 <- tibble::tibble( version = as.Date(c("2024-08-01", "2024-08-02", "2024-08-03")), signal1 = c(14, 11, 9) ) - # The s2 signal at August 1st gets revised from 3 to 5 on August 3rd s2 <- tibble::tibble( geo_value = c("ca", "ca", "ca"), @@ -112,11 +118,8 @@ s2 <- tibble::tibble( version = as.Date(c("2024-08-02", "2024-08-03", "2024-08-03")), signal2 = c(3, 5, 2), ) - s1 <- s1 \%>\% as_epi_archive() s2 <- s2 \%>\% as_epi_archive() - -# Some LOCF for signal 1 as signal 2 gets updated merged <- epix_merge(s1, s2, sync = "locf") merged[["DT"]] } diff --git a/man/figures/README-unnamed-chunk-6-1.png b/man/figures/README-unnamed-chunk-6-1.png new file mode 100644 index 00000000..b435c651 Binary files /dev/null and b/man/figures/README-unnamed-chunk-6-1.png differ diff --git a/man/figures/README-unnamed-chunk-6-1.svg b/man/figures/README-unnamed-chunk-6-1.svg new file mode 100644 index 00000000..adde4157 --- /dev/null +++ b/man/figures/README-unnamed-chunk-6-1.svg @@ -0,0 +1,383 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/man/figures/README-unnamed-chunk-7-1.svg b/man/figures/README-unnamed-chunk-7-1.svg new file mode 100644 index 00000000..30058a57 --- /dev/null +++ b/man/figures/README-unnamed-chunk-7-1.svg @@ -0,0 +1,399 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/man/figures/README-unnamed-chunk-8-1.svg b/man/figures/README-unnamed-chunk-8-1.svg new file mode 100644 index 00000000..5ec5ba01 --- /dev/null +++ b/man/figures/README-unnamed-chunk-8-1.svg @@ -0,0 +1,365 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/man/full_date_seq.Rd b/man/full_date_seq.Rd new file mode 100644 index 00000000..eb36b2c1 --- /dev/null +++ b/man/full_date_seq.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/slide.R +\name{full_date_seq} +\alias{full_date_seq} +\title{Make a complete date sequence between min(x$time_value) and max +(x$time_value). Produce lists of dates before min(x$time_value) and after +max(x$time_value) for padding initial and final windows to size \code{n}.} +\usage{ +full_date_seq(x, before, after, time_type) +} +\description{ +\code{before} and \code{after} args are assumed to have been validated by the calling +function (using \code{validate_slide_window_arg}). +} +\keyword{internal} diff --git a/man/geo_column_names.Rd b/man/geo_column_names.Rd index 4b3810dc..839d7ce8 100644 --- a/man/geo_column_names.Rd +++ b/man/geo_column_names.Rd @@ -10,3 +10,4 @@ geo_column_names() the full list of potential substitutions for the \code{geo_value} column name: geo_value, geo_values, geo_id, geos, location, jurisdiction, fips, zip, county, hrr, msa, state, province, nation, states, provinces, counties, geo_Value, Geo_Value, Geo_Values, Geo_Id, Geos, Location, Jurisdiction, Fips, Zip, County, Hrr, Msa, State, Province, Nation, States, Provinces, Counties, Geo_Value } +\keyword{internal} diff --git a/man/growth_rate.Rd b/man/growth_rate.Rd index c4e82a09..0e22508c 100644 --- a/man/growth_rate.Rd +++ b/man/growth_rate.Rd @@ -84,8 +84,7 @@ filtering (a discrete spline) fit to \code{x} and \code{y}, via \code{genlasso::trendfilter()}, divided by the fitted value of the discrete spline at \code{x0}. } -} -\section{Log Scale}{ +\subsection{Log Scale}{ An alternative view for the growth rate of a function f in general is given by defining g(t) = log(f(t)), and then observing that g'(t) = f'(t) / @@ -96,7 +95,7 @@ method above ("rel_change", "linear_reg", "smooth_spline", and \code{log_scale = TRUE}. } -\section{Sliding Windows}{ +\subsection{Sliding Windows}{ For the local methods, "rel_change" and "linear_reg", we use a sliding window centered at the reference point of bandiwidth \code{h}. In other words, the @@ -108,7 +107,7 @@ sliding window contains all data in between January 1 and 14 (matching the behavior of \code{epi_slide()} with \code{before = h - 1} and \code{after = h}). } -\section{Additional Arguments}{ +\subsection{Additional Arguments}{ For the global methods, "smooth_spline" and "trend_filter", additional arguments can be specified via \code{...} for the underlying estimation @@ -133,7 +132,7 @@ rule, respectively. Default is "min" (going along with the default \code{cv = TR user. } } - +} \examples{ # COVID cases growth rate by state using default method relative change cases_deaths_subset \%>\% diff --git a/man/guess_period.Rd b/man/guess_period.Rd index 0be9fdf2..5f17cf4e 100644 --- a/man/guess_period.Rd +++ b/man/guess_period.Rd @@ -30,3 +30,4 @@ by adding \code{k * result} for an integer k, and such that there is no smaller \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} } +\keyword{internal} diff --git a/man/is_epi_df.Rd b/man/is_epi_df.Rd deleted file mode 100644 index 62e2f43a..00000000 --- a/man/is_epi_df.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_df.R -\name{is_epi_df} -\alias{is_epi_df} -\title{Test for \code{epi_df} format} -\usage{ -is_epi_df(x) -} -\arguments{ -\item{x}{An object.} -} -\value{ -\code{TRUE} if the object inherits from \code{epi_df}. -} -\description{ -Test for \code{epi_df} format -} diff --git a/man/max_version_with_row_in.Rd b/man/max_version_with_row_in.Rd index cca554fa..47ce2abb 100644 --- a/man/max_version_with_row_in.Rd +++ b/man/max_version_with_row_in.Rd @@ -16,3 +16,4 @@ an \code{NA} version value \description{ Exported to make defaults more easily copyable. } +\keyword{internal} diff --git a/man/next_after.Rd b/man/next_after.Rd index 5170e8d9..da9e321c 100644 --- a/man/next_after.Rd +++ b/man/next_after.Rd @@ -15,3 +15,4 @@ same class, typeof, and length as \code{x} \description{ Get the next possible value greater than \code{x} of the same type } +\keyword{internal} diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index d1664cd7..5a232de0 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/methods-epi_df.R \name{print.epi_df} \alias{print.epi_df} +\alias{summary.epi_df} \alias{group_by.epi_df} \alias{ungroup.epi_df} \alias{group_modify.epi_df} @@ -10,6 +11,8 @@ \usage{ \method{print}{epi_df}(x, ...) +\method{summary}{epi_df}(object, ...) + \method{group_by}{epi_df}(.data, ...) \method{ungroup}{epi_df}(x, ...) @@ -21,7 +24,10 @@ \arguments{ \item{x}{an \code{epi_df}} -\item{...}{additional arguments to forward to \code{NextMethod()}, or unused} +\item{...}{Additional arguments, for compatibility with \code{summary()}. +Currently unused.} + +\item{object}{an \code{epi_df}} \item{.data}{an \code{epi_df}} @@ -33,4 +39,7 @@ } \description{ Print and summary functions for an \code{epi_df} object. + +Prints a variety of summary statistics about the \code{epi_df} object, such as +the time range included and geographic coverage. } diff --git a/man/revision_summary.Rd b/man/revision_summary.Rd index 590a1ed5..39a72b9a 100644 --- a/man/revision_summary.Rd +++ b/man/revision_summary.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/revision_analysis.R \name{revision_summary} \alias{revision_summary} -\title{A function to describe revision behavior for an archive} +\title{A function to describe revision behavior for an archive.} \usage{ revision_summary( epi_arch, @@ -70,8 +70,9 @@ threshold for when two floats are considered identical.} \description{ \code{revision_summary} removes all missing values (if requested), and then computes some basic statistics about the revision behavior of an archive, -returning a tibble summarizing the revisions per time_value+epi_key features. If \code{print_inform} is true, it -prints a concise summary. The columns returned are: +returning a tibble summarizing the revisions per time_value+epi_key +features. If \code{print_inform} is true, it prints a concise summary. The +columns returned are: \enumerate{ \item \code{n_revisions}: the total number of revisions for that entry \item \code{min_lag}: the minimum time to any value (if \code{drop_nas=FALSE}, this @@ -86,16 +87,15 @@ always excludes \code{NA} values) \item \code{rel_spread}: \code{spread} divided by the largest value (so it will always be less than 1). Note that this need not be the final value. It will be \code{NA} whenever \code{spread} is 0. -\item \code{time_near_latest}: This gives the lag when the value is within -\code{within_latest} (default 20\%) of the value at the latest time. For example, -consider the series (0,20, 99, 150, 102, 100); then \code{time_near_latest} is -the 5th index, since even though 99 is within 20\%, it is outside the window -afterwards at 150. +\item \code{time_near_latest}: the time taken for the revisions to settle to within +\code{within_latest} (default 20\%) of the final value and stay there. For +example, consider the series (0, 20, 99, 150, 102, 100); then +\code{time_near_latest} is 5, since even though 99 is within 20\%, it is outside +the window afterwards at 150. } } \examples{ - revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) - revision_example \%>\% arrange(desc(spread)) + } diff --git a/man/summary.epi_df.Rd b/man/summary.epi_df.Rd deleted file mode 100644 index 831d4d4e..00000000 --- a/man/summary.epi_df.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_df.R -\name{summary.epi_df} -\alias{summary.epi_df} -\title{Summarize \code{epi_df} object} -\usage{ -\method{summary}{epi_df}(object, ...) -} -\arguments{ -\item{object}{an \code{epi_df}} - -\item{...}{Additional arguments, for compatibility with \code{summary()}. -Currently unused.} -} -\description{ -Prints a variety of summary statistics about the \code{epi_df} object, such as -the time range included and geographic coverage. -} diff --git a/man/time_column_names.Rd b/man/time_column_names.Rd index 2e2db6b5..01668ebd 100644 --- a/man/time_column_names.Rd +++ b/man/time_column_names.Rd @@ -10,3 +10,4 @@ time_column_names() the full list of potential substitutions for the \code{time_value} column name: time_value, date, time, datetime, dateTime, date_time, target_date, week, epiweek, month, mon, year, yearmon, yearmonth, yearMon, yearMonth, dates, time_values, target_dates, time_Value, Time_Value, Date, Time, Datetime, DateTime, Date_Time, Target_Date, Week, Epiweek, Month, Mon, Year, Yearmon, Yearmonth, YearMon, YearMonth, Dates, Time_Values, Target_Dates, Time_Value } +\keyword{internal} diff --git a/man/validate_version_bound.Rd b/man/validate_version_bound.Rd new file mode 100644 index 00000000..2eb0be6b --- /dev/null +++ b/man/validate_version_bound.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/archive.R +\name{validate_version_bound} +\alias{validate_version_bound} +\title{Validate a version bound arg} +\usage{ +validate_version_bound( + version_bound, + x, + na_ok = FALSE, + version_bound_arg = rlang::caller_arg(version_bound), + x_arg = rlang::caller_arg(x) +) +} +\arguments{ +\item{version_bound}{the version bound to validate} + +\item{x}{a data frame containing a version column with which to check +compatibility} + +\item{na_ok}{Boolean; is \code{NA} an acceptable "bound"? (If so, \code{NA} will +have a special context-dependent meaning.)} + +\item{version_bound_arg}{optional string; what to call the version bound in +error messages} +} +\description{ +Expected to be used on \code{clobberable_versions_start}, \code{versions_end}, and +similar arguments. Some additional context-specific checks may be needed. +Side effects: raises an error if version bound appears invalid. +} +\keyword{internal} diff --git a/man/version_column_names.Rd b/man/version_column_names.Rd index 75ee2315..2761dedd 100644 --- a/man/version_column_names.Rd +++ b/man/version_column_names.Rd @@ -10,3 +10,4 @@ version_column_names() the full list of potential substitutions for the \code{version} column name: version, issue, release, Version, Issue, Release } +\keyword{internal} diff --git a/pkgdown-watch.R b/pkgdown-watch.R new file mode 100644 index 00000000..00aae322 --- /dev/null +++ b/pkgdown-watch.R @@ -0,0 +1,58 @@ +# Run with: Rscript pkgdown-watch.R +# +# Modifying this: https://gist.github.com/gadenbuie/d22e149e65591b91419e41ea5b2e0621 +# - Removed docopts cli interface and various configs/features I didn't need. +# - Sped up reference building by not running examples. +# +# Note that the `pattern` regex is case sensitive, so make sure your Rmd files +# end in `.Rmd` and not `.rmd`. + +rlang::check_installed(c("pkgdown", "servr", "devtools", "here", "cli", "fs")) + +pkg <- pkgdown::as_pkgdown(".") + +servr::httw( + dir = here::here("docs"), + watch = here::here(), + pattern = "[.](Rm?d|y?ml|s[ac]ss|css|js)$", + handler = function(files) { + devtools::load_all() + + files_rel <- fs::path_rel(files, start = getwd()) + cli::cli_inform("{cli::col_yellow('Updated')} {.val {files_rel}}") + + articles <- grep("vignettes.+Rmd$", files, value = TRUE) + + if (length(articles) == 1) { + name <- fs::path_ext_remove(fs::path_rel(articles, fs::path(pkg$src_path, "vignettes"))) + pkgdown::build_article(name, pkg) + } else if (length(articles) > 1) { + pkgdown::build_articles(pkg, preview = FALSE) + } + + refs <- grep("man.+R(m?d)?$", files, value = TRUE) + if (length(refs)) { + # TODO: This does not work for me, so I just run it manually. + # pkgdown::build_reference(pkg, preview = FALSE, examples = FALSE, lazy = FALSE) + } + + pkgdown <- grep("pkgdown", files, value = TRUE) + if (length(pkgdown) && !pkgdown %in% c(articles, refs)) { + pkgdown::init_site(pkg) + } + + pkgdown_index <- grep("index[.]Rmd$", files_rel, value = TRUE) + if (length(pkgdown_index)) { + devtools::build_rmd(pkgdown_index) + pkgdown::build_home(pkg) + } + + readme <- grep("README[.]rmd$", files, value = TRUE, ignore.case = TRUE) + if (length(readme)) { + devtools::build_readme(".") + pkgdown::build_home(pkg) + } + + cli::cli_alert("Site rebuild done!") + } +) diff --git a/scrap.Rmd b/scrap.Rmd new file mode 100644 index 00000000..cf09a8aa --- /dev/null +++ b/scrap.Rmd @@ -0,0 +1,174 @@ +This notebook contains sections removed from other notebooks. It currently doesn't compile. + +## Getting data into `epi_df` format + +As another example, here we will import the daily new (not cumulative) SARS +cases in Canada in 2003, from the +[outbreaks](https://github.com/reconverse/outbreaks) package: + +```{r} +edf <- outbreaks::sars_canada_2003 %>% + mutate(geo_value = "ca") %>% + select(geo_value, time_value = date, starts_with("cases")) %>% + pivot_longer(starts_with("cases"), names_to = "type") %>% + mutate(type = substring(type, 7)) %>% + as_epi_df(other_keys = "type") + +head(edf) + +edf %>% + autoplot() +``` + +Get confirmed cases of Ebola in Sierra Leone from 2014 to 2015 by province and +date of onset, prepared from line list data from the same package: + +```{r, fig.width = 9, fig.height = 6} +edf <- outbreaks::ebola_sierraleone_2014 %>% + select(district, date_of_onset, status) %>% + mutate(province = case_when( + district %in% c("Kailahun", "Kenema", "Kono") ~ + "Eastern", + district %in% c( + "Bombali", "Kambia", "Koinadugu", "Port Loko", + "Tonkolili" + ) ~ + "Northern", + district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ + "Sourthern", + district %in% c("Western Rural", "Western Urban") ~ + "Western" + )) %>% + group_by(geo_value = province, time_value = date_of_onset) %>% + summarise(cases = sum(status == "confirmed"), .groups = "drop") %>% + complete(geo_value, + time_value = full_seq(time_value, period = 1), + fill = list(cases = 0) + ) %>% + as_epi_df() + +head(edf) + +edf %>% + autoplot() +``` + + +## Some details on metadata + +In general, an `epi_df` object has the following fields in its metadata: + +* `geo_type`: the type for the geo values. +* `as_of`: the time value at which the given data were available. + +Metadata for an `epi_df` object `x` can be accessed (and altered) via +`attributes(x)$metadata`. The field, `geo_type`,is not currently used by any +downstream functions in the `epiprocess` package, and serve only as useful bits +of information to convey about the data set at hand. The last field here, +`as_of`, is one of the most unique aspects of an `epi_df` object. + +If `geo_type` or `as_of` arguments are missing in a call to `as_epi_df()`, then +this function will try to infer them from the passed object. Usually, `geo_type` +can be inferred from the `geo_value` columns, respectively, but inferring the +`as_of` field is not as easy. See the documentation for `as_epi_df()` more +details. + +## Using additional key columns in `epi_df` + +In the following examples we will show how to create an `epi_df` with additional keys. + +### Converting a `tsibble` that has county code as an extra key + +```{r} +ex1 <- tibble( + geo_value = rep(c("ca", "fl", "pa"), each = 3), + county_code = c( + "06059", "06061", "06067", + "12111", "12113", "12117", + "42101", "42103", "42105" + ), + time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), + value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) +) %>% + as_tsibble(index = time_value, key = c(geo_value, county_code)) + +ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03") +``` + +The metadata now includes `county_code` as an extra key. +```{r} +attr(ex1, "metadata") +``` + +### Dealing with misspecified column names + +`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. + +```{r, error = TRUE} +data.frame( + # misnamed + state = rep(c("ca", "fl", "pa"), each = 3), + # extra key + pol = rep(c("blue", "swing", "swing"), each = 3), + # misnamed + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = 9), + value = 1:9 + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, 9)) +) %>% as_epi_df(as_of = as.Date("2024-03-20")) +``` + +The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. + +```{r} +ex2 <- tibble( + # misnamed + state = rep(c("ca", "fl", "pa"), each = 3), + # extra key + pol = rep(c("blue", "swing", "swing"), each = 3), + # misnamed + reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(state)), + value = seq_along(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state))) +) %>% data.frame() + +head(ex2) + +ex2 <- ex2 %>% + rename(geo_value = state, time_value = reported_date) %>% + as_epi_df( + as_of = "2020-06-03", + other_keys = "pol" + ) + +attr(ex2, "metadata") +``` + +### Adding additional keys to an `epi_df` object + +In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. + +We use a toy data set included in `epiprocess` prepared using the `covidcast` library and are filtering to a single state for simplicity. + +```{r} +ex3 <- epidatasets::covid_incidence_county_subset %>% + filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% + slice_tail(n = 6) + +attr(ex3, "metadata") # geo_type is county currently +``` + +Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column. + +```{r} +ex3 <- ex3 %>% + as_tibble() %>% # needed to add the additional metadata + mutate( + state = rep(tolower("MA"), 6), + pol = rep(c("blue", "swing", "swing"), each = 2) + ) %>% + as_epi_df(other_keys = c("state", "pol"), as_of = as.Date("2024-03-20")) + +attr(ex3, "metadata") +``` + +Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` argument. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. + +Currently `other_keys` metadata in `epi_df` doesn't impact `epi_slide()`, contrary to `other_keys` in `as_epi_archive` which affects how the update data is interpreted. diff --git a/tests/testthat/test-archive.R b/tests/testthat/test-archive.R index 1a03141b..0a06cf79 100644 --- a/tests/testthat/test-archive.R +++ b/tests/testthat/test-archive.R @@ -90,22 +90,18 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea1 <- as_epi_archive(df, compactify = FALSE) expect_equal(key(ea1$DT), c("geo_value", "time_value", "version")) - expect_null(ea1$additional_metadata) ea2 <- as_epi_archive(df, other_keys = "value", compactify = FALSE) expect_equal(key(ea2$DT), c("geo_value", "time_value", "value", "version")) - expect_null(ea2$additional_metadata) # Tibble tib <- tibble::tibble(df, code = "x") ea3 <- as_epi_archive(tib, compactify = FALSE) expect_equal(key(ea3$DT), c("geo_value", "time_value", "version")) - expect_null(ea3$additional_metadata) ea4 <- as_epi_archive(tib, other_keys = "code", compactify = FALSE) expect_equal(key(ea4$DT), c("geo_value", "time_value", "code", "version")) - expect_null(ea4$additional_metadata) # Keyed data.table kdt <- data.table::data.table( @@ -120,12 +116,10 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea5 <- as_epi_archive(kdt, compactify = FALSE) # Key from data.table isn't absorbed when as_epi_archive is used expect_equal(key(ea5$DT), c("geo_value", "time_value", "version")) - expect_null(ea5$additional_metadata) ea6 <- as_epi_archive(kdt, other_keys = "value", compactify = FALSE) # Mismatched keys, but the one from as_epi_archive overrides expect_equal(key(ea6$DT), c("geo_value", "time_value", "value", "version")) - expect_null(ea6$additional_metadata) # Unkeyed data.table udt <- data.table::data.table( @@ -138,11 +132,9 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea7 <- as_epi_archive(udt, compactify = FALSE) expect_equal(key(ea7$DT), c("geo_value", "time_value", "version")) - expect_null(ea7$additional_metadata) ea8 <- as_epi_archive(udt, other_keys = "code", compactify = FALSE) expect_equal(key(ea8$DT), c("geo_value", "time_value", "code", "version")) - expect_null(ea8$additional_metadata) # epi_df edf1 <- cases_deaths_subset %>% @@ -151,11 +143,9 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea9 <- as_epi_archive(edf1, compactify = FALSE) expect_equal(key(ea9$DT), c("geo_value", "time_value", "version")) - expect_null(ea9$additional_metadata) ea10 <- as_epi_archive(edf1, other_keys = "code", compactify = FALSE) expect_equal(key(ea10$DT), c("geo_value", "time_value", "code", "version")) - expect_null(ea10$additional_metadata) # Keyed epi_df edf2 <- data.frame( @@ -172,11 +162,9 @@ test_that("epi_archives are correctly instantiated with a variety of data types" ea11 <- as_epi_archive(edf2, compactify = FALSE) expect_equal(key(ea11$DT), c("geo_value", "time_value", "version")) - expect_null(ea11$additional_metadata) ea12 <- as_epi_archive(edf2, other_keys = "misc", compactify = FALSE) expect_equal(key(ea12$DT), c("geo_value", "time_value", "misc", "version")) - expect_null(ea12$additional_metadata) }) test_that("`epi_archive` rejects nonunique keys", { diff --git a/vignettes/.gitignore b/vignettes/.gitignore deleted file mode 100644 index bc04c23a..00000000 --- a/vignettes/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.html -*.R -.DS_Store diff --git a/vignettes/_common.R b/vignettes/_common.R new file mode 100644 index 00000000..260fadf5 --- /dev/null +++ b/vignettes/_common.R @@ -0,0 +1,25 @@ +knitr::opts_chunk$set( + digits = 3, + comment = "#>", + collapse = TRUE, + cache = TRUE, + dev = "svg", + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + tidy = "styler", + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 +) diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd deleted file mode 100644 index 0b65c71f..00000000 --- a/vignettes/aggregation.Rmd +++ /dev/null @@ -1,235 +0,0 @@ ---- -title: Aggregate signals over space and time -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Aggregate signals over space and time} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -Aggregation, both time-wise and geo-wise, are common tasks when working with -epidemiological data sets. This vignette demonstrates how to carry out these -kinds of tasks with `epi_df` objects. We'll work with county-level reported -COVID-19 cases in MA and VT. - -The data is included in this package (via the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: - -```{r, warning = FALSE, message = FALSE} -library(epiprocess) -library(dplyr) -library(readr) - -x <- covid_incidence_county_subset -``` - -The data can also be fetched from the Delphi Epidata API with the following query: -```{r, message = FALSE, eval = FALSE, warning = FALSE} -library(epidatr) - -d <- as.Date("2024-03-20") - -# Get mapping between FIPS codes and county&state names: -y <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv", # nolint: line_length_linter - col_types = c( - FIPS = col_character(), - CTYNAME = col_character(), - STNAME = col_character() - ) -) %>% - filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% - select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) - -# Fetch only counties from Massachusetts and Vermont, then append names columns as well -x <- pub_covidcast( - source = "jhu-csse", - signals = "confirmed_incidence_num", - geo_type = "county", - time_type = "day", - geo_values = paste(y$geo_value, collapse = ","), - time_values = epirange(20200601, 20211231), - as_of = d -) %>% - select(geo_value, time_value, cases = value) %>% - inner_join(y, by = "geo_value", relationship = "many-to-one", unmatched = c("error", "drop")) %>% - as_epi_df(as_of = d) -``` - -The data contains 16,212 rows and 5 columns. - -## Converting to `tsibble` format - -For manipulating and wrangling time series data, the -[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a host of -useful tools. A tsibble object (formerly, of class `tbl_ts`) is basically a -tibble (data frame) but with two specially-marked columns: an **index** column -representing the time variable (defining an order from past to present), and a -**key** column identifying a unique observational unit for each time point. In -fact, the key can be made up of any number of columns, not just a single one. - -In an `epi_df` object, the index variable is `time_value`, and the key variable -is typically `geo_value` (though this need not always be the case: for example, -if we have an age group variable as another column, then this could serve as a -second key variable). The `epiprocess` package thus provides an implementation -of `as_tsibble()` for `epi_df` objects, which sets these variables according to -these defaults. - -```{r, message = FALSE} -library(tsibble) - -xt <- as_tsibble(x) -head(xt) -key(xt) -index(xt) -interval(xt) -``` - -We can also set the key variable(s) directly in a call to `as_tsibble()`. -Similar to SQL keys, if the key does not uniquely identify each time point (that -is, the key and index together do not not uniquely identify each row), then -`as_tsibble()` throws an error: - -```{r, error = TRUE} -head(as_tsibble(x, key = "county_name")) -``` - -As we can see, there are duplicate county names between Massachusetts and -Vermont, which caused the error. - -```{r, message = FALSE} -head(duplicates(x, key = "county_name")) -``` - -Keying by both county name and state name, however, does work: - -```{r, message = FALSE} -head(as_tsibble(x, key = c("county_name", "state_name"))) -``` - -## Detecting and filling time gaps - -One of the major advantages of the `tsibble` package is its ability to handle -**implicit gaps** in time series data. In other words, it can infer what time -scale we're interested in (say, daily data), and detect apparent gaps (say, when -values are reported on January 1 and 3 but not January 2). We can subsequently -use functionality to make these missing entries explicit, which will generally -help avoid bugs in further downstream data processing tasks. - -Let's first remove certain dates from our data set to create gaps: - -```{r} -state_naming <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv", # nolint: line_length_linter - col_types = c(NAME = col_character(), ABBR = col_character()) -) %>% - transmute(state_name = NAME, abbr = tolower(ABBR)) %>% - as_tibble() - -# First make geo value more readable for tables, plots, etc. -x <- x %>% - inner_join(state_naming, by = "state_name", relationship = "many-to-one", unmatched = c("error", "drop")) %>% - mutate(geo_value = paste(substr(county_name, 1, nchar(county_name) - 7), state_name, sep = ", ")) %>% - select(geo_value, time_value, cases) - -xt <- as_tsibble(x) %>% filter(cases >= 3) -``` - -The functions `has_gaps()`, `scan_gaps()`, `count_gaps()` in the `tsibble` -package each provide useful summaries, in slightly different formats. - -```{r} -head(has_gaps(xt)) -head(scan_gaps(xt)) -head(count_gaps(xt)) -``` - -We can also visualize the patterns of missingness: - -```{r, message = FALSE, warning = FALSE} -library(ggplot2) -theme_set(theme_bw()) - -ggplot( - count_gaps(xt), - aes( - x = reorder(geo_value, desc(geo_value)), - color = geo_value - ) -) + - geom_linerange(aes(ymin = .from, ymax = .to)) + - geom_point(aes(y = .from)) + - geom_point(aes(y = .to)) + - coord_flip() + - labs(x = "County", y = "Date") + - theme(legend.position = "none") -``` - -Using the `fill_gaps()` function from `tsibble`, we can replace all gaps by an -explicit value. The default is `NA`, but in the current case, where missingness -is not at random but rather represents a small value that was censored (only a -hypothetical with COVID-19 reports, but certainly a real phenomenon that occurs -in other signals), it is better to replace it by zero, which is what we do here. -(Other approaches, such as LOCF: last observation carried forward in time, could -be accomplished by first filling with `NA` values and then following up with a -second call to `tidyr::fill()`.) - -```{r} -fill_gaps(xt, cases = 0) %>% - head() -``` - -Note that the time series for Addison, VT only starts on August 27, 2020, even -though the original (uncensored) data set itself was drawn from a period that -went back to June 6, 2020. By setting `.full = TRUE`, we can at zero-fill over -the entire span of the observed (censored) data. - -```{r} -xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) - -head(xt_filled) -``` - -Explicit imputation for missingness (zero-filling in our case) can be important -for protecting against bugs in all sorts of downstream tasks. For example, even -something as simple as a 7-day trailing average is complicated by missingness. -The function `epi_slide()` looks for all rows within a window of 7 days anchored -on the right at the reference time point (when `.window_size = 7`). -But when some days in a given week are missing because they were censored -because they had small case counts, taking an average of the observed case -counts can be misleading and is unintentionally biased upwards. Meanwhile, -running `epi_slide()` on the zero-filled data brings these trailing averages -(appropriately) downwards, as we can see inspecting Plymouth, MA around July 1, -2021. - -```{r} -xt %>% - as_epi_df(as_of = as.Date("2024-03-20")) %>% - group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% - ungroup() %>% - filter( - geo_value == "Plymouth, MA", - abs(time_value - as.Date("2021-07-01")) <= 3 - ) %>% - print(n = 7) - -xt_filled %>% - as_epi_df(as_of = as.Date("2024-03-20")) %>% - group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% - ungroup() %>% - filter( - geo_value == "Plymouth, MA", - abs(time_value - as.Date("2021-07-01")) <= 3 - ) %>% - print(n = 7) -``` - -## Geographic aggregation - -TODO - -## Attribution -This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. - diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd deleted file mode 100644 index 86fc2c2b..00000000 --- a/vignettes/archive.Rmd +++ /dev/null @@ -1,694 +0,0 @@ ---- -title: Work with archive objects and data revisions -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Work with archive objects and data revisions} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -In addition to the `epi_df` data structure, the `epiprocess` package has a -companion structure called `epi_archive`. In comparison to an `epi_df` object, -which can be seen as storing a single snapshot of a data set with the most -up-to-date signal values as of some given time, an `epi_archive` object stores -the full version history of a data set. Many signals of interest for -epidemiological tracking are subject to revision (some more than others) and -paying attention to data revisions can be important for all sorts of downstream -data analysis and modeling tasks. - -This vignette walks through working with `epi_archive()` objects and demonstrates -some of their key functionality. We'll work with a signal on the percentage of -doctor's visits with CLI (COVID-like illness) computed from medical insurance -claims, available through the [COVIDcast -API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This -signal is subject to very heavy and regular revision; you can read more about it -on its [API documentation -page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). - -The data is included in this package (via the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: - -```{r, message = FALSE, warning = FALSE} -library(epiprocess) -library(data.table) -library(dplyr) -library(purrr) -library(ggplot2) - -# This fetches the raw data backing the archive_cases_dv_subset object. -dv <- archive_cases_dv_subset$DT %>% - as_tibble() -``` - -The data can also be fetched from the Delphi Epidata API with the following query: -```{r, message = FALSE, warning = FALSE, eval = FALSE} -library(epidatr) - -dv <- pub_covidcast( - source = "doctor-visits", - signals = "smoothed_adj_cli", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl,ny,tx", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) %>% - rename(version = issue, percent_cli = value) -``` - -## Getting data into `epi_archive` format - -An `epi_archive()` object can be constructed from a data frame, data table, or -tibble, provided that it has (at least) the following columns: - -* `geo_value`: the geographic value associated with each row of measurements. -* `time_value`: the time value associated with each row of measurements. -* `version`: the time value specifying the version for each row of measurements. - For example, if in a given row the `version` is January 15, 2022 and - `time_value` is January 14, 2022, then this row contains the measurements of - the data for January 14, 2022 that were available one day later. - -As we can see from the above, the data frame returned by -`epidatr::pub_covidcast()` has the columns required for the `epi_archive` -format, with `issue` playing the role of `version`. We can now use -`as_epi_archive()` to bring it into `epi_archive` format. For removal of -redundant version updates in `as_epi_archive` using compactify, please refer to -the [compactify vignette](articles/compactify.html). - -```{r} -x <- dv %>% - select(geo_value, time_value, version, percent_cli) %>% - as_epi_archive(compactify = TRUE) - -class(x) -print(x) -``` - -An `epi_archive` is consists of a primary field `DT`, which is a data table -(from the `data.table` package) that has at least the required columns -`geo_value`, `time_value`, and `version`; and other metadata fields, such as -`geo_type`. - -The variables `geo_value`, `time_value`, `version` serve as **key variables** -for the data table, as well as any other specified in the metadata (described -below). There can only be a single row per unique combination of key variables, -and therefore the key variables are critical for figuring out how to generate a -snapshot of data from the archive, as of a given version (also described below). - -```{r, error=TRUE} -key(x$DT) -``` - -In general, the last version of each observation is carried forward (LOCF) to -fill in data between recorded versions. - -## Some details on metadata - -The following pieces of metadata are included as fields in an `epi_archive` -object: - -* `geo_type`: the type for the geo values. - -Metadata for an `epi_archive` object `x` can be accessed (and altered) directly, -as in `x$geo_type`, etc. Just like `as_epi_df()`, the function -`as_epi_archive()` attempts to guess metadata fields when an `epi_archive` -object is instantiated, if they are not explicitly specified in the function -call (as it did in the case above). - -## Summarizing Revision Behavior - -There are many ways to examine the ways that signals change across different -revisions. The simplest that is included directly in epiprocess is -`revision_summary()`, which computes simple summary statistics for each key (by -default, `(geo_value,time_value)` pairs), such as the lag to the first value -(latency). In addition to the per key summary, it also returns an overall -summary: - -```{r} -revision_details <- revision_summary(x, print_inform = TRUE) -``` - -So as was mentioned at the top, this is clearly a data set where basically -everything has some amount of revisions, only 0.37% have no revision at all, and -0.92 have fewer than 3. Over 94% change by more than 10%. On the other hand, -most are within plus or minus 20% within 5-9 days, so the revisions converge -relatively quickly, even if the revisions continue for longer. - -To do more detailed analysis than is possible with the above printing, we have -`revision_details`: - -```{r} -revision_details %>% - group_by(geo_value) %>% - summarise( - n_rev = mean(n_revisions), - min_lag = min(min_lag), - max_lag = max(max_lag), - spread = mean(spread), - rel_spread = mean(rel_spread), - time_near_latest = mean(time_near_latest) - ) -``` - -Most of the states have similar stats on most of these features, except for -Florida, which takes nearly double the amount of time to get close to the right -value, with California not too far behind. - -## Producing snapshots in `epi_df` form - -A key method of an `epi_archive` class is `epix_as_of()`, which generates a -snapshot of the archive in `epi_df` format. This represents the most up-to-date -values of the signal variables as of a given version. - -```{r} -x_snapshot <- epix_as_of(x, as.Date("2021-06-01")) -class(x_snapshot) -head(x_snapshot) -max(x_snapshot$time_value) -attributes(x_snapshot)$metadata$as_of -``` - -We can see that the max time value in the `epi_df` object `x_snapshot` that was -generated from the archive is May 29, 2021, even though the specified version -date was June 1, 2021. From this we can infer that the doctor's visits signal -was 2 days latent on June 1. Also, we can see that the metadata in the `epi_df` -object has the version date recorded in the `as_of` field. - -Below, we pull several snapshots from the archive, spaced one month apart. We -overlay the corresponding signal curves as colored lines, with the version dates -marked by dotted vertical lines, and draw the latest curve in black (from the -latest snapshot `x_latest` that the archive can provide). - -```{r, fig.width = 8, fig.height = 7} -theme_set(theme_bw()) - -x_latest <- epix_as_of(x, x$versions_end) -self_max <- max(x$DT$version) -versions <- seq(as.Date("2020-06-01"), self_max - 1, by = "1 month") -snapshots <- map_dfr(versions, function(v) { - epix_as_of(x, v) %>% mutate(version = v) -}) %>% - bind_rows( - x_latest %>% mutate(version = self_max) - ) %>% - mutate(latest = version == self_max) - -ggplot( - snapshots %>% filter(!latest), - aes(x = time_value, y = percent_cli) -) + - geom_line(aes(color = factor(version)), na.rm = TRUE) + - geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + - facet_wrap(~geo_value, scales = "free_y", ncol = 1) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "% of doctor's visits with CLI") + - theme(legend.position = "none") + - geom_line( - data = snapshots %>% filter(latest), - aes(x = time_value, y = percent_cli), - inherit.aes = FALSE, color = "black", na.rm = TRUE - ) -``` - -We can see some interesting and highly nontrivial revision behavior: at some -points in time the provisional data snapshots grossly underestimate the latest -curve (look in particular at Florida close to the end of 2021), and at others -they overestimate it (both states towards the beginning of 2021), though not -quite as dramatically. Modeling the revision process, which is often called -*backfill modeling*, is an important statistical problem in it of itself. - -## Merging `epi_archive` objects - -Now we demonstrate how to merge two `epi_archive` objects together, e.g., so -that grabbing data from multiple sources as of a particular version can be -performed with a single `epix_as_of` call. The function `epix_merge()` is made -for this purpose. Below we merge the working `epi_archive` of versioned -percentage CLI from outpatient visits to another one of versioned COVID-19 case -reporting data, which we fetch the from the [COVIDcast -API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the -rate scale (counts per 100,000 people in the population). - -When merging archives, unless the archives have identical data release patterns, -`NA`s can be introduced in the non-key variables for a few reasons: -- to represent the "value" of an observation before its initial release (when we - need to pair it with additional observations from the other archive that have - been released) -- to represent the "value" of an observation that has no recorded versions at - all (in the same sort of situation) -- if requested via `sync="na"`, to represent potential update data that we do - not yet have access to (e.g., due to encountering issues while attempting to - download the currently available version data for one of the archives, but not - the other). - -```{r, message = FALSE, warning = FALSE, eval=FALSE} -y <- pub_covidcast( - source = "jhu-csse", - signals = "confirmed_7dav_incidence_prop", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl,ny,tx", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) %>% - select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% - as_epi_archive(compactify = TRUE) - -x <- epix_merge(x, y, sync = "locf", compactify = TRUE) -print(x) -head(x$DT) -``` - -```{r, echo=FALSE, message=FALSE, warning=FALSE} -x <- archive_cases_dv_subset -print(x) -head(x$DT) -``` - -## Sliding version-aware computations - -Lastly, we demonstrate another key method for archives, which is the -`epix_slide()`. It works just like `epi_slide()` does for an `epi_df` object, -but with one key difference: it performs version-aware computations. That is, -for the computation at any given reference time t, it only uses **data that -would have been available as of t**. - -For the demonstration, we'll revisit the forecasting example from the [slide -vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html), and now -we'll build a forecaster that uses properly-versioned data (that would have been -available in real-time) to forecast future COVID-19 case rates from current and -past COVID-19 case rates, as well as current and past values of the outpatient -CLI signal from medical claims. We'll extend the `prob_ar()` function from the -slide vignette to accomodate exogenous variables in the autoregressive model, -which is often referred to as an ARX model. - -```{r} -prob_arx <- function(x, y, lags = c(0, 7, 14), ahead = 7, min_train_window = 20, - lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, - intercept = FALSE, nonneg = TRUE) { - # Return NA if insufficient training data - if (length(y) < min_train_window + max(lags) + ahead) { - return(data.frame(point = NA, lower = NA, upper = NA)) - } - - # Useful transformations - if (!missing(x)) { - x <- data.frame(x, y) - } else { - x <- data.frame(y) - } - if (!is.list(lags)) lags <- list(lags) - lags <- rep(lags, length.out = ncol(x)) - - # Build features and response for the AR model, and then fit it - dat <- do.call( - data.frame, - unlist( # Below we loop through and build the lagged features - purrr::map(seq_len(ncol(x)), function(i) { - purrr::map(lags[[i]], function(j) lag(x[, i], n = j)) - }), - recursive = FALSE - ) - ) - names(dat) <- paste0("x", seq_len(ncol(dat))) - if (intercept) dat$x0 <- rep(1, nrow(dat)) - dat$y <- lead(y, n = ahead) - obj <- lm(y ~ . + 0, data = dat) - - # Use LOCF to fill NAs in the latest feature values, make a prediction - setDT(dat) - setnafill(dat, type = "locf") - point <- predict(obj, newdata = tail(dat, 1)) - - # Compute a band - r <- residuals(obj) - s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? - q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) - lower <- point + q[1] - upper <- point + q[2] - - # Clip at zero if we need to, then return - if (nonneg) { - point <- max(point, 0) - lower <- max(lower, 0) - upper <- max(upper, 0) - } - return(data.frame(point = point, lower = lower, upper = upper)) -} -``` - -Next we slide this forecaster over the working `epi_archive` object, in order to -forecast COVID-19 case rates 7 days into the future. - -```{r} -fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-11-30"), by = "1 month") - -z <- x %>% - group_by(geo_value) %>% - epix_slide( - fc = prob_arx(x = percent_cli, y = case_rate_7d_av, ahead = 7), - .before = 119, - .versions = fc_time_values - ) %>% - ungroup() - -head(z, 10) -``` - -We get back a tibble `z` with the grouping variables (here geo value), the -(reference) time values, and a ["packed"][tidyr::pack] data frame column `fc` -containing `fc$point`, `fc$lower`, and `fc$upper` that correspond to the point -forecast, and the lower and upper endpoints of the 95\% prediction band, -respectively. (We could also have used `, prob_ar(cases_7dav)` to get three -separate columns `point`, `lower`, and `upper`, or used `fc = -list(prob_ar(cases_7dav))` to make an `fc` column with a ["nested"][tidyr::nest] -format (list of data frames) instead.) - -On the whole, `epix_slide()` works similarly to `epix_slide()`, though there are -a few notable differences, even apart from the version-aware aspect. You can -read the documentation for `epix_slide()` for details. - -We finish off by comparing version-aware and -unaware forecasts at various -points in time and forecast horizons. The former comes from using -`epix_slide()` with the `epi_archive` object `x`, and the latter from applying -`epi_slide()` to the latest snapshot of the data `x_latest`. - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -x_latest <- epix_as_of(x, x$versions_end) - -# Simple function to produce forecasts k weeks ahead -forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { - if (as_of) { - x %>% - group_by(geo_value) %>% - epix_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .before = 119, - .versions = fc_time_values - ) %>% - mutate(target_date = .data$version + ahead, as_of = TRUE) %>% - ungroup() - } else { - x_latest %>% - group_by(geo_value) %>% - epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, ahead = ahead), .window_size = 120, - .ref_time_values = fc_time_values - ) %>% - mutate(target_date = .data$time_value + ahead, as_of = FALSE) %>% - ungroup() - } -} - -# Generate the forecasts, and bind them together -fc <- bind_rows( - forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), - forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), - forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), - forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) -) - -# Plot them, on top of latest COVID-19 case rates -ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + - geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + - geom_line( - data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = fc$point)) + - geom_point(aes(y = fc$point), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(as_of), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` - -Each row displays the forecasts for a different location (CA, FL, NY, and TX), and each -column corresponds to whether properly-versioned data is used (`FALSE` means no, -and `TRUE` means yes). We can see that the properly-versioned forecaster is, at -some points in time, more problematic; for example, it massively overpredicts -the peak in both locations in winter wave of 2020. However, performance is -pretty poor across the board here, whether or not properly-versioned data is -used. Similar to what we saw in the [slide -vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html), the -ARX forecasts can too volatile, overconfident, or both. - -Some of the volatility can be attenuated here by training an ARX model jointly -over locations; the [advanced sliding -vignette](https://cmu-delphi.github.io/epiprocess/articles/advanced.html) gives -a demonstration of how to do this. But really, the -[`epipredict`](https://cmu-delphi.github.io/epipredict/) package, which builds -off the data structures and functionality in the current package, is the place -to look for more robust forecasting methodology. The forecasters that appear in -the vignettes in the current package are only meant to demo the slide -functionality with some of the most basic forecasting methodology possible. - -## Sliding version-aware computations with geo-pooling - -First, we fetch the versioned data and build the archive. - -```{r, message = FALSE, warning = FALSE, eval =FALSE} -library(epidatr) -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -y1 <- pub_covidcast( - source = "doctor-visits", - signals = "smoothed_adj_cli", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -y2 <- pub_covidcast( - source = "jhu-csse", - signal = "confirmed_7dav_incidence_prop", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl", - time_values = epirange(20200601, 20211201), - issues = epirange(20200601, 20211201) -) - -x <- y1 %>% - select(geo_value, time_value, - version = issue, - percent_cli = value - ) %>% - as_epi_archive(compactify = FALSE) - -# mutating merge operation: -x <- epix_merge( - x, - y2 %>% - select(geo_value, time_value, - version = issue, - case_rate_7d_av = value - ) %>% - as_epi_archive(compactify = FALSE), - sync = "locf", - compactify = FALSE -) -``` - -```{r, message = FALSE, echo =FALSE} -library(data.table) -library(ggplot2) -theme_set(theme_bw()) - -x <- archive_cases_dv_subset$DT %>% - filter(geo_value %in% c("ca", "fl")) %>% - as_epi_archive(compactify = FALSE) -``` - -Next, we extend the ARX function to handle multiple geo values, since in the -present case, we will not be grouping by geo value and each slide computation -will be run on multiple geo values at once. Note that, because `epix_slide()` -only returns the grouping variables, `time_value`, and the slide computations in -the eventual returned tibble, we need to include `geo_value` as a column in the -output data frame from our ARX computation. - -```{r} -library(tidyr) -library(purrr) - -prob_arx_args <- function(lags = c(0, 7, 14), - ahead = 7, - min_train_window = 20, - lower_level = 0.05, - upper_level = 0.95, - symmetrize = TRUE, - intercept = FALSE, - nonneg = TRUE) { - return(list( - lags = lags, - ahead = ahead, - min_train_window = min_train_window, - lower_level = lower_level, - upper_level = upper_level, - symmetrize = symmetrize, - intercept = intercept, - nonneg = nonneg - )) -} - -prob_arx <- function(x, y, geo_value, time_value, args = prob_arx_args()) { - # Return NA if insufficient training data - if (length(y) < args$min_train_window + max(args$lags) + args$ahead) { - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = NA, lower = NA, upper = NA - )) - } - - # Set up x, y, lags list - if (!missing(x)) { - x <- data.frame(x, y) - } else { - x <- data.frame(y) - } - if (!is.list(args$lags)) args$lags <- list(args$lags) - args$lags <- rep(args$lags, length.out = ncol(x)) - - # Build features and response for the AR model, and then fit it - dat <- tibble(i = seq_len(ncol(x)), lag = args$lags) %>% - unnest(lag) %>% - mutate(name = paste0("x", seq_len(nrow(.)))) %>% # nolint: object_usage_linter - # One list element for each lagged feature - pmap(function(i, lag, name) { - tibble( - geo_value = geo_value, - time_value = time_value + lag, # Shift back - !!name := x[, i] - ) - }) %>% - # One list element for the response vector - c(list( - tibble( - geo_value = geo_value, - time_value = time_value - args$ahead, # Shift forward - y = y - ) - )) %>% - # Combine them together into one data frame - reduce(full_join, by = c("geo_value", "time_value")) %>% - arrange(time_value) - if (args$intercept) dat$x0 <- rep(1, nrow(dat)) - obj <- lm(y ~ . + 0, data = select(dat, -geo_value, -time_value)) - - # Use LOCF to fill NAs in the latest feature values (do this by geo value) - setDT(dat) # Convert to a data.table object by reference - cols <- setdiff(names(dat), c("geo_value", "time_value")) - dat[, (cols) := nafill(.SD, type = "locf"), .SDcols = cols, by = "geo_value"] - - # Make predictions - test_time_value <- max(time_value) - point <- predict( - obj, - newdata = dat %>% - dplyr::group_by(geo_value) %>% - dplyr::filter(time_value == test_time_value) - ) - - # Compute bands - r <- residuals(obj) - s <- ifelse(args$symmetrize, -1, NA) # Should the residuals be symmetrized? - q <- quantile(c(r, s * r), probs = c(args$lower, args$upper), na.rm = TRUE) - lower <- point + q[1] - upper <- point + q[2] - - # Clip at zero if we need to, then return - if (args$nonneg) { - point <- pmax(point, 0) - lower <- pmax(lower, 0) - upper <- pmax(upper, 0) - } - return(data.frame( - geo_value = unique(geo_value), # Return geo value! - point = point, lower = lower, upper = upper - )) -} -``` - -We now make forecasts on the archive and compare to forecasts on the latest -data. - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -# Latest snapshot of data, and forecast dates -x_latest <- epix_as_of(x, version = max(x$DT$version)) -fc_time_values <- seq(as.Date("2020-08-01"), - as.Date("2021-11-30"), - by = "1 month" -) - -# Simple function to produce forecasts k weeks ahead -forecast_k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { - if (as_of) { - x %>% - epix_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - .before = 219, .versions = fc_time_values - ) %>% - mutate( - target_date = .data$version + ahead, as_of = TRUE, - geo_value = .data$fc$geo_value - ) - } else { - x_latest %>% - epi_slide( - fc = prob_arx(.data$percent_cli, .data$case_rate_7d_av, .data$geo_value, .data$time_value, - args = prob_arx_args(ahead = ahead) - ), - .window_size = 220, .ref_time_values = fc_time_values - ) %>% - mutate(target_date = .data$time_value + ahead, as_of = FALSE) - } -} - -# Generate the forecasts, and bind them together -fc <- bind_rows( - forecast_k_week_ahead(x, ahead = 7, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 14, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 21, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 28, as_of = TRUE), - forecast_k_week_ahead(x, ahead = 7, as_of = FALSE), - forecast_k_week_ahead(x, ahead = 14, as_of = FALSE), - forecast_k_week_ahead(x, ahead = 21, as_of = FALSE), - forecast_k_week_ahead(x, ahead = 28, as_of = FALSE) -) - -# Plot them, on top of latest COVID-19 case rates -ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + - geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper), alpha = 0.4) + - geom_line( - data = x_latest, aes(x = time_value, y = case_rate_7d_av), - inherit.aes = FALSE, color = "gray50" - ) + - geom_line(aes(y = fc$point)) + - geom_point(aes(y = fc$point), size = 0.5) + - geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + - facet_grid(vars(geo_value), vars(as_of), scales = "free") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") -``` - -We can see that these forecasts, which come from training an ARX model jointly -over CA and FL, exhibit generally less variability and wider prediction bands -compared to the ones from the archive vignette, which come from training a -separate ARX model on each state. As in the archive vignette, we can see a -difference between version-aware (right column) and -unaware (left column) -forecasting, as well. - -## Attribution - -This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. diff --git a/vignettes/compactify.Rmd b/vignettes/compactify.Rmd index 72a2d266..a791f67a 100644 --- a/vignettes/compactify.Rmd +++ b/vignettes/compactify.Rmd @@ -7,6 +7,10 @@ vignette: > %\VignetteEncoding{UTF-8} --- +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + ## Removing redundant update data to save space We do not need to store version update rows that look like the last version of diff --git a/vignettes/correlation.Rmd b/vignettes/correlation.Rmd index 073812b3..14d6c76a 100644 --- a/vignettes/correlation.Rmd +++ b/vignettes/correlation.Rmd @@ -1,14 +1,18 @@ --- -title: Correlate signals over space and time +title: Correlate signals across locations and time output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Correlate signals over space and time} + %\VignetteIndexEntry{Correlate signals across locations and time} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + The `epiprocess` package provides some simple functionality for computing lagged -correlations between two signals, over space or time (or other variables), via +correlations between two signals, across locations or time (or other variables), via `epi_cor()`. This function is really just a convenience wrapper over some basic commands: it first performs specified time shifts, then computes correlations, grouped in a specified way. In this vignette, we'll examine correlations between @@ -73,7 +77,6 @@ time value, and by geo value. The former is obtained via `cor_by = time_value`. ```{r, message = FALSE, warning = FALSE} library(ggplot2) -theme_set(theme_bw()) z1 <- epi_cor(x, case_rate, death_rate, cor_by = "time_value") diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd new file mode 100644 index 00000000..aca4702a --- /dev/null +++ b/vignettes/epi_archive.Rmd @@ -0,0 +1,465 @@ +--- +title: Working with epi_archive objects and data revisions +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Working with epi_archive objects and data revisions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + +The `epi_archive` data structure provided by `epiprocess` provides convenient +ways to work with data sets that are subject to revision (a common occurrence in +the public health space, as situational awareness improves). In comparison to an +`epi_df` object, which can viewed as a data snapshot at a point in time, an +`epi_archive` object stores the full version history of a data set. Paying +attention to data revisions can be important for all sorts of downstream data +analysis and modeling tasks. + +In this vignette we will: + +- construct an `epi_archive` object from a data frame, +- summarize revision behavior in the archive, +- produce snapshots of the data in `epi_df` form, +- merge `epi_archive` objects together, +- run a simple autoregressive forecaster (version-unaware) on a single date, +- slide a simple autoregressive forecaster (version-unaware), +- slide a simple autoregressive forecaster (version-aware), +- compare version-aware and -unaware forecasts. + +## Getting data into `epi_archive` format + +We will work with a signal on the percentage of doctor's visits with CLI +(COVID-like illness) computed from medical insurance claims, available through +the [COVIDcast +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html). This +signal is subject to very heavy and regular revision; you can read more about it +on its [API documentation +page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). + +The data is included in this package (via the [`epidatasets` +package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: + +```{r, message = FALSE, warning = FALSE} +library(epiprocess) +library(data.table) +library(dplyr) +library(purrr) +library(ggplot2) + +# This fetches the raw data backing the archive_cases_dv_subset object. +dv <- archive_cases_dv_subset$DT %>% + as_tibble() +``` + +The data can also be fetched from the Delphi Epidata API with the following query: + +```{r, message = FALSE, warning = FALSE, eval = FALSE} +library(epidatr) + +dv <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) %>% + rename(version = issue, percent_cli = value) +``` + +An `epi_archive()` object can be constructed from a data frame, data table, or +tibble, provided that it has (at least) the following columns: + +* `geo_value`: the geographic value associated with each row of measurements. +* `time_value`: the time value associated with each row of measurements. +* `version`: the time value specifying the version for each row of measurements. + For example, if in a given row the `version` is January 15, 2022 and + `time_value` is January 14, 2022, then this row contains the measurements of + the data for January 14, 2022 that were available one day later. + +As we can see from the above, the data frame returned by +`epidatr::pub_covidcast()` has the columns required for the `epi_archive` +format, with `issue` playing the role of `version`. We can now use +`as_epi_archive()` to bring it into `epi_archive` format. + +```{r} +dv_archive <- dv %>% + select(geo_value, time_value, version, percent_cli) %>% + as_epi_archive(compactify = TRUE) +dv_archive +``` + +See the `epi_archive()` documentation for more information about its internal +structure. + +## Producing snapshots in `epi_df` form + +A key method of an `epi_archive` class is `epix_as_of()`, which generates a +snapshot of the archive in `epi_df` format. This represents the most up-to-date +values of the signal variables as of a given version. + +```{r} +edf <- epix_as_of(dv_archive, as.Date("2021-06-01")) +print(edf) +print(max(edf$time_value)) +``` + +Note that that the max time value in the `epi_df` object is May 29, 2021, even +though the specified version date was June 1, 2021 (note that the `as_of` field +printed above helps us see the date of the snapshot). From this we can infer +that the doctor's visits signal was 2 days latent on June 1. + +Now, let's investigate how much this data was revised. We will plot the most +up-to-date version of the time series in black (`edf_latest` below) and then +overlay several revisions from the archive, spaced one month apart, as colored +lines (`snapshots` below). We will also mark the version dates with dotted +vertical lines. + +```{r} +edf_latest <- epix_as_of(dv_archive, dv_archive$versions_end) +max_version <- max(dv_archive$DT$version) +versions <- seq(as.Date("2020-06-01"), max_version - 1, by = "1 month") +monthly_snapshots <- map(versions, function(v) { + epix_as_of(dv_archive, v) %>% mutate(version = v) +}) %>% + bind_rows( + edf_latest %>% mutate(version = max_version) + ) %>% + mutate(latest = version == max_version) + +ggplot( + monthly_snapshots %>% filter(!latest), + aes(x = time_value, y = percent_cli) +) + + geom_line(aes(color = factor(version)), na.rm = TRUE) + + geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + + facet_wrap(~geo_value, scales = "free_y", ncol = 1) + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "% of doctor's visits with CLI") + + theme(legend.position = "none") + + geom_line( + data = monthly_snapshots %>% filter(latest), + aes(x = time_value, y = percent_cli), + inherit.aes = FALSE, color = "black", na.rm = TRUE + ) +``` + +We can see some interesting and highly nontrivial revision behavior: at some +points in time the provisional data snapshots grossly underestimate the latest +curve (look in particular at Florida close to the end of 2021), and at others +they overestimate it (both states towards the beginning of 2021), though not +quite as dramatically. Modeling the revision process, which is often called +*backfill modeling*, is an important statistical problem in it of itself. + +## Summarizing revision behavior + +There are many ways to examine how signals change across revisions. We provide +the convenient analysis wrapper `revision_summary()`, which computes simple +summary statistics for each key (by default, `(geo_value,time_value)` pairs). In +addition to the per key summary, it also returns an overall summary. Here is an +a sample of the output: + +```{r} +revision_details <- revision_summary(dv_archive, print_inform = TRUE) +``` + +We can see from the output that, as mentioned above, this data set has a lot of +revisions: there are no keys that have no revision at all and 34% of the keys +change by 10% or more when revised. + +To do more detailed analysis than is possible with the above printing, we can +inspect the returned `revision_details` tibble. Here we collect a number of +statistics for each state: + +```{r} +revision_details %>% + group_by(geo_value) %>% + summarise( + n_rev = mean(n_revisions), + min_lag = min(min_lag), + max_lag = max(max_lag), + spread = mean(spread), + rel_spread = mean(rel_spread), + time_near_latest = mean(time_near_latest) + ) +``` + +Most of the states have similar stats on most of these features, except for the +`time_near_latest` stat, which is the amount of time that it takes for the +revisions to converge to within 20% of the final value and stay there. It is the +highest for CA and the lowest for TX. + +## Merging `epi_archive` objects + +A common operation on datasets is merging (or joining) them together, such as +when we grab data from multiple sources for joint analysis or modeling. Merging +two `epi_archive` objects together is a bit tricky however, since we need to handle +datasets that might get revised at different times. The function `epix_merge()` +is made to smooth this out. Below we merge the working `epi_archive` of versioned +percentage CLI from outpatient visits to another one of versioned COVID-19 case +reporting data, which we fetch the from the [COVIDcast +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html/), on the +rate scale (counts per 100,000 people in the population). + +```{r, message = FALSE, warning = FALSE, eval=FALSE} +library(epidatr) + +y <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) +) %>% + select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% + as_epi_archive(compactify = TRUE) + +dv_cases_archive <- epix_merge(dv_archive, y, sync = "locf", compactify = TRUE) +print(dv_cases_archive) +``` + +```{r, echo=FALSE, message=FALSE, warning=FALSE} +dv_cases_archive <- archive_cases_dv_subset +print(dv_cases_archive) +``` + +Note that we have used the `sync = "locf"` argument to specify that we want to +synchronize the two datasets on their disjoint revisions by using the last +observation carried forward (LOCF). For more information, see `epix_merge()`. + +## Running a simple autoregressive forecaster (version-unaware) + +One of the most common use cases of the `epi_archive` object is for accurate +model backtesting. In this section we will: + +- develop a simple autoregressive forecaster that predicts the next value of the +signal based on the current and past values of the signal itself, and +- demonstrate how to slide this forecaster over the `epi_archive` object to +produce forecasts at a few dates date, using version-unaware and -aware +computations, +- compare the two approaches. + +Before we get started, we should mention that all the work of constructing the +forecaster that we're about to do is just a simple demo. The `epipredict` +package, which is a companion package to `epiprocess`, offers a suite of +predictive modeling tools that can improve on some of the shortcomings of the +simple AR model we will use here, while also allowing the forecasters to be +built from building blocks. A better version of the function below exists as +`epipredict::arx_forecaster()`. See the vignette in the `epipredict` package for +a similar demo, but using the forecasters in that package. TODO: Link. + +First, let's define the forecaster. While AR models can be fit in numerous ways +(using base R or external packages), here we define it "by hand" because we +would like to demonstrate a *probabilistic* forecaster: one that outputs not +just a point prediction, but a notion of uncertainty around this. In particular, +our forecaster will output a point prediction along with an 90\% uncertainty +band, represented by a predictive quantiles at the 5\% and 95\% levels (lower +and upper endpoints of the uncertainty band). + +The function defined below, `prob_ar()`, is our probabilistic AR forecaster. Our +forecasting target will be the `percent_cli` signal. The function is as follows: + +```{r} +#' `ahead` - the number of time units ahead to forecast (in this case days), +#' `lags` - the autoregressive lags to use in the model (in this case, the +#' current value and the values from 7 and 14 days ago), +#' `min_train_window` - the minimum number of observations required to fit the +#' forecaster (used to control for edge cases), +#' `lower_level` and `upper_level` - the quantiles to use for the uncertainty +#' bands +#' `symmetrize` - whether to symmetrize the residuals when computing the +#' uncertainty bands, +#' `intercept` - whether to include an intercept in the model, +#' `nonneg` - whether to clip the forecasts at zero. +prob_ar <- function(y, ahead = 7, lags = c(0, 7, 14), min_train_window = 90, + lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, + intercept = FALSE, nonneg = TRUE) { + # Return NA if insufficient training data + if (length(y) < min_train_window + max(lags) + ahead) { + return(data.frame(point = NA, lower = NA, upper = NA)) + } + + # Filter down the edge-NAs + y <- y[!is.na(y)] + + # Build features and response for the AR model + dat <- do.call( + data.frame, + purrr::map(lags, function(j) lag(y, n = j)) + ) + names(dat) <- paste0("x", seq_len(ncol(dat))) + if (intercept) dat$x0 <- rep(1, nrow(dat)) + dat$y <- lead(y, n = ahead) + + # Now fit the AR model and make a prediction + obj <- lm(y ~ . + 0, data = dat) + point <- predict(obj, newdata = tail(dat, 1)) + + # Compute a band + r <- residuals(obj) + s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? + q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) + lower <- point + q[1] + upper <- point + q[2] + + # Clip at zero if we need to, then return + if (nonneg) { + point <- max(point, 0) + lower <- max(lower, 0) + upper <- max(upper, 0) + } + return(data.frame(point = point, lower = lower, upper = upper)) +} +``` + +To start, let's run this forecaster on a single date (say, the last date in the +archive) to see how it performs. We will use the `epix_as_of()` method to +generate a snapshot of the archive at the last date, and then run the forecaster. + +```{r} +edf_latest <- epix_as_of(dv_archive, dv_archive$versions_end) %>% drop_na() +edf_latest %>% + group_by(geo_value) %>% + summarize(fc = prob_ar(percent_cli), time_value = last(time_value), percent_cli = last(percent_cli)) +``` + +The resulting epi_df now contains three new columns: `fc$point`, `fc$lower`, and +`fc$upper` corresponding to the point forecast, and the lower and upper +endpoints of the 95\% prediction band, respectively. The forecasts look +reasonable and in line with the data. The point forecast is close to the last +observed value, and the uncertainty bands are wide enough to capture the +variability in the data. + +Note that the same can be achieved wth `epix_slide` using the following code: + +```{r} +dv_archive %>% + group_by(geo_value) %>% + epix_slide(fc = prob_ar(percent_cli), .versions = dv_archive$versions_end) +``` + +Here we used `epix_slide()` to slide the forecaster over the `epi_archive`, but +by specifying the `.versions` argument to be the last version in the archive, we +effectively ran the forecaster on the last date in the archive. + +Now let's go ahead and slide this forecaster in a version unaware way. First, we +need to snapshot the latest version of the data, and then make a faux archive by +setting `version = time_value`. This has the effect of simulating a data set +that receives the final version updates every day. + +```{r} +dv_archive_faux <- edf_latest %>% + mutate(version = time_value) %>% + as_epi_archive() +``` + +Now we can slide the forecaster over the faux archive to produce forecasts at a +number of dates in the past, spaced a month apart. Note that we will use the +`case_rate_7d_av` signal from the merged archive, which is the smoothed COVID-19 +case rate. This is clearly equivalent, up to a constant, to modeling weekly sums +of COVID-19 cases. We will forecast 7, 14, 21, and 28 days ahead, so to reduce +typing, we create the wrapper function `k_week_ahead()`. We also produce +forecasts in a version-aware way, which simply requires us to use the true +`epi_archive` object instead of the faux one. + +```{r} +# Generate a sequence of forecast dates. Starting 3 months into the data, so we have +# enough data to train the AR model. +forecast_dates <- seq(as.Date("2020-10-01"), as.Date("2021-11-01"), by = "1 months") +k_week_ahead <- function(archive, ahead = 7) { + archive %>% + group_by(geo_value) %>% + epix_slide(fc = prob_ar(percent_cli, ahead = ahead), .versions = forecast_dates) %>% + ungroup() %>% + mutate(target_date = version + ahead) +} + +aheads <- 1:28 +forecasts <- bind_rows( + map(aheads, ~ k_week_ahead(dv_archive_faux, ahead = .x) %>% mutate(version_aware = FALSE)), + map(aheads, ~ k_week_ahead(dv_archive, ahead = .x) %>% mutate(version_aware = TRUE)) +) +``` + +Now let's plot the forecasts at each forecast date at multiple horizons: 7, 14, +21, and 28 days ahead. The grey line represents the finalized COVID-19 case +rates, and the colored lines represent the forecasts. The left column shows the +version aware forecasts, and the right column shows the version unaware +forecasts. They grey vertical lines mark the forecast dates. + +```{r} +edf_latest <- epix_as_of(dv_archive, dv_archive$versions_end) +max_version <- max(dv_archive$DT$version) +geo_choose <- "tx" + +forecasts_filtered <- forecasts %>% + filter(geo_value == geo_choose) %>% + mutate(time_value = version) +percent_cli_data <- bind_rows( + # Snapshotted data for the version-aware forecasts + map(forecast_dates, ~ epix_as_of(dv_archive, .x) %>% mutate(version = .x)) %>% + bind_rows() %>% + mutate(version_aware = TRUE), + # Latest data for the version-unaware forecasts + edf_latest %>% mutate(version_aware = FALSE) +) %>% + filter(geo_value == geo_choose) + +ggplot(data = forecasts_filtered, aes(x = target_date, group = time_value)) + + geom_ribbon(aes(ymin = fc$lower, ymax = fc$upper, fill = factor(time_value)), alpha = 0.4) + + geom_line(aes(y = fc$point, color = factor(time_value)), linetype = 2L) + + geom_point(aes(y = fc$point, color = factor(time_value)), size = 0.75) + + geom_vline(data = percent_cli_data, aes(color = factor(version), xintercept = version), lty = 2) + + geom_line( + data = percent_cli_data, + aes(x = time_value, y = percent_cli, color = factor(version)), + inherit.aes = FALSE, na.rm = TRUE + ) + + facet_grid(version_aware ~ geo_value, scales = "free") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + scale_y_continuous(expand = expansion(c(0, 0.05))) + + labs(x = "Date", y = "smoothed, day of week adjusted covid-like doctors visits") + + theme(legend.position = "none") +``` + +A few points are worth making. First, it's clear that training and making +predictions on finalized data can lead to an overly optimistic sense of accuracy +(see, for example, [McDonald et al. +(2021)](https://www.pnas.org/content/118/51/e2111453118/), and references +therein). Second, we can see that the properly-versioned forecaster is, at some +points in time, more problematic; for example, it massively overpredicts the +peak in both locations in winter wave of 2020. However, performance is pretty +poor across the board here, whether or not properly-versioned data is used, with +volatile and overconfident forecasts in various places. + +As mentioned previously, this forecaster is meant only as a demo of the slide +functionality with some of the most basic forecasting methodology possible. The +[`epipredict`](https://cmu-delphi.github.io/epipredict/) package, which builds +off the data structures and functionality in the current package, is the place +to look for more robust forecasting methodology. + +## Attribution + +This document contains a dataset that is a modified part of the [COVID-19 Data +Repository by the Center for Systems Science and Engineering (CSSE) at Johns +Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished +in the COVIDcast Epidata +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). +This data set is licensed under the terms of the [Creative Commons Attribution +4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the +Johns Hopkins University on behalf of its Center for Systems Science in +Engineering. Copyright Johns Hopkins University 2020. + +The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor +Visits +data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). +This dataset is licensed under the terms of the [Creative Commons Attribution +4.0 International license](https://creativecommons.org/licenses/by/4.0/). +Copyright Delphi Research Group at Carnegie Mellon University 2020. diff --git a/vignettes/epi_df.Rmd b/vignettes/epi_df.Rmd new file mode 100644 index 00000000..0d0452ca --- /dev/null +++ b/vignettes/epi_df.Rmd @@ -0,0 +1,481 @@ +--- +title: Working with epi_df objects and time series data +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Working with epi_df objects and time series data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + +The `epi_df` data structure provided by `epiprocess` provides convenient ways to +perform common processing tasks. In this vignette, we will: + +- construct an `epi_df` from a data frame +- perform rolling time-window computations using `epi_slide()` +- perform group-level aggregation using `sum_groups_epi_df()` +- detect and fill time gaps using `complete.epi_df()` and `{tsibble}` +- perform geographic aggregation (not yet implemented) + +## Getting data into `epi_df` format + +As in `vignette("epiprocess")`, we will fetch daily reported COVID-19 cases from +CA, FL, NY, and TX (note: here we're using new, not cumulative cases) using the +[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert +this to `epi_df` format. + +```{r, message = FALSE, warning = FALSE} +library(epiprocess) +library(dplyr) +``` + +The data is included in this package (via the [`epidatasets` +package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: + +```{r} +edf <- cases_deaths_subset %>% + select(geo_value, time_value, cases) %>% + arrange(geo_value, time_value) +``` + +The data can also be fetched from the Delphi Epidata API with the following query: + +```{r, message = FALSE, eval = FALSE} +library(epidatr) + +d <- as.Date("2024-03-20") + +edf <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_incidence_num", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx,ga,pa", + time_values = epirange(20200301, 20211231), + as_of = d +) %>% + select(geo_value, time_value, cases = value) %>% + arrange(geo_value, time_value) %>% + as_epi_df(as_of = d) +``` + +The data has 2,684 rows and 3 columns. + +## Rolling computations using `epi_slide` + +A very common operation in time series processing is aggregating the values of +the time series by applying some function on a rolling time window of data +points. The key tool that allows this is `epi_slide()`. The function always +first makes sure to group the data by the grouping variables of the `epi_df` +object, which includes the `geo_value` and possibly `other_keys` columns. It +then applies the rolling slide computation inside each group. + +The `epi_slide()` function has three ways to specify the computation to be +performed: + +- by using a tidy evaluation approach +- by passing a formula +- by passing a function + +### Slide the tidy way + +Usually, the most convenient way to setup a computation in `epi_slide()` is to +pass in an expression for tidy evaluation. In this case, we can simply define +the name of the new column directly as part of the expression, setting it equal +to a computation in which we can access any columns of `.x` by name, just as we +would in a call to, say, `dplyr::mutate()`. For example: + +```{r} +slide_output <- edf %>% + epi_slide(cases_7sd = sd(cases, na.rm = TRUE), .window_size = 7) +``` + +As a simple sanity check, we visualize the 7-day trailing averages computed on +top of the original counts: + +```{r, message = FALSE, warning = FALSE} +library(ggplot2) + +ggplot(slide_output, aes(x = time_value)) + + geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + + geom_line(aes(y = cases_7sd, col = geo_value), show.legend = FALSE) + + facet_wrap(~geo_value, scales = "free_y") + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "Reported COVID-19 cases") +``` + +As we can see from the Texas plot, the state moved to weekly reporting of +COVID-19 cases in summer of 2021. + +Note that without `epi_slide()`, the computation is much less convenient. For +instance, a rough equivalent of the above computation would be the following, +which is easy to get wrong: + +```{r} +edf %>% + complete(geo_value, time_value = seq.Date(min(time_value), max(time_value), by = "day")) %>% + arrange_canonical() %>% + group_by(geo_value) %>% + mutate(cases_7sd = slider::slide_dbl(cases, .f = sd, na.rm = TRUE, .before = 7, .after = 0)) +``` + +Furthermore `epi_slide()` allows for selecting `.ref_time_value`, which the +latter recipe does not support. + +### Slide with a function + +We can also pass a function to the second argument in `epi_slide()`. In this +case, the passed function `.f` must have the form `function(x, g, t, ...)`, +where + +- `x` is an epi_df with the same column names as the input epi_df +- `g` is a one-row tibble containing the values of the grouping variables + for the associated group, for instance `g$geo_value` +- `t` is the ref_time_value for the current window +- `...` are additional arguments + +The same computation as above can be done with a function: + +```{r} +edf %>% + epi_slide(.f = function(x, g, t) sd(x$cases, na.rm = TRUE), .window_size = 7) +``` + +### `epi_slide()` with a formula + +The same computation as above can be done with a formula, where all references +to the columns must be made with the prefix `.x$...`, for instance: + +```{r} +edf %>% + epi_slide(~ sd(.x$cases, na.rm = TRUE), .window_size = 7) +``` + +Note that the name of the column defaults to `slide_value` in the unnamed +formula or function case. This can be adjusted with `.new_col_name`. + +### Rolling computations with multiple column outputs + +If your formula (or function) returns a data.frame, then the columns of the +data.frame will be unpacked into the resulting `epi_df` (in the sense of +`tidyr::unpack()`). For example, the following computes the 7-day trailing +average of daily cases as well as the the 7-day trailing standard deviation of +daily cases: + +```{r} +edf %>% + epi_slide( + ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sd = sd(.x$cases, na.rm = TRUE)), + .window_size = 7 + ) +``` + +### Optimized rolling mean and sums + +For the two most common sliding operations, we offer two optimized versions: +`epi_slide_mean()` and `epi_slide_sum()`. These are much faster than +`epi_slide()`, so we recommend using them when you are only interested in the +mean or sum of a column. The following computes the 7-day trailing mean of daily +cases: + +```{r} +edf %>% + group_by(geo_value) %>% + epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) +edf %>% + group_by(geo_value) %>% + epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) +``` + +### Running a forecaster on a sliding window of data + +The natural next step is to use the sliding window to forecast future values. +However to do this correctly, we should make sure that our data is historically +accurate. The data structure we use for that is the `epi_archive` and the +analogous slide function is `epix_slide()`. To read further along this train of +thought, see `vignette("epi_archive")`. + +## Group-level aggregation with `sum_groups_epi_df` + +TODO + +## Detecting and filling time gaps with `complete.epi_df` + +Sometimes you may have missing data in your time series. This can be due to +actual missing data, or it can be due to the fact that the data is only reported +on certain days. In the latter case, it is often useful to fill in the missing +data with explicit zeros. This can be done with the `complete.epi_df()` +function. + +First, let's create a data set with some missing data. We will reuse the dataset +`edf` from above, but modify it slightly. + +```{r} +edf_missing <- edf %>% + filter(geo_value %in% c("ca", "tx")) %>% + group_by(geo_value) %>% + slice(1:3, 5:6) + +edf_missing %>% + print(n = 10) +``` + +Now let's fill in the missing data with explicit zeros: + +```{r} +edf_missing %>% + complete( + time_value = seq.Date(min(time_value), max(time_value), by = "day"), + fill = list(cases = 0) + ) %>% + print(n = 12) +``` + +### Detecting and filling time gaps with `tsibble` + +We can also use the `tsibble` package to detect and fill time gaps. We'll work +with county-level reported COVID-19 cases in MA and VT. + +The data is included in this package (via the [`epidatasets` +package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: + +```{r, warning = FALSE, message = FALSE} +library(epiprocess) +library(dplyr) +library(readr) + +x <- covid_incidence_county_subset +``` + +The data can also be fetched from the Delphi Epidata API with the following query: + +```{r, message = FALSE, eval = FALSE, warning = FALSE} +library(epidatr) + +d <- as.Date("2024-03-20") + +# Get mapping between FIPS codes and county&state names: +y <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/county_census.csv", # nolint: line_length_linter + col_types = c( + FIPS = col_character(), + CTYNAME = col_character(), + STNAME = col_character() + ) +) %>% + filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% + select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) + +# Fetch only counties from Massachusetts and Vermont, then append names columns as well +x <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_incidence_num", + geo_type = "county", + time_type = "day", + geo_values = paste(y$geo_value, collapse = ","), + time_values = epirange(20200601, 20211231), + as_of = d +) %>% + select(geo_value, time_value, cases = value) %>% + inner_join(y, by = "geo_value", relationship = "many-to-one", unmatched = c("error", "drop")) %>% + as_epi_df(as_of = d) +``` + +The data contains 16,212 rows and 5 columns. + +## Converting to `tsibble` format + +For manipulating and wrangling time series data, the +[`tsibble`](https://tsibble.tidyverts.org/index.html) already provides a host of +useful tools. A tsibble object (formerly, of class `tbl_ts`) is basically a +tibble (data frame) but with two specially-marked columns: an **index** column +representing the time variable (defining an order from past to present), and a +**key** column identifying a unique observational unit for each time point. In +fact, the key can be made up of any number of columns, not just a single one. + +In an `epi_df` object, the index variable is `time_value`, and the key variable +is typically `geo_value` (though this need not always be the case: for example, +if we have an age group variable as another column, then this could serve as a +second key variable). The `epiprocess` package thus provides an implementation +of `as_tsibble()` for `epi_df` objects, which sets these variables according to +these defaults. + +```{r, message = FALSE} +library(tsibble) + +xt <- as_tsibble(x) +head(xt) +key(xt) +index(xt) +interval(xt) +``` + +We can also set the key variable(s) directly in a call to `as_tsibble()`. +Similar to SQL keys, if the key does not uniquely identify each time point (that +is, the key and index together do not not uniquely identify each row), then +`as_tsibble()` throws an error: + +```{r, error = TRUE} +head(as_tsibble(x, key = "county_name")) +``` + +As we can see, there are duplicate county names between Massachusetts and +Vermont, which caused the error. + +```{r, message = FALSE} +head(duplicates(x, key = "county_name")) +``` + +Keying by both county name and state name, however, does work: + +```{r, message = FALSE} +head(as_tsibble(x, key = c("county_name", "state_name"))) +``` + +One of the major advantages of the `tsibble` package is its ability to handle +**implicit gaps** in time series data. In other words, it can infer what time +scale we're interested in (say, daily data), and detect apparent gaps (say, when +values are reported on January 1 and 3 but not January 2). We can subsequently +use functionality to make these missing entries explicit, which will generally +help avoid bugs in further downstream data processing tasks. + +Let's first remove certain dates from our data set to create gaps: + +```{r} +state_naming <- read_csv("https://github.com/cmu-delphi/covidcast/raw/c89e4d295550ba1540d64d2cc991badf63ad04e5/Python-packages/covidcast-py/covidcast/geo_mappings/state_census.csv", # nolint: line_length_linter + col_types = c(NAME = col_character(), ABBR = col_character()) +) %>% + transmute(state_name = NAME, abbr = tolower(ABBR)) %>% + as_tibble() + +# First make geo value more readable for tables, plots, etc. +x <- x %>% + inner_join(state_naming, by = "state_name", relationship = "many-to-one", unmatched = c("error", "drop")) %>% + mutate(geo_value = paste(substr(county_name, 1, nchar(county_name) - 7), state_name, sep = ", ")) %>% + select(geo_value, time_value, cases) + +xt <- as_tsibble(x) %>% filter(cases >= 3) +``` + +The functions `has_gaps()`, `scan_gaps()`, `count_gaps()` in the `tsibble` +package each provide useful summaries, in slightly different formats. + +```{r} +head(has_gaps(xt)) +head(scan_gaps(xt)) +head(count_gaps(xt)) +``` + +We can also visualize the patterns of missingness: + +```{r, message = FALSE, warning = FALSE} +library(ggplot2) + +ggplot( + count_gaps(xt), + aes( + x = reorder(geo_value, desc(geo_value)), + color = geo_value + ) +) + + geom_linerange(aes(ymin = .from, ymax = .to)) + + geom_point(aes(y = .from)) + + geom_point(aes(y = .to)) + + coord_flip() + + labs(x = "County", y = "Date") + + theme(legend.position = "none") +``` + +Using the `fill_gaps()` function from `tsibble`, we can replace all gaps by an +explicit value. The default is `NA`, but in the current case, where missingness +is not at random but rather represents a small value that was censored (only a +hypothetical with COVID-19 reports, but certainly a real phenomenon that occurs +in other signals), it is better to replace it by zero, which is what we do here. +(Other approaches, such as LOCF: last observation carried forward in time, could +be accomplished by first filling with `NA` values and then following up with a +second call to `tidyr::fill()`.) + +```{r} +fill_gaps(xt, cases = 0) %>% + head() +``` + +Note that the time series for Addison, VT only starts on August 27, 2020, even +though the original (uncensored) data set itself was drawn from a period that +went back to June 6, 2020. By setting `.full = TRUE`, we can at zero-fill over +the entire span of the observed (censored) data. + +```{r} +xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) + +head(xt_filled) +``` + +Explicit imputation for missingness (zero-filling in our case) can be important +for protecting against bugs in all sorts of downstream tasks. For example, even +something as simple as a 7-day trailing average is complicated by missingness. +The function `epi_slide()` looks for all rows within a window of 7 days anchored +on the right at the reference time point (when `.window_size = 7`). +But when some days in a given week are missing because they were censored +because they had small case counts, taking an average of the observed case +counts can be misleading and is unintentionally biased upwards. Meanwhile, +running `epi_slide()` on the zero-filled data brings these trailing averages +(appropriately) downwards, as we can see inspecting Plymouth, MA around July 1, +2021. + +```{r} +xt %>% + as_epi_df(as_of = as.Date("2024-03-20")) %>% + group_by(geo_value) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% + ungroup() %>% + filter( + geo_value == "Plymouth, MA", + abs(time_value - as.Date("2021-07-01")) <= 3 + ) %>% + print(n = 7) + +xt_filled %>% + as_epi_df(as_of = as.Date("2024-03-20")) %>% + group_by(geo_value) %>% + epi_slide(cases_7dav = mean(cases), .window_size = 7) %>% + ungroup() %>% + filter( + geo_value == "Plymouth, MA", + abs(time_value - as.Date("2021-07-01")) <= 3 + ) %>% + print(n = 7) +``` + +## Geographic aggregation + +We do not yet provide tools for geographic aggregation in `epiprocess`. However, +we have some Python geocoding utilities available. Reach out to us if this is +functionality you would like to see us add to `epiprocess`. + +## Attribution + +The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor +Visits +data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). +This dataset is licensed under the terms of the [Creative Commons Attribution +4.0 International license](https://creativecommons.org/licenses/by/4.0/). +Copyright Delphi Research Group at Carnegie Mellon University 2020. + +This document contains a dataset that is a modified part of the [COVID-19 Data +Repository by the Center for Systems Science and Engineering (CSSE) at Johns +Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished +in the COVIDcast Epidata +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). +This data set is licensed under the terms of the [Creative Commons Attribution +4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the +Johns Hopkins University on behalf of its Center for Systems Science in +Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): +These signals are taken directly from the JHU CSSE [COVID-19 GitHub +repository](https://github.com/CSSEGISandData/COVID-19) without changes. + diff --git a/vignettes/epiprocess.Rmd b/vignettes/epiprocess.Rmd index 66c098ae..f45927c7 100644 --- a/vignettes/epiprocess.Rmd +++ b/vignettes/epiprocess.Rmd @@ -1,108 +1,42 @@ --- -title: Get started with `epiprocess` +title: Get started with epiprocess output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Get started with `epiprocess`} + %\VignetteIndexEntry{Get started with epiprocess} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: chunk_output_type: console --- -The [`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) package works -with epidemiological time series and version data to provide situational -awareness, processing and transformations in preparation for modeling, and -version-faithful model backtesting. It contains: - -- `epi_df`, a class for working with epidemiological time series data; -- `epi_archive`, a class for working with the version history of such time series data; -- sample data in these formats; -- [`{dplyr}`](https://dplyr.tidyverse.org/)-esque "verbs" for common data - transformations (e.g., 7-day averages); -- functions for exploratory data analysis and situational awareness (e.g., - outlier detection and growth rate estimation); and -- [`{dplyr}`](https://dplyr.tidyverse.org/)-esque "verbs" for version-faithful - "pseudoprospective" backtesting of models, and other version history analysis - and transformations. - -It is part of a broader suite of packages that includes -[`{epipredict}`](https://cmu-delphi.github.io/epipredict/), -[`{epidatr}`](https://cmu-delphi.github.io/epidatr/), -[`{rtestim}`](https://dajmcdon.github.io/rtestim/), and -[`{epidatasets}`](https://cmu-delphi.github.io/epidatasets/), for accessing, -analyzing, and forecasting epidemiological time series data. We have expanded -documentation and demonstrations for some of these packages available in an -online "book" format [here](https://cmu-delphi.github.io/delphi-tooling-book/). - -## Motivation - -[`{epiprocess}`](https://cmu-delphi.github.io/epiprocess/) and -[`{epipredict}`](https://cmu-delphi.github.io/epipredict/) are designed to lower -the barrier to entry and implementation cost for epidemiological time series -analysis and forecasting. Epidemiologists and forecasting groups repeatedly and -separately have had to rush to implement this type of functionality in a much -more ad hoc manner; we are trying to save such effort in the future by providing -well-documented, tested, and general packages that can be called for many common -tasks instead. - -[`{epiprocess}`](https://github.com/cmu-delphi/epiprocess/) also provides tools -to help avoid a particularly common pitfall in analysis and forecasting: -ignoring reporting latency and revisions to a data set. This can, for example, -lead to one retrospectively analyzing a surveillance signal or forecasting model -and concluding that it is much more accurate than it actually was in real time, -or producing always-decreasing forecasts on data sets where initial surveillance -estimates are systematically revised upward. Storing and working with version -history can help avoid these issues. - -## Intended audience - -We expect users to be proficient in R, and familiar with the -[`{dplyr}`](https://dplyr.tidyverse.org/) and -[`{tidyr}`](https://tidyr.tidyverse.org/) packages. - -## Installing - -This package is not on CRAN yet, so it can be installed using the -[`{devtools}`](https://devtools.r-lib.org) package: - -```{r, eval = FALSE} -devtools::install_github("cmu-delphi/epiprocess", ref = "main") +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) ``` -Building the vignettes, such as this getting started guide, takes a significant -amount of time. They are not included in the package by default. If you want to -include vignettes, then use this modified command: +## Overview -```{r, eval = FALSE} -devtools::install_github("cmu-delphi/epiprocess", - ref = "main", - build_vignettes = TRUE, dependencies = TRUE -) -``` - -## Getting data into `epi_df` format - -We'll start by showing how to get data into -epi_df format, which is just -a tibble with a bit of special structure, and is the format assumed by all of -the functions in the `epiprocess` package. An `epi_df` object has (at least) the -following columns: +This vignette provides a brief introduction to the `{epiprocess}` package. We will +do the following: -* `geo_value`: the geographic value associated with each row of measurements. -* `time_value`: the time value associated with each row of measurements. +- Get data into `epi_df()` format and plot the data +- Perform basic signal processing tasks (lagged differences, rolling average, + cumulative sum, etc.) +- Detect outliers in the data and apply corrections +- Calculate the growth rate of the data +- Get data into `epi_archive()` format and perform similar signal processing + tasks -It can have any number of other columns which can serve as measured variables, -which we also broadly refer to as signal variables. The documentation for - gives more details about this data format. +## Getting data into `epi_df` format -A data frame or tibble that has `geo_value` and `time_value` columns can be -converted into an `epi_df` object, using the function `as_epi_df()`. As an -example, we'll work with daily cumulative COVID-19 cases from four U.S. states: -CA, FL, NY, and TX, over time span from mid 2020 to early 2022. We have included -this example data in the `epidatasets::covid_confirmed_cumulative_num` object, -which we prepared by downloading the data using `epidatr::pub_covidcast()`. +We'll start by getting data into `epi_df()` format, which is just a tibble with +a bit of special structure. As an example, we will get COVID-19 confirmed +cumulative case data from JHU CSSE for California, Florida, New York, and Texas, +from March 1, 2020 to January 31, 2022. We have +included this example data in the `epidatasets::covid_confirmed_cumulative_num` +object, which we prepared by downloading the data using +`epidatr::pub_covidcast()`. -```{r, message = FALSE} +```{r, results=FALSE, warning=FALSE, message=FALSE} library(epidatasets) library(epiprocess) library(dplyr) @@ -115,243 +49,153 @@ class(cases) colnames(cases) ``` -As we can see, a data frame returned by `epidatr::pub_covidcast()` has the -columns required for an `epi_df` object (along with many others). We can use -`as_epi_df()`, with specification of some relevant metadata, to bring the data -frame into `epi_df` format. - -```{r, message = FALSE} -x <- as_epi_df(cases, as_of = max(cases$issue)) %>% - select(geo_value, time_value, total_cases = value) +The same data can be downloaded with `{epidatr}` as follows: -class(x) -summary(x) -head(x) -attributes(x)$metadata +```{r eval=FALSE} +cases <- pub_covidcast( + source = "jhu-csse", + signals = "confirmed_cumulative_num", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200301, 20220131), +) ``` -## Some details on metadata - -In general, an `epi_df` object has the following fields in its metadata: - -* `geo_type`: the type for the geo values. -* `as_of`: the time value at which the given data were available. +The tibble returned has the columns required for an `epi_df` object, `geo_value` +and `time_value`, so we can convert it directly to an `epi_df` object using +`as_epi_df()`. -Metadata for an `epi_df` object `x` can be accessed (and altered) via -`attributes(x)$metadata`. The field, `geo_type`,is not currently used by any -downstream functions in the `epiprocess` package, and serve only as useful bits -of information to convey about the data set at hand. The last field here, -`as_of`, is one of the most unique aspects of an `epi_df` object. - -In brief, we can think of an `epi_df` object as a single snapshot of a data set -that contains the most up-to-date values of some signals of interest, as of the -time specified `as_of`. For example, if `as_of` is January 31, 2022, then the -`epi_df` object has the most up-to-date version of the data available as of -January 31, 2022. The `epiprocess` package also provides a companion data -structure called `epi_archive`, which stores the full version history of a given -data set. See the [archive -vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for -more. - -If `geo_type` or `as_of` arguments are missing in a call to `as_epi_df()`, then -this function will try to infer them from the passed object. Usually, `geo_type` -can be inferred from the `geo_value` columns, respectively, but inferring the -`as_of` field is not as easy. See the documentation for `as_epi_df()` more -details. - -```{r} -x <- as_epi_df(cases, as_of = as.Date("2024-03-20")) %>% - select(geo_value, time_value, total_cases = value) - -attributes(x)$metadata +```{r, message = FALSE} +edf <- cases %>% + select(geo_value, time_value, cases_cumulative = value) %>% + as_epi_df() %>% + group_by(geo_value) %>% + mutate(cases_daily = cases_cumulative - lag(cases_cumulative, default = 0)) +edf ``` -## Using additional key columns in `epi_df` +In brief, we can think of an `epi_df` object as snapshot of an epidemiological +data set as it was at a particular point in time (recorded in the `as_of` +attribute). We can easily plot the data using the `autoplot()` method (which is +a convenience wrapper to `ggplot2`). -In the following examples we will show how to create an `epi_df` with additional keys. - -### Converting a `tsibble` that has county code as an extra key - -```{r} -ex1 <- tibble( - geo_value = rep(c("ca", "fl", "pa"), each = 3), - county_code = c( - "06059", "06061", "06067", - "12111", "12113", "12117", - "42101", "42103", "42105" - ), - time_value = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(geo_value)), - value = seq_along(geo_value) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(geo_value))) -) %>% - as_tsibble(index = time_value, key = c(geo_value, county_code)) - -ex1 <- as_epi_df(x = ex1, as_of = "2020-06-03") +```{r, message = FALSE, warning = FALSE} +edf %>% + autoplot(cases_cumulative) ``` -The metadata now includes `county_code` as an extra key. +We can compute the 7 day moving average of the confirmed daily cases for each +geo_value by using the `epi_slide_mean()` function. For a more in-depth guide to +sliding, see `vignette("epi_df")`. + ```{r} -attr(ex1, "metadata") +edf %>% + group_by(geo_value) %>% + epi_slide_mean(cases_daily, .window_size = 7, na.rm = TRUE) ``` -### Dealing with misspecified column names - -`epi_df` requires there to be columns `geo_value` and `time_value`, if they do not exist then `as_epi_df()` throws an error. +We can compute the growth rate of the confirmed cumulative cases for each +geo_value. For a more in-depth guide to growth rates, see `vignette("growth_rate")`. -```{r, error = TRUE} -data.frame( - # misnamed - state = rep(c("ca", "fl", "pa"), each = 3), - # extra key - pol = rep(c("blue", "swing", "swing"), each = 3), - # misnamed - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = 9), - value = 1:9 + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, 9)) -) %>% as_epi_df(as_of = as.Date("2024-03-20")) +```{r} +edf %>% + group_by(geo_value) %>% + mutate(cases_growth = growth_rate(x = time_value, y = cases_cumulative, method = "rel_change", h = 7)) ``` -The columns can be renamed to match `epi_df` format. In the example below, notice there is also an additional key `pol`. +Detect outliers in daily reported cases for each geo_value. For a more in-depth +guide to outlier detection, see `vignette("outliers")`. ```{r} -ex2 <- tibble( - # misnamed - state = rep(c("ca", "fl", "pa"), each = 3), - # extra key - pol = rep(c("blue", "swing", "swing"), each = 3), - # misnamed - reported_date = rep(seq(as.Date("2020-06-01"), as.Date("2020-06-03"), by = "day"), length.out = length(state)), - value = seq_along(state) + 0.01 * withr::with_rng_version("3.0.0", withr::with_seed(42, length(state))) -) %>% data.frame() - -head(ex2) - -ex2 <- ex2 %>% - rename(geo_value = state, time_value = reported_date) %>% - as_epi_df( - as_of = "2020-06-03", - other_keys = "pol" - ) - -attr(ex2, "metadata") +edf %>% + group_by(geo_value) %>% + mutate(outlier_info = detect_outlr(x = time_value, y = cases_daily)) %>% + ungroup() ``` -### Adding additional keys to an `epi_df` object - -In the above examples, all the keys are added to objects that are not `epi_df` objects. We illustrate how to add keys to an `epi_df` object. - -We use a toy data set included in `epiprocess` prepared using the `covidcast` library and are filtering to a single state for simplicity. +Add a column to the epi_df object with the daily deaths for each geo_value and +compute the correlations between cases and deaths for each geo_value. For a more +in-depth guide to correlations, see `vignette("correlation")`. ```{r} -ex3 <- covid_incidence_county_subset %>% - filter(time_value > "2021-12-01", state_name == "Massachusetts") %>% - slice_tail(n = 6) - -attr(ex3, "metadata") # geo_type is county currently +df <- pub_covidcast( + source = "jhu-csse", + signals = "deaths_incidence_num", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200301, 20220131), +) %>% + select(geo_value, time_value, deaths_daily = value) %>% + as_epi_df() %>% + arrange_canonical() +edf <- inner_join(edf, df, by = c("geo_value", "time_value")) +edf %>% + group_by(geo_value) %>% + epi_slide_mean(deaths_daily, .window_size = 7, na.rm = TRUE) %>% + epi_cor(cases_daily, deaths_daily) ``` -Now we add `state` (MA) and `pol` as new columns to the data and as new keys to the metadata. Reminder that lower case state name abbreviations are what we would expect if this were a `geo_value` column. +Note that if an epi_df object loses its `geo_value` or `time_value` columns, it +will decay to a regular tibble. ```{r} -ex3 <- ex3 %>% - as_tibble() %>% # needed to add the additional metadata - mutate( - state = rep(tolower("MA"), 6), - pol = rep(c("blue", "swing", "swing"), each = 2) - ) %>% - as_epi_df(other_keys = c("state", "pol"), as_of = as.Date("2024-03-20")) - -attr(ex3, "metadata") +edf %>% select(-time_value) ``` -Note that the two additional keys we added, `state` and `pol`, are specified as a character vector in the `other_keys` argument. They must be specified in this manner so that downstream actions on the `epi_df`, like model fitting and prediction, can recognize and use these keys. - -Currently `other_keys` metadata in `epi_df` doesn't impact `epi_slide()`, contrary to `other_keys` in `as_epi_archive` which affects how the update data is interpreted. - -## Working with `epi_df` objects downstream +## Getting data into `epi_archive` format -Data in `epi_df` format should be easy to work with downstream, since it is a -very standard tabular data format; in the other vignettes, we'll walk through -some basic signal processing tasks using functions provided in the `epiprocess` -package. Of course, we can also write custom code for other downstream uses, -like plotting, which is pretty easy to do `ggplot2`. +We can also get data into `epi_archive()` format, which can be thought of as an +aggregation of many `epi_df` snapshots. We can perform similar signal processing +tasks on `epi_archive` objects as we did on `epi_df` objects, though the +interface is a bit different. -```{r, message = FALSE, warning = FALSE} +```{r, message = FALSE, warning = FALSE, eval=FALSE} +library(epidatr) +library(epiprocess) +library(data.table) +library(dplyr) +library(purrr) library(ggplot2) -theme_set(theme_bw()) -ggplot(x, aes(x = time_value, y = total_cases, color = geo_value)) + - geom_line() + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Cumulative COVID-19 cases", color = "State") -``` - -As a last couple examples, we'll look at some more data sets just to show how -we might get them into `epi_df` format. Data on daily new (not cumulative) SARS -cases in Canada in 2003, from the -[outbreaks](https://github.com/reconverse/outbreaks) package: - -```{r} -x <- outbreaks::sars_canada_2003 %>% - mutate(geo_value = "ca") %>% - select(geo_value, time_value = date, starts_with("cases")) %>% - as_epi_df(as_of = as.Date("2024-03-20")) - -head(x) - -library(tidyr) -x <- x %>% - pivot_longer(starts_with("cases"), names_to = "type") %>% - mutate(type = substring(type, 7)) - -yrange <- range( - x %>% - group_by(time_value) %>% - summarize(value = sum(value)) %>% - pull(value) +dv <- pub_covidcast( + source = "doctor-visits", + signals = "smoothed_adj_cli", + geo_type = "state", + time_type = "day", + geo_values = "ca,fl,ny,tx", + time_values = epirange(20200601, 20211201), + issues = epirange(20200601, 20211201) ) - -ggplot(x, aes(x = time_value, y = value)) + - geom_col(aes(fill = type)) + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - scale_y_continuous(breaks = yrange[1]:yrange[2]) + - labs(x = "Date", y = "SARS cases in Canada", fill = "Type") ``` -Get confirmed cases of Ebola in Sierra Leone from 2014 to 2015 by province and -date of onset, prepared from line list data from the same package: - -```{r, fig.width = 9, fig.height = 6} -x <- outbreaks::ebola_sierraleone_2014 %>% - select(district, date_of_onset, status) %>% - mutate(province = case_when( - district %in% c("Kailahun", "Kenema", "Kono") ~ - "Eastern", - district %in% c( - "Bombali", "Kambia", "Koinadugu", "Port Loko", - "Tonkolili" - ) ~ - "Northern", - district %in% c("Bo", "Bonthe", "Moyamba", "Pujehun") ~ - "Sourthern", - district %in% c("Western Rural", "Western Urban") ~ - "Western" - )) %>% - group_by(geo_value = province, time_value = date_of_onset) %>% - summarise(cases = sum(status == "confirmed"), .groups = "drop") %>% - complete(geo_value, - time_value = full_seq(time_value, period = 1), - fill = list(cases = 0) - ) %>% - as_epi_df(as_of = as.Date("2024-03-20")) - -ggplot(x, aes(x = time_value, y = cases)) + - geom_col(aes(fill = geo_value), show.legend = FALSE) + - facet_wrap(~geo_value, scales = "free_y") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Confirmed cases of Ebola in Sierra Leone") +```{r, echo=FALSE, message=FALSE, warning=FALSE} +library(epidatr) +library(epiprocess) +library(data.table) +library(dplyr) +library(purrr) +library(ggplot2) +dv <- archive_cases_dv_subset$DT %>% + select(-case_rate_7d_av) %>% + rename(issue = version, value = percent_cli) %>% + tibble() ``` -## Attribution -This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): - These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +## Data attribution + +This document contains a dataset that is a modified part of the [COVID-19 Data +Repository by the Center for Systems Science and Engineering (CSSE) at Johns +Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished +in the COVIDcast Epidata +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). +This data set is licensed under the terms of the [Creative Commons Attribution +4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the +Johns Hopkins University on behalf of its Center for Systems Science in +Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata +API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): +These signals are taken directly from the JHU CSSE [COVID-19 GitHub +repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index 326a07c4..4e1269fb 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -7,6 +7,10 @@ vignette: > %\VignetteEncoding{UTF-8} --- +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + A basic way of assessing growth in a signal is to look at its relative change over two neighboring time windows. The `epiprocess` package provides a function `growth_rate()` to compute such relative changes, as well as more sophisticated @@ -105,7 +109,6 @@ red) and below -1% (in blue), faceting by geo value. ```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 4} library(ggplot2) -theme_set(theme_bw()) upper <- 0.01 lower <- -0.01 diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 1c00ff6e..03b7015c 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -7,9 +7,13 @@ vignette: > %\VignetteEncoding{UTF-8} --- +```{r, include = FALSE} +source(here::here("vignettes", "_common.R")) +``` + This vignette describes functionality for detecting and correcting outliers in signals in the `detect_outlr()` and `correct_outlr()` functions provided in the -`epiprocess` package. These functions is designed to be modular and extendable, +`epiprocess` package. These functions are designed to be modular and extendable, so that you can define your own outlier detection and correction routines and apply them to `epi_df` objects. We'll demonstrate this using state-level daily reported COVID-19 case counts from FL and NJ. @@ -24,9 +28,8 @@ library(tidyr) x <- covid_incidence_outliers ``` -```{r, fig.width = 8, fig.height = 7, warning=FALSE,message=FALSE} +```{r, warning=FALSE,message=FALSE} library(ggplot2) -theme_set(theme_bw()) ggplot(x, aes(x = time_value, y = cases)) + geom_line() + @@ -203,7 +206,7 @@ returned by each outlier detection method. Below we use the replacement value from the combined method, which is defined by the median of replacement values from the base methods at each time point. -```{r, fig.width = 8, fig.height = 7} +```{r} y <- x %>% mutate(cases_corrected = combined_replacement) %>% select(geo_value, time_value, cases, cases_corrected) diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd deleted file mode 100644 index 0257b3ee..00000000 --- a/vignettes/slide.Rmd +++ /dev/null @@ -1,381 +0,0 @@ ---- -title: Slide a computation over signal values -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Slide a computation over signal values} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -A central tool in the `epiprocess` package is `epi_slide()`, which is based on -the powerful functionality provided in the -[`slider`](https://cran.r-project.org/web/packages/slider) package. In -`epiprocess`, to "slide" means to apply a computation---represented as a -function or formula---over a sliding/rolling data window. The function always -applies the slide inside each group and the grouping is assumed to be across all -group keys of the `epi_df` (this is the grouping used by default if you do not -group the `epi_df` with a `group_by()`). - -By default, the `.window_size` units depend on the `time_type` of the `epi_df`, -which is determined from the types in the `time_value` column of the `epi_df`. -See the "Details" in `epi_slide()` for more. - -As in getting started guide, we'll fetch daily reported COVID-19 cases from CA, -FL, NY, and TX (note: here we're using new, not cumulative cases) using the -[`epidatr`](https://github.com/cmu-delphi/epidatr) package, and then convert -this to `epi_df` format. - -```{r, message = FALSE, warning = FALSE} -library(epiprocess) -library(dplyr) -``` - -The data is included in this package (via the [`epidatasets` package](https://cmu-delphi.github.io/epidatasets/)) and can be loaded with: - -```{r} -edf <- cases_deaths_subset %>% - select(geo_value, time_value, cases) %>% - arrange(geo_value, time_value) -``` - -The data can also be fetched from the Delphi Epidata API with the following query: -```{r, message = FALSE, eval = FALSE} -library(epidatr) - -d <- as.Date("2024-03-20") - -edf <- pub_covidcast( - source = "jhu-csse", - signals = "confirmed_incidence_num", - geo_type = "state", - time_type = "day", - geo_values = "ca,fl,ny,tx,ga,pa", - time_values = epirange(20200301, 20211231), - as_of = d -) %>% - select(geo_value, time_value, cases = value) %>% - arrange(geo_value, time_value) %>% - as_epi_df(as_of = d) -``` - -The data has 2,684 rows and 3 columns. - -## Optimized rolling mean and sums - -For the two most common sliding operations, we offer two optimized versions: -`epi_slide_mean()` and `epi_slide_sum()`. This example gets the 7-day trailing -average of the daily cases. Note that the name of the column(s) that we want to -average is specified as the first argument of `epi_slide_mean()`. - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide_mean("cases", .window_size = 7, na.rm = TRUE) %>% - ungroup() %>% - head(10) -``` - -Note that we passed `na.rm = TRUE` to `data.table::frollmean()` via `...` to -`epi_slide_mean`. - -The following computes the 7-day trailing sum of daily cases (and passed `na.rm` -to `data.table::frollsum()` similarly): - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide_sum("cases", .window_size = 7, na.rm = TRUE) %>% - ungroup() %>% - head(10) -``` - -## General sliding with a formula - -The previous computations can also be performed using `epi_slide()`, which can -be used for more general sliding computations (but is much slower for the -specific cases of mean and sum). - -The same 7-day trailing average of daily cases can be computed by passing in a -formula for the first argument of `epi_slide()`: - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide(~ mean(.x$cases, na.rm = TRUE), .window_size = 7) %>% - ungroup() %>% - head(10) -``` - -If your formula returns a data.frame, then the columns of the data.frame -will be unpacked into the resulting `epi_df`. For example, the following -computes the 7-day trailing average of daily cases and the 7-day trailing sum of -daily cases: - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide( - ~ data.frame(cases_mean = mean(.x$cases, na.rm = TRUE), cases_sum = sum(.x$cases, na.rm = TRUE)), - .window_size = 7 - ) %>% - ungroup() %>% - head(10) -``` - -Note that this formula has access to all non-grouping columns present in the -original `epi_df` object and must refer to them with the prefix `.x$...`. As we -can see, the function `epi_slide()` returns an `epi_df` object with a new column -appended that contains the results (from sliding), named `slide_value` as the -default. - -Some other information is available in additional variables: - -* `.group_key` is a one-row tibble containing the values of the grouping - variables for the associated group -* `.ref_time_value` is the reference time value the time window was based on - -```{r} -# Returning geo_value in the formula -edf %>% - group_by(geo_value) %>% - epi_slide(~ .x$geo_value[[1]], .window_size = 7) %>% - ungroup() %>% - head(10) - -# Returning time_value in the formula -edf %>% - group_by(geo_value) %>% - epi_slide(~ .x$time_value[[1]], .window_size = 7) %>% - ungroup() %>% - head(10) -``` - -While the computations above do not look very useful, these can be used as -building blocks for computations that do something different depending on the -geo_value or ref_time_value. - -## Slide the tidy way - -Perhaps the most convenient way to setup a computation in `epi_slide()` is to -pass in an expression for tidy evaluation. In this case, we can simply define -the name of the new column directly as part of the expression, setting it equal -to a computation in which we can access any columns of `.x` by name, just as we -would in a call to `dplyr::mutate()`, or any of the `dplyr` verbs. For example: - -```{r} -slide_output <- edf %>% - group_by(geo_value) %>% - epi_slide(cases_7dav = mean(cases, na.rm = TRUE), .window_size = 7) %>% - ungroup() %>% - head(10) -``` - -In addition to referring to individual columns by name, you can refer to -`epi_df` time window as `.x` (`.group_key` and `.ref_time_value` are still -available). Also, the tidyverse "pronouns" `.data` and `.env` can also be used -if you need distinguish between the data and environment. - -As a simple sanity check, we visualize the 7-day trailing averages computed on -top of the original counts: - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -library(ggplot2) -theme_set(theme_bw()) - -ggplot(slide_output, aes(x = time_value)) + - geom_col(aes(y = cases, fill = geo_value), alpha = 0.5, show.legend = FALSE) + - geom_line(aes(y = cases_7dav, col = geo_value), show.legend = FALSE) + - facet_wrap(~geo_value, scales = "free_y") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 cases") -``` - -As we can see from the top right panel, it looks like Texas moved to weekly -reporting of COVID-19 cases in summer of 2021. - -## Slide with a function - -We can also pass a function to the second argument in `epi_slide()`. In this -case, the passed function `.f` must have the form `function(x, g, t, ...)`, -where - -- "x" is an epi_df with the same column names as the archive's `DT`, minus - the `version` column -- "g" is a one-row tibble containing the values of the grouping variables -for the associated group -- "t" is the ref_time_value for the current window -- "..." are additional arguments - -Recreating the last example of a 7-day trailing average: - -```{r} -edf %>% - group_by(geo_value) %>% - epi_slide(function(x, g, t) mean(x$cases, na.rm = TRUE), .window_size = 7) %>% - ungroup() %>% - head(10) -``` - -## Running a simple autoregressive forecaster - -As a more complex example, we create a forecaster based on an autoregression or -AR model. AR models can be fit in numerous ways (using base R functions and -various packages), but here we define it "by hand" both because it provides a -more advanced example of sliding a function over an `epi_df` object, and because -it allows us to be a bit more flexible in defining a *probabilistic* forecaster: -one that outputs not just a point prediction, but a notion of uncertainty around -this. In particular, our forecaster will output a point prediction along with an -90\% uncertainty band, represented by a predictive quantiles at the 5\% and 95\% -levels (lower and upper endpoints of the uncertainty band). - -The function defined below, `prob_ar()`, is our probabilistic AR forecaster. The -`lags`argument indicates which lags to use in the model, and `ahead` indicates -how far ahead in the future to make forecasts (both are encoded in terms of the -units of the `time_value` column; so, days, in the working `epi_df` being -considered in this vignette). - -```{r} -prob_ar <- function(y, lags = c(0, 7, 14), ahead = 6, min_train_window = 20, - lower_level = 0.05, upper_level = 0.95, symmetrize = TRUE, - intercept = FALSE, nonneg = TRUE) { - # Return NA if insufficient training data - if (length(y) < min_train_window + max(lags) + ahead) { - return(data.frame(point = NA, lower = NA, upper = NA)) - } - - # Filter down the edge-NAs - y <- y[!is.na(y)] - - # Build features and response for the AR model - dat <- do.call( - data.frame, - purrr::map(lags, function(j) lag(y, n = j)) - ) - names(dat) <- paste0("x", seq_len(ncol(dat))) - if (intercept) dat$x0 <- rep(1, nrow(dat)) - dat$y <- lead(y, n = ahead) - - # Now fit the AR model and make a prediction - obj <- lm(y ~ . + 0, data = dat) - point <- predict(obj, newdata = tail(dat, 1)) - - # Compute a band - r <- residuals(obj) - s <- ifelse(symmetrize, -1, NA) # Should the residuals be symmetrized? - q <- quantile(c(r, s * r), probs = c(lower_level, upper_level), na.rm = TRUE) - lower <- point + q[1] - upper <- point + q[2] - - # Clip at zero if we need to, then return - if (nonneg) { - point <- max(point, 0) - lower <- max(lower, 0) - upper <- max(upper, 0) - } - return(data.frame(point = point, lower = lower, upper = upper)) -} -``` - -We go ahead and slide this AR forecaster over the working `epi_df` of COVID-19 -cases. Note that we actually model the `cases_7dav` column, to operate on the -scale of smoothed COVID-19 cases. This is clearly equivalent, up to a constant, -to modeling weekly sums of COVID-19 cases. - -```{r} -fc_time_values <- seq(as.Date("2020-06-01"), as.Date("2021-12-01"), by = "1 months") -edf %>% - group_by(geo_value) %>% - epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% - epi_slide(fc = prob_ar(.data$cases_7dav), .window_size = 120, .ref_time_values = fc_time_values) %>% - ungroup() %>% - head(10) -``` - -Note that here we have utilized an argument `.ref_time_values` to perform the -sliding computation (here, compute a forecast) at a specific subset of reference -time values (the start of every month from mid 2020 to the end of 2021). The -resulting epi_df now contains three new columns: `fc$point`, `fc$lower`, and -`fc$upper` corresponding to the point forecast, and the lower and upper -endpoints of the 95\% prediction band, respectively. - -To finish off, we plot the forecasts at some times (spaced out by a few months) -over the last year, at multiple horizons: 7, 14, 21, and 28 days ahead. To do -so, we encapsulate the process of generating forecasts into a simple function, -so that we can call it a few times. - -```{r, message = FALSE, warning = FALSE, fig.width = 9, fig.height = 6} -# Note the use of .all_rows = TRUE (keeps all original rows in the output) -k_week_ahead <- function(x, ahead = 7) { - x %>% - group_by(geo_value) %>% - epi_slide(cases_7dav = mean(.data$cases, na.rm = TRUE), .window_size = 7) %>% - epi_slide( - fc = prob_ar(.data$cases_7dav, ahead = ahead), - .window_size = 120, - .ref_time_values = fc_time_values, - .all_rows = TRUE - ) %>% - ungroup() %>% - mutate(target_date = .data$time_value + ahead) -} - -# First generate the forecasts, and bind them together -z <- bind_rows( - k_week_ahead(edf, ahead = 7), - k_week_ahead(edf, ahead = 14), - k_week_ahead(edf, ahead = 21), - k_week_ahead(edf, ahead = 28) -) - -# Now plot them, on top of actual COVID-19 case counts -ggplot(z) + - geom_line(aes(x = time_value, y = cases_7dav), color = "gray50") + - geom_ribbon(aes( - x = target_date, ymin = fc$lower, ymax = fc$upper, - group = time_value - ), fill = 6, alpha = 0.4) + - geom_line(aes(x = target_date, y = fc$point, group = time_value)) + - geom_point(aes(x = target_date, y = fc$point, group = time_value), - size = 0.5 - ) + - geom_vline( - data = tibble(x = fc_time_values), aes(xintercept = x), - linetype = 2, alpha = 0.5 - ) + - facet_wrap(vars(geo_value), scales = "free_y") + - scale_x_date(minor_breaks = "month", date_labels = "%b %y") + - labs(x = "Date", y = "Reported COVID-19 cases") -``` - -Two points are worth making. First, the AR model's performance here is pretty -spotty. At various points in time, we can see that its forecasts are volatile -(its point predictions are all over the place), or overconfident (its bands are -too narrow), or both at the same time. This is only meant as a simple demo and -not entirely unexpected given the way the AR model is set up. The -[`epipredict`](https://cmu-delphi.github.io/epipredict) package, which is a -companion package to `epiprocess`, offers a suite of predictive modeling tools -that can improve on some of the shortcomings of the above simple AR model. - -Second, the AR forecaster here is using finalized data, meaning, it uses the -latest versions of signal values (reported COVID-19 cases) available, for both -training models and making predictions historically. However, this is not -reflective of the provisional nature of the data that it must cope with in a -true forecast task. Training and making predictions on finalized data can lead -to an overly optimistic sense of accuracy; see, for example, [McDonald et al. -(2021)](https://www.pnas.org/content/118/51/e2111453118/), and references -therein. Fortunately, the `epiprocess` package provides a data structure called -`epi_archive` that can be used to store all data revisions, and furthermore, an -`epi_archive` object knows how to slide computations in the correct -version-aware sense (for the computation at each reference time $t$, it uses -only data that would have been available as of $t$). We will revisit this -example in the [archive -vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). - -## Attribution - -The `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. - -This document contains a dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. - -[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): -These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes.