doc: big doc overhaul

* clean up and reorganize reference * unify detect_outlr, epi_slide_* reference pages * add README.Rmd and rewrite landing page * rewrite Getting Started vignette * unify aggregation and slide vignette into epi_df * rewrite archive vignette * improve slide, epi_df and epi_archive reference pages * update epix_fill_through_version #419 * add .editorconfig * update DEVELOPMENT.md
cmu-delphi · Oct 18, 2024 · 0f62112 · 0f62112
1 parent cb468db
commit 0f62112
Show file tree

Hide file tree

Showing 78 changed files with 3,876 additions and 3,191 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -17,4 +17,8 @@
 ^DEVELOPMENT.md$
 man-roxygen
 ^.venv$
-^sandbox.R$
+^sandbox.R$
+^README.Rmd$
+^README_cache$
+^pkgdown-watch.R$
+^scrap.Rmd$
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,21 @@
+# EditorConfig helps developers define and maintain consistent
+# coding styles between different editors and IDEs
+# editorconfig.org
+
+root = true
+
+
+[*]
+
+# Change these settings to your own preference
+indent_style = space
+indent_size = 2
+
+# We recommend you to keep these unchanged
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,9 @@ docs
 renv/
 renv.lock
 .Rprofile
-sandbox.R
+sandbox.R
+# Vignette caches
+*_cache/
+vignettes/*.html
+vignettes/*.R
+!vignettes/_common.R
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
-Type: Package
 Package: epiprocess
+Type: Package
 Title: Tools for basic signal processing in epidemiology
 Version: 0.9.5
 Authors@R: c(
@@ -28,11 +28,11 @@ Authors@R: c(
     person("Carnegie Mellon University Delphi Group", role = "dtc",
             comment = "Owner of claims-based CLI data from the Delphi Epidata API")
   )
-Description: This package introduces a common data structure for
-    epidemiological data reported by location and time, provides another
-    data structure to work with revisions to these data sets over time,
-    and offers associated utilities to perform basic signal processing
-    tasks.
+Description: This package introduces common data structures for working with
+    epidemiological data reported by location and time and offers associated
+    utilities to perform basic signal processing tasks. The package is designed
+    to be used in conjunction with `epipredict` for building and evaluating
+    epidemiological models.
 License: MIT + file LICENSE
 URL: https://cmu-delphi.github.io/epiprocess/
 Depends:
@@ -62,6 +62,7 @@ Imports:
 Suggests:
     devtools,
     epidatr,
+    here,
     knitr,
     outbreaks,
     readr,
@@ -88,7 +89,7 @@ Collate:
     'correlation.R'
     'epi_df.R'
     'epi_df_forbidden_methods.R'
-    'epiprocess.R'
+    'epiprocess-package.R'
     'group_by_epi_df_methods.R'
     'methods-epi_archive.R'
     'grouped_epi_archive.R'

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -1,10 +1,8 @@
 ## Setting up the development environment
 
 ```r
-install.packages(c('devtools', 'pkgdown', 'styler', 'lintr')) # install dev dependencies
-devtools::install_deps(dependencies = TRUE) # install package dependencies
-devtools::document() # generate package meta data and man files
-devtools::build() # build package
+install.packages(c('devtools', 'pkgdown', 'styler', 'lintr', 'pak')) # install dev dependencies
+pak::pkg_install(".") # install package and dependencies
 ```
 
 ## Validating the package
@@ -13,8 +11,12 @@ devtools::build() # build package
 styler::style_pkg() # format code
 lintr::lint_package() # lint code
 
+devtools::check() # run R CMD check, which runs everything below
+devtools::document() # generate package meta data and man files
 devtools::test() # test package
-devtools::check() # check package for errors
+devtools::build_vignettes() # build vignettes only
+devtools::run_examples() # run doc examples
+devtools::check(vignettes = FALSE) # check package without vignettes
 ```
 
 ## Developing the documentation site
@@ -24,20 +26,16 @@ Our CI builds two version of the documentation:
 - https://cmu-delphi.github.io/epiprocess/ from the `main` branch and
 - https://cmu-delphi.github.io/epiprocess/dev from the `dev` branch.
 
-The documentation site can be previewed locally by running in R:
-
-```r
-# Should automatically open a browser
-pkgdown::build_site(preview=TRUE)
-```
-
-If the above does not open a browser, you can try using a Python server from the
-command line:
+We include the script `pkgdown-watch.R` that will automatically rebuild the
+documentation locally and preview it. It can be used with:
 
-```bash
-R -e 'devtools::document()'
-R -e 'pkgdown::build_site()'
-python -m http.server -d docs
+```sh
+# Make sure you have servr installed
+R -e 'renv::install("servr")'
+# Will start a local server
+Rscript pkgdown-watch.R
+# You may need to first build the site with
+R -e 'pkgdown::build_site(".", examples = FALSE, devel = TRUE, preview = FALSE)'
 ```
 
 ## Versioning

diff --git a/NAMESPACE b/NAMESPACE
@@ -42,8 +42,6 @@ S3method(key_colnames,default)
 S3method(key_colnames,epi_archive)
 S3method(key_colnames,epi_df)
 S3method(mean,epi_df)
-S3method(next_after,Date)
-S3method(next_after,integer)
 S3method(print,epi_archive)
 S3method(print,epi_df)
 S3method(print,grouped_epi_archive)
@@ -65,6 +63,7 @@ export(complete)
 export(covid_case_death_rates_extended)
 export(covid_incidence_county_subset)
 export(covid_incidence_outliers)
+export(deprecated_quo_is_present)
 export(detect_outlr)
 export(detect_outlr_rm)
 export(detect_outlr_stl)
@@ -89,11 +88,9 @@ export(guess_period)
 export(is_epi_df)
 export(is_grouped_epi_archive)
 export(key_colnames)
-export(max_version_with_row_in)
 export(mutate)
 export(new_epi_archive)
 export(new_epi_df)
-export(next_after)
 export(relocate)
 export(rename)
 export(revision_summary)

diff --git a/R/archive.R b/R/archive.R
@@ -9,8 +9,9 @@
 
 #' Validate a version bound arg
 #'
-#' Expected to be used on `clobberable_versions_start`, `versions_end`,
-#' and similar arguments. Some additional context-specific checks may be needed.
+#' Expected to be used on `clobberable_versions_start`, `versions_end`, and
+#' similar arguments. Some additional context-specific checks may be needed.
+#' Side effects: raises an error if version bound appears invalid.
 #'
 #' @param version_bound the version bound to validate
 #' @param x a data frame containing a version column with which to check
@@ -20,9 +21,7 @@
 #' @param version_bound_arg optional string; what to call the version bound in
 #'   error messages
 #'
-#' @section Side effects: raises an error if version bound appears invalid
-#'
-#' @noRd
+#' @keywords internal
 validate_version_bound <- function(version_bound, x, na_ok = FALSE,
                                    version_bound_arg = rlang::caller_arg(version_bound),
                                    x_arg = rlang::caller_arg(x)) {
@@ -77,7 +76,7 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE,
 #'
 #' @importFrom checkmate check_names
 #'
-#' @export
+#' @keywords internal
 max_version_with_row_in <- function(x) {
   if (nrow(x) == 0L) {
     cli_abort(
@@ -108,72 +107,71 @@ max_version_with_row_in <- function(x) {
 #' @param x the starting "value"(s)
 #' @return same class, typeof, and length as `x`
 #'
-#' @export
+#' @keywords internal
 next_after <- function(x) UseMethod("next_after")
 
 
-#' @export
+#' @keywords internal
 next_after.integer <- function(x) x + 1L
 
 
-#' @export
+#' @keywords internal
 next_after.Date <- function(x) x + 1L
 
 
-#' Compactify
-#'
-#' This section describes the internals of how compactification works in an
-#' `epi_archive()`. Compactification can potentially improve code speed or
-#' memory usage, depending on your data.
-#'
-#' In general, the last version of each observation is carried forward (LOCF) to
-#' fill in data between recorded versions, and between the last recorded
-#' update and the `versions_end`. One consequence is that the `DT` doesn't
-#' have to contain a full snapshot of every version (although this generally
-#' works), but can instead contain only the rows that are new or changed from
-#' the previous version (see `compactify`, which does this automatically).
-#' Currently, deletions must be represented as revising a row to a special
-#' state (e.g., making the entries `NA` or including a special column that
-#' flags the data as removed and performing some kind of post-processing), and
-#' the archive is unaware of what this state is. Note that `NA`s *can* be
-#' introduced by `epi_archive` methods for other reasons, e.g., in
-#' [`epix_fill_through_version`] and [`epix_merge`], if requested, to
-#' represent potential update data that we do not yet have access to; or in
-#' [`epix_merge`] to represent the "value" of an observation before the
-#' version in which it was first released, or if no version of that
-#' observation appears in the archive data at all.
-#'
-#' @name compactify
-NULL
-
-
-#' Epi Archive
+#' `epi_archive` object
 #'
-#' @title `epi_archive` object
+#' @description The second main data structure for storing time series in
+#' `epiprocess`. It is similar to `epi_df` in that it fundamentally a table with
+#' a few required columns that stores epidemiological time series data. An
+#' `epi_archive` requires a `geo_value`, `time_value`, and `version` column (and
+#' possibly other key columns) along with measurement values. In brief, an
+#' `epi_archive` is a history of the time series data, where the `version`
+#' column tracks the time at which the data was available. This allows for
+#' version-aware forecasting.
 #'
-#' @description An `epi_archive` is an S3 class which contains a data table
-#'   along with several relevant pieces of metadata. The data table can be seen
-#'   as the full archive (version history) for some signal variables of
-#'   interest.
+#' `new_epi_archive` is the constructor for `epi_archive` objects that assumes
+#' all arguments have been validated. Most users should use `as_epi_archive`.
 #'
-#' @details An `epi_archive` contains a data table `DT`, of class `data.table`
-#'   from the `data.table` package, with (at least) the following columns:
+#' @details An `epi_archive` contains a `data.table` object `DT` (from the
+#' `{data.table}` package), with (at least) the following columns:
 #'
-#' * `geo_value`: the geographic value associated with each row of measurements.
-#' * `time_value`: the time value associated with each row of measurements.
+#' * `geo_value`: the geographic value associated with each row of measurements,
+#' * `time_value`: the time value associated with each row of measurements,
 #' * `version`: the time value specifying the version for each row of
 #'   measurements. For example, if in a given row the `version` is January 15,
 #'   2022 and `time_value` is January 14, 2022, then this row contains the
 #'   measurements of the data for January 14, 2022 that were available one day
 #'   later.
 #'
-#' The data table `DT` has key variables `geo_value`, `time_value`, `version`,
-#'   as well as any others (these can be specified when instantiating the
-#'   `epi_archive` object via the `other_keys` argument, and/or set by operating
-#'   on `DT` directly). Note that there can only be a single row per unique
-#'   combination of key variables.
+#' The variables `geo_value`, `time_value`, `version` serve as key variables for
+#'   the data table (in addition to any other keys specified in the metadata).
+#'   There can only be a single row per unique combination of key variables. The
+#'   keys for an `epi_archive` can be viewed with `key(epi_archive$DT)`.
+#'
+#' ## Compactification
+#'
+#' By default, an `epi_archive` will compactify the data table to remove
+#' redundant rows. This is done by not storing rows that have the same value,
+#' except for the `version` column (this is essentially a last observation
+#' carried forward, but along the version index). This is done to save space and
+#' improve performance. If you do not want to compactify the data, you can set
+#' `compactify = FALSE` in `as_epi_archive()`.
+#'
+#' Note that in some data scenarios, LOCF may not be appropriate. For instance,
+#' if you expected data to be updated on a given day, but your data source did
+#' not update, then it could be reasonable to code the data as `NA` for that
+#' day, instead of assuming LOCF.
+#'
+#' `NA`s *can* be introduced by `epi_archive` methods for other
+#' reasons, e.g., in [`epix_fill_through_version`] and [`epix_merge`], if
+#' requested, to represent potential update data that we do not yet have access
+#' to; or in [`epix_merge`] to represent the "value" of an observation before
+#' the version in which it was first released, or if no version of that
+#' observation appears in the archive data at all.
+#'
+#' ## Metadata
 #'
-#' @section Metadata:
 #' The following pieces of metadata are included as fields in an `epi_archive`
 #'   object:
 #'
@@ -187,20 +185,6 @@ NULL
 #'  archive. Unexpected behavior may result from modifying the metadata
 #'  directly.
 #'
-#' @section Generating Snapshots:
-#' An `epi_archive` object can be used to generate a snapshot of the data in
-#'   `epi_df` format, which represents the most up-to-date time series values up
-#'   to a point in time. This is accomplished by calling `epix_as_of()`.
-#'
-#' @section Sliding Computations:
-#' We can run a sliding computation over an `epi_archive` object, much like
-#'   `epi_slide()` does for an `epi_df` object. This is accomplished by calling
-#'   the `slide()` method for an `epi_archive` object, which works similarly to
-#'   the way `epi_slide()` works for an `epi_df` object, but with one key
-#'   difference: it is version-aware. That is, for an `epi_archive` object, the
-#'   sliding computation at any given reference time point t is performed on
-#'   **data that would have been available as of t**.
-#'
 #' @param x A data.frame, data.table, or tibble, with columns `geo_value`,
 #'   `time_value`, `version`, and then any additional number of columns.
 #' @param geo_type DEPRECATED Has no effect. Geo value type is inferred from the
@@ -239,9 +223,11 @@ NULL
 #'   value of `clobberable_versions_start` does not fully trust these empty
 #'   updates, and assumes that any version `>= max(x$version)` could be
 #'   clobbered.) If `nrow(x) == 0`, then this argument is mandatory.
-#' @param compactify_tol double. the tolerance used to detect approximate equality for compactification
+#' @param compactify_tol double. the tolerance used to detect approximate
+#'   equality for compactification
 #' @return An `epi_archive` object.
 #'
+#' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`]
 #' @importFrom data.table as.data.table key setkeyv
 #' @importFrom dplyr if_any if_all everything
 #' @importFrom utils capture.output
@@ -356,12 +342,13 @@ new_epi_archive <- function(
   )
 }
 
-#' given a tibble as would be found in an epi_archive, remove duplicate entries.
-#' @description
-#' works by shifting all rows except the version, then comparing values to see
+#' Given a tibble as would be found in an epi_archive, remove duplicate entries.
+#'
+#' Works by shifting all rows except the version, then comparing values to see
 #'   if they've changed. We need to arrange in descending order, but note that
 #'   we don't need to group, since at least one column other than version has
 #'   changed, and so is kept.
+#'
 #' @keywords internal
 #' @importFrom dplyr filter
 apply_compactify <- function(df, keys, tolerance = .Machine$double.eps^.5) {
@@ -466,6 +453,7 @@ validate_epi_archive <- function(
 
 #' `as_epi_archive` converts a data frame, data table, or tibble into an
 #' `epi_archive` object.
+#'
 #' @param ... used for specifying column names, as in [`dplyr::rename`]. For
 #'   example `version = release_date`
 #' @param .versions_end location based versions_end, used to avoid prefix