Skip to content

Commit abbe0e2

Browse files
authored
Add new datasets for R4DS (tidyverse#1333)
Fixes tidyverse#1331
1 parent 85b0538 commit abbe0e2

19 files changed

+1124
-92
lines changed

R/data.R

+106-15
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,43 @@
11
#' World Health Organization TB data
22
#'
3+
#' @description
34
#' A subset of data from the World Health Organization Global Tuberculosis
4-
#' Report, and accompanying global populations.
5+
#' Report, and accompanying global populations. `who` uses the original
6+
#' codes from the World Health Organization. The column names for columns
7+
#' 5 through 60 are made by combining `new_` with:
58
#'
6-
#' @format `who`: a data frame with 7,240 rows and the columns:
9+
#' * the method of diagnosis (`rel` = relapse, `sn` = negative pulmonary
10+
#' smear, `sp` = positive pulmonary smear, `ep` = extrapulmonary),
11+
#' * gender (`f` = female, `m` = male), and
12+
#' * age group (`014` = 0-14 yrs of age, `1524` = 15-24, `2534` = 25-34,
13+
#' `3544` = 35-44 years of age, `4554` = 45-54, `5564` = 55-64,
14+
#' `65` = 65 years or older).
15+
#'
16+
#' `who2` is a lightly modified version that makes teaching the basics
17+
#' easier by tweaking the variables to be slightly more consistent and
18+
#' dropping `iso2` and `iso3`. `newrel` is replaced by `new_rel`, and a
19+
#' `_` is added after the gender.
20+
#'
21+
#' @format ## `who`
22+
#' A data frame with 7,240 rows and 60 columns:
723
#' \describe{
824
#' \item{country}{Country name}
925
#' \item{iso2, iso3}{2 & 3 letter ISO country codes}
1026
#' \item{year}{Year}
1127
#' \item{new_sp_m014 - new_rel_f65}{Counts of new TB cases recorded by group.
12-
#' Column names encode three variables that describe the group (see details).}
28+
#' Column names encode three variables that describe the group.}
1329
#' }
14-
#' @details The data uses the original codes given by the World Health
15-
#' Organization. The column names for columns five through 60 are made by
16-
#' combining `new_` to a code for method of diagnosis (`rel` =
17-
#' relapse, `sn` = negative pulmonary smear, `sp` = positive
18-
#' pulmonary smear, `ep` = extrapulmonary) to a code for gender
19-
#' (`f` = female, `m` = male) to a code for age group (`014` =
20-
#' 0-14 yrs of age, `1524` = 15-24 years of age, `2534` = 25 to
21-
#' 34 years of age, `3544` = 35 to 44 years of age, `4554` = 45 to
22-
#' 54 years of age, `5564` = 55 to 64 years of age, `65` = 65 years
23-
#' of age or older).
24-
#'
2530
#' @source <https://www.who.int/teams/global-tuberculosis-programme/data>
2631
"who"
2732

2833
#' @rdname who
29-
#' @format `population`: a data frame with 4,060 rows and three columns:
34+
#' @format ## `who2`
35+
#' A data frame with 7,240 rows and 58 columns.
36+
"who2"
37+
38+
#' @rdname who
39+
#' @format ## `population`
40+
#' A data frame with 4,060 rows and three columns:
3041
#' \describe{
3142
#' \item{country}{Country name}
3243
#' \item{year}{Year}
@@ -173,3 +184,83 @@
173184
#' The "Whitburn" project, <https://waxy.org/2008/05/the_whitburn_project/>,
174185
#' (downloaded April 2008)
175186
"billboard"
187+
188+
189+
#' Household data
190+
#'
191+
#' This dataset is based on an example in
192+
#' `vignette("datatable-reshape", package = "data.table")`
193+
#'
194+
#' @format A data frame with 5 rows and 5 columns:
195+
#' \describe{
196+
#' \item{family}{Family identifier}
197+
#' \item{dob_child1}{Date of birth of first child}
198+
#' \item{dob_child2}{Date of birth of second child}
199+
#' \item{name_child1}{Name of first child}?
200+
#' \item{name_child2}{Name of second child}
201+
#' }
202+
"household"
203+
204+
#' Data from the Centers for Medicare & Medicaid Services
205+
#'
206+
#' @description
207+
#' Two datasets from public data provided the Centers for Medicare & Medicaid
208+
#' Services, <https://data.cms.gov>.
209+
#'
210+
#' * `cms_patient_experience` contains some lightly cleaned data from
211+
#' "Hospice - Provider Data", which provides a list of hospice agencies
212+
#' along with some data on quality of patient care,
213+
#' <https://data.cms.gov/provider-data/dataset/252m-zfp9>.
214+
#'
215+
#' * `cms_patient_care` "Doctors and Clinicians Quality Payment Program PY 2020
216+
#' Virtual Group Public Reporting",
217+
#' <https://data.cms.gov/provider-data/dataset/8c70-d353>
218+
#'
219+
#' @examples
220+
#' cms_patient_experience %>%
221+
#' dplyr::distinct(measure_cd, measure_title)
222+
#'
223+
#' cms_patient_experience %>%
224+
#' pivot_wider(
225+
#' id_cols = starts_with("org"),
226+
#' names_from = measure_cd,
227+
#' values_from = prf_rate
228+
#' )
229+
#'
230+
#' cms_patient_care %>%
231+
#' pivot_wider(
232+
#' names_from = type,
233+
#' values_from = score
234+
#' )
235+
#'
236+
#' cms_patient_care %>%
237+
#' pivot_wider(
238+
#' names_from = measure_abbr,
239+
#' values_from = score
240+
#' )
241+
#'
242+
#' cms_patient_care %>%
243+
#' pivot_wider(
244+
#' names_from = c(measure_abbr, type),
245+
#' values_from = score
246+
#' )
247+
#' @format `cms_patient_experience` is a data frame with 500 observations and
248+
#' five variables:
249+
#' \describe{
250+
#' \item{org_pac_id,org_nm}{Organisation ID and name}
251+
#' \item{measure_cd,measure_title}{Measure code and title}
252+
#' \item{prf_rate}{Measure performance rate}
253+
#' }
254+
"cms_patient_experience"
255+
256+
#' @format `cms_patient_care` is a data frame with 252 observations and
257+
#' five variables:
258+
#' \describe{
259+
#' \item{ccn,facility_name}{Facility ID and name}
260+
#' \item{measure_abbr}{Abbreviated measurement title, suitable for use as variable name}
261+
#' \item{score}{Measure score}
262+
#' \item{type}{Whether score refers to the rating out of 100 ("observed"), or
263+
#' the maximum possible value of the raw score ("denominator")}
264+
#' }
265+
#' @rdname cms_patient_experience
266+
"cms_patient_care"

_pkgdown.yml

+2
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,10 @@ reference:
7878
- title: Data
7979
contents:
8080
- billboard
81+
- cms_patient_experience
8182
- construction
8283
- fish_encounters
84+
- household
8385
- relig_income
8486
- smiths
8587
- table1

data-raw/cms.R

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
library(tidyverse)
2+
3+
# Doctors and Clinicians Quality Payment Program PY 2020 Group Public Reporting:
4+
# Patient Experience
5+
6+
# https://data.cms.gov/provider-data/dataset/8c70-d353
7+
url <- "https://data.cms.gov/provider-data/api/1/datastore/query/8c70-d353/0?offset=0&count=true&results=true&schema=true&keys=true&format=json&rowIds=false"
8+
9+
json <- jsonlite::read_json(url)
10+
cms_patient_experience <- json$results |>
11+
map_df(as_tibble) |>
12+
select(org_pac_id, org_nm, measure_cd, measure_title, prf_rate) |>
13+
arrange(org_pac_id, stringi::stri_rank(measure_cd, list(numeric = TRUE))) |>
14+
mutate(prf_rate = as.numeric(prf_rate))
15+
16+
write_csv(cms_patient_experience, "data-raw/cms_patient_experience.csv")
17+
usethis::use_data(cms_patient_experience, overwrite = TRUE)
18+
19+
20+
# -------------------------------------------------------------------------
21+
22+
# Hospice - Provider Data
23+
# A list of hospice agencies with data on the quality of patient care measures.
24+
# https://data.cms.gov/provider-data/dataset/252m-zfp9
25+
26+
# Recommended by
27+
# https://twitter.com/hunter_boost/status/1500212341463339008
28+
29+
url <- "https://data.cms.gov/provider-data/api/1/datastore/query/252m-zfp9/0?limit=500&offset=0&count=true&results=true&schema=true&keys=true&format=json&rowIds=false"
30+
json <- jsonlite::read_json(url)
31+
32+
abbr <- tribble(
33+
~measure_name , ~measure_abbr,
34+
"Hospice and Palliative Care Treatment Preferences" , "treat_pref",
35+
"Beliefs & Values Addressed (if desired by the patient)" , "beliefs_addressed",
36+
"Hospice and Palliative Care Pain Screening" , "pain_screening",
37+
"Hospice and Palliative Care Pain Assessment" , "pain_assessment",
38+
"Hospice and Palliative Care Dyspnea Screening" , "dyspnea_screening",
39+
"Hospice and Palliative Care Dyspnea Treatment" , "dyspena_treatment",
40+
"Patient Treated with an Opioid Who Are Given a Bowel Regimen", "opioid_bowel",
41+
"Hospice and Palliative Care Composite Process Measure" , "composite_process",
42+
"Hospice Visits When Death Is Imminent, Measure 1" , "visits_imminent",
43+
)
44+
45+
cms_patient_care <- json$results |>
46+
map_df(as_tibble) |>
47+
select(ccn = cms_certification_number_ccn, facility_name, measure_name, measure_code, score) |>
48+
mutate(measure_name = na_if(measure_name, "")) |>
49+
fill(measure_name, .direction = "up") |>
50+
filter(str_detect(measure_code, "^H")) |>
51+
mutate(score = as.numeric(na_if(score, "Not Available"))) |>
52+
mutate(
53+
type = str_to_lower(str_remove(measure_code, "H_\\d{3}_\\d{2}_")),
54+
measure_code = NULL
55+
) |>
56+
left_join(abbr, by = "measure_name") |>
57+
select(ccn, facility_name, measure_abbr, score, type) |>
58+
arrange(ccn, measure_abbr, type)
59+
60+
write_csv(cms_patient_care, "data-raw/cms_patient_care.csv")
61+
usethis::use_data(cms_patient_care, overwrite = TRUE)

0 commit comments

Comments
 (0)