diff --git a/.Rbuildignore b/.Rbuildignore index c4f1ad1..63c7030 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -15,3 +15,4 @@ ^readme-cache/ ^vignettes/ ^wercker\.yml$ +^_pkgdown\.yml$ diff --git a/.travis.yml b/.travis.yml index cccf3d9..7c0ba73 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,16 +3,6 @@ cache: packages matrix: include: - - os: linux - dist: precise - r: release - - os: linux - dist: precise - r: devel - - os: linux - dist: precise - r: oldrel - - os: linux dist: trusty r: release @@ -30,10 +20,6 @@ matrix: osx_image: xcode8.3 latex: false r: release - # - os: osx - # osx_image: xcode8.3 - # latex: false - # r: devel - os: osx osx_image: xcode8.3 latex: false @@ -43,9 +29,6 @@ matrix: - os: osx latex: false r: release - # - os: osx - # latex: false - # r: devel - os: osx latex: false r: oldrel diff --git a/DESCRIPTION b/DESCRIPTION index 0bfb257..d8342f9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dgo Title: Dynamic Estimation of Group-Level Opinion -Version: 0.2.10 -Date: 2017-05-29 +Version: 0.2.11 +Date: 2017-10-26 Description: Fit dynamic group-level IRT and MRP models from individual or aggregated item response data. This package handles common preprocessing tasks and extends functions for inspecting results, poststratification, and @@ -43,7 +43,6 @@ Collate: 'class-dgmrp_fit.r' 'dgirt.r' 'dichotomize_item_responses.r' - 'expand_rownames.r' 'methods-control.r' 'methods-dgirtfit-plot.r' 'methods-dgirtfit-poststratify.r' diff --git a/Makefile b/Makefile index 096f357..752c33f 100644 --- a/Makefile +++ b/Makefile @@ -25,18 +25,17 @@ build-cran: $(R) CMD build . --no-resave-data --no-manual check: - $(R) CMD check $(PKG)_$(VERSION).tar.gz + $(R) CMD check $(BINARY) check-cran: - $(R) CMD check --as-cran $(PKG)_$(VERSION).tar.gz + $(R) CMD check --as-cran $(BINARY) -check-quick $(PKG)_$(VERSION).tar.gz: +check-quick $(BINARY): $(R) $(R_ARGS) CMD build . - $(R) CMD check $(PKG)_$(VERSION).tar.gz + $(R) CMD check $(BINARY) -install: $(PKG)_$(VERSION).tar.gz - $(R) CMD INSTALL --no-multiarch --with-keep.source \ - $(PKG)_$(VERSION).tar.gz +install: $(BINARY) + $(R) CMD INSTALL --no-multiarch --with-keep.source $(BINARY) install-code: $(R) CMD INSTALL --no-multiarch --with-keep.source --no-docs . diff --git a/NAMESPACE b/NAMESPACE index 1ee7f46..d102e26 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ S3method(as.data.frame,dgo_fit) export(dgirt) export(dgmrp) -export(expand_rownames) export(plot_rhats) export(shape) export(summarize) diff --git a/NEWS.md b/NEWS.md index 17f2fa2..3923f0b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +## dgo 0.2.11 + +* Add poststratification over posterior samples (closes #21). +* `shape()` now accepts aggregated item response data unaccompanied by + individual-level item response data. The `item_data` and `item_names` + arguments are no longer required. +* Add a `max_raked_weight` argument to `shape()` for trimming raked weights. + Note that trimming occurs before raked weights are rescaled to have mean 1, + and the rescaled weights can be larger than `max_raked_weight`. +* Remove the unused function `expand_rownames()`. +* Bugfixes. + ## dgo 0.2.10 * Remove Rcpp dependency by rewriting `dichotomize()` in R. diff --git a/R/aggregate_item_responses.r b/R/aggregate_item_responses.r index 90bb4fa..9b5cc9e 100644 --- a/R/aggregate_item_responses.r +++ b/R/aggregate_item_responses.r @@ -23,78 +23,90 @@ make_group_counts <- function(item_data, aggregate_data, ctrl) { # Because of how DGIRT Stan code iterates over the data, the result must be # ordered by time, item, and then group. The order of the grouping variables # doesn't matter. - gt_names <- attr(item_data, "gt_items") - item_data[, c("n_responses") := list(rowSums(!is.na(.SD))), - .SDcols = gt_names] - if (!length(ctrl@weight_name)) { - item_data[, weight := 1L] - ctrl@weight_name <- "weight" - } - item_data[, c("def") := lapply(.SD, calc_design_effects), - .SDcols = ctrl@weight_name, - by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)] - - # get design-effect-adjusted nonmissing response counts by group and item - item_n <- item_data[, lapply(.SD, count_items, get("n_responses"), get("def")), - .SDcols = c(gt_names), - by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)] - # append _n_grp to the response count columns - item_n_vars <- paste0(gt_names, "_n_grp") - names(item_n) <- replace(names(item_n), match(gt_names, names(item_n)), item_n_vars) - data.table::setkeyv(item_n, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names)) - drop_cols <- setdiff(names(item_n), c(key(item_n), item_n_vars)) - if (length(drop_cols)) { - item_n[, c(drop_cols) := NULL] - } + if (length(item_data)) { + gt_names <- attr(item_data, "gt_items") + item_data[, c("n_responses") := list(rowSums(!is.na(.SD))), + .SDcols = gt_names] + if (!length(ctrl@weight_name)) { + item_data[, weight := 1L] + ctrl@weight_name <- "weight" + } + item_data[, c("def") := lapply(.SD, calc_design_effects), + .SDcols = ctrl@weight_name, + by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)] - # get mean ystar - item_data[, c("adj_weight") := get(ctrl@weight_name) / get("n_responses")] - item_means <- item_data[, lapply(.SD, function(x) weighted.mean(x, .SD$adj_weight, na.rm = TRUE)), - .SDcols = c(gt_names, "adj_weight"), - by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)] - # append _mean to the mean response columns - item_mean_vars <- paste0(gt_names, "_mean") - names(item_means) <- replace(names(item_means), match(gt_names, names(item_means)), item_mean_vars) - data.table::setkeyv(item_means, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names)) - drop_cols <- setdiff(names(item_means), c(key(item_means), item_mean_vars)) - if (length(drop_cols)) { - item_means[, c(drop_cols) := NULL] - } + # get design-effect-adjusted nonmissing response counts by group and item + item_n <- item_data[, lapply(.SD, count_items, get("n_responses"), get("def")), + .SDcols = c(gt_names), + by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)] + # append _n_grp to the response count columns + item_n_vars <- paste0(gt_names, "_n_grp") + names(item_n) <- replace(names(item_n), match(gt_names, names(item_n)), item_n_vars) + data.table::setkeyv(item_n, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names)) + drop_cols <- setdiff(names(item_n), c(key(item_n), item_n_vars)) + if (length(drop_cols)) { + item_n[, c(drop_cols) := NULL] + } + + # get mean ystar + item_data[, c("adj_weight") := get(ctrl@weight_name) / get("n_responses")] + item_means <- item_data[, lapply(.SD, function(x) weighted.mean(x, .SD$adj_weight, na.rm = TRUE)), + .SDcols = c(gt_names, "adj_weight"), + by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)] + # append _mean to the mean response columns + item_mean_vars <- paste0(gt_names, "_mean") + names(item_means) <- replace(names(item_means), match(gt_names, names(item_means)), item_mean_vars) + data.table::setkeyv(item_means, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names)) + drop_cols <- setdiff(names(item_means), c(key(item_means), item_mean_vars)) + if (length(drop_cols)) { + item_means[, c(drop_cols) := NULL] + } + + # join response counts with means + count_means <- item_n[item_means] + count_means <- count_means[, c(ctrl@time_name, ctrl@geo_name, + ctrl@group_names, item_mean_vars, + item_n_vars), with = FALSE] - # join response counts with means - count_means <- item_n[item_means] - count_means <- count_means[, c(ctrl@time_name, ctrl@geo_name, - ctrl@group_names, item_mean_vars, - item_n_vars), with = FALSE] - - # the group success count for an item is the product of its count and mean - item_s_vars <- paste0(gt_names, "_s_grp") - count_means[, c(item_s_vars) := round(count_means[, (item_mean_vars), with = FALSE] * - count_means[, (item_n_vars), with = FALSE], 0)] - count_means <- count_means[, -grep("_mean$", names(count_means)), with = FALSE] - - - # we want a long table of successes (s_grp) and trials (n_grp) by group and - # item; items need to move from columns to rows - melted <- melt(count_means, id.vars = c(ctrl@time_name, ctrl@geo_name, - ctrl@group_names), - variable.name = "item") - melted[, c("variable") := list(gsub(".*([sn]_grp)$", "\\1", get("item")))] - melted[, c("item") := list(gsub("(.*)_[sn]_grp$", "\\1", get("item")))] - f <- as.formula(paste0(paste(ctrl@time_name, ctrl@geo_name, - paste(ctrl@group_names, collapse = " + "), - "item", sep = "+"), " ~ variable")) - counts <- data.table::dcast.data.table(melted, f, drop = FALSE, fill = 0L) + # the group success count for an item is the product of its count and mean + item_s_vars <- paste0(gt_names, "_s_grp") + count_means[, c(item_s_vars) := round(count_means[, (item_mean_vars), with = FALSE] * + count_means[, (item_n_vars), with = FALSE], 0)] + count_means <- count_means[, -grep("_mean$", names(count_means)), with = FALSE] + + + # we want a long table of successes (s_grp) and trials (n_grp) by group and + # item; items need to move from columns to rows + melted <- melt(count_means, id.vars = c(ctrl@time_name, ctrl@geo_name, + ctrl@group_names), + variable.name = "item") + melted[, c("variable") := list(gsub(".*([sn]_grp)$", "\\1", get("item")))] + melted[, c("item") := list(gsub("(.*)_[sn]_grp$", "\\1", get("item")))] + f <- as.formula(paste0(paste(ctrl@time_name, ctrl@geo_name, + paste(ctrl@group_names, collapse = " + "), + "item", sep = "+"), " ~ variable")) + counts <- data.table::dcast.data.table(melted, f, drop = FALSE, fill = 0L) + } # include aggregates, if any - if (length(aggregate_data) && nrow(aggregate_data) > 0) { + if (length(item_data) && length(aggregate_data) && nrow(aggregate_data) > 0) { + # invariant: we have both individual- and aggregate-level item responses counts <- data.table::rbindlist(list(counts, aggregate_data), use.names = TRUE) message("Added ", length(ctrl@aggregate_item_names), " items from aggregate data.") - data.table::setkeyv(counts, c(ctrl@time_name, "item", ctrl@group_names, - ctrl@geo_name)) + } else if (length(aggregate_data) && nrow(aggregate_data) > 0) { + # invariant: we have only aggregate-level item responses + # aggregate_data is already in the expected format + counts <- aggregate_data + message("Using ", length(ctrl@aggregate_item_names), " items from aggregate data.") + } else if (!length(item_data)) { + # invariant: we unexpectedly have neither individual- nor aggregate-level data + stop("can't proceed with neither item_data nor aggregate_data") } + data.table::setkeyv(counts, c(ctrl@time_name, "item", ctrl@group_names, + ctrl@geo_name)) + # include unobserved cells all_groups = expand.grid(c(setNames(list(unique(counts[[ctrl@geo_name]])), ctrl@geo_name), setNames(list(ctrl@time_filter), ctrl@time_name), diff --git a/R/assertions.r b/R/assertions.r index 4b0dd83..bd24864 100644 --- a/R/assertions.r +++ b/R/assertions.r @@ -41,7 +41,7 @@ has_all_names <- function(table, names, suggestion = NULL) { } assertthat::on_failure(has_all_names) <- function(call, env) { - paste0("not all ", call$names, " are names in ", deparse(call$table)) + paste0("not all of ", deparse(call$names), " are names in ", deparse(call$table)) } all_strings <- function(x) { diff --git a/R/class-control.r b/R/class-control.r index 74e35a6..110275e 100644 --- a/R/class-control.r +++ b/R/class-control.r @@ -1,6 +1,5 @@ setClass("Control", - slots = list(# item data - item_names = "character", + slots = list(item_names = "ANY", time_name = "character", geo_name = "character", group_names = "ANY", @@ -22,13 +21,21 @@ setClass("Control", weight_name = "ANY", proportion_name = "character", rake_names = "character", + max_raked_weight = "ANY", # modeling options - constant_item = "logical"), + constant_item = "logical", + # indicators for state + has_individual_data = "ANY", + has_aggregate_data = "ANY", + has_target_data = "ANY", + has_modifier_data = "ANY"), validity = function(object) { if (!length(object@time_name) == 1L) "\"time_name\" should be a single variable name" else if (!length(object@geo_name) == 1L) "\"geo_name\" should be a single variable name" + else if (length(object@item_names) && !is.character(object@item_names)) + "if specified \"item_names\" should give variable names in a character vector" else if (length(object@survey_name) && length(object@survey_name) != 1L) "if specified \"survey_name\" should be a single variable name" else if (length(object@survey_name) && !is.character(object@survey_name)) @@ -64,14 +71,17 @@ setClass("Control", else if (!length(object@constant_item) == 1L && is.logical(object@constant_item)) "\"constant_item\" should be a single logical value" - # else if (length(unique(object@time_filter)) == 1L) - # "if specified \"time_filter\" should give at least two time periods" else if (length(unique(object@geo_filter)) == 1L) "if specified \"geo_filter\" should give at least two local geographic areas" else if (length(object@min_survey_filter) != 1L || object@min_survey_filter <= 0L) "\"min_survey_filter\" should be a positive integer" else if (!length(object@min_t_filter) == 1L && object@min_t_filter > 0L) "\"min_t_filter\" should be a positive integer" + else if (length(object@max_raked_weight) && + (length(object@max_raked_weight) > 1 | + !is.numeric(object@max_raked_weight))) { + "if specified \"max_raked_weight\" should be a single number" + } else TRUE }) diff --git a/R/class-dgirtin.r b/R/class-dgirtin.r index c0ebfa6..3007250 100644 --- a/R/class-dgirtin.r +++ b/R/class-dgirtin.r @@ -1,7 +1,7 @@ #' Class \code{dgirtIn}: data prepared for modeling with \code{dgirt} #' #' \code{shape} generates objects of class \code{dgirtIn} for modeling with -#' \code{dgirt}. +#' \code{dgirt} and \code{dgmrp}. #' #' @aliases dgirtin-class, get_item_n, get_item_names, get_n, dgirtIn-method, #' print.dgirtIn, @@ -21,8 +21,11 @@ NULL setOldClass("dgirtIn", "R6") dgirtIn <- R6::R6Class("dgirtIn", public = c( + # model objects (N, G, T, ...) and shape objects (item_data, etc.) are + # public and initially NULL setNames(lapply(c(model_objects, shape_objects), function(x) NULL), c(model_objects, shape_objects)), + # the class is instantiated from a Control object initialize = function(ctrl) { if (length(ctrl@constant_item)) { self$constant_item <- ctrl@constant_item @@ -30,29 +33,51 @@ dgirtIn <- R6::R6Class("dgirtIn", self$mod_par_names <- c(ctrl@geo_name, ctrl@time_name) self$unmod_par_names <- ctrl@group_names }, + # the as_list method extracts attributes used in modeling as expected by + # rstan. its arguments will be passed from a dgirt or dgmrp call as_list = function(separate_t, delta_tbar_prior_mean, delta_tbar_prior_sd, innov_sd_delta_scale, innov_sd_theta_scale, hierarchical_model) { + + # model_objects is a character vector of attribute names for rstan data d_in_list <- Map(function(x) self[[x]], private$model_objects) - if (!length(separate_t) == 1L && is.logical(separate_t)) + + # separate_t is a boolean in the stan code + if (length(separate_t) != 1L || !is.logical(separate_t)) { stop("\"separate_t\" should be a single logical value") - else d_in_list$separate_t <- separate_t - if (!length(hierarchical_model) == 1L && is.logical(hierarchical_model)) + } + d_in_list$separate_t <- separate_t + + # hierarchical_model is also boolean in the stan code + if (length(hierarchical_model) != 1L || !is.logical(hierarchical_model)) { stop("\"hierarchical_model\" should be a single logical value") - else d_in_list$hierarchical_model <- hierarchical_model - if (!length(delta_tbar_prior_mean) == 1L && - is.numeric(delta_tbar_prior_mean)) + } + d_in_list$hierarchical_model <- hierarchical_model + + if (length(delta_tbar_prior_mean) != 1L || !is.numeric(delta_tbar_prior_mean)) { stop("\"delta_tbar_prior_mean\" should be a single real value") - else d_in_list$delta_tbar_prior_mean <- delta_tbar_prior_mean - if (!length(delta_tbar_prior_sd) == 1L && is.numeric(delta_tbar_prior_sd)) + } + d_in_list$delta_tbar_prior_mean <- delta_tbar_prior_mean + + if (length(delta_tbar_prior_sd) != 1L || !is.numeric(delta_tbar_prior_sd) + || delta_tbar_prior_sd < 0) + { stop("\"delta_tbar_prior_sd\" should be a single positive real value") - else d_in_list$delta_tbar_prior_sd <- delta_tbar_prior_sd - if (!length(innov_sd_delta_scale ) == 1L && is.numeric(innov_sd_delta_scale)) - stop("\"delta_tbar_delta_scale\" should be a single real value") - else d_in_list$innov_sd_delta_scale <- innov_sd_delta_scale - if (!length(innov_sd_theta_scale ) == 1L && is.numeric(innov_sd_theta_scale)) - stop("\"delta_tbar_theta_scale\" should be a single real value") - else d_in_list$innov_sd_theta_scale <- innov_sd_theta_scale + } + d_in_list$delta_tbar_prior_sd <- delta_tbar_prior_sd + + if (length(innov_sd_delta_scale) != 1L || + !is.numeric(innov_sd_delta_scale) || innov_sd_delta_scale < 0) { + stop("\"innov_sd_delta_scale\" should be a single real value") + } + d_in_list$innov_sd_delta_scale <- innov_sd_delta_scale + + if (length(innov_sd_theta_scale ) != 1L || + !is.numeric(innov_sd_theta_scale) || innov_sd_theta_scale < 0) { + stop("\"innov_sd_theta_scale\" should be a single positive real value") + } + d_in_list$innov_sd_theta_scale <- innov_sd_theta_scale + d_in_list }), - private = list(model_objects = model_objects, - shape_objects = shape_objects)) + # keep track of which items will be used in modeling + private = list(model_objects = model_objects, shape_objects = shape_objects)) diff --git a/R/expand_rownames.r b/R/expand_rownames.r deleted file mode 100644 index 4a458ff..0000000 --- a/R/expand_rownames.r +++ /dev/null @@ -1,65 +0,0 @@ -#' \code{expand_rownames}: expand parameter descriptions in rownames -#' -#' Move rownames that describe parameters (e.g. xi[2009]) to columns. -#' -#' It should rarely be necessary to call \code{expand_rownames} directly. But -#' elements extracted from \code{\link{dgirtfit}}-class objects may have -#' rownames of the format \code{param[group1__groupK,t]} for parameters indexed -#' by group and time period, or \code{param[t]} for parameters indexed by time -#' period. \code{expand_rownames} moves this information to columns whose names -#' are given by the \code{col_names} argument. The rownames in their original -#' format will appear in another column called \code{rn}. -#' -#' @param x A table with rownames in the format \code{param[group1__groupK,t]} -#' or \code{param[t]}. -#' -#' @param time_name A name for any resulting time variable. -#' -#' @param geo_name A name for any resulting geographic variable. -#' -#' @param group_names Names for any resulting group variables. -#' -#' @return \code{x} with additional columns (see details). -#' @seealso \code{\link{dgirtfit-class}} -#' @export -expand_rownames <- function(x, time_name, geo_name, group_names) { - if (is.matrix(x)) x <- as.data.frame(x, stringsAsFactors = FALSE, - rownames = rownames(x)) - x <- data.table::copy(data.table::setDT(x, keep.rownames = TRUE)) - if (!"rn" %in% names(x)) stop("After setDT(x, keep.rownames = TRUE), ", - "rownames couldn't be found. Did x ", - "have rownames?") - indexes <- gsub('.*\\[([A-Za-z0-9,_]+)\\].*', '\\1', x[["rn"]]) - parnames <- gsub('(.*)\\[[A-Za-z0-9,_]+\\].*', '\\1', x[["rn"]]) - comma_split <- data.table::tstrsplit(indexes, c(",")) - for (parname in unique(parnames)) { - # index_names is a list for looking up the names of parameter indexes - for (i in index_names[[parname]]) { - if (length(i)) { - index_pos <- which(index_names[[parname]] == i) - x[parnames == parname, c(i) := - list(comma_split[[index_pos]][parnames == parname])] - } - } - } - if ("group_names" %in% names(x)) { - us_split <- strsplit(x[["group_names"]], "__", fixed = TRUE) - n_col <- max(vapply(us_split, length, integer(1L))) - group_cols <- paste0("group_", seq.int(0, n_col - 1L)) - x[, c(group_cols) := data.table::tstrsplit(group_names, "__", fixed = TRUE)] - if (length(geo_name)) { - x[, c(geo_name, group_cols[1L]) := list(get(group_cols[1L]), NULL)] - } - if (length(group_names)) { - x[, c(group_names, group_cols[-1L]) := list(get(group_cols[-1L]), NULL)] - } - x[, c("group_names") := NULL] - } - if (length(time_name) && "time_name" %in% names(x)) { - names(x)[names(x) == "time_name"] <- time_name - if (is.character(x[[time_name]])) { - x[, c(time_name) := type.convert(x[[time_name]])] - } - } - x -} diff --git a/R/methods-control.r b/R/methods-control.r index aa102b6..871c8ce 100644 --- a/R/methods-control.r +++ b/R/methods-control.r @@ -23,10 +23,10 @@ init_control <- function(item_data = item_data, raking = raking, weight_name = weight_name, proportion_name = proportion_name, + max_raked_weight = max_raked_weight, constant_item = constant_item, ...) { ctrl <- new("Control", - # item data item_names = item_names, time_name = time_name, geo_name = geo_name, @@ -48,12 +48,17 @@ init_control <- function(item_data = item_data, raking = raking, weight_name = weight_name, proportion_name = proportion_name, + max_raked_weight = max_raked_weight, # modeling options constant_item = constant_item, ...) - is_item_name <- valid_names(item_data, ctrl, 1L) - is_item_name(c("time_name", "geo_name")) - has_type(c("time_name", "geo_name"), item_data, ctrl) + + if (length(item_data)) { + is_item_name <- valid_names(item_data, ctrl, 1L) + is_item_name(c("time_name", "geo_name")) + has_type(c("time_name", "geo_name"), item_data, ctrl) + } + if (length(aggregate_data)) { is_agg_name <- valid_names(aggregate_data, ctrl, 1L) is_agg_name(c("time_name", "geo_name")) @@ -65,6 +70,10 @@ init_control <- function(item_data = item_data, } } + if (!length(item_data) & !length(aggregate_data)) { + stop("either \"item_data\" or \"aggregate_data\" must be specified") + } + if (length(ctrl@modifier_names)) { if (!length(ctrl@t1_modifier_names)) { ctrl@t1_modifier_names <- ctrl@modifier_names @@ -72,7 +81,9 @@ init_control <- function(item_data = item_data, } if (!length(ctrl@time_filter)) { - ctrl@time_filter <- sort(unique(item_data[[ctrl@time_name]])) + if (length(item_data)) { + ctrl@time_filter <- sort(unique(item_data[[ctrl@time_name]])) + } if (length(aggregate_data)) { ctrl@time_filter <- sort(unique(c(ctrl@time_filter, aggregate_data[[ctrl@time_name]]))) @@ -80,7 +91,9 @@ init_control <- function(item_data = item_data, } if (!length(ctrl@geo_filter)) { - ctrl@geo_filter <- sort(unique(as.character(item_data[[ctrl@geo_name]]))) + if (length(item_data)) { + ctrl@geo_filter <- sort(unique(as.character(item_data[[ctrl@geo_name]]))) + } if (length(aggregate_data)) { ctrl@geo_filter <- sort(unique(c(ctrl@geo_filter, aggregate_data[[ctrl@geo_name]]))) @@ -94,5 +107,11 @@ init_control <- function(item_data = item_data, ctrl@rake_names = all.vars(ctrl@raking) } } + + ctrl@has_individual_data = ifelse(length(item_data), TRUE, FALSE) + ctrl@has_aggregate_data = ifelse(length(aggregate_data), TRUE, FALSE) + ctrl@has_modifier_data = ifelse(length(modifier_data), TRUE, FALSE) + ctrl@has_target_data = ifelse(length(target_data), TRUE, FALSE) + ctrl } diff --git a/R/methods-dgirtfit-poststratify.r b/R/methods-dgirtfit-poststratify.r index c7ad422..2569471 100644 --- a/R/methods-dgirtfit-poststratify.r +++ b/R/methods-dgirtfit-poststratify.r @@ -16,7 +16,7 @@ utils::globalVariables(c("value", "scaled_prop")) #' @param ... Additional arguments to methods. setGeneric("poststratify", signature = "x", function(x, target_data, strata_names, aggregated_names, - proportion_name = "proportion", ...) + proportion_name = "proportion", ...) standardGeneric("poststratify")) #' @param pars Selected parameter names. @@ -58,12 +58,11 @@ setMethod("poststratify", c("dgo_fit"), #' @export setMethod("poststratify", "data.frame", function(x, target_data, strata_names, aggregated_names, - proportion_name = "proportion", pars = "theta_bar") { + proportion_name = "proportion") { assert(is.data.frame(target_data)) assert(all_strings(strata_names)) - assert(all_strings(strata_names)) + assert(all_strings(aggregated_names)) assert(assertthat::is.string(proportion_name)) - assert(all_strings(pars)) if (anyDuplicated(c(strata_names, aggregated_names))) { stop("Variable names cannot be used more than once across ", @@ -112,39 +111,38 @@ setMethod("poststratify", "data.frame", targets[, c(extra_cols) := NULL] } - x_n <- nrow(x) - props <- merge(x, targets, all = FALSE, by = c(strata_names, - aggregated_names)) - if (!identical(x_n, nrow(props))) { - warning("Not all estimates could be matched with a population proportion ", - "using the stratifying and grouping variables. ", x_n - - nrow(props), " will be dropped from the output, ", - "and this may indicate a larger problem.") + for (varname in c(strata_names, aggregated_names)) { + check_target_levels(varname, x, targets) } - props <- scale_props(props, proportion_name, strata_names) - - check_proportions(props, strata_names) - res <- props[, list(value = sum(value * scaled_prop)), by = strata_names] + props <- merge(x, targets, all = FALSE, by = c(strata_names, + aggregated_names)) + by_vars = c(strata_names, 'iteration') + if (!'iteration' %in% names(props)) { + props[, iteration := 1] + no_iterations <- TRUE + } else { + no_iterations <- FALSE + } + props <- scale_props(props, proportion_name, by_vars) + check_proportions(props, by_vars) + res <- props[, list(value = sum(value * scaled_prop)), by = by_vars] + if (no_iterations) { + res[, iteration := NULL] + } res[] }) -check_estimates <- function(estimates, strata_names) { - estimates[, lapply(.SD, sum), by = c(strata_names)] - estimates -} - -scale_props <- function(props, proportion_name, strata_names) { - strata_sums <- props[, list(strata_sum = sum(get(proportion_name))), - by = strata_names] - props <- merge(props, strata_sums, all = FALSE, by = strata_names) +scale_props <- function(props, proportion_name, by_vars) { + strata_sums <- props[, list(strata_sum = sum(get(proportion_name))), by = + by_vars] + props <- merge(props, strata_sums, all = FALSE, by = by_vars) props[, c("scaled_prop") := get(proportion_name) / get("strata_sum")] return(props) } -check_proportions <- function(tabular, strata_names) { - prop_sums <- tabular[, lapply(.SD, sum), .SDcols = "scaled_prop", - by = strata_names] +check_proportions <- function(tabular, by_vars) { + prop_sums <- tabular[, lapply(.SD, sum), .SDcols = "scaled_prop", by = by_vars] if (!isTRUE(all.equal(rep(1L, nrow(prop_sums)), prop_sums$scaled_prop))) { stop("Not all proportions sum to 1 within stratifying variables even ", " though they should have been rescaled. (The mean sum is ", @@ -156,13 +154,13 @@ check_proportions <- function(tabular, strata_names) { check_target_levels <- function(variable, x, targets) { if (!identical(class(x[[variable]]), class(targets[[variable]]))) { stop("'", variable, "' inherits from '", class(x[[variable]]), - "' in x and '", class(targets[[variable]]), "' in targets.", - "Please reconcile the types.") + "' in estimates and '", class(targets[[variable]]), "' in ", + "targets. Please reconcile the types.") } else if (!all(x[[variable]] %in% targets[[variable]])) { x_levels <- setdiff(x[[variable]], targets[[variable]]) stop("Not all levels of '", variable, "' in estimates are levels of '", variable, "' in targets. Missing: ", paste(x_levels , collapse = ", "), - "missing. The target data should give the population proportion of each - ", "group represented in the estimates.") + ". The target data should give the population proportion of ", + "each group represented in the estimates.") } else TRUE } diff --git a/R/restrict_input_data.r b/R/restrict_input_data.r index da2ab82..a509b14 100644 --- a/R/restrict_input_data.r +++ b/R/restrict_input_data.r @@ -1,45 +1,47 @@ restrict_items <- function(item_data, ctrl) { - extra_colnames <- setdiff(names(item_data), - c(ctrl@item_names, - ctrl@survey_name, - ctrl@geo_name, - ctrl@time_name, - ctrl@group_names, - ctrl@weight_name, - ctrl@rake_names, - ctrl@id_vars - )) - if (length(extra_colnames)) { - item_data[, c(extra_colnames) := NULL] - } - coerce_factors(item_data, c(ctrl@group_names, ctrl@geo_name, - ctrl@survey_name)) - rename_numerics(item_data, c(ctrl@group_names, ctrl@geo_name, - ctrl@survey_name)) - initial_dim <- dim(item_data) - final_dim <- c() - iter <- 1L - while (!identical(initial_dim, final_dim)) { - message("Applying restrictions, pass ", iter, "...") - if (iter == 1L) { - item_data <- drop_rows_missing_covariates(item_data, ctrl) - item_data <- keep_t(item_data, ctrl) - item_data <- keep_geo(item_data, ctrl) + if (length(item_data)) { + extra_colnames <- setdiff(names(item_data), + c(ctrl@item_names, + ctrl@survey_name, + ctrl@geo_name, + ctrl@time_name, + ctrl@group_names, + ctrl@weight_name, + ctrl@rake_names, + ctrl@id_vars + )) + if (length(extra_colnames)) { + item_data[, c(extra_colnames) := NULL] } + coerce_factors(item_data, c(ctrl@group_names, ctrl@geo_name, + ctrl@survey_name)) + rename_numerics(item_data, c(ctrl@group_names, ctrl@geo_name, + ctrl@survey_name)) initial_dim <- dim(item_data) - drop_responseless_items(item_data, ctrl) - drop_items_rare_in_time(item_data, ctrl) - if (length(ctrl@survey_name)) { - drop_items_rare_in_polls(item_data, ctrl) + final_dim <- c() + iter <- 1L + while (!identical(initial_dim, final_dim)) { + message("Applying restrictions, pass ", iter, "...") + if (iter == 1L) { + item_data <- drop_rows_missing_covariates(item_data, ctrl) + item_data <- keep_t(item_data, ctrl) + item_data <- keep_geo(item_data, ctrl) + } + initial_dim <- dim(item_data) + drop_responseless_items(item_data, ctrl) + drop_items_rare_in_time(item_data, ctrl) + if (length(ctrl@survey_name)) { + drop_items_rare_in_polls(item_data, ctrl) + } + item_data <- drop_itemless_respondents(item_data, ctrl) + final_dim <- dim(item_data) + iter <- iter + 1L + if (identical(initial_dim, final_dim)) { + message("\tNo changes") + } } - item_data <- drop_itemless_respondents(item_data, ctrl) - final_dim <- dim(item_data) - iter <- iter + 1L - if (identical(initial_dim, final_dim)) { - message("\tNo changes") - } + setkeyv(item_data, c(ctrl@geo_name, ctrl@time_name)) } - setkeyv(item_data, c(ctrl@geo_name, ctrl@time_name)) invisible(item_data) } @@ -287,12 +289,6 @@ drop_items_rare_in_polls <- function(item_data, ctrl) { invisible(item_data) } -get_observed <- function(item_data, aggregate_data, varname) { - obs <- Map(unique.data.frame, list(item_data[, varname, with = FALSE], - aggregate_data[, varname, with = FALSE])) - sort.default(unique.default(unname(unlist(obs)))) -} - stop_if_any_na <- function(where, varnames) { # If there are NA values in any variable named in 'varnames', in the dataframe # given by 'where', stop diff --git a/R/reweight_item_responses.r b/R/reweight_item_responses.r index 3dc2a51..ff2f1e2 100644 --- a/R/reweight_item_responses.r +++ b/R/reweight_item_responses.r @@ -1,9 +1,15 @@ -utils::globalVariables(c("raked_weight")) +utils::globalVariables(c("raked_weight", "preweight")) weight <- function(item_data, target_data, control) { # Adjust individual survey weights given population targets item_data[, c("preweight") := rake_weights(item_data, target_data, control)] + + if (length(control@max_raked_weight)) { + item_data[preweight > control@max_raked_weight, preweight := + control@max_raked_weight] + } + item_data[, c("raked_weight") := list(get("preweight") / mean(get("preweight"), na.rm = TRUE)), by = eval(control@time_name)] @@ -43,3 +49,4 @@ rake_weights <- function(item_data, target_data, control) { } return(raked_weights) } + diff --git a/R/shape.r b/R/shape.r index 219f406..eacfe9b 100644 --- a/R/shape.r +++ b/R/shape.r @@ -39,11 +39,15 @@ #' @section Reweighting: #' Use argument \code{target_data} to adjust the weighting of groups toward #' population targets via raking, using an adaptation of -#' \code{\link[survey]{rake}}. To adjust individual survey weights in +#' \code{\link[survey]{rake}}. To adjust existing survey weights in #' \code{item_data}, provide argument \code{weight_name}. Otherwise, #' observations in \code{item_data} will be assigned equal starting weights. -#' Argument \code{raking} defines strata. Argument \code{proportion_name} -#' is optional. +#' Argument \code{raking} defines strata. If you pass it a list of formulas like +#' \code{list(~ x, ~ y)}, raking is first over \code{x}, then over \code{y}. +#' Given an additive formula like \code{~ x + y}, raking is over the +#' combinations of \code{x} and \code{y}. So, \code{list(~ x, ~ y + z)} is first +#' over \code{x}, then over \code{y}-\code{z} pairs. Argument +#' \code{proportion_name} is optional. #' #' @section Restrictions: #' For convenience, data in \code{item_data}, \code{modifier_data}, @@ -85,6 +89,10 @@ #' @param raking A formula or list of formulas specifying the variables on which #' to rake survey weights. #' +#' @param max_raked_weight A maximum over which raked weights will be trimmed. +#' Only applied after raking. To trim unraked weights, manipulate the input data +#' directly. +#' #' @param aggregate_data A table of trial and success counts by group and item. #' See details below. #' @@ -141,8 +149,8 @@ #' get_item_n(shaped_responses, by = "year") #' #' @export -shape <- function(item_data, - item_names, +shape <- function(item_data = NULL, + item_names = NULL, time_name, geo_name, group_names = NULL, @@ -158,6 +166,7 @@ shape <- function(item_data, standardize = TRUE, target_data = NULL, raking = NULL, + max_raked_weight = NULL, weight_name = NULL, proportion_name = "proportion", aggregate_data = NULL, @@ -187,6 +196,7 @@ shape <- function(item_data, raking = raking, weight_name = weight_name, proportion_name = proportion_name, + max_raked_weight = max_raked_weight, constant_item = constant_item, ...) @@ -210,7 +220,7 @@ shape <- function(item_data, aggregate_data$item) # rake survey weights # - if (length(target_data)) { + if (length(target_data) & length(item_data)) { item_data <- weight(item_data, target_data, ctrl) ctrl@weight_name <- "raked_weight" } @@ -241,7 +251,9 @@ init_dgirt_in <- function(item_data, aggregate_data, modifier_data, target_data, d_in <- dgirtIn$new(ctrl) # aggregate individual item response data to group level # - item_data <- dichotomize(item_data, ctrl) + if (length(item_data)) { + item_data <- dichotomize(item_data, ctrl) + } d_in$group_grid <- make_group_grid(item_data, aggregate_data, ctrl) d_in$group_grid_t <- make_group_grid_t(d_in$group_grid, ctrl) d_in$group_counts <- make_group_counts(item_data, aggregate_data, ctrl) diff --git a/R/validate_input_data.r b/R/validate_input_data.r index 1682334..3bdc376 100644 --- a/R/validate_input_data.r +++ b/R/validate_input_data.r @@ -62,31 +62,33 @@ check_modifiers <- function(modifier_data, ctrl) { } check_item <- function(item_data, ctrl) { - is_name <- valid_names(item_data, ctrl, 1L) - is_name(c("time_name", "geo_name")) - are_names <- valid_names(item_data, ctrl) - are_names("item_names") - if (length(ctrl@id_vars)) { - are_names("id_vars") - } - for (varname in c("weight_name", "survey_name")) { - if (length(slot(ctrl, varname))) { - is_name(varname) - has_type(varname, item_data, ctrl) + if (length(item_data)) { + is_name <- valid_names(item_data, ctrl, 1L) + is_name(c("time_name", "geo_name")) + are_names <- valid_names(item_data, ctrl) + are_names("item_names") + if (length(ctrl@id_vars)) { + are_names("id_vars") + } + for (varname in c("weight_name", "survey_name")) { + if (length(slot(ctrl, varname))) { + is_name(varname) + has_type(varname, item_data, ctrl) + } + } + has_type(c("time_name", "geo_name", "group_names"), item_data, ctrl) + check_time(item_data, ctrl@time_name) + if (is.list(ctrl@raking)) { + raking = unlist(lapply(ctrl@raking, all.vars)) + } else { + raking = all.vars(ctrl@raking) + } + are_valid_terms <- valid_names(item_data, len = 1, stub = "is a raking formula term but isn't") + are_valid_terms(raking) + for (name in c(ctrl@group_names, ctrl@geo_name)) { + if (!length(unique(item_data[[name]])) > 1) + stop(name, " doesn't vary in item_data") } - } - has_type(c("time_name", "geo_name", "group_names"), item_data, ctrl) - check_time(item_data, ctrl@time_name) - if (is.list(ctrl@raking)) { - raking = unlist(lapply(ctrl@raking, all.vars)) - } else { - raking = all.vars(ctrl@raking) - } - are_valid_terms <- valid_names(item_data, len = 1, stub = "is a raking formula term but isn't") - are_valid_terms(raking) - for (name in c(ctrl@group_names, ctrl@geo_name)) { - if (!length(unique(item_data[[name]])) > 1) - stop(name, " doesn't vary in item_data") } } diff --git a/README.Rmd b/README.Rmd index 436e6b8..88dc94c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -44,7 +44,7 @@ knitr::opts_chunk$set( # Installation -dgo can be installed from [CRAN](https://cran.r-project.org/web/packages/dgo/index.html): +dgo can be installed from [CRAN](https://CRAN.R-project.org/package=dgo): ```{r, eval = FALSE} install.packages("dgo") diff --git a/README.md b/README.md index 9b28ca9..aee9baf 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ This model opens up new areas of research on historical public opinion in the Un Installation ============ -dgo can be installed from [CRAN](https://cran.r-project.org/web/packages/dgo/index.html): +dgo can be installed from [CRAN](https://CRAN.R-project.org/package=dgo): ``` r install.packages("dgo") @@ -35,6 +35,14 @@ Load the package and set RStan's recommended options for a local, multicore mach ``` r library(dgo) +#> Loading required package: dgodata +#> Loading required package: rstan +#> Loading required package: ggplot2 +#> Loading required package: StanHeaders +#> rstan (Version 2.16.2, packaged: 2017-07-03 09:24:58 UTC, GitRev: 2e1f913d3ca3) +#> For execution on a local, multicore CPU with excess RAM we recommend calling +#> rstan_options(auto_write = TRUE) +#> options(mc.cores = parallel::detectCores()) rstan_options(auto_write = TRUE) options(mc.cores = parallel::detectCores()) ``` diff --git a/_pkgdown.yml b/_pkgdown.yml index 8b140f4..13d40c9 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -25,7 +25,6 @@ reference: - poststratify - plot_rhats - dgirt_plot - - expand_rownames navbar: left: diff --git a/appveyor.yml b/appveyor.yml index 2ead4c1..b8495f7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -19,19 +19,11 @@ environment: USE_RTOOLS: true matrix: - - R_VERSION: devel - GCC_PATH: mingw_32 - - - R_VERSION: devel - R_ARCH: x64 - GCC_PATH: mingw_64 - - R_VERSION: release R_ARCH: x64 - - R_VERSION: oldrel - RTOOLS_VERSION: 32 - CRAN: http://cran.rstudio.com + - R_VERSION: release + R_ARCH: i386 build_script: - travis-tool.sh install_deps diff --git a/data/toy_dgirt_in.rda b/data/toy_dgirt_in.rda index 6f91c2f..a70e44e 100644 Binary files a/data/toy_dgirt_in.rda and b/data/toy_dgirt_in.rda differ diff --git a/data/toy_dgirtfit.rda b/data/toy_dgirtfit.rda index db35c41..f310ddc 100644 Binary files a/data/toy_dgirtfit.rda and b/data/toy_dgirtfit.rda differ diff --git a/docs/articles/abortion_attitudes.html b/docs/articles/abortion_attitudes.html index 9bfbb4d..2ead0c7 100644 --- a/docs/articles/abortion_attitudes.html +++ b/docs/articles/abortion_attitudes.html @@ -66,11 +66,11 @@
This vignette demonstrates estimation of public attitudes toward abortion from responses to a single survey item, using the dynamic multi-level regression and post-stratification (MRP) model implemented in dgmrp()
.
This vignette demonstrates estimation of public attitudes toward abortion from responses to a single survey item, using the dynamic multi-level regression and post-stratification (MRP) model implemented in dgmrp()
.
shape()
prepares input data for use with the modeling functions dgirt()
and dgmrp()
. Here we use the included opinion
dataset.
shape()
prepares input data for use with the modeling functions dgirt()
and dgmrp()
. Here we use the included opinion
dataset.
dgirt_in_abortion <- shape(opinion, item_names = "abortion", time_name = "year",
geo_name = "state", group_names = "race3", geo_filter = c("CA", "GA", "LA",
"MA"), id_vars = "source")
@@ -110,14 +110,14 @@
#> Constants:
#> Q T P N G H D
#> 1 5 5 60 12 1 1
get_n()
and get_item_n()
give response counts.
get_n(dgirt_in_abortion, by = "state")
+get_n()
and get_item_n()
give response counts.
+get_n(dgirt_in_abortion, by = "state")
#> state n
#> 1: CA 14248
#> 2: GA 4547
#> 3: LA 1658
#> 4: MA 2554
-get_item_n(dgirt_in_abortion, by = "year")
+get_item_n(dgirt_in_abortion, by = "year")
#> year abortion
#> 1: 2006 5275
#> 2: 2007 1690
@@ -128,9 +128,9 @@
Fit a model
-dgmrp()
fits a dynamic multi-level regression and post-stratification (MRP) model to data processed by shape()
. Here, we’ll use it to estimate public attitudes toward abortion over time, for the groups defined by state
and race3
. (Specifically, by their Cartesian product.)
-Under the hood, dgmrp()
uses RStan for MCMC sampling, and arguments can be passed to RStan’s stan()
via the ...
argument of dgmrp()
. This is almost always desirable. Here, we specify the number of sampler iterations, chains, and cores.
-dgmrp_out_abortion <- dgmrp(dgirt_in_abortion, iter = 1500, chains = 4, cores =
+dgmrp()
fits a dynamic multi-level regression and post-stratification (MRP) model to data processed by shape()
. Here, we’ll use it to estimate public attitudes toward abortion over time, for the groups defined by state
and race3
. (Specifically, by their Cartesian product.)
+Under the hood, dgmrp()
uses RStan for MCMC sampling, and arguments can be passed to RStan’s stan()
via the ...
argument of dgmrp()
. This is almost always desirable. Here, we specify the number of sampler iterations, chains, and cores.
+dgmrp_out_abortion <- dgmrp(dgirt_in_abortion, iter = 1500, chains = 4, cores =
4, seed = 42)
The model results are held in a dgmrp_fit
object. Methods from RStan like extract()
are available if needed because dgmrp_fit
is a subclass of stanfit
. But dgo provides its own methods for typical post-estimation tasks.
@@ -240,7 +240,8 @@