0.2.11

jamesdunham · Oct 28, 2017 · 367ade3 · 367ade3
2 parents 2df18b4 + d0eecbc
commit 367ade3
Show file tree

Hide file tree

Showing 63 changed files with 1,293 additions and 765 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -15,3 +15,4 @@
 ^readme-cache/
 ^vignettes/
 ^wercker\.yml$
+^_pkgdown\.yml$
diff --git a/.travis.yml b/.travis.yml
@@ -3,16 +3,6 @@ cache: packages
 matrix:
   include:
 
-    - os: linux
-      dist: precise
-      r: release
-    - os: linux
-      dist: precise
-      r: devel
-    - os: linux
-      dist: precise
-      r: oldrel
-
     - os: linux
       dist: trusty
       r: release
@@ -30,10 +20,6 @@ matrix:
       osx_image: xcode8.3
       latex: false
       r: release
-    # - os: osx
-    #   osx_image: xcode8.3
-    #   latex: false
-    #   r: devel
     - os: osx
       osx_image: xcode8.3
       latex: false
@@ -43,9 +29,6 @@ matrix:
     - os: osx
       latex: false
       r: release
-    # - os: osx
-    #   latex: false
-    #   r: devel
     - os: osx
       latex: false
       r: oldrel

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: dgo
 Title: Dynamic Estimation of Group-Level Opinion
-Version: 0.2.10
-Date: 2017-05-29
+Version: 0.2.11
+Date: 2017-10-26
 Description: Fit dynamic group-level IRT and MRP models from individual or
     aggregated item response data. This package handles common preprocessing
     tasks and extends functions for inspecting results, poststratification, and
@@ -43,7 +43,6 @@ Collate:
     'class-dgmrp_fit.r'
     'dgirt.r'
     'dichotomize_item_responses.r'
-    'expand_rownames.r'
     'methods-control.r'
     'methods-dgirtfit-plot.r'
     'methods-dgirtfit-poststratify.r'

diff --git a/Makefile b/Makefile
@@ -25,18 +25,17 @@ build-cran:
 	$(R) CMD build . --no-resave-data --no-manual
 
 check: 
-	$(R) CMD check $(PKG)_$(VERSION).tar.gz
+	$(R) CMD check $(BINARY)
 
 check-cran: 
-	$(R) CMD check --as-cran $(PKG)_$(VERSION).tar.gz
+	$(R) CMD check --as-cran $(BINARY)
 
-check-quick $(PKG)_$(VERSION).tar.gz:
+check-quick $(BINARY):
 	$(R) $(R_ARGS) CMD build . 
-	$(R) CMD check $(PKG)_$(VERSION).tar.gz
+	$(R) CMD check $(BINARY)
 
-install: $(PKG)_$(VERSION).tar.gz
-	$(R) CMD INSTALL --no-multiarch --with-keep.source \
-	  $(PKG)_$(VERSION).tar.gz
+install: $(BINARY)
+	$(R) CMD INSTALL --no-multiarch --with-keep.source $(BINARY)
 
 install-code:
 	$(R) CMD INSTALL --no-multiarch --with-keep.source --no-docs .

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,7 +3,6 @@
 S3method(as.data.frame,dgo_fit)
 export(dgirt)
 export(dgmrp)
-export(expand_rownames)
 export(plot_rhats)
 export(shape)
 export(summarize)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+## dgo 0.2.11
+
+* Add poststratification over posterior samples (closes #21).
+* `shape()` now accepts aggregated item response data unaccompanied by
+  individual-level item response data. The `item_data` and `item_names`
+  arguments are no longer required.
+* Add a `max_raked_weight` argument to `shape()` for trimming raked weights.
+  Note that trimming occurs before raked weights are rescaled to have mean 1,
+  and the rescaled weights can be larger than `max_raked_weight`.
+* Remove the unused function `expand_rownames()`.
+* Bugfixes.
+
 ## dgo 0.2.10
 
 * Remove Rcpp dependency by rewriting `dichotomize()` in R.

diff --git a/R/aggregate_item_responses.r b/R/aggregate_item_responses.r
@@ -23,78 +23,90 @@ make_group_counts <- function(item_data, aggregate_data, ctrl) {
   # Because of how DGIRT Stan code iterates over the data, the result must be
   # ordered by time, item, and then group. The order of the grouping variables
   # doesn't matter.
-  gt_names <- attr(item_data, "gt_items")
-  item_data[, c("n_responses") := list(rowSums(!is.na(.SD))),
-            .SDcols = gt_names]
-  if (!length(ctrl@weight_name)) {
-    item_data[, weight := 1L]
-    ctrl@weight_name <- "weight"
-  }
-  item_data[, c("def") := lapply(.SD, calc_design_effects),
-            .SDcols = ctrl@weight_name,
-            by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
-
-  # get design-effect-adjusted nonmissing response counts by group and item
-  item_n <- item_data[, lapply(.SD, count_items, get("n_responses"), get("def")),
-                      .SDcols = c(gt_names),
-                      by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
-  # append _n_grp to the response count columns
-  item_n_vars <- paste0(gt_names, "_n_grp")
-  names(item_n) <- replace(names(item_n), match(gt_names, names(item_n)), item_n_vars)
-  data.table::setkeyv(item_n, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names))
-  drop_cols <- setdiff(names(item_n), c(key(item_n), item_n_vars))
-  if (length(drop_cols)) {
-    item_n[, c(drop_cols) := NULL]
-  }
+  if (length(item_data)) {
+    gt_names <- attr(item_data, "gt_items")
+    item_data[, c("n_responses") := list(rowSums(!is.na(.SD))),
+              .SDcols = gt_names]
+    if (!length(ctrl@weight_name)) {
+      item_data[, weight := 1L]
+      ctrl@weight_name <- "weight"
+    }
+    item_data[, c("def") := lapply(.SD, calc_design_effects),
+              .SDcols = ctrl@weight_name,
+              by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
 
-  # get mean ystar
-  item_data[, c("adj_weight") := get(ctrl@weight_name) / get("n_responses")]
-  item_means <- item_data[, lapply(.SD, function(x) weighted.mean(x, .SD$adj_weight, na.rm = TRUE)),
-                          .SDcols = c(gt_names, "adj_weight"),
-                          by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
-  # append _mean to the mean response columns
-  item_mean_vars <- paste0(gt_names, "_mean")
-  names(item_means) <- replace(names(item_means), match(gt_names, names(item_means)), item_mean_vars)
-  data.table::setkeyv(item_means, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names))
-  drop_cols <- setdiff(names(item_means), c(key(item_means), item_mean_vars))
-  if (length(drop_cols)) {
-    item_means[, c(drop_cols) := NULL]
-  }
+    # get design-effect-adjusted nonmissing response counts by group and item
+    item_n <- item_data[, lapply(.SD, count_items, get("n_responses"), get("def")),
+                        .SDcols = c(gt_names),
+                        by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
+    # append _n_grp to the response count columns
+    item_n_vars <- paste0(gt_names, "_n_grp")
+    names(item_n) <- replace(names(item_n), match(gt_names, names(item_n)), item_n_vars)
+    data.table::setkeyv(item_n, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names))
+    drop_cols <- setdiff(names(item_n), c(key(item_n), item_n_vars))
+    if (length(drop_cols)) {
+      item_n[, c(drop_cols) := NULL]
+    }
+
+    # get mean ystar
+    item_data[, c("adj_weight") := get(ctrl@weight_name) / get("n_responses")]
+    item_means <- item_data[, lapply(.SD, function(x) weighted.mean(x, .SD$adj_weight, na.rm = TRUE)),
+                            .SDcols = c(gt_names, "adj_weight"),
+                            by = c(ctrl@geo_name, ctrl@group_names, ctrl@time_name)]
+    # append _mean to the mean response columns
+    item_mean_vars <- paste0(gt_names, "_mean")
+    names(item_means) <- replace(names(item_means), match(gt_names, names(item_means)), item_mean_vars)
+    data.table::setkeyv(item_means, c(ctrl@time_name, ctrl@geo_name, ctrl@group_names))
+    drop_cols <- setdiff(names(item_means), c(key(item_means), item_mean_vars))
+    if (length(drop_cols)) {
+      item_means[, c(drop_cols) := NULL]
+    }
+
+    # join response counts with means
+    count_means <- item_n[item_means]
+    count_means <- count_means[, c(ctrl@time_name, ctrl@geo_name,
+                                     ctrl@group_names, item_mean_vars,
+                                     item_n_vars), with = FALSE]
 
-  # join response counts with means
-  count_means <- item_n[item_means]
-  count_means <- count_means[, c(ctrl@time_name, ctrl@geo_name,
-                                   ctrl@group_names, item_mean_vars,
-                                   item_n_vars), with = FALSE]
-
-  # the group success count for an item is the product of its count and mean
-  item_s_vars <- paste0(gt_names, "_s_grp")
-  count_means[, c(item_s_vars) := round(count_means[, (item_mean_vars), with = FALSE] *
-                                         count_means[, (item_n_vars), with = FALSE], 0)]
-  count_means <- count_means[, -grep("_mean$", names(count_means)), with = FALSE]
-
-
-  # we want a long table of successes (s_grp) and trials (n_grp) by group and
-  # item; items need to move from columns to rows
-  melted <- melt(count_means, id.vars = c(ctrl@time_name, ctrl@geo_name,
-                                           ctrl@group_names),
-                 variable.name = "item")
-  melted[, c("variable") := list(gsub(".*([sn]_grp)$", "\\1", get("item")))]
-  melted[, c("item") := list(gsub("(.*)_[sn]_grp$", "\\1", get("item")))]
-  f <- as.formula(paste0(paste(ctrl@time_name, ctrl@geo_name,
-                               paste(ctrl@group_names, collapse = " + "),
-                               "item", sep = "+"), " ~ variable"))
-  counts <- data.table::dcast.data.table(melted, f, drop = FALSE, fill = 0L)
+    # the group success count for an item is the product of its count and mean
+    item_s_vars <- paste0(gt_names, "_s_grp")
+    count_means[, c(item_s_vars) := round(count_means[, (item_mean_vars), with = FALSE] *
+                                           count_means[, (item_n_vars), with = FALSE], 0)]
+    count_means <- count_means[, -grep("_mean$", names(count_means)), with = FALSE]
+
+
+    # we want a long table of successes (s_grp) and trials (n_grp) by group and
+    # item; items need to move from columns to rows
+    melted <- melt(count_means, id.vars = c(ctrl@time_name, ctrl@geo_name,
+                                             ctrl@group_names),
+                   variable.name = "item")
+    melted[, c("variable") := list(gsub(".*([sn]_grp)$", "\\1", get("item")))]
+    melted[, c("item") := list(gsub("(.*)_[sn]_grp$", "\\1", get("item")))]
+    f <- as.formula(paste0(paste(ctrl@time_name, ctrl@geo_name,
+                                 paste(ctrl@group_names, collapse = " + "),
+                                 "item", sep = "+"), " ~ variable"))
+    counts <- data.table::dcast.data.table(melted, f, drop = FALSE, fill = 0L)
+  }
 
   # include aggregates, if any
-  if (length(aggregate_data) && nrow(aggregate_data) > 0) {
+  if (length(item_data) && length(aggregate_data) && nrow(aggregate_data) > 0) {
+    # invariant: we have both individual- and aggregate-level item responses
     counts <- data.table::rbindlist(list(counts, aggregate_data), use.names =
                                     TRUE)
     message("Added ", length(ctrl@aggregate_item_names), " items from aggregate data.")
-    data.table::setkeyv(counts, c(ctrl@time_name, "item", ctrl@group_names,
-                                  ctrl@geo_name))
+  } else if (length(aggregate_data) && nrow(aggregate_data) > 0) {
+    # invariant: we have only aggregate-level item responses
+    # aggregate_data is already in the expected format
+    counts <- aggregate_data
+    message("Using ", length(ctrl@aggregate_item_names), " items from aggregate data.")
+  } else if (!length(item_data)) {
+    # invariant: we unexpectedly have neither individual- nor aggregate-level data
+    stop("can't proceed with neither item_data nor aggregate_data")
   }
 
+  data.table::setkeyv(counts, c(ctrl@time_name, "item", ctrl@group_names,
+      ctrl@geo_name))
+
   # include unobserved cells
   all_groups = expand.grid(c(setNames(list(unique(counts[[ctrl@geo_name]])), ctrl@geo_name),
                              setNames(list(ctrl@time_filter), ctrl@time_name),

diff --git a/R/assertions.r b/R/assertions.r
@@ -41,7 +41,7 @@ has_all_names <- function(table, names, suggestion = NULL) {
 }
 
 assertthat::on_failure(has_all_names) <- function(call, env) {
-  paste0("not all ", call$names, " are names in ", deparse(call$table))
+  paste0("not all of ", deparse(call$names), " are names in ", deparse(call$table))
 }
 
 all_strings <- function(x) {

diff --git a/R/class-control.r b/R/class-control.r
@@ -1,6 +1,5 @@
 setClass("Control",
-         slots = list(# item data
-                      item_names = "character",
+         slots = list(item_names = "ANY",
                       time_name = "character",
                       geo_name = "character",
                       group_names = "ANY",
@@ -22,13 +21,21 @@ setClass("Control",
                       weight_name = "ANY",
                       proportion_name = "character",
                       rake_names = "character",
+                      max_raked_weight = "ANY",
                       # modeling options
-                      constant_item = "logical"),
+                      constant_item = "logical",
+                      # indicators for state
+                      has_individual_data = "ANY",
+                      has_aggregate_data = "ANY",
+                      has_target_data = "ANY",
+                      has_modifier_data = "ANY"),
          validity = function(object) {
            if (!length(object@time_name) == 1L)
              "\"time_name\" should be a single variable name"
            else if (!length(object@geo_name) == 1L)
              "\"geo_name\" should be a single variable name"
+           else if (length(object@item_names) && !is.character(object@item_names))
+             "if specified \"item_names\" should give variable names in a character vector"
            else if (length(object@survey_name) && length(object@survey_name) != 1L)
              "if specified \"survey_name\" should be a single variable name"
            else if (length(object@survey_name) && !is.character(object@survey_name))
@@ -64,14 +71,17 @@ setClass("Control",
            else if (!length(object@constant_item) == 1L &&
                     is.logical(object@constant_item))
              "\"constant_item\" should be a single logical value"
-          # else if (length(unique(object@time_filter)) == 1L)
-          #   "if specified \"time_filter\" should give at least two time periods"
            else if (length(unique(object@geo_filter)) == 1L)
              "if specified \"geo_filter\" should give at least two local geographic areas"
            else if (length(object@min_survey_filter) != 1L || object@min_survey_filter <= 0L)
              "\"min_survey_filter\" should be a positive integer"
            else if (!length(object@min_t_filter) == 1L && object@min_t_filter > 0L)
              "\"min_t_filter\" should be a positive integer"
+           else if (length(object@max_raked_weight) &&
+             (length(object@max_raked_weight) > 1 |
+               !is.numeric(object@max_raked_weight))) {
+             "if specified \"max_raked_weight\" should be a single number"
+           }
            else 
              TRUE
          })