diff --git a/DESCRIPTION b/DESCRIPTION index 5d469d4..77974f4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,25 +1,19 @@ Type: Package Package: retroharmonize Title: Ex Post Survey Data Harmonization -Version: 0.2.5.002 -Date: 2022-09-24 +Version: 0.2.5.003 +Date: 2023-12-01 Authors@R: c( - person("Daniel", "Antal", , "daniel.antal@ceemid.eu", role = c("aut", "cre"), + person(given = "Daniel", + family = "Antal", + email = "daniel.antal@dataobservatory.eu", + role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7513-6760")), person(given = "Marta", - family = "Kolczynska", - role = c("ctb"), - email = "mkolczynska@gmail.com", - comment = c(ORCID = "0000-0003-4981-0437")), - person(given = "Pyry", - family = "Kantanen", - role = "ctb", - comment = c(ORCID = "0000-0003-2853-2765")), - person(given = "Leo", - family = " Lahti", - role = "ctb", - comment = c(ORCID = "0000-0001-5537-637X") - ) + family = "Kolczynska", + role = c("ctb"), + email = "mkolczynska@gmail.com", + comment = c(ORCID = "0000-0003-4981-0437")) ) Maintainer: Daniel Antal Description: Assist in reproducible retrospective (ex-post) harmonization @@ -69,6 +63,6 @@ Config/testthat/edition: 3 Encoding: UTF-8 Language: en-US LazyData: true -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 X-schema.org-isPartOf: http://ropengov.org/ X-schema.org-keywords: ropengov diff --git a/R/collect_val_labels.R b/R/collect_val_labels.R index e408fe3..9b96cd4 100644 --- a/R/collect_val_labels.R +++ b/R/collect_val_labels.R @@ -1,4 +1,4 @@ -#' Collect labels from metadata file +#' @title Collect labels from metadata file #' #' @param metadata A metadata data frame created by #' \code{\link{metadata_create}}. diff --git a/R/create_codebook.R b/R/create_codebook.R index 9317eca..500edf9 100644 --- a/R/create_codebook.R +++ b/R/create_codebook.R @@ -83,32 +83,32 @@ create_codebook <- function ( metadata = NULL, if ( n_labelled_numeric > 0 ) { # These area cases when the labels are of class numeric valid_labelled_numeric <- metadata_labelled_numeric %>% - filter ( grepl( "labelled", .data$class_orig )) %>% + filter ( grepl( "labelled", class_orig )) %>% select ( all_of(c("entry", "id", "filename", "var_name_orig", "var_label_orig", "valid_labels"))) %>% - unnest_longer( .data$valid_labels) %>% + unnest_longer( valid_labels) %>% rlang::set_names ( c("entry", "id", "filename", "var_name_orig","var_label_orig", "val_code_orig", "val_label_orig")) %>% mutate ( # This is the valid observation range label_range = "valid", - val_code_orig = as.character(.data$val_code_orig)) + val_code_orig = as.character(val_code_orig)) na_labelled_numeric <- metadata[num_labels ,] %>% - filter ( grepl( "labelled", .data$class_orig )) %>% + filter ( grepl( "labelled", class_orig )) %>% select ( all_of(c("entry", "id", "filename", "var_name_orig", "var_label_orig", "na_labels"))) %>% - unnest_longer( .data$na_labels) %>% + unnest_longer( na_labels) %>% purrr::set_names ( c("entry", "id", "filename", "var_name_orig", "var_label_orig", "val_code_orig", "val_label_orig")) %>% mutate ( # This is the missing observation range label_range = "missing") %>% - filter ( !is.na(.data$val_code_orig) ) %>% - mutate ( val_code_orig = as.character(.data$val_code_orig) ) + filter ( !is.na(val_code_orig) ) %>% + mutate ( val_code_orig = as.character(val_code_orig) ) num_labels <- valid_labelled_numeric %>% dplyr::bind_rows ( na_labelled_numeric ) %>% - dplyr::arrange( .data$entry, .data$val_code_orig ) %>% + dplyr::arrange( entry, val_code_orig ) %>% left_join ( metadata %>% select ( any_of(c("entry", "id", "filename", "na_range", "n_labels", "n_valid_labels", "n_na_labels", user_vars))), @@ -126,34 +126,34 @@ create_codebook <- function ( metadata = NULL, if ( n_labelled_character > 0) { # These area cases when the na_labels are of class character valid_labelled_character <- metadata_labelled_character %>% - filter ( grepl( "labelled", .data$class_orig )) %>% + filter ( grepl( "labelled", class_orig )) %>% select ( all_of(c("entry", "id", "filename", "var_name_orig", "var_label_orig", "valid_labels"))) %>% - unnest_longer( .data$valid_labels) %>% + unnest_longer( valid_labels) %>% rlang::set_names ( c("entry", "id", "filename", "var_name_orig","var_label_orig", "val_code_orig", "val_label_orig")) %>% mutate ( # This is the valid observation range label_range = "valid") %>% - mutate ( val_code_orig = as.character(.data$val_code_orig) ) + mutate ( val_code_orig = as.character(val_code_orig) ) na_labelled_character <- metadata[char_labels ,] %>% - filter ( grepl( "labelled", .data$class_orig )) %>% + filter ( grepl( "labelled", class_orig )) %>% select ( all_of(c("entry", "id", "filename", "var_name_orig", "var_label_orig", "na_labels"))) %>% - unnest_longer( .data$na_labels) %>% + unnest_longer( na_labels) %>% purrr::set_names ( c("entry", "id", "filename", "var_name_orig", "var_label_orig", "val_code_orig", "val_label_orig")) %>% mutate ( # This is the missing observation range label_range = "missing") %>% - filter ( !is.na(.data$val_code_orig)) %>% - mutate ( val_code_orig = as.character(.data$val_code_orig) ) + filter ( !is.na(val_code_orig)) %>% + mutate ( val_code_orig = as.character(val_code_orig) ) char_labels <- valid_labelled_character %>% dplyr::bind_rows ( na_labelled_character ) %>% - dplyr::arrange( .data$entry, .data$val_code_orig ) %>% + dplyr::arrange( entry, val_code_orig ) %>% left_join ( metadata %>% select ( any_of(c("entry", "id", "filename", "na_range", "n_labels", "n_valid_labels", "n_na_labels", user_vars))), @@ -178,14 +178,14 @@ create_codebook <- function ( metadata = NULL, left_join ( user_data[0,], by = "entry" ) } else if ( n_labelled_character == 0 ) { num_labels %>% - dplyr::arrange (.data$entry) + dplyr::arrange (entry) } else if ( n_labelled_numeric == 0 ) { char_labels %>% - dplyr::arrange (.data$entry) + dplyr::arrange (entry) } else { num_labels %>% bind_rows ( char_labels) %>% - dplyr::arrange (.data$entry) + dplyr::arrange (entry) } } diff --git a/R/crosswalk.R b/R/crosswalk.R index fffba5d..3cf35b5 100644 --- a/R/crosswalk.R +++ b/R/crosswalk.R @@ -25,7 +25,6 @@ #' data frames, where the variable names, and optionally the variable labels, and the missing #' value range is harmonized (the same names, labels, codes are used.) #' @importFrom dplyr filter select mutate distinct_all relocate across everything -#' @importFrom rlang .data #' @importFrom assertthat assert_that #' @family harmonization functions #' @examples @@ -90,7 +89,7 @@ crosswalk_surveys <- function(crosswalk_table, msg = "selection must have rows") select_to_harmonize <- selection %>% - filter ( !is.na(.data$val_label_orig) ) + filter ( !is.na(val_label_orig) ) vars_to_harmonize <- unique(select_to_harmonize$var_name_target) @@ -102,7 +101,7 @@ crosswalk_surveys <- function(crosswalk_table, for ( this_var in vars_to_harmonize ) { correspondence_table <- select_to_harmonize %>% - filter ( .data$var_name_target == this_var ) + filter ( var_name_target == this_var ) assert_that(is.numeric(correspondence_table$val_numeric_target), msg = "Error in relabel_survey: 'val_numeric_target' must be a numeric vector") @@ -138,15 +137,15 @@ crosswalk_surveys <- function(crosswalk_table, subset_survey <- function(this_survey) { survey_id <- attr(this_survey, "id") - assertthat::assert_that(length(survey_id)>0, - msg = "Error in subset_survey(): survey_id has 0 length.") + assert_that(length(survey_id)>0, + msg = "Error in subset_survey(): survey_id has 0 length.") tmp <- this_survey %>% mutate ( id = survey_id ) %>% - relocate ( .data$id, .before = everything()) + relocate ( id, .before = everything()) selection <- crosswalk_table %>% - filter ( .data$id == survey_id ) %>% + filter ( id == survey_id ) %>% distinct_all() @@ -322,8 +321,8 @@ crosswalk_table_create <- function(metadata) { if (nrow(metadata)==1) { fn_labels(x=metadata[1,]) } else { - ctable_list <- lapply ( 1:nrow(metadata), function(x) fn_labels(metadata[x,]) ) - ctable <- suppressMessages(purrr::reduce ( ctable_list, full_join )) + ctable_list <- lapply (1:nrow(metadata), function(x) fn_labels(metadata[x,])) + ctable <- suppressMessages(purrr::reduce(ctable_list, full_join)) ctable } } @@ -332,7 +331,6 @@ crosswalk_table_create <- function(metadata) { #' @rdname crosswalk_table_create #' @param ctable A table to validate if it is a crosswalk table. #' @importFrom dplyr tally group_by across filter -#' @importFrom rlang .data #' @family metadata functions #' @export @@ -351,8 +349,8 @@ is.crosswalk_table <- function(ctable) { distinct_all() %>% group_by ( across(c("var_name_target", "id"))) %>% tally() %>% - filter ( .data$n>1) %>% - select (.data$var_name_target ) %>% + filter ( n>1) %>% + select (var_name_target ) %>% unlist() error_msg <- paste(unique(duplicates), collapse = ', ') diff --git a/R/document_survey_item.R b/R/document_survey_item.R index 32348c9..b934d90 100644 --- a/R/document_survey_item.R +++ b/R/document_survey_item.R @@ -72,8 +72,8 @@ document_survey_item <- function(x) { tbl_length <- nrow(coding) list ( - code_table = dplyr::bind_cols(coding, labelling) %>% - mutate ( missing = ifelse (.data$values %in% attr(x, "na_values"), + code_table = bind_cols(coding, labelling) %>% + mutate ( missing = ifelse (values %in% attr(x, "na_values"), TRUE, FALSE)), history_var_name = c( c("name" = original_x_name ), diff --git a/R/harmonize_survey_variables.R b/R/harmonize_survey_variables.R index 38b20f6..cda363c 100644 --- a/R/harmonize_survey_variables.R +++ b/R/harmonize_survey_variables.R @@ -65,15 +65,15 @@ harmonize_survey_variables <- function( crosswalk_table, new_names <- tibble( var_name_orig = names(this_survey)) %>% left_join ( crosswalk_table %>% - filter (.data$id == survey_id) %>% - select ( .data$var_name_orig, .data$var_name_target ) %>% + filter (id == survey_id) %>% + select ( var_name_orig, var_name_target ) %>% distinct_all(), by = "var_name_orig", ) %>% - mutate ( var_name_target = ifelse (.data$var_name_orig == "rowid", + mutate ( var_name_target = ifelse (var_name_orig == "rowid", yes = "rowid", - no = .data$var_name_target)) %>% - select ( .data$var_name_target ) %>% unlist() %>% as.character() + no = var_name_target)) %>% + select ( var_name_target ) %>% unlist() %>% as.character() rlang::set_names(this_survey, nm = new_names ) @@ -96,15 +96,15 @@ harmonize_survey_variables <- function( crosswalk_table, new_names <- tibble( var_name_orig = names(this_survey)) %>% left_join ( crosswalk_table %>% - filter (.data$id == survey_id) %>% - select ( .data$var_name_orig, .data$var_name_target ) %>% + filter (id == survey_id) %>% + select ( var_name_orig, var_name_target ) %>% distinct_all(), by = "var_name_orig", ) %>% - mutate ( var_name_target = ifelse (.data$var_name_orig == "rowid", + mutate ( var_name_target = ifelse (var_name_orig == "rowid", yes = "rowid", - no = .data$var_name_target)) %>% - select ( .data$var_name_target ) %>% unlist() %>% as.character() + no = var_name_target)) %>% + select ( var_name_target ) %>% unlist() %>% as.character() this_survey <- rlang::set_names(this_survey, nm = new_names ) saveRDS(this_survey, file = file.path(export_path, x), version = 2 ) diff --git a/R/metadata_create.R b/R/metadata_create.R index b82d995..2419a10 100644 --- a/R/metadata_create.R +++ b/R/metadata_create.R @@ -90,7 +90,6 @@ metadata_waves_create <- function(survey_list) { #' @importFrom labelled na_values na_range val_labels var_label #' @importFrom purrr map #' @importFrom assertthat assert_that -#' @importFrom rlang .data #' @family metadata functions #' @return A nested data frame with metadata and the range of #' labels, na_values and the na_range itself. @@ -219,14 +218,14 @@ metadata_survey_create <- function(survey) { return_df <- metadata %>% left_join ( range_df %>% - group_by ( .data$var_name_orig ) %>% + group_by ( var_name_orig ) %>% tidyr::nest(), by = "var_name_orig") %>% tidyr::unnest ( cols = "data" ) %>% ungroup() %>% - mutate ( n_na_labels = as.numeric(.data$n_na_labels), - n_valid_labels = as.numeric(.data$n_valid_labels), - n_labels = as.numeric(.data$n_labels)) %>% + mutate ( n_na_labels = as.numeric(n_na_labels), + n_valid_labels = as.numeric(n_valid_labels), + n_labels = as.numeric(n_labels)) %>% as.data.frame() change_label_to_empty <- function() { @@ -247,7 +246,7 @@ metadata_survey_create <- function(survey) { no = return_df$na_labels ) return_df %>% - select ( -.data$label_type ) + select ( -label_type ) } diff --git a/R/pull_survey.R b/R/pull_survey.R index 79a88d3..cafca8e 100644 --- a/R/pull_survey.R +++ b/R/pull_survey.R @@ -1,6 +1,6 @@ -#' Pull a survey from a survey list +#' @title Pull a survey from a survey list #' -#' Pull a survey by survey code or id. +#' @description Pull a survey by survey code or id. #' #' @param survey_list A list of surveys #' @param id The id of the requested survey. If \code{NULL} use diff --git a/R/read_spss.R b/R/read_spss.R index 2038498..c19b124 100644 --- a/R/read_spss.R +++ b/R/read_spss.R @@ -18,6 +18,7 @@ #' \code{tibble::\link[tibble:as_tibble]{as_tibble}} for details. #' @inheritParams read_rds #' @importFrom haven read_spss read_sav write_sav is.labelled +#' @importFrom assertthat assert_that #' @importFrom tibble rowid_to_column as_tibble #' @importFrom fs path_ext_remove path_file is_file #' @importFrom labelled var_label @@ -58,7 +59,7 @@ read_spss <- function(file, source_file_info <- valid_file_info(file) - safely_read_haven_spss <- purrr::safely(.f = haven::read_spss) + safely_read_haven_spss <- safely(.f = haven::read_spss) tmp <- safely_read_haven_spss (file = file, user_na = user_na, @@ -78,13 +79,13 @@ read_spss <- function(file, all_vars <- names(tmp) - assertthat::assert_that(length(all_vars)>0, - msg = "The SPSS file has no names.") + assert_that(length(all_vars)>0, + msg = "The SPSS file has no names.") - filename <- fs::path_file(file) + filename <- path_file(file) if ( is.null(id) ) { - id <- fs::path_ext_remove ( filename ) + id <- path_ext_remove(filename) } if ( is.null(doi)) { @@ -161,7 +162,7 @@ read_spss <- function(file, return_survey <- survey (return_df, id=id, filename=filename, doi=doi) object_size <- as.numeric(object.size(as_tibble(return_df))) - attr(return_survey, "object_size") <- object_size + attr(return_survey, "object_size") <- object_size attr(return_survey, "source_file_size") <- source_file_info$size return_survey diff --git a/R/read_surveys.R b/R/read_surveys.R index 3a3b88d..39cddb1 100644 --- a/R/read_surveys.R +++ b/R/read_surveys.R @@ -23,6 +23,7 @@ #' information is recorded for a reproducible workflow. #' @importFrom purrr safely #' @importFrom fs path_file +#' @importFrom assertthat assert_that #' @examples #' file1 <- system.file( #' "examples", "ZA7576.rds", package = "retroharmonize") @@ -38,11 +39,24 @@ read_surveys <- function ( survey_paths, .f = NULL, export_path = NULL ) { + import_file_vector <- survey_paths + existing_files <- which(file.exists(import_file_vector)) + not_existing_files <- which(! file.exists(import_file_vector)) - import_file_list <- survey_paths + if ( length(existing_files)==0) { + stop ("None of the files on read_surveys(survey_paths=...) exist.") + } - return_survey_list <- lapply ( import_file_list, function(x) read_survey(x, .f, export_path)) - + if (length(not_existing_files)>0) { + missing_files <- paste(import_file_vector[not_existing_files], collapse = ";\n") + warning("Some files on 'survey_pahts' do not exist:\n", missing_files) + } + + import_file_vector <- import_file_vector[existing_files] + + return_survey_list <- lapply ( import_file_vector, + function(x) read_survey(x, .f, export_path) + ) return_survey_list } diff --git a/R/retroharmonize-package.R b/R/retroharmonize-package.R new file mode 100644 index 0000000..a65cf64 --- /dev/null +++ b/R/retroharmonize-package.R @@ -0,0 +1,6 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +## usethis namespace: end +NULL diff --git a/R/subset_surveys.R b/R/subset_surveys.R index e429c6d..91b8d47 100644 --- a/R/subset_surveys.R +++ b/R/subset_surveys.R @@ -39,8 +39,8 @@ #' ) #' #' subset_surveys(survey_list = example_surveys, -#' subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"), -#' subset_name = "subset_example") +#' subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"), +#' subset_name = "subset_example") #' @export subset_surveys <- function ( survey_list, @@ -64,7 +64,10 @@ subset_surveys <- function ( survey_list, } else { subset_from_files <- TRUE } - if ( !is.null(survey_paths) ) validate_survey_files(survey_paths) + if ( !is.null(survey_paths) ) { + validate_survey_files(survey_paths) + } + if ( !is.null(crosswalk_table)) { is.crosswalk_table(crosswalk_table) if (!is.null(survey_paths)) { @@ -102,12 +105,14 @@ subset_surveys <- function ( survey_list, this_path <- files_to_subset[x] this_id <- crosswalk_table %>% filter ( - .data$filename == fs::path_file(this_path) - ) %>% distinct(.data$id) %>% unlist() %>% as.character() + filename == fs::path_file(this_path) + ) %>% distinct(id) %>% + unlist() %>% + as.character() subset_vars <- crosswalk_table %>% - filter ( .data$id == this_id ) %>% - select ( .data$var_name_orig ) %>% + filter ( id == this_id ) %>% + select ( var_name_orig ) %>% unlist() %>% as.character() %>% unique() @@ -164,8 +169,8 @@ subset_surveys <- function ( survey_list, this_survey <- survey_list[[x]] subset_vars <- crosswalk_table %>% - filter ( .data$id == attr(this_survey, "id")) %>% - select ( .data$var_name_orig ) %>% + filter ( id == attr(this_survey, "id")) %>% + select ( var_name_orig ) %>% unlist() %>% as.character() %>% unique() diff --git a/R/utils.R b/R/utils.R index 68e4256..9d06073 100644 --- a/R/utils.R +++ b/R/utils.R @@ -5,8 +5,10 @@ cat_line <- function(...) { # identical to haven:::cat_line cat(paste0(..., "\n", collapse = "")) } + #' @keywords internal is.labelled_spss <- function (x) inherits(x, "haven_labelled_spss") + #' @keywords internal is.labelled <- function (x) inherits(x, "haven_labelled") diff --git a/man/retroharmonize-package.Rd b/man/retroharmonize-package.Rd new file mode 100644 index 0000000..4e8ea9f --- /dev/null +++ b/man/retroharmonize-package.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/retroharmonize-package.R +\docType{package} +\name{retroharmonize-package} +\alias{retroharmonize} +\alias{retroharmonize-package} +\title{retroharmonize: Ex Post Survey Data Harmonization} +\description{ +\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} + +Assist in reproducible retrospective (ex-post) harmonization of data, particularly individual level survey data, by providing tools for organizing metadata, standardizing the coding of variables, and variable names and value labels, including missing values, and documenting the data transformations, with the help of comprehensive s3 classes. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://retroharmonize.dataobservatory.eu/} + \item Report bugs at \url{https://github.com/rOpenGov/retroharmonize/issues/} +} + +} +\author{ +\strong{Maintainer}: Daniel Antal \email{daniel.antal@dataobservatory.eu} (\href{https://orcid.org/0000-0001-7513-6760}{ORCID}) + +Other contributors: +\itemize{ + \item Marta Kolczynska \email{mkolczynska@gmail.com} (\href{https://orcid.org/0000-0003-4981-0437}{ORCID}) [contributor] +} + +} +\keyword{internal} diff --git a/man/subset_surveys.Rd b/man/subset_surveys.Rd index 9f7446d..cf6afd1 100644 --- a/man/subset_surveys.Rd +++ b/man/subset_surveys.Rd @@ -78,7 +78,7 @@ example_surveys <- read_surveys( ) subset_surveys(survey_list = example_surveys, - subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"), - subset_name = "subset_example") + subset_vars = c("rowid", "isocntry", "qa10_1", "qa14_1"), + subset_name = "subset_example") } \concept{subsetting function} diff --git a/tests/testthat/retroh_int.txt b/tests/testthat/retroh_int.txt index 56be782..12745cc 100644 --- a/tests/testthat/retroh_int.txt +++ b/tests/testthat/retroh_int.txt @@ -1,14 +1,14 @@ > pillar::pillar(x1) - 1 [Good] - 2 - 3 - 4 - 5 - 6 - 7 - 8 [Bad] - 9 (NA) - 10 (NA) + 1 [Good] + 2 + 3 + 4 + 5 + 6 + 7 + 8 [Bad] + 9 (NA) +10 (NA) diff --git a/tests/testthat/retroh_int_shaft.txt b/tests/testthat/retroh_int_shaft.txt index f90c407..0245242 100644 --- a/tests/testthat/retroh_int_shaft.txt +++ b/tests/testthat/retroh_int_shaft.txt @@ -1,16 +1,16 @@ > pillar::pillar_shaft(tibble(v1 = x1)) # A tibble: 10 x 1 - v1 + v1 - 1 1 [Good] - 2 2 - 3 3 - 4 4 - 5 5 - 6 6 - 7 7 - 8 8 [Bad] - 9 9 (NA) -10 10 (NA) + 1 1 [Good] + 2 2 + 3 3 + 4 4 + 5 5 + 6 6 + 7 7 + 8 8 [Bad] + 9 9 (NA) +10 10 (NA) diff --git a/tests/testthat/test-codebook_create.R b/tests/testthat/test-codebook_create.R index fd4a62e..e69de29 100644 --- a/tests/testthat/test-codebook_create.R +++ b/tests/testthat/test-codebook_create.R @@ -1,55 +0,0 @@ -metadata <- metadata_create ( - survey_list = read_rds ( - system.file("examples", "ZA7576.rds", - package = "retroharmonize") - )) - -metadata_2 <- metadata -metadata_2$user_var <- paste0(1:nrow(metadata), "_user") - -names (metadata ) -test_codebook <- create_codebook ( metadata ) -test_codebook_2 <- create_codebook ( metadata = metadata_2 ) - -names ( test_codebook_2) - -examples_dir <- system.file("examples", package = "retroharmonize") -survey_list <- dir(examples_dir)[grepl("\\.rds", dir(examples_dir))] - -example_surveys <- read_surveys( - file.path( examples_dir, survey_list), - export_path = NULL) - -test_survey_codebook <- create_codebook (survey = example_surveys[[1]]) - -test_that("Deprecation warning is given", { - expect_warning(codebook_waves_create ( waves = example_surveys )) -}) - -waves_codebook <- codebook_surveys_create ( survey_list = example_surveys ) - -test_that("correct codebook structure is returned", { - expect_true ( "user_var" %in% names(test_codebook_2)) - expect_true(all(names (test_survey_codebook)[! names ( test_codebook ) %in% names(metadata)] %in% c("entry","val_code_orig","val_label_orig","label_range"))) - expect_true(all(names (test_codebook)[! names ( test_codebook ) %in% names(metadata)] %in% c("entry","val_code_orig","val_label_orig","label_range"))) - expect_true(all(names (waves_codebook)[! names ( test_codebook ) %in% names(metadata)] %in% c("entry","val_code_orig","val_label_orig","label_range"))) -}) - - -test_that("correct codebook contents are returned", { - expect_true ( - ## all files are present in the codebook - all (survey_list %in% unique ( waves_codebook$filename)) - ) - expect_true ( - # The label range is correctly identified - all ( c("valid", "missing") %in% unique(waves_codebook$label_range)) - ) - }) - -test_that("error handling", { - expect_error ( create_codebook( survey = data.frame()) ) - expect_error ( create_codebook(metadata = NULL, survey = NULL )) -}) - - diff --git a/tests/testthat/test-create_codebook.R b/tests/testthat/test-create_codebook.R new file mode 100644 index 0000000..bd5de66 --- /dev/null +++ b/tests/testthat/test-create_codebook.R @@ -0,0 +1,55 @@ +metadata <- metadata_create ( + survey_list = read_rds ( + system.file("examples", "ZA7576.rds", + package = "retroharmonize") + )) + +metadata_2 <- metadata +metadata_2$user_var <- paste0(1:nrow(metadata), "_user") + +names (metadata ) +test_codebook <- create_codebook ( metadata ) +test_codebook_2 <- create_codebook ( metadata = metadata_2 ) + +names ( test_codebook_2) + +examples_dir <- system.file("examples", package = "retroharmonize") +survey_list <- dir(examples_dir)[grepl("\\.rds", dir(examples_dir))] + +example_surveys <- read_surveys( + file.path( examples_dir, survey_list), + export_path = NULL) + +test_survey_codebook <- create_codebook (survey = example_surveys[[1]]) + +test_that("Deprecation warning is given", { + expect_warning(codebook_waves_create ( waves = example_surveys )) +}) + +waves_codebook <- codebook_surveys_create ( survey_list = example_surveys ) + +test_that("correct codebook structure is returned", { + expect_true ( "user_var" %in% names(test_codebook_2)) + expect_true(all(names (test_survey_codebook)[! names ( test_codebook ) %in% names(metadata)] %in% c("entry","val_code_orig","val_label_orig","label_range"))) + expect_true(all(names (test_codebook)[! names ( test_codebook ) %in% names(metadata)] %in% c("entry","val_code_orig","val_label_orig","label_range"))) + expect_true(all(names (waves_codebook)[! names ( test_codebook ) %in% names(metadata)] %in% c("entry","val_code_orig","val_label_orig","label_range"))) +}) + + +test_that("correct codebook contents are returned", { + expect_true ( + ## all files are present in the codebook + all (survey_list %in% unique ( waves_codebook$filename)) + ) + expect_true ( + # The label range is correctly identified + all ( c("valid", "missing") %in% unique(waves_codebook$label_range)) + ) +}) + +test_that("error handling", { + expect_error ( create_codebook( survey = data.frame()) ) + expect_error ( create_codebook(metadata = NULL, survey = NULL )) +}) + + diff --git a/tests/testthat/test-harmonize_survey_variables.R b/tests/testthat/test-harmonize_survey_variables.R index dcf2be0..d7167de 100644 --- a/tests/testthat/test-harmonize_survey_variables.R +++ b/tests/testthat/test-harmonize_survey_variables.R @@ -1,2 +1,25 @@ +examples_dir <- system.file("examples", package = "retroharmonize") +survey_list <- dir(examples_dir)[grepl("\\.rds", dir(examples_dir))] +example_surveys <- read_surveys( + file.path( examples_dir, survey_list), + export_path = NULL) -## Please refer to test-crosswalk \ No newline at end of file +documented_surveys <- metadata_create(example_surveys) +documented_surveys <- documented_surveys[ + documented_surveys$var_name_orig %in% c( "rowid", "isocntry", "w1", "qd3_4", + "qd3_8" , "qd7.4", "qd7.8", "qd6.4", "qd6.8"), + ] +crosswalk_table <- crosswalk_table_create ( metadata = documented_surveys ) + +freedom_table <- crosswalk_table[ + which(crosswalk_table$var_name_target %in% c("rowid", "freedom")), + ] + +harmonized <- harmonize_survey_variables(crosswalk_table = freedom_table, + subset_name = 'freedom', + survey_list = example_surveys ) + + +test_that("harmonize_survey_variables() subsets and renames as expected", { + expect_true(is.list(harmonized)) +}) diff --git a/tests/testthat/test-read_surveys.R b/tests/testthat/test-read_surveys.R index 7b23aae..543d017 100644 --- a/tests/testthat/test-read_surveys.R +++ b/tests/testthat/test-read_surveys.R @@ -5,20 +5,20 @@ my_rds_files <- file.path(examples_dir, dir( examples_dir)[grepl(".rds", example_surveys <- read_surveys(survey_paths = my_rds_files) -test_that("Correct file is read", { +test_that("read_surveys() reads the correct file", { expect_equal(attr(read_survey( file_path = my_rds_files[1]), "filename"), fs::path_file(my_rds_files[1])) }) +test_that("read_surveys() error if file does not exists", { + expect_error(read_surveys(tempfile(), .f = "read_csv", export_path = NULL)) +}) + -test_that("All files are read", { +test_that("read_surveys() reads all files", { expect_equal(length(example_surveys), 3) }) -wrong_files <- c(file.path(examples_dir), "no_file.rds", example_surveys) -test_that("exception handling works", { - expect_error(read_surveys(wrong_files)) -}) \ No newline at end of file diff --git a/tests/testthat/test-subset_save_surveys.R b/tests/testthat/test-subset_save_surveys.R index d47f2d3..3a08641 100644 --- a/tests/testthat/test-subset_save_surveys.R +++ b/tests/testthat/test-subset_save_surveys.R @@ -70,10 +70,10 @@ test_that("saving and subsetting", { test_that("saving and subsetting (not on CRAN)", { skip_on_cran() expect_true(ncol(readRDS ( file.path(tempdir(), "ZA7576_tested.rds"))) == 3) - expect_error(save_surveys ( crosswalk_table = ctable, - subset_name = "tested", - import_path = NULL, - export_path = NULL)) + expect_error(save_surveys( crosswalk_table = ctable, + subset_name = "tested", + import_path = NULL, + export_path = NULL)) }) diff --git a/tests/testthat/test-subset_surveys.R b/tests/testthat/test-subset_surveys.R new file mode 100644 index 0000000..4b91d42 --- /dev/null +++ b/tests/testthat/test-subset_surveys.R @@ -0,0 +1,21 @@ +test_survey <- read_rds ( + file = system.file("examples", "ZA7576.rds", + package = "retroharmonize") +) + +test_metadata <- metadata_create ( test_survey ) +test_metadata <- test_metadata[c(1,7,18),] +ctable_2 <- crosswalk_table_create(test_metadata) +ctable_2$var_name_target <- ifelse(ctable_2$var_name_orig == "qa14_3", + "trust_ecb", + ctable_2$var_name_orig) + +subsetted <- subset_surveys (crosswalk_table = ctable_2, + subset_name = "tested", + survey_list = test_survey, + import_path = NULL) + + +test_that("multiplication works", { + expect_true(is.survey(subsetted[[1]])) +}) diff --git a/vignettes/cap.Rmd b/vignettes/cap.Rmd index d1b6b99..f919f04 100644 --- a/vignettes/cap.Rmd +++ b/vignettes/cap.Rmd @@ -99,12 +99,10 @@ From the metadata description, we select the post-stratification weight variable ```{r weightvars} weight_variables <- cap_metadata %>% - filter ( .data$var_name_orig %in% c("isocntry", "wex", "wextra", "v47", "v7", "w1") | - .data$var_label_orig %in% c("w_1_weight_result_from_target", - "w_3_weight_special_germany", - "weight_result_from_traget_united_germany", - "w_4_weight_special_united_kingdom", - "weight_result_from_traget_united_kingdom")) + filter ( + var_name_orig %in% c("isocntry", "wex", "wextra", "v47", "v7", "w1") | var_label_orig %in% c("w_1_weight_result_from_target", + "w_3_weight_special_germany", "weight_result_from_traget_united_germany", "w_4_weight_special_united_kingdom", "weight_result_from_traget_united_kingdom") + ) ``` A *schema crosswalk* is a table that shows equivalent elements (or "fields") in more than one structured data source. With `crosswalk_table_create()` we first create an empty schema crosswalk, then we fill up the empty schema with values. Researchers who feel more comfortable working in a spreadsheet application can create a similar crosswalk table in Excel, Numbers, or OpenOffice, and import the data from a `csv` or any tabular file. @@ -119,14 +117,16 @@ weigthing_crosswalk_table <- crosswalk_table_create( # Define the new, harmonized variable names var_name_target = case_when ( # grepl("weight_result_from_target", .data$val_label_target) ~ "w1", [this is the issue] - .data$var_name_orig %in% c("wex", "wextra", "v47") ~ 'wex', - .data$var_name_orig %in% c("w1", "v8") ~ "w1", - .data$var_name_orig %in% c("w3a", "v12") ~ "w_de", - .data$var_name_orig %in% c("w4a", "v10") ~ "w_uk", - .data$var_name_orig == "rowid" ~ 'rowid', # do not forget to keep the unique row IDs + var_name_orig %in% c("wex", "wextra", "v47") ~ 'wex', + var_name_orig %in% c("w1", "v8") ~ "w1", + var_name_orig %in% c("w3a", "v12") ~ "w_de", + var_name_orig %in% c("w4a", "v10") ~ "w_uk", + var_name_orig == "rowid" ~ 'rowid', # do not forget to keep the unique row IDs TRUE ~ "geo"), # Define the target R class for working with these variables. - class_target = ifelse(.data$var_name_target %in% c("geo", "v47"), "factor", "numeric") + class_target = ifelse(var_name_target %in% c("geo", "v47"), + yes = "factor", + no = "numeric") ) %>% select ( -all_of(c("val_numeric_orig", "val_numeric_target", "val_label_orig", "val_label_target")) @@ -164,8 +164,8 @@ weight_vars <- weight_vars %>% country_code == "GB" ~ w_uk, # UK = Great Britain + Northern Ireland TRUE ~ w1 )) %>% mutate (year_survey = case_when( - .data$id == "ZA4529_v3-0-1" ~ '2007', - .data$id == "ZA5688_v6-0-0" ~ '2013' + id == "ZA4529_v3-0-1" ~ '2007', + id == "ZA5688_v6-0-0" ~ '2013' )) %>% mutate (year_survey = as.factor(.data$year_survey)) ``` @@ -174,7 +174,8 @@ weight_vars <- weight_vars %>% ```{r printweigthvars} weight_vars <- weight_vars %>% - select ( all_of(c("rowid", "country_code", "geo", "w", "w1", "wex", "id")) ) + select ( all_of(c("rowid", "country_code", "geo", + "w", "w1", "wex", "id")) ) set.seed(2022) weight_vars %>% sample_n(6) ```