diff --git a/DESCRIPTION b/DESCRIPTION index e39cc6c..cd49460 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,13 @@ Package: ECOTOXr Type: Package Title: Download and Extract Data from US EPA's ECOTOX Database -Version: 0.1.1 -Date: 2021-10-04 +Version: 0.2.0 +Date: 2022-11-17 Authors@R: c(person("Pepijn", "de Vries", role = c("aut", "cre", "dtc"), - email = "pepijn.devries@outlook.com")) + email = "pepijn.devries@outlook.com", + comment = list(ORCID = "0000-0002-7961-6646"))) Author: - Pepijn de Vries [aut, cre, dtc] + Pepijn de Vries [aut, cre, dtc] (0000-0002-7961-6646) Maintainer: Pepijn de Vries Description: The US EPA ECOTOX database is a freely available database with a treasure of aquatic and terrestrial ecotoxicological data. @@ -19,13 +20,22 @@ Depends: RSQLite Imports: crayon, + dbplyr, dplyr, + httr, + purrr, rappdirs, readr, + rlang, rvest, stringr, + tibble, + tidyr, + tidyselect, utils -Suggests: +Suggests: + DBI, + standartox, testthat (>= 3.0.0), webchem URL: diff --git a/NAMESPACE b/NAMESPACE index cd80a46..e860a07 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,18 +1,45 @@ -# Generated by roxygen2: do not edit by hand - -export(build_ecotox_sqlite) -export(check_ecotox_availability) -export(cite_ecotox) -export(dbConnectEcotox) -export(dbDisconnectEcotox) -export(download_ecotox_data) -export(get_ecotox_info) -export(get_ecotox_path) -export(get_ecotox_sqlite_file) -export(list_ecotox_fields) -export(search_ecotox) -export(search_query_ecotox) -importFrom(RSQLite,dbConnect) -importFrom(RSQLite,dbDisconnect) -importFrom(RSQLite,dbExecute) -importFrom(RSQLite,dbWriteTable) +# Generated by roxygen2: do not edit by hand + +S3method("[",cas) +S3method("[<-",cas) +S3method("[[",cas) +S3method("[[<-",cas) +S3method(as.character,cas) +S3method(as.data.frame,cas) +S3method(as.double,cas) +S3method(as.integer,cas) +S3method(as.list,cas) +S3method(c,cas) +S3method(format,cas) +S3method(print,cas) +export("%>%") +export(as.cas) +export(build_ecotox_sqlite) +export(cas) +export(check_ecotox_availability) +export(cite_ecotox) +export(dbConnectEcotox) +export(dbDisconnectEcotox) +export(download_ecotox_data) +export(get_ecotox_info) +export(get_ecotox_path) +export(get_ecotox_sqlite_file) +export(get_ecotox_url) +export(is.cas) +export(list_ecotox_fields) +export(search_ecotox) +export(search_ecotox_lazy) +export(search_query_ecotox) +export(show.cas) +importFrom(RSQLite,dbConnect) +importFrom(RSQLite,dbDisconnect) +importFrom(RSQLite,dbExecute) +importFrom(RSQLite,dbWriteTable) +importFrom(dplyr,"%>%") +importFrom(dplyr,collect) +importFrom(dplyr,inner_join) +importFrom(dplyr,left_join) +importFrom(dplyr,select) +importFrom(dplyr,sql) +importFrom(dplyr,tbl) +importFrom(rlang,":=") diff --git a/NEWS b/NEWS index 445c55a..845b7db 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,19 @@ +ECOTOXr v0.2.0 (Release date: ***********TODO) +============= + + * Major changes: + + * Modified searching routines to make advantage of + the sql parser and optimisers implemented in + the package 'dplyr'. Performance (i.e. speed) of the + search routines have improved considerably. + * Added support for handling Chemical Abstracts + Service (CAS) numbers. + + * Several minor adjustments and corrections to code + and manual. These include fixes to address notes + from CRAN checks. + ECOTOXr v0.1.1 (Release date: 2021-10-04) ============= diff --git a/R/ECOTOXr.r b/R/ECOTOXr.r index 2808b41..7d31fd2 100644 --- a/R/ECOTOXr.r +++ b/R/ECOTOXr.r @@ -18,7 +18,8 @@ #' First download a copy of the complete EPA database. This can be done by calling \code{\link{download_ecotox_data}}. #' This may not always work on all machines as R does not always accept the website SSL certificate from the EPA. #' In those cases the zipped archive with the database files can be downloaded manually with a different (more -#' forgiving) browser. The files from the zip archive can be extracted to a location of choice. +#' forgiving) browser. The files from the zip archive can be extracted to a location of choice. Alternatively, +#' the user could try to use \code{\link{download_ecotox_data}(ssl_verifypeer = 0L)} when the download URL is trusted. #' } #' \item{ #' Next, an SQLite database needs to be build from the downloaded files. This will be done automatically when diff --git a/R/cas_handlers.r b/R/cas_handlers.r new file mode 100644 index 0000000..2682964 --- /dev/null +++ b/R/cas_handlers.r @@ -0,0 +1,279 @@ +#' Functions for handling chemical abstract service (CAS) registry numbers +#' +#' Functions for handling chemical abstract service (CAS) registry numbers +#' +#' In the database \href{https://en.wikipedia.org/wiki/Chemical_Abstracts_Service}{CAS registry} numbers are stored +#' as text (type \code{character}). As CAS numbers can consist of a maximum of 10 digits (plus two hyphens) this means +#' that each CAS number can consume up to 12 bytes of memory or disk space. By storing the data numerically, only +#' 5 bytes are required. These functions provide the means to handle CAS registry numbers and coerce from and to +#' different formats and types. +#' @param x Object from which data needs to be extracted or replaced, or needs to be coerced into a specific +#' format. For nearly all of the functions documented here, this needs to be an object of the S3 class 'cas', +#' which can be created with \code{as.cas}. For \code{as.cas}, \code{x} can be a \code{character} (CAS registry number +#' with or without hyphenation) or a \code{numeric} value. Note that \code{as.cas} will only accept correctly +#' formatted and valid CAS registry numbers. +#' @param i Index specifying element(s) to extract or replace. See also \code{\link[base:Extract]{Extract}}. +#' @param value A replacement value, can be anything that can be converted into an S3 cas-class object with \code{as.cas}. +#' @param length A non-negative \code{integer} specifying the desired length. Double values will be coerced to +#' \code{integer}: supplying an argument of length other than one is an error. +#' @param hyphenate A \code{logical} value indicating whether the formatted CAS number needs to be hyphenated. +#' Default is \code{TRUE}. +#' @param ... Arguments passed to other functions +#' @return Functions \code{cas}, \code{c} and \code{as.cas} return S3 class 'cas' objects. Coercion functions +#' (starting with 'as') return the object as specified by their respective function names (i.e., \code{integer}, +#' \code{double}, \code{character}, \code{list} and \code{data.frame}). The \code{show.cas} and \code{print} functions +#' also return formatted \code{charater}s. The function \code{is.cas} will return a single \code{logical} value, +#' indicating whether \code{x} is a valid S3 cas-class object. The square brackets return the selected index/indices, +#' or the \code{vector} of cas objects where the selected elements are replaced by \code{value}. +#' @rdname cas +#' @name cas +#' @examples +#' ## This will generate a vector of cas objects containing 10 +#' ## fictive (0-00-0), but valid registry numbers: +#' cas(10) +#' +#' ## This is a cas-object: +#' is.cas(cas(0L)) +#' +#' ## This is not a cas-object: +#' is.cas(0L) +#' +#' ## Three different ways of creating a cas object from +#' ## Benzene's CAS registry number (the result is the same) +#' as.cas("71-43-2") +#' as.cas("71432") +#' as.cas(71432L) +#' +#' ## This is one way of creating a vector with multiple CAS registry numbers: +#' cas_data <- as.cas(c("64175", "71432", "58082")) +#' +#' ## This is how you select a specific element(s) from the vector: +#' cas_data[2:3] +#' cas_data[[2]] +#' +#' ## You can also replace specific elements in the vector: +#' cas_data[1] <- "7440-23-5" +#' cas_data[[2]] <- "129-00-0" +#' +#' ## You can format CAS numbers with or without hyphens: +#' format(cas_data, TRUE) +#' format(cas_data, FALSE) +#' +#' ## The same can be achieved using as.character +#' as.character(cas_data, TRUE) +#' as.character(cas_data, FALSE) +#' +#' ## There are also show and print methods available: +#' show(cas_data) +#' print(cas_data) +#' +#' ## Numeric values can be obtained from CAS using as.numeric, as.double or as.integer +#' as.numeric(cas_data) +#' +#' ## Be careful, however. Some CAS numbers cannot be represented by R's 32 bit integers +#' ## and will produce NA's. This will work OK: +#' huge_cas <- as.cas("9999999-99-5") +#' +#' \dontrun{ +#' ## This will not: +#' as.integer(huge_cas) +#' } +#' +#' ## The trick applied by this package is that the final +#' ## validation digit is stored separately as attribute: +#' unclass(huge_cas) +#' +#' ## This is how cas objects can be concatenated: +#' cas_data <- c(huge_cas, cas_data) +#' +#' ## This will create a data.frame +#' as.data.frame(cas_data) +#' +#' ## This will create a list: +#' as.list(cas_data) +#' @author Pepijn de Vries +#' @export +cas <- function(length = 0L) { + structure ( + integer(length), ## The registry number is stored as integer without the final digit (=checksum) + checksum = raw(length), ## last digit of CAS number, which serves as a checksum, stored as raw value + class = "cas" + ) +} + +#' @rdname cas +#' @name is.cas +#' @export +is.cas <- function(x) { + if (!(class(x) %in% "cas")) return(F) + checksums <- attributes(x)$checksum + if (length(checksums) != length(x)) stop("Each CAS registry in the vector needs a checksum") + validate <- outer(unclass(x), 0:9, function(x, y) { + floor(x/(10^y)) %% 10 + }) + validate <- apply(validate, 1, function(x) { + x <- sum(seq_along(x)*x) %% 10 + }) + return(all(validate == as.numeric(checksums))) +} + +#' @rdname cas +#' @name as.cas +#' @export +as.cas <- function(x) { + if (is.cas(x)) return(x) + x <- as.character(x) + is_hyphenated <- stringr::str_sub(x, -2, -2) == "-" & stringr::str_sub(x, -5, -5) == "-" + x[is_hyphenated] <- paste0( + stringr::str_sub(x[is_hyphenated], 1, -6), + stringr::str_sub(x[is_hyphenated], -4, -3), + stringr::str_sub(x[is_hyphenated], -1, -1) + ) + if (any(!grepl("^[0-9]+$", x))) stop("CAS numbers can only contain hyphens at correct positions and numeric characters otherwise...") + registry <- as.integer(stringr::str_sub(x, 1, -2)) + registry[is.na(registry)] <- 0L + attributes(registry)$checksum <- as.raw(as.integer(stringr::str_sub(x, -1, -1))) + class(registry) <- "cas" + if (!is.cas(registry)) stop("Input contains invalid CAS numbers") + registry +} + +#' @rdname cas +#' @name [[.cas +#' @export +`[[.cas` <- function(x, i) { + attribs <- attributes(x) + attribs$checksum <- attribs$checksum[[i]] + attribs$names <- attribs$names[[i]] + x <- unclass(x) + x <- x[[i]] + attributes(x) <- attribs + x +} + +#' @rdname cas +#' @name [.cas +#' @export +`[.cas` <- function(x, i) { + attribs <- attributes(x) + attribs$checksum <- attribs$checksum[i] + attribs$names <- attribs$names[i] + x <- unclass(x) + x <- x[i] + attributes(x) <- attribs + x +} + +#' @rdname cas +#' @name [[<-.cas +#' @export +`[[<-.cas` <- function(x, i, value) { + value <- as.cas(value) + attribs <- attributes(x) + attribs$checksum[[i]] <- attributes(value)$checksum + attribs$names[[i]] <- attributes(value)$names + x <- unclass(x) + x[[i]] <- unclass(value) + attributes(x) <- attribs + x +} + +#' @rdname cas +#' @name [<-.cas +#' @export +`[<-.cas` <- function(x, i, value) { + value <- as.cas(value) + attribs <- attributes(x) + attribs$checksum[i] <- attributes(value)$checksum + attribs$names[i] <- attributes(value)$names + x <- unclass(x) + x[i] <- unclass(value) + attributes(x) <- attribs + x +} + +#' @rdname cas +#' @name format.cas +#' @export +format.cas <- function(x, hyphenate = TRUE, ...) { + checksums <- attributes(x)$checksum + x <- unclass(x) + repp <- x + repp[repp == 0] <- 1 + repp <- ceiling(2 - log10(repp)) + repp[repp < 0] <- 0 + x <- paste0(strrep("0", repp), x) + sprintf("%s%s%s%s%01i", + stringr::str_sub(x, 1, -3), + ifelse(hyphenate, "-", ""), + stringr::str_sub(x, -2, -1), + ifelse(hyphenate, "-", ""), + as.numeric(checksums) + ) +} + +#' @rdname cas +#' @name as.character.cas +#' @export +as.character.cas <- function(x, ...) { + format(x, ...) +} + +#' @rdname cas +#' @name show.cas +#' @export +show.cas <- function(x, ...) { + format(x, ...) +} + +#' @rdname cas +#' @name print.cas +#' @export +print.cas <- function(x, ...) { + if (length(x) == 0) + cat("cas(0)\n") else print(format.cas(x), ...) +} + +#' @rdname cas +#' @name as.list.cas +#' @export +as.list.cas <- function(x, ...) { + lapply(seq_along(x), function(i) x[i]) +} + +#' @rdname cas +#' @name as.double.cas +#' @export +as.double.cas <- function(x, ...) { + as.double(as.integer.cas(x, ...), ...) +} + +#' @rdname cas +#' @name as.integer.cas +#' @export +as.integer.cas <- function(x, ...) { + checksums <- as.integer(attributes(x)$checksum, ...) + x <- 10L*unclass(x) + attributes(x) <- NULL + x + checksums +} + +#' @rdname cas +#' @name c.cas +#' @export +c.cas <- function(...) { + result <- list(...) + result <- lapply(result, as.cas) + checksums <- do.call(c, lapply(result, function(x) attributes(x)$checksum)) + result <- do.call(c, lapply(result, function(x) unclass(x))) + class(result) <- "cas" + attributes(result)$checksum <- checksums + result +} + +#' @rdname cas +#' @name as.data.frame.cas +#' @export +as.data.frame.cas <- function(...) { + as.data.frame(tibble::tibble(...)) +} diff --git a/R/database_access.r b/R/database_access.r index 8426127..dc756a2 100644 --- a/R/database_access.r +++ b/R/database_access.r @@ -142,8 +142,11 @@ get_ecotox_info <- function(path = get_ecotox_path(), version) { #' This can be useful when specifying a \code{\link{search_ecotox}}, to identify which fields #' are available from the database, for searching and output. #' @param which A \code{character} string that specifies which fields to return. Can be any of: -#' '\code{default}': returns default output field names; '\code{all}': returns all fields; or -#' '\code{full}': returns all except fields from table 'dose_response_details'. +#' '\code{default}': returns default output field names; '\code{all}': returns all fields; +#' '\code{extended}': returns all fields of the default tables; or +#' '\code{full}': returns all fields except those from tables 'chemical_carriers', +#' 'media_characteristics', 'doses', 'dose_responses', +#' 'dose_response_details', 'dose_response_links' and 'dose_stat_method_codes'. #' @param include_table A \code{logical} value indicating whether the table name should be included #' as prefix. Default is \code{TRUE}. #' @return Returns a \code{vector} of type \code{character} containing the field names from the ECOTOX database. @@ -156,16 +159,20 @@ get_ecotox_info <- function(path = get_ecotox_path(), version) { #' ## All fields that are available from the ECOTOX database: #' list_ecotox_fields("all") #' -#' ## All except fields from the table 'dose_response_details' +#' ## All except fields from the tables 'chemical_carriers', 'media_characteristics', 'doses', +#' ## 'dose_responses', 'dose_response_details', 'dose_response_links' and 'dose_stat_method_codes' #' ## that are available from the ECOTOX database: #' list_ecotox_fields("full") #' @author Pepijn de Vries #' @export -list_ecotox_fields <- function(which = c("default", "full", "all"), include_table = TRUE) { +list_ecotox_fields <- function(which = c("default", "extended", "full", "all"), include_table = TRUE) { which <- match.arg(which) result <- .db_specs$field_name - if (include_table) result <- paste(.db_specs$table, result, sep = ".") - if (which == "default") result <- result[.db_specs$default_output] - if (which == "full") result <- result[.db_specs$table != "dose_response_details"] + if (include_table) result <- paste(.db_specs$table, result, sep = ".") + if (which == "default") result <- result[.db_specs$default_output] + if (which == "extended") result <- result[.db_specs$table %in% unique(.db_specs$table[.db_specs$default_output])] + if (which == "full") result <- result[ + !(.db_specs$table %in% c("chemical_carriers", "media_characteristics", "doses", "dose_response_details", + "dose_response_links", "dose_stat_method_codes"))] return(result) } diff --git a/R/helpers.r b/R/helpers.r index 141c07b..27164d8 100644 --- a/R/helpers.r +++ b/R/helpers.r @@ -12,3 +12,380 @@ stop("No local database located. Download data first by calling 'download_ecotox_data()'") } else return(test) } + +.search_ecotox_lazy_get_result_ids <- function(search, dbcon) { + # Declare variables to pass CRAN checks + .data <- field <- terms <- table_mod <- NULL + + search_result <- + lapply(search, as.data.frame, strings.as.factors = F) %>% + dplyr::bind_rows(.id = "field") %>% + dplyr::mutate( + table = lapply(field, function(x) { + switch( + x, + result_id = "results", + test_id = "tests", + .db_specs$table[.db_specs$field_name %in% x]) + }) + ) %>% + tidyr::unnest("table") %>% + dplyr::group_by(dplyr::across(dplyr::any_of(c("table", "field", "method")))) %>% + dplyr::summarise( + where = { + method <- tryCatch({.data$method[[1]]}, error = function(e) "contains") + if (method == "exact") { + sprintf("`%s` COLLATE NOCASE IN ('%s')", field[[1]], paste(.data$terms, collapse = "', '")) + } else if(method == "contains"){ + paste(sprintf("`%s` LIKE '%%%s%%'", field, terms), collapse = " OR ") + } else { + stop("Sorry, specified search method is not implemented.") + } + }) %>% + dplyr::group_by(table) %>% + dplyr::group_map(~{ + dplyr::tibble( + tbl = list(tbl(dbcon, .x$table[[1]]) %>% dplyr::filter(!!sql(paste(sprintf("(%s)", .x$where), collapse = " AND ")))), + table = .x$table[[1]] + )}, .keep = T) %>% + dplyr::bind_rows() %>% + dplyr::group_by(table_mod = gsub("_synonyms", "", table)) %>% + dplyr::summarise( + table = table_mod[[1]], + tbl = + if (length(tbl) > 1) { + tbl %>% + purrr::map(~{select(., dplyr::any_of("species_number"))}) %>% + purrr::reduce(~{dplyr::union(.x, .y)}) %>% + list() + } else { + tbl + }) %>% + dplyr::group_by(dplyr::across(dplyr::any_of("table"))) %>% + dplyr::group_map(~{ + #keep linking tables until result_ids are obtained + my_tab <- ..1$tbl[[1]] + count <- 0 + repeat { + count <- count + 1 + if (count > 5) stop("Sorry could not build a query for the search you have specified.") + cur_fields <- colnames(my_tab) + if (any(c("result_id", "test_id") %in% cur_fields)) break + referring <- .db_specs[.db_specs$table %in% c("results", "tests") & + .db_specs$foreign_key %in% sprintf("%s(%s)", ..2$table, cur_fields),,drop = F] + if (nrow(referring) == 1) { + foreign <- unlist(regmatches(referring$foreign_key, gregexpr("(?<=\\().+(?=\\))", referring$foreign_key, perl = T))) + my_tab <- + my_tab %>% + left_join(tbl(dbcon, referring$table), by = structure(referring$field_name, names = foreign)) + } else { + stop("Sorry could not build a query for the search you have specified.") + } + } + dplyr::tibble(tbl = list(my_tab %>% select(dplyr::any_of(c("result_id", "test_id"))))) + }) %>% + dplyr::bind_rows() %>% + dplyr::group_by(result_available = tbl %>% + purrr::map(~ paste(sort(match(colnames(.), c("test_id", "result_id"))), collapse = "_")) %>% + unlist()) %>% + dplyr::group_map(~{ + switch( + .y$result_available, + `1` = { + purrr::reduce(.x$tbl, ~{inner_join(.x, .y, by = "test_id")}) %>% + left_join(tbl(dbcon, "results"), by = "test_id") %>% + select(dplyr::all_of(c("test_id", "result_id"))) + }, + `2` = { + purrr::reduce(.x$tbl, ~{inner_join(.x, .y, by = "result_id")}) %>% + left_join(tbl(dbcon, "results"), by = "result_id") %>% + select(dplyr::all_of(c("test_id", "result_id"))) + }, + `1_2` = { + purrr::reduce(.x$tbl, ~{inner_join(.x, .y, by = c("test_id", "result_id"))}) + }) + }) %>% + purrr::reduce(~{inner_join(.x, .y, by = c("test_id", "result_id"))}) + + return(search_result) +} + +.search_ecotox_lazy_append_fields <- function(dbcon, search_result, output_fields, compute, ...) { + con <- search_result[["src"]]$con + output_fields <- as.data.frame(do.call(rbind, strsplit(output_fields, ".", fixed = T)), stringsAsFactors = F) + names(output_fields) <- c("table", "field") + ## Duplicated field names need to be renamed to avoid ambiguous results, keep track of those in 'new_field' + output_fields$new_field <- output_fields$field + output_fields_tables <- unique(output_fields$table) + ## Find out if there are any field names duplicated which are not used as joining keys: + duplicated_fields <- unique(sort(.db_specs$field_name[duplicated(.db_specs$field_name)])) + duplicated_fields <- duplicated_fields[ + duplicated_fields %in% + .db_specs$field_name[.db_specs$primary_key == "" & .db_specs$foreign_key == ""] + ] + + .process_lookups <- function(result, parent_tab, prefix = "") { + REPLACE <- NULL # This is used in dplyr routines below but are not globally bound + join_params <- list() + + foreigns <- gsub("\\(code\\)", "", + unique(.db_specs$foreign_key[.db_specs$table == parent_tab & endsWith(.db_specs$foreign_key, "(code)")])) + tabs <- .db_specs[ + (.db_specs$table == parent_tab & .db_specs$field_name %in% output_fields$field & + endsWith(.db_specs$foreign_key, "(code)")) | + .db_specs$table == parent_tab & + grepl(paste0(foreigns[foreigns %in% output_fields$table], "\\(",collapse="|"), .db_specs$foreign_key),, + drop = F] + tabs$foreign_tab <- gsub("\\(code\\)", "", tabs$foreign_key) + + .db_specs[ + .db_specs$table == parent_tab & + startsWith(.db_specs$foreign_key, paste0(foreigns[foreigns %in% output_fields$table], "(")),,drop=F] + if (nrow(tabs) > 0) { + for (tab in unique(tabs$foreign_tab)) { + flds <- tabs$field_name[tabs$foreign_tab == tab] + for (fld in flds) { + get_idx <- function(f) {output_fields$table == tab & output_fields$field == f} + if (any(gsub(prefix, "", output_fields$new_field[get_idx("code")]) != output_fields$field[get_idx("code")])) { + output_fields <<- dplyr::bind_rows( + output_fields, + data.frame(field = "code", table = tab, new_field = paste0(prefix, fld)) + ) + } else { + output_fields$new_field[get_idx("code")] <<- paste0(prefix, fld) + } + + descr <- paste0(fld, "_description_") + if (any(gsub(prefix, "", output_fields$new_field[get_idx("description")]) != + output_fields$field[get_idx("description")])) { + output_fields <<- dplyr::bind_rows( + output_fields, + data.frame(field = "description", table = tab, new_field = paste0(prefix, descr)) + ) + } else { + output_fields$new_field[get_idx("description")] <<- paste0(prefix, descr) + } + + if (nrow(output_fields[output_fields$table == tab,]) > 0) { + join_params[[length(join_params) + 1]] <- + list( + left_by = paste0(prefix, fld, "_temp"), + right_table = tab, + right_by = "code", + right_select = structure("description", names = paste0(prefix, descr)) + ) + } + } + } + } + if (length(join_params) > 0) { + ## Create temp fields where forward slash is removed for joining with lookup values + lefties <- join_params %>% purrr::map_chr(~.$left_by) + lefties_repl <- lefties %>% purrr::map(~rlang::expr(REPLACE(!!rlang::sym(gsub("_temp$", "", .)), "/", ""))) + names(lefties_repl) <- lefties + result <- result %>% dplyr::mutate(!!!lefties_repl) + + for (i in seq_along(join_params)) { + jp <- join_params[[i]] + result <- + result %>% + left_join(tbl(dbcon, jp$right_table) %>% dplyr::rename(!!!jp$right_select), + by = structure(jp$right_by, names = jp$left_by)) + } + result <- result %>% select(-dplyr::any_of(lefties)) + if (compute) result <- result %>% dplyr::compute() + } + return(result) + } + + result <- search_result + if (compute) result <- result %>% dplyr::compute() + ## start with results table as this is needed for obtaining dose link ids + + results_output <- union(output_fields$field[output_fields$table == "results"], "result_id") + dose_tables <- c("dose_responses", "dose_response_details", "doses", "dose_response_links", "dose_stat_method_codes") + if (any(dose_tables %in% output_fields_tables)) { + + out_rename <- output_fields$table %in% dose_tables & !output_fields$field %in% c("test_id", "result_id") + output_fields$new_field[out_rename] <- paste0("dose_link_", output_fields$field[out_rename]) + result <- + result %>% + left_join(tbl(con, "dose_response_links"), "result_id") %>% + left_join(tbl(con, "dose_responses"), c("dose_resp_id", "test_id")) %>% + left_join(tbl(con, "dose_response_details"), "dose_resp_id") %>% + left_join(tbl(con, "doses"), c("dose_id", "test_id")) %>% + dplyr::rename_with(~paste0("dose_link_", .), !dplyr::any_of(c("test_id", "result_id"))) %>% + .process_lookups("dose_response_links") %>% + .process_lookups("doses", "dose_link_") + } + + if (any(c("media_characteristics", "organic_matter_type_codes") %in% output_fields_tables)) { + foreigns <- .db_specs$field_name[.db_specs$table == "media_characteristics" & .db_specs$foreign_key != ""] + result <- + result %>% + left_join(tbl(con, "media_characteristics"), "result_id") %>% + select(dplyr::any_of(union( + colnames(result), + union( + foreigns, + output_fields$field[output_fields$table == "media_characteristics"] + ) + ))) %>% + .process_lookups("media_characteristics") + } + + result_foreigns <- data.frame( + table = gsub("\\(.*?\\)", "", + .db_specs$foreign_key[.db_specs$foreign_key != "" & .db_specs$table == "results"]), + field = .db_specs$field_name[.db_specs$foreign_key != "" & .db_specs$table == "results"] + ) %>% dplyr::filter(!table %in% c("tests", "results")) + + results_output <- union(results_output, c("result_id", result_foreigns$field)) + result <- result %>% + left_join(tbl(con, "results") %>% + select(dplyr::any_of(union(c("result_id", "test_id"), results_output))) %>% + dplyr::rename_with(function(x) paste0("result_", x), + dplyr::any_of(c("created_date", "modified_date", "additional_comments"))), + c("result_id", "test_id")) + output_fields$new_field[output_fields$table == "results" & output_fields$field == "created_date"] <- + "result_created_date" + output_fields$new_field[output_fields$table == "results" & output_fields$field == "modified_date"] <- + "result_modified_date" + output_fields$new_field[output_fields$table == "results" & output_fields$field == "additional_comments"] <- + "result_additional_comments" + + if (compute) result <- result %>% dplyr::compute() + + result <- result %>% .process_lookups("results") + + ## continue with linking all requested tables to the test data + + if ("chemical_carriers" %in% output_fields_tables) { + car_sel <- output_fields$table == "chemical_carriers" & !output_fields$field %in% c("carrier_id", "test_id") + output_fields$new_field[car_sel] <- paste0("carrier_", output_fields$new_field[car_sel]) + result <- + result %>% + left_join(tbl(con, "chemical_carriers") %>% + dplyr::rename_with(~paste0("carrier_", .), !dplyr::any_of(c("carrier_id", "test_id"))), + "test_id") %>% + select(dplyr::any_of(union( + colnames(result), + output_fields$new_field[output_fields$table == "chemical_carriers"] + ))) + } + + test_foreigns <- data.frame( + table = gsub("\\(.*?\\)", "", + .db_specs$foreign_key[.db_specs$foreign_key != "" & .db_specs$table == "tests"]), + field = .db_specs$field_name[.db_specs$foreign_key != "" & .db_specs$table == "tests"] + ) %>% dplyr::filter(!table %in% c("tests", "results")) + + test_foreigns <- test_foreigns[test_foreigns$table %in% output_fields$table,,drop = F] + test_foreigns$foreign <- .db_specs$field_name[.db_specs$primary_key != ""][ + match(test_foreigns$table, .db_specs$table[.db_specs$primary_key != ""])] + + required_test_fields <- + union(output_fields$field[output_fields$table == "tests"], test_foreigns$field) + + result <- result %>% + left_join(tbl(con, "tests") %>% + select(dplyr::any_of(union("test_id", required_test_fields))) %>% + dplyr::rename_with(function(x) paste0("test_", x), + dplyr::any_of(c("created_date", "modified_date", "additional_comments"))), + "test_id") + output_fields$new_field[output_fields$table == "tests" & output_fields$field == "created_date"] <- + "test_created_date" + output_fields$new_field[output_fields$table == "tests" & output_fields$field == "modified_date"] <- + "test_modified_date" + output_fields$new_field[output_fields$table == "tests" & output_fields$field == "additional_comments"] <- + "test_additional_comments" + + if (any(c("species", "species_synonyms") %in% output_fields_tables)) { + species_fields <- output_fields$field[output_fields$table == "species"] + species_syn_fields <- output_fields$field[output_fields$table == "species_synonyms"] + result <- result %>% + left_join( + tbl(con, "species") %>% select(union(species_fields, "species_number")), + "species_number") + if (length(species_syn_fields) > 0) { + output_fields$new_field[output_fields$table == "species_synonyms" & output_fields$field == "latin_name"] <- + "species_synonyms" + spec_syns <- + tbl(con, "species_synonyms") %>% + select(union(species_syn_fields, "species_number")) + if ("species_synonyms" %in% output_fields$table) { + spec_syns <- spec_syns %>% dplyr::rename(species_synonyms = "latin_name") + } + result <- result %>% left_join(spec_syns, "species_number") + } + } + + if (!is.null(list(...)$group_by_results) && list(...)$group_by_results && "species_synonyms" %in% colnames(result)) { + result <- result %>% + dplyr::group_by(dplyr::across(!dplyr::any_of("species_synonyms"))) %>% + dplyr::summarise(dplyr::across(dplyr::any_of("species_synonyms"), ~GROUP_CONCAT(., "|")), .groups = "keep") %>% + dplyr::ungroup() + } + + if ("chemicals" %in% output_fields$table) { + output_fields$new_field[output_fields$table == "chemicals" & output_fields$field == "cas_number"] <- + "test_cas" + output_fields$new_field[output_fields$table == "chemicals" & output_fields$field == "chemical_name"] <- + "test_chemical" + output_fields$new_field[output_fields$table == "chemicals" & output_fields$field == "ecotox_group"] <- + "test_chemical_group" + result <- result %>% + left_join( + tbl(con, "chemicals") %>% + select(union("cas_number", output_fields$field[output_fields$table == "chemicals"])) %>% + dplyr::rename_with( + function(x){ + c("test_cas", "test_chemical", "test_chemical_group")[match(x, c("cas_number", "chemical_name", "ecotox_group"))] + }, dplyr::any_of(c("cas_number", "chemical_name", "ecotox_group"))), + "test_cas") + } + + if ("references" %in% output_fields$table) { + result <- result %>% + left_join( + tbl(con, "references") %>% + select(union("reference_number", output_fields$field[output_fields$table == "references"])), + "reference_number" + ) + } + + ## join remaining lookup values + result <- result %>% .process_lookups("tests") + + renamed <- output_fields$field != output_fields$new_field + message( + crayon::white( + paste( + sprintf("'%s.%s' was renamed '%s'", + output_fields$table[renamed], + output_fields$field[renamed], + output_fields$new_field[renamed]), + collapse = "\n") + ) + ) + + result <- result %>% select(dplyr::any_of(union(unique(output_fields$new_field), "result_id"))) + return(result) +} + +.group_nest_results <- function(search_result) { + if (nrow(search_result) > 0) { + group_by <- + search_result %>% + dplyr::group_by(dplyr::across(dplyr::any_of("result_id"))) %>% + dplyr::summarise_all(~all(is.na(.)) || all(.[[1]] == .)) %>% + dplyr::ungroup() %>% + select(tidyselect::vars_select_helpers$where(~ all(.))) %>% + colnames() + if (all(colnames(search_result) %in% group_by)) return (search_result) else{ + search_result <- + search_result %>% + dplyr::group_nest(dplyr::across(dplyr::any_of(group_by)), .key = "nested_data") + } + } else search_result +} diff --git a/R/imports.r b/R/imports.r index 7a9f950..9f7114a 100644 --- a/R/imports.r +++ b/R/imports.r @@ -9,4 +9,19 @@ } #' @importFrom RSQLite dbExecute dbConnect dbDisconnect dbWriteTable +#' @importFrom dplyr collect left_join inner_join select sql tbl +#' @importFrom rlang := +NULL + +#' Objects exported from other packages +#' +#' Objects imported and exported from other packages. See original documentation for more details. +#' +#' \describe{ +#' \item{dplyr}{\code{\link[dplyr:reexports]{\%>\%}}} +#' } +#' @importFrom dplyr %>% +#' @export %>% +#' @name %>% +#' @rdname reexports NULL diff --git a/R/init.r b/R/init.r index c17330a..6eb1c1a 100644 --- a/R/init.r +++ b/R/init.r @@ -1,3 +1,33 @@ +#' Get ECOTOX download URL from EPA website +#' +#' This function downloads the webpage at \url{https://cfpub.epa.gov/ecotox/index.cfm}. It then searches for the +#' download link for the complete ECOTOX database and extract its URL. +#' +#' This function is called by \code{\link{download_ecotox_data}} which tries to download the file from the resulting +#' URL. On some machines this fails due to issues with the SSL certificate. The user can try to download the file +#' by using this URL in a different browser (or on a different machine). Alternatively, the user could try to use +#' \code{\link{download_ecotox_data}(ssl_verifypeer = 0L)} when the download URL is trusted. +#' @param ... arguments passed on to \code{\link[httr]{GET}} +#' @return Returns a \code{character} string containing the download URL of the latest version of the EPA ECOTOX +#' database. +#' @rdname get_ecotox_url +#' @name get_ecotox_url +#' @examples +#' \dontrun{ +#' get_ecotox_url() +#' } +#' @author Pepijn de Vries +#' @export +get_ecotox_url <- function(...) { + link <- httr::GET("https://cfpub.epa.gov/ecotox/index.cfm", ...) + link <- rvest::read_html(link) + link <- rvest::html_elements(link, "a.ascii-link") + link <- rvest::html_attr(link, "href") + link <- link[!is.na(link) & endsWith(link, ".zip")] + if (length(link) == 0) stop("Could not find ASCII download link...") + return(link) +} + #' Check whether a ECOTOX database exists locally #' #' Tests whether a local copy of the US EPA ECOTOX database exists in \code{\link{get_ecotox_path}}. @@ -39,7 +69,8 @@ check_ecotox_availability <- function(target = get_ecotox_path()) { #' Obtain the local path to where the ECOTOX database is (or will be) placed. #' #' It can be useful to know where the database is located on your disk. This function -#' returns the location as provided by \code{\link[rappdirs]{app_dir}}. +#' returns the location as provided by \code{\link[rappdirs]{app_dir}}, or as +#' specified by you using \code{options(ECOTOXr_path = "mypath")}. #' #' @param path When you have a copy of the database somewhere other than the default #' directory (\code{\link{get_ecotox_path}()}), you can provide the path here. @@ -61,7 +92,7 @@ check_ecotox_availability <- function(target = get_ecotox_path()) { #' @author Pepijn de Vries #' @export get_ecotox_path <- function() { - rappdirs::app_dir("ECOTOXr")$cache() + getOption("ECOTOXr_path", rappdirs::app_dir("ECOTOXr")$cache()) } #' Download and extract ECOTOX database files and compose database @@ -69,17 +100,19 @@ get_ecotox_path <- function() { #' In order for this package to fully function, a local copy of the ECOTOX database needs to be build. #' This function will download the required data and build the database. #' -#' This function will attempt to find the latest download url for the ECOTOX database from the EPA website. -#' When found it will attempt to download the zipped archive containing all required data. This data is than +#' This function will attempt to find the latest download url for the ECOTOX database from the +#' \href{https://cfpub.epa.gov/ecotox/index.cfm}{EPA website} (see \code{\link{get_ecotox_url}()}). +#' When found it will attempt to download the zipped archive containing all required data. This data is then #' extracted and a local copy of the database is build. #' #' Use '\code{\link{suppressMessages}}' to suppress the progress report. #' @section Known issues: -#' On some machines this function fails to connect to the database download URL from the EPA website due to missing +#' On some machines this function fails to connect to the database download URL from the +#' \href{https://cfpub.epa.gov/ecotox/index.cfm}{EPA website} due to missing #' SSL certificates. Unfortunately, there is no easy fix for this in this package. A work around is to download and #' unzip the file manually using a different machine or browser that is less strict with SSL certificates. You can #' then call \code{\link{build_ecotox_sqlite}()} and point the \code{source} location to the manually extracted zip -#' archive. +#' archive. For this purpose \code{\link{get_ecotox_url}()} can be used. #' #' @param target Target directory where the files will be downloaded and the database compiled. Default is #' \code{\link{get_ecotox_path}()}. @@ -89,6 +122,9 @@ get_ecotox_path <- function() { #' @param ask There are several steps in which files are (potentially) overwritten or deleted. In those cases #' the user is asked on the command line what to do in those cases. Set this parameter to \code{FALSE} in order #' to continue without warning and asking. +#' @param ... Arguments passed on to \code{\link[httr]{GET}}. When this function fails with the error: "Peer +#' certificate cannot be authenticated with given CA certificates", you could try to rerun the function with +#' the option \code{ssl_verifypeer = 0L}. Only do so when you trust the indicated URL. #' @return Returns \code{NULL} invisibly. #' @rdname download_ecotox_data #' @name download_ecotox_data @@ -99,7 +135,7 @@ get_ecotox_path <- function() { #' } #' @author Pepijn de Vries #' @export -download_ecotox_data <- function(target = get_ecotox_path(), write_log = TRUE, ask = TRUE) { +download_ecotox_data <- function(target = get_ecotox_path(), write_log = TRUE, ask = TRUE, ...) { avail <- check_ecotox_availability() if (avail && ask) { cat(sprintf("A local database already exists (%s).", paste(attributes(avail)$file$database, collapse = ", "))) @@ -112,14 +148,8 @@ download_ecotox_data <- function(target = get_ecotox_path(), write_log = TRUE, a if (!dir.exists(target)) dir.create(target, recursive = T) ## Obtain download link from EPA website: message(crayon::white("Obtaining download link from EPA website... ")) - con <- url("https://cfpub.epa.gov/ecotox/index.cfm") - link <- rvest::read_html(con) - link <- rvest::html_nodes(link, "a.ascii-link") - link <- rvest::html_attr(link, "href") - link <- link[!is.na(link) & endsWith(link, ".zip")] + link <- get_ecotox_url(...) dest_path <- file.path(target, utils::tail(unlist(strsplit(link, "/")), 1)) - closeAllConnections() - if (length(link) == 0) stop("Could not find ASCII download link...") message(crayon::green("Done\n")) proceed.download <- T if (file.exists(dest_path) && ask) { @@ -128,20 +158,14 @@ download_ecotox_data <- function(target = get_ecotox_path(), write_log = TRUE, a } if (proceed.download) { message(crayon::white(sprintf("Start downloading ECOTOX data from %s...\n", link))) - con <- url(link, "rb") - dest <- file(gsub(".zip", ".incomplete.download", dest_path, fixed = T), "wb") - mb <- 0 - repeat { - read <- readBin(con, "raw", 1024*1024) ## download in 1MB chunks. - writeBin(read, dest) - mb <- mb + 1 - message(crayon::white(sprintf("\r%i MB downloaded...", mb)), appendLF = F) - if (length(read) == 0) break - } - closeAllConnections() + httr::GET(link, httr::config( + noprogress = 0L, + progressfunction = function(down, up) { + message(crayon::white(sprintf("\r%0.1f MB downloaded...", down[2]/(1024*1024))), appendLF = F) + TRUE + }), httr::write_disk(dest_path, overwrite = TRUE)) message(crayon::green(" Done\n")) } - file.rename(gsub(".zip", ".incomplete.download", dest_path, fixed = T), dest_path) ## create bib-file for later reference con <- file(gsub(".zip", "_cit.txt", dest_path), "w+") @@ -160,14 +184,15 @@ download_ecotox_data <- function(target = get_ecotox_path(), write_log = TRUE, a if (dir.exists(extr.path)) { test.files <- list.files(extr.path) if (length(test.files) >= 12 && any(test.files == "chemical_carriers.txt") && ask) { - cat("EXtracted zip files already appear to exist.\n") + cat("Extracted zip files already appear to exist.\n") prompt <- readline(prompt = "Continue unzipping and overwriting these files (y/n)? ") proceed.unzip <- startsWith("Y", toupper(prompt)) } } if (proceed.unzip) { message(crayon::white("Extracting downloaded zip file... ")) - utils::unzip(file.path(target, utils::tail(unlist(strsplit(link, "/")), 1)), exdir = target) + utils::unzip(file.path(target, utils::tail(unlist(strsplit(link, "/")), 1)), + exdir = file.path(target, gsub(".zip", "", basename(link)))) message(crayon::green("Done\n")) if (ask && startsWith("Y", toupper(readline(prompt = "Done extracting zip file, remove it to save disk space (y/n)? ")))) { @@ -247,6 +272,7 @@ build_ecotox_sqlite <- function(source, destination = get_ecotox_path(), write_l ## Loop the text file tables and add them to the sqlite database 1 by 1 i <- 0 + by(.db_specs, .db_specs$table, function(tab) { i <<- i + 1 message(crayon::white(sprintf("Adding '%s' table (%i/%i) to database:\n", @@ -277,7 +303,7 @@ build_ecotox_sqlite <- function(source, destination = get_ecotox_path(), write_l lines.read <- 1 ## Copy tables in 50000 line fragments to database, to avoid memory issues frag.size <- 50000 - message(crayon::white(sprintf("\r 0 lines (incl. header) added of '%s' added to database", tab$table[[1]])), + message(crayon::white(sprintf("\r 0 lines (incl. header) of '%s' added to database", tab$table[[1]])), appendLF = F) repeat { if (is.null(head)) { @@ -289,23 +315,43 @@ build_ecotox_sqlite <- function(source, destination = get_ecotox_path(), write_l ## Replace pipe-characters with dashes when they are between brackets "("and ")", ## These should not be interpreted as table separators and will mess up the table.read call body <- stringr::str_replace_all(body, "(?<=\\().+?(?=\\))", function(x){ - if (grepl("[\\(/]", x)) return(x) ## there should not be another opening bracket or forward slash! in that case leave as is + ## there should not be another opening bracket, double pipe or forward slash! in that case leave as is + if (grepl("[\\(/]", x) || grepl("||", x, fixed = T)) return(x) gsub("[|]", "-", x) }) lines.read <- lines.read + length(body) - table.frag <- utils::read.table(text = c(head, body[1:1]), - sep = "|", header = T, quote = "", comment.char = "", - stringsAsFactors = F, strip.white = F) + ## Join lines when number of pipes is to small (probably caused by unintended linefeed) + count_pipes <- unlist(lapply(regmatches(body, gregexpr("[|]", body)), length)) + join_lines <- which(count_pipes < length(regmatches(head, gregexpr("[|]", head))[[1]])) + if (length(join_lines) > 0) { + body <- c(body[-join_lines], paste(body[join_lines], collapse = " ")) + } ## strip.white is set to F, as they occur in primary keys! table.frag <- utils::read.table(text = c(head, body), sep = "|", header = T, quote = "", comment.char = "", stringsAsFactors = F, strip.white = F) - RSQLite::dbWriteTable(dbcon, tab$table[[1]], table.frag, append = T) - message(crayon::white(sprintf("\r %i lines (incl. header) added of '%s' added to database", lines.read, tab$table[[1]])), + missing_cols <- tab$field_name[!tab$field_name %in% colnames(table.frag)] + unexpected_cols <- colnames(table.frag)[!colnames(table.frag) %in% tab$field_name] + if (length(unexpected_cols) > 0) + message(stringr::str_pad( + sprintf("\r Ignoring unexpected column(s) '%s' in '%s'", paste(unexpected_cols, collapse = "', '"), + tab$table[[1]]), + width = 80, "right") + ) + if (length(missing_cols) > 0) + message(stringr::str_pad( + sprintf("\r Missing column(s) '%s' in '%s'", paste(missing_cols, collapse = "', '"), + tab$table[[1]]), + width = 80, "right") + ) + RSQLite::dbWriteTable(dbcon, tab$table[[1]], + table.frag[,setdiff(tab$field_name, missing_cols), drop = F], append = T) + + message(crayon::white(sprintf("\r %i lines (incl. header) of '%s' added to database", lines.read, tab$table[[1]])), appendLF = F) if (length(body) < testsize) break } diff --git a/R/sysdata.rda b/R/sysdata.rda index 3c891c8..a76e0d5 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/wrappers.r b/R/wrappers.r index fdb0e23..fdbeba1 100644 --- a/R/wrappers.r +++ b/R/wrappers.r @@ -27,7 +27,7 @@ #' #' Each element in that list should contain another list with at least one element named 'terms'. This should #' contain a \code{vector} of \code{character} strings with search terms. Optionally, a second element -#' named 'method' can be provided which should be set to either '\code{contain}' (default, when missing) or +#' named 'method' can be provided which should be set to either '\code{contains}' (default, when missing) or #' '\code{exact}'. In the first case the query will match any record in the indicated field that contains #' the search term. In case of '\code{exact}' it will only return exact matches. Note that searches are #' not case sensitive, but are picky with special (accented) characters. While building the local database @@ -49,7 +49,8 @@ #' @param output_fields A \code{vector} of \code{character} strings indicating which field names (table headers) #' should be included in the output. By default \code{\link{list_ecotox_fields}("default")} is used. Use #' \code{\link{list_ecotox_fields}("all")} to list all available fields. -#' @param group_by_results Ecological test results are generally the most informative element in the ECOTOX +#' @param group_by_results +#' Ecological test results are generally the most informative element in the ECOTOX #' database. Therefore, this search function returns a table with unique results in each row. #' #' However, some tables in the database (such as 'chemical_carriers' and 'dose_responses') have a one to many @@ -58,8 +59,15 @@ #' #' By default the search results are grouped by test results. As a result not all doses or chemical carriers may #' be displayed in the output. Set the \code{group_by_results} parameter to \code{FALSE} in order to force SQLite -#' to output all data (all carriers and doses). But beware that test results may be duplicated in those cases. -#' @param ... Arguments passed to \code{\link{dbConnectEcotox}}. You can use this when the database +#' to output all data (e.g., all carriers). But beware that test results may be duplicated in those cases. +#' @param compute The ECOTOXr package tries to construct database queries as lazy as possible. Meaning that R +#' moves as much of the heavy lifting as possible to the database. When your search becomes complicated (e.g., when +#' including many output fields), you may run into trouble and hit the SQL parser limits. In those cases you can set +#' this parameter to \code{TRUE}. Database queries are then computed in the process of joining tables. This is generally +#' slower. Alternatively, you could try to include less output fields in order to simplify the query. +#' @param as_data_frame \code{logical} value indicating whether the result should be converted into a \code{data.frame} +#' (default is \code{TRUE}). When set to \code{FALSE} the data will be returned as a \code{\link[dplyr:tibble]{tbl_df}}. +#' @param ... Arguments passed to \code{\link{dbConnectEcotox}} and other functions. You can use this when the database #' is not located at the default path (\code{\link{get_ecotox_path}()}). #' @return In case of \code{search_query_ecotox}, a \code{character} string containing an SQL #' query is returned. This query is built based on the provided search terms and options. @@ -90,7 +98,7 @@ #' method = "exact" #' ) #' ) -#' ## numbers in result each represent a unique test id from the database +#' ## rows in result each represent a unique test id from the database #' result <- search_ecotox(search) #' query <- search_query_ecotox(search) #' cat(query) @@ -100,195 +108,52 @@ #' } #' @author Pepijn de Vries #' @export -search_ecotox <- function(search, output_fields = list_ecotox_fields("default"), group_by_results = TRUE, ...) { - search <- search_query_ecotox(search, output_fields, group_by_results) - dbcon <- dbConnectEcotox(...) - query <- RSQLite::dbGetQuery(dbcon, search) - dbDisconnectEcotox(dbcon) - return(.add_tags(query, attributes(dbcon)$database_file)) +search_ecotox <- function(search, output_fields = list_ecotox_fields("default"), + group_by_results = TRUE, compute = FALSE, as_data_frame = TRUE, ...) { + temp_field <- if (!"results.result_id" %in% output_fields) "results.result_id" else NULL + if (any(startsWith(output_fields, "dose_responses.")) && !"dose_responses.dose_resp_id" %in% output_fields) + temp_field <- c(temp_field, "dose_responses.dose_resp_id") + search_result <- search_ecotox_lazy(search, c(output_fields, temp_field), compute, group_by_results = group_by_results) + database_file <- attributes(search_result)$database_file + dbcon <- search_result[["src"]]$con + search_result <- search_result %>% collect() + dbDisconnect(dbcon) + ## group by result_id if requested + + if (group_by_results) search_result <- .group_nest_results(search_result) + + ## remove temporary fields + if (!is.null(temp_field)) + search_result <- + search_result %>% + select(!dplyr::any_of(gsub("^.*?[.]", "", temp_field))) + if (as_data_frame) search_result <- search_result %>% as.data.frame() + return(.add_tags(search_result, database_file)) } #' @rdname search_ecotox -#' @name search_query_ecotox +#' @name search_ecotox_lazy #' @export -search_query_ecotox <- function(search, output_fields = list_ecotox_fields("default"), group_by_results = TRUE) { +search_ecotox_lazy <- function(search, output_fields = list_ecotox_fields("default"), + compute = FALSE, ...) { ignored_fields <- !(output_fields %in% list_ecotox_fields("all")) if (any(ignored_fields)) warning(sprintf("The following fields are unknown and ignored: %s.", paste(output_fields[ignored_fields], collapse =", "))) output_fields <- output_fields[!ignored_fields] - if (!any(grepl("^results.", output_fields))) { - warning("Output fields should contain at least 1 field from table 'results'. Adding 'test_id'.") - output_fields <- c("results.test_id") - } - - ## identify key fields that are required for joining tables - db_links <- cbind(.db_specs, - do.call(rbind, lapply(strsplit(.db_specs$foreign_key, "\\(|\\)"), function(x) { - if (length(x) < 2) return(data.frame(foreign_table = "", foreign_field = "")) else - return(data.frame(foreign_table = x[1], foreign_field = x[2])) - }))) - db_links$is_key <- db_links$primary_key == "PRIMARY KEY" | db_links$foreign_key != "" | db_links$foreign_table != "" - key_output_fields <- .db_specs[db_links$is_key | paste(.db_specs$table, .db_specs$field_name, sep = ".") %in% output_fields,,drop = F] - output_fields <- .db_specs[paste(.db_specs$table, .db_specs$field_name, sep = ".") %in% output_fields,,drop = F] - - if (!is.list(search)) stop("Parameter 'search' needs to be a list!") - if (!all(unlist(lapply(search, is.list)))) stop("Each element of parameter 'search' should contain a list") - if (any(duplicated(names(search)))) stop("You have used duplicated search fields. Use each field only once in your search!") - search.tables <- do.call(rbind, lapply(names(search), function(fn) { - tables <- unique(.db_specs$table[.db_specs$field_name %in% fn]) - if (length(tables) == 0) stop(sprintf("Unknown search field: %s", fn)) - if (fn == "test_id") tables <- "tests" - x <- search[[fn]] - if (!all(names(x) %in% c("terms", "method"))) stop("Each search field can only contain two elements: 'terms' and 'method'.") - method <- match.arg(x[["method"]], c("exact", "contains")) - wildcard <- ifelse(method == "contains", "%", "") - collapse <- ifelse(method == "contains", " OR ", ", ") - prefix <- ifelse(method == "contains", sprintf("\"%s\" LIKE ", fn), "") - if (typeof(x$terms) != "character" || length(x$terms) == 0) stop("Provide at least 1 search term (type 'character')") - - terms <- paste(sprintf("%s\"%s%s%s\"", - prefix, - wildcard, - x$terms, - wildcard), - collapse = collapse) - if (method == "exact") { - terms <- sprintf("\"%s\" COLLATE NOCASE IN (%s)", fn, terms) - } - return (data.frame(table = tables, - terms = terms, - method = method)) - })) - search.tables <- rbind(search.tables, - data.frame(table = unique(c("results", "tests", with(output_fields, table[!(table %in% search.tables$table)]))), - terms = "", method = "")) - search.tables$id <- sprintf("search%03i", seq_len(nrow(search.tables))) - search.tables$select <- unlist(lapply(seq_len(nrow(search.tables)), function(i) { - out <- key_output_fields[key_output_fields$table == search.tables$table[i],,drop = F] - paste(paste(search.tables$id[i], sprintf("\"%s\"", out$field_name), sep = "."), collapse = ", ") - })) - search.tables$query <- - with(search.tables, - sprintf("SELECT %s FROM \"%s\" AS %s%s", - select, - table, - id, - ifelse(terms != "", sprintf(" WHERE %s", terms), "") - ) - ) - ## species and species_synonyms need to be combined before we can continue - if (any(search.tables$table == "species_synonyms")) { - sp_id <- search.tables$table == "species" - ss_id <- search.tables$table == "species_synonyms" - sp <- search.tables[sp_id,] - select <- gsub(sp$id, "syns", sp$select) - q <- search.tables$query[ss_id] - q <- sprintf("SELECT %s FROM species AS syns INNER JOIN (%s) USING(species_number)", - select, - q) - search.tables$query[sp_id] <- sprintf("SELECT * FROM (%s UNION ALL %s) AS spec", search.tables$query[sp_id], q) - search.tables$id[sp_id] <- "syns" - search.tables$select[sp_id] <- select - search.tables <- search.tables[!ss_id,] - } - search.tables$linked_to <- "" - search.tables$linked_by <- "" - search.tables$linked_from <- "" - j <- 1 - for (tab in search.tables$table[!(search.tables$table %in% c("results", "tests"))]) { - repeat { - i <- which(search.tables$table == tab) - links <- subset(db_links, (db_links$table == tab & db_links$foreign_table != "") | db_links$foreign_table == tab) - exclude <- c("chemical_carriers", "doses", "dose_responses", "dose_response_details") - exclude <- exclude[!(exclude %in% output_fields$table)] - links <- subset(links, !links$table %in% exclude) - inverselink <- subset(links, links$table == tab & links$field_name %in% c("test_id", "result_id")) - if (nrow(inverselink) > 0) { - search.tables$linked_to[i] <- inverselink$foreign_table - search.tables$linked_by[i] <- inverselink$foreign_field - search.tables$linked_from[i] <- inverselink$field_name - break - } else { - links <- links[1,] - if (links$table %in% c("results", "tests")) { - search.tables$linked_to[i] <- links$table - search.tables$linked_by[i] <- links$field_name - search.tables$linked_from[i] <- links$foreign_field - break - } - temp_sel <- db_links$field_name[db_links$table == links$table] - search.tables$select[i] <- gsub(search.tables$id[i], sprintf("target%03i", j), search.tables$select[i]) - search.tables$id[i] <- sprintf("target%03i", j) - search.tables$select[i] <- paste(c(search.tables$select[i], - sprintf("%s.\"%s\"", sprintf("source%03i", j), temp_sel)), collapse = ", ") - search.tables$query[i] <- - sprintf("SELECT %s FROM \"%s\" AS source%03i\nLEFT JOIN (%s) target%03i ON source%03i.%s = target%03i.\"%s\"", - search.tables$select[i], - links$table, - j, - search.tables$q[i], - j, j, - links$field_name, - j, - links$foreign_field) - if (tab == links$table) stop("Can't build an SQL query using these parameters.") - tab <- links$table - } - } - j <- j + 1 - } - tests.query.tabs <- subset(search.tables, search.tables$linked_to == "tests") - tests.query.withs <- paste0("WITH ", paste(sprintf("%s AS (\n%s\n)", tests.query.tabs$id, tests.query.tabs$query), collapse = ",\n")) - tests.query.select <- unique(sprintf("\"%s\"", key_output_fields$field_name[key_output_fields$table == "tests"])) - - results.query.tabs <- subset(search.tables, search.tables$table == "results") - results.query.where <- results.query.tabs$terms[results.query.tabs$terms != ""] - tests_from_results <- sprintf("tests.test_id IN (SELECT DISTINCT test_id FROM results WHERE %s)", results.query.where) - - tests.query.where <- subset(search.tables, search.tables$table == "tests" & search.tables$terms != "") - tests.query.where <- paste( - sprintf("(%s)", - c(if (length(tests_from_results) > 0) tests_from_results else NULL, - if (length(tests.query.where$terms) > 0) tests.query.where$terms else NULL, - with(subset(tests.query.tabs, tests.query.tabs$terms != ""), - sprintf("tests.\"%s\" IN (SELECT \"%s\" FROM \"%s\")", - linked_by, linked_from, id)))), - collapse = " AND ") - tests.query.tabs$extrawhere <- rep("", nrow(tests.query.tabs)) - tests.query.tabs$extrawhere <- sprintf(" WHERE \"%s\".\"%s\" IN (SELECT DISTINCT tests_agg.\"%s\" FROM tests_agg)", - tests.query.tabs$table, - tests.query.tabs$linked_from, - tests.query.tabs$linked_by) - tests.query <- sprintf("%s\nSELECT %s FROM tests%s%s\n", - tests.query.withs, - paste(tests.query.select, collapse = ", "), - ifelse(tests.query.where == "", "", " WHERE "), - tests.query.where) - - tests.query <- paste0("WITH tests_agg AS (", tests.query, - ")\nSELECT * FROM tests_agg\n", - paste(sprintf("LEFT JOIN (SELECT * FROM \"%s\"%s) AS %s ON tests_agg.\"%s\" = %s.\"%s\"", - tests.query.tabs$table, - tests.query.tabs$extrawhere, - tests.query.tabs$id, - tests.query.tabs$linked_by, - tests.query.tabs$id, - tests.query.tabs$linked_from), collapse = "\n")) - results.query.where <- paste( - c(sprintf("(%s)", results.query.where), - "results.test_id IN (SELECT DISTINCT test_id FROM tests_agg)"), - collapse = " AND ") - results.query <- sprintf(paste0("WITH tests_agg AS (%s)\n", - "SELECT * FROM (SELECT DISTINCT * FROM results WHERE %s)\n", - "INNER JOIN (SELECT * FROM tests_agg) USING(test_id)%s"), - tests.query, - results.query.where, - ifelse(group_by_results, "GROUP BY result_id", "") - ) - - results.query <- sprintf("SELECT %s FROM\n(%s)", - paste(sprintf("\"%s\"", unique(output_fields$field_name)), collapse = ", "), - results.query - ) - return(.add_tags(results.query)) + ## Note that the database connection is opened here, but not closed. It's the end-users responsibility + ## to close the connection when no longer required. + dbcon <- dbConnectEcotox(...) + search_result <- .search_ecotox_lazy_get_result_ids(search, dbcon) + search_result <- .search_ecotox_lazy_append_fields(dbcon, search_result, output_fields, compute, ...) + return(.add_tags(search_result, attributes(dbcon)$database_file)) } + +#' @rdname search_ecotox +#' @name search_query_ecotox +#' @export +search_query_ecotox <- function(search, output_fields = list_ecotox_fields("default"), ...) { + search_result <- search_ecotox_lazy(search, output_fields, ...) + database_file <- attributes(search_result)$database_file + search_result <- search_result %>% dbplyr::sql_render() + return(.add_tags(search_result, database_file)) +} \ No newline at end of file diff --git a/man/ECOTOXr.Rd b/man/ECOTOXr.Rd index 1601020..37f9338 100644 --- a/man/ECOTOXr.Rd +++ b/man/ECOTOXr.Rd @@ -25,7 +25,8 @@ in order to use the package to its full capacity. First download a copy of the complete EPA database. This can be done by calling \code{\link{download_ecotox_data}}. This may not always work on all machines as R does not always accept the website SSL certificate from the EPA. In those cases the zipped archive with the database files can be downloaded manually with a different (more -forgiving) browser. The files from the zip archive can be extracted to a location of choice. +forgiving) browser. The files from the zip archive can be extracted to a location of choice. Alternatively, +the user could try to use \code{\link{download_ecotox_data}(ssl_verifypeer = 0L)} when the download URL is trusted. } \item{ Next, an SQLite database needs to be build from the downloaded files. This will be done automatically when diff --git a/man/build_ecotox_sqlite.Rd b/man/build_ecotox_sqlite.Rd index 5f61cf0..e1908e5 100644 --- a/man/build_ecotox_sqlite.Rd +++ b/man/build_ecotox_sqlite.Rd @@ -43,6 +43,8 @@ This problem only seems to occur for characters that are listed as 'control char consequences for reproducibility, but only if you build search queries that look for such special characters. It is therefore advised to stick to common (non-accented) alpha-numerical characters in your searches, for the sake of reproducibility. + +Use '\code{\link{suppressMessages}}' to suppress the progress report. } \examples{ \dontrun{ @@ -55,7 +57,8 @@ if (test) { dir <- gsub(".sqlite", "", files$database, fixed = T) path <- files$path if (dir.exists(file.path(path, dir))) { - build_ecotox_sqlite(source = file.path(path, dir), destination = get_ecotox_path()) + ## This will build the database in your temp directory: + build_ecotox_sqlite(source = file.path(path, dir), destination = tempdir()) } } } diff --git a/man/cas.Rd b/man/cas.Rd new file mode 100644 index 0000000..c53dad6 --- /dev/null +++ b/man/cas.Rd @@ -0,0 +1,158 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cas_handlers.r +\name{cas} +\alias{cas} +\alias{is.cas} +\alias{as.cas} +\alias{[[.cas} +\alias{[.cas} +\alias{[[<-.cas} +\alias{[<-.cas} +\alias{format.cas} +\alias{as.character.cas} +\alias{show.cas} +\alias{print.cas} +\alias{as.list.cas} +\alias{as.double.cas} +\alias{as.integer.cas} +\alias{c.cas} +\alias{as.data.frame.cas} +\title{Functions for handling chemical abstract service (CAS) registry numbers} +\usage{ +cas(length = 0L) + +is.cas(x) + +as.cas(x) + +\method{[[}{cas}(x, i) + +\method{[}{cas}(x, i) + +\method{[[}{cas}(x, i) <- value + +\method{[}{cas}(x, i) <- value + +\method{format}{cas}(x, hyphenate = TRUE, ...) + +\method{as.character}{cas}(x, ...) + +show.cas(x, ...) + +\method{print}{cas}(x, ...) + +\method{as.list}{cas}(x, ...) + +\method{as.double}{cas}(x, ...) + +\method{as.integer}{cas}(x, ...) + +\method{c}{cas}(...) + +\method{as.data.frame}{cas}(...) +} +\arguments{ +\item{length}{A non-negative \code{integer} specifying the desired length. Double values will be coerced to +\code{integer}: supplying an argument of length other than one is an error.} + +\item{x}{Object from which data needs to be extracted or replaced, or needs to be coerced into a specific +format. For nearly all of the functions documented here, this needs to be an object of the S3 class 'cas', +which can be created with \code{as.cas}. For \code{as.cas}, \code{x} can be a \code{character} (CAS registry number +with or without hyphenation) or a \code{numeric} value. Note that \code{as.cas} will only accept correctly +formatted and valid CAS registry numbers.} + +\item{i}{Index specifying element(s) to extract or replace. See also \code{\link[base:Extract]{Extract}}.} + +\item{value}{A replacement value, can be anything that can be converted into an S3 cas-class object with \code{as.cas}.} + +\item{hyphenate}{A \code{logical} value indicating whether the formatted CAS number needs to be hyphenated. +Default is \code{TRUE}.} + +\item{...}{Arguments passed to other functions} +} +\value{ +Functions \code{cas}, \code{c} and \code{as.cas} return S3 class 'cas' objects. Coercion functions +(starting with 'as') return the object as specified by their respective function names (i.e., \code{integer}, +\code{double}, \code{character}, \code{list} and \code{data.frame}). The \code{show.cas} and \code{print} functions +also return formatted \code{charater}s. The function \code{is.cas} will return a single \code{logical} value, +indicating whether \code{x} is a valid S3 cas-class object. The square brackets return the selected index/indices, +or the \code{vector} of cas objects where the selected elements are replaced by \code{value}. +} +\description{ +Functions for handling chemical abstract service (CAS) registry numbers +} +\details{ +In the database \href{https://en.wikipedia.org/wiki/Chemical_Abstracts_Service}{CAS registry} numbers are stored +as text (type \code{character}). As CAS numbers can consist of a maximum of 10 digits (plus two hyphens) this means +that each CAS number can consume up to 12 bytes of memory or disk space. By storing the data numerically, only +5 bytes are required. These functions provide the means to handle CAS registry numbers and coerce from and to +different formats and types. +} +\examples{ +## This will generate a vector of cas objects containing 10 +## fictive (0-00-0), but valid registry numbers: +cas(10) + +## This is a cas-object: +is.cas(cas(0L)) + +## This is not a cas-object: +is.cas(0L) + +## Three different ways of creating a cas object from +## Benzene's CAS registry number (the result is the same) +as.cas("71-43-2") +as.cas("71432") +as.cas(71432L) + +## This is one way of creating a vector with multiple CAS registry numbers: +cas_data <- as.cas(c("64175", "71432", "58082")) + +## This is how you select a specific element(s) from the vector: +cas_data[2:3] +cas_data[[2]] + +## You can also replace specific elements in the vector: +cas_data[1] <- "7440-23-5" +cas_data[[2]] <- "129-00-0" + +## You can format CAS numbers with or without hyphens: +format(cas_data, TRUE) +format(cas_data, FALSE) + +## The same can be achieved using as.character +as.character(cas_data, TRUE) +as.character(cas_data, FALSE) + +## There are also show and print methods available: +show(cas_data) +print(cas_data) + +## Numeric values can be obtained from CAS using as.numeric, as.double or as.integer +as.numeric(cas_data) + +## Be careful, however. Some CAS numbers cannot be represented by R's 32 bit integers +## and will produce NA's. This will work OK: +huge_cas <- as.cas("9999999-99-5") + +\dontrun{ +## This will not: +as.integer(huge_cas) +} + +## The trick applied by this package is that the final +## validation digit is stored separately as attribute: +unclass(huge_cas) + +## This is how cas objects can be concatenated: +cas_data <- c(huge_cas, cas_data) + +## This will create a data.frame +as.data.frame(cas_data) + +## This will create a list: +as.list(cas_data) +} +\author{ +Pepijn de Vries +} diff --git a/man/download_ecotox_data.Rd b/man/download_ecotox_data.Rd index 2ad2d4f..5b55dcd 100644 --- a/man/download_ecotox_data.Rd +++ b/man/download_ecotox_data.Rd @@ -4,7 +4,12 @@ \alias{download_ecotox_data} \title{Download and extract ECOTOX database files and compose database} \usage{ -download_ecotox_data(target = get_ecotox_path(), write_log = TRUE, ask = TRUE) +download_ecotox_data( + target = get_ecotox_path(), + write_log = TRUE, + ask = TRUE, + ... +) } \arguments{ \item{target}{Target directory where the files will be downloaded and the database compiled. Default is @@ -16,6 +21,10 @@ download_ecotox_data(target = get_ecotox_path(), write_log = TRUE, ask = TRUE) \item{ask}{There are several steps in which files are (potentially) overwritten or deleted. In those cases the user is asked on the command line what to do in those cases. Set this parameter to \code{FALSE} in order to continue without warning and asking.} + +\item{...}{Arguments passed on to \code{\link[httr]{GET}}. When this function fails with the error: "Peer +certificate cannot be authenticated with given CA certificates", you could try to rerun the function with +the option \code{ssl_verifypeer = 0L}. Only do so when you trust the indicated URL.} } \value{ Returns \code{NULL} invisibly. @@ -25,22 +34,27 @@ In order for this package to fully function, a local copy of the ECOTOX database This function will download the required data and build the database. } \details{ -This function will attempt to find the latest download url for the ECOTOX database from the EPA website. -When found it will attempt to download the zipped archive containing all required data. This data is than +This function will attempt to find the latest download url for the ECOTOX database from the +\href{https://cfpub.epa.gov/ecotox/index.cfm}{EPA website} (see \code{\link{get_ecotox_url}()}). +When found it will attempt to download the zipped archive containing all required data. This data is then extracted and a local copy of the database is build. + +Use '\code{\link{suppressMessages}}' to suppress the progress report. } \section{Known issues}{ -On some machines this function fails to connect to the database download URL from the EPA website due to missing +On some machines this function fails to connect to the database download URL from the +\href{https://cfpub.epa.gov/ecotox/index.cfm}{EPA website} due to missing SSL certificates. Unfortunately, there is no easy fix for this in this package. A work around is to download and unzip the file manually using a different machine or browser that is less strict with SSL certificates. You can then call \code{\link{build_ecotox_sqlite}()} and point the \code{source} location to the manually extracted zip -archive. +archive. For this purpose \code{\link{get_ecotox_url}()} can be used. } \examples{ \dontrun{ -download_ecotox_data() +## This will download and build the database in your temp dir: +download_ecotox_data(tempdir()) } } \author{ diff --git a/man/get_ecotox_url.Rd b/man/get_ecotox_url.Rd new file mode 100644 index 0000000..2f4c88e --- /dev/null +++ b/man/get_ecotox_url.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/init.r +\name{get_ecotox_url} +\alias{get_ecotox_url} +\title{Get ECOTOX download URL from EPA website} +\usage{ +get_ecotox_url(...) +} +\arguments{ +\item{...}{arguments passed on to \code{\link[httr]{GET}}} +} +\value{ +Returns a \code{character} string containing the download URL of the latest version of the EPA ECOTOX +database. +} +\description{ +This function downloads the webpage at \url{https://cfpub.epa.gov/ecotox/index.cfm}. It then searches for the +download link for the complete ECOTOX database and extract its URL. +} +\details{ +This function is called by \code{\link{download_ecotox_data}} which tries to download the file from the resulting +URL. On some machines this fails due to issues with the SSL certificate. The user can try to download the file +by using this URL in a different browser (or on a different machine). Alternatively, the user could try to use +\code{\link{download_ecotox_data}(ssl_verifypeer = 0L)} when the download URL is trusted. +} +\examples{ +\dontrun{ +get_ecotox_url() +} +} +\author{ +Pepijn de Vries +} diff --git a/man/get_path.Rd b/man/get_path.Rd index 407857c..f5bd4bb 100644 --- a/man/get_path.Rd +++ b/man/get_path.Rd @@ -27,7 +27,8 @@ Obtain the local path to where the ECOTOX database is (or will be) placed. } \details{ It can be useful to know where the database is located on your disk. This function -returns the location as provided by \code{\link[rappdirs]{app_dir}}. +returns the location as provided by \code{\link[rappdirs]{app_dir}}, or as +specified by you using \code{options(ECOTOXr_path = "mypath")}. } \examples{ get_ecotox_path() diff --git a/man/list_ecotox_fields.Rd b/man/list_ecotox_fields.Rd index 05a65b4..034fded 100644 --- a/man/list_ecotox_fields.Rd +++ b/man/list_ecotox_fields.Rd @@ -4,12 +4,18 @@ \alias{list_ecotox_fields} \title{List the field names that are available from the ECOTOX database} \usage{ -list_ecotox_fields(which = c("default", "full", "all"), include_table = TRUE) +list_ecotox_fields( + which = c("default", "extended", "full", "all"), + include_table = TRUE +) } \arguments{ \item{which}{A \code{character} string that specifies which fields to return. Can be any of: -'\code{default}': returns default output field names; '\code{all}': returns all fields; or -'\code{full}': returns all except fields from table 'dose_response_details'.} +'\code{default}': returns default output field names; '\code{all}': returns all fields; +'\code{extended}': returns all fields of the default tables; or +'\code{full}': returns all fields except those from tables 'chemical_carriers', +'media_characteristics', 'doses', 'dose_responses', +'dose_response_details', 'dose_response_links' and 'dose_stat_method_codes'.} \item{include_table}{A \code{logical} value indicating whether the table name should be included as prefix. Default is \code{TRUE}.} @@ -31,9 +37,10 @@ list_ecotox_fields("default") ## All fields that are available from the ECOTOX database: list_ecotox_fields("all") -## All except fields from the table 'dose_response_details' +## All except fields from the tables 'chemical_carriers', 'media_characteristics', 'doses', +## 'dose_responses', 'dose_response_details', 'dose_response_links' and 'dose_stat_method_codes' ## that are available from the ECOTOX database: -list_ecotox_fields("all") +list_ecotox_fields("full") } \author{ Pepijn de Vries diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 0000000..a8f8239 --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/imports.r +\name{\%>\%} +\alias{\%>\%} +\title{Objects exported from other packages} +\description{ +Objects imported and exported from other packages. See original documentation for more details. +} +\details{ +\describe{ +\item{dplyr}{\code{\link[dplyr:reexports]{\%>\%}}} +} +} diff --git a/man/search_ecotox.Rd b/man/search_ecotox.Rd index 9749a83..8ef94b8 100644 --- a/man/search_ecotox.Rd +++ b/man/search_ecotox.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/wrappers.r \name{search_ecotox} \alias{search_ecotox} +\alias{search_ecotox_lazy} \alias{search_query_ecotox} \title{Search and retrieve toxicity records from the database} \usage{ @@ -9,14 +10,19 @@ search_ecotox( search, output_fields = list_ecotox_fields("default"), group_by_results = TRUE, + compute = FALSE, + as_data_frame = TRUE, ... ) -search_query_ecotox( +search_ecotox_lazy( search, output_fields = list_ecotox_fields("default"), - group_by_results = TRUE + compute = FALSE, + ... ) + +search_query_ecotox(search, output_fields = list_ecotox_fields("default"), ...) } \arguments{ \item{search}{A named \code{list} containing the search terms. The names of the elements should refer to @@ -25,7 +31,7 @@ obtain a list of available field names. Each element in that list should contain another list with at least one element named 'terms'. This should contain a \code{vector} of \code{character} strings with search terms. Optionally, a second element -named 'method' can be provided which should be set to either '\code{contain}' (default, when missing) or +named 'method' can be provided which should be set to either '\code{contains}' (default, when missing) or '\code{exact}'. In the first case the query will match any record in the indicated field that contains the search term. In case of '\code{exact}' it will only return exact matches. Note that searches are not case sensitive, but are picky with special (accented) characters. While building the local database @@ -58,9 +64,18 @@ similarly, multiple doses can also be linked to a single test result. By default the search results are grouped by test results. As a result not all doses or chemical carriers may be displayed in the output. Set the \code{group_by_results} parameter to \code{FALSE} in order to force SQLite -to output all data (all carriers and doses). But beware that test results may be duplicated in those cases.} +to output all data (e.g., all carriers). But beware that test results may be duplicated in those cases.} + +\item{compute}{The ECOTOXr package tries to construct database queries as lazy as possible. Meaning that R +moves as much of the heavy lifting as possible to the database. When your search becomes complicated (e.g., when +including many output fields), you may run into trouble and hit the SQL parser limits. In those cases you can set +this parameter to \code{TRUE}. Database queries are then computed in the process of joining tables. This is generally +slower. Alternatively, you could try to include less output fields in order to simplify the query.} + +\item{as_data_frame}{\code{logical} value indicating whether the result should be converted into a \code{data.frame} +(default is \code{TRUE}). When set to \code{FALSE} the data will be returned as a \code{\link[dplyr:tibble]{tbl_df}}.} -\item{...}{Arguments passed to \code{\link{dbConnectEcotox}}. You can use this when the database +\item{...}{Arguments passed to \code{\link{dbConnectEcotox}} and other functions. You can use this when the database is not located at the default path (\code{\link{get_ecotox_path}()}).} } \value{ @@ -102,7 +117,7 @@ CRAN and document the package and database version that are used to generate spe \examples{ \dontrun{ ## let's find the ids of all ecotox tests on species -## where latin names contain either of 2 specific genus names and +## where Latin names contain either of 2 specific genus names and ## where they were exposed to the chemical benzene if (check_ecotox_availability()) { search <- @@ -116,7 +131,7 @@ if (check_ecotox_availability()) { method = "exact" ) ) - ## numbers in result each represent a unique test id from the database + ## rows in result each represent a unique test id from the database result <- search_ecotox(search) query <- search_query_ecotox(search) cat(query) diff --git a/tests/testthat/test_cas.r b/tests/testthat/test_cas.r new file mode 100644 index 0000000..42cac05 --- /dev/null +++ b/tests/testthat/test_cas.r @@ -0,0 +1,56 @@ +test_that("Function cas works and generates the correct length", { + expect_length(cas(10), 10) +}) + +test_that("Converting either a numerical or a character to cas, results in the same object", { + expect_identical(as.cas(71432L), as.cas("71432")) +}) + +test_that("Hyphenation can be omitted in CAS", { + expect_identical(as.cas("71-43-2"), as.cas("71432")) +}) + +test_that("Concatenating cas objects works correctly", { + expect_equal(length(c(as.cas(71432L), "71432")), 2) +}) + +test_that("is.cas doesn't throw errors", { + expect_true(is.cas(c(as.cas(71432L), "71432"))) +}) + +test_that("EXtract and replace methods don't throw errors for cas objects", { + expect_true({ + cas_data <- cas(100) + cas_data[4:10] + cas_data[[50]] + cas_data[3:4] <- as.cas(rep("71432", 2)) + cas_data[[50]] <- as.cas(11) + TRUE + }) +}) + +test_that("as.data.frame.cas doesn't throw errors", { + expect_true({ + "data.frame" %in% class(as.data.frame(c(cas(10), as.cas("71432")))) + }) +}) + +test_that("as.integer.cas returns an integer", { + expect_type(as.integer(as.cas("71432")), "integer") +}) + +test_that("as.double.cas returns a double", { + expect_type(as.double(as.cas("71432")), "double") +}) + +test_that("as.character.cas returns a character", { + expect_type(as.character(as.cas("71432")), "character") +}) + +test_that("as.list.cas returns a list", { + expect_type(as.list(as.cas(rep("71432", 4))), "list") +}) + +test_that("as.cas returns a cas", { + expect_s3_class(as.cas("71432"), "cas") +}) \ No newline at end of file diff --git a/tests/testthat/test_that.r b/tests/testthat/test_that.r index 767f475..328b192 100644 --- a/tests/testthat/test_that.r +++ b/tests/testthat/test_that.r @@ -5,22 +5,28 @@ check_db <- function() { } simple_search1 <- if (check_ecotox_availability()) { - suppressWarnings(search_ecotox( + suppressMessages(suppressWarnings(search_ecotox( list(latin_name = list(terms = "Daphnia magna"), chemical_name = list(terms = "benzene")), - c(list_ecotox_fields(), "results.result_id", "results.test_id", "tests.reference_number"))) + c(list_ecotox_fields(), "results.result_id", "results.test_id", "tests.reference_number"), + compute = TRUE))) } else NULL simple_search2 <- if (check_ecotox_availability()) { - suppressWarnings({search_ecotox(list(test_id = list(terms = "1")))}) + suppressMessages(suppressWarnings({search_ecotox(list(test_id = list(terms = "1", method = "exact")))})) } else NULL simple_search3 <- if (check_ecotox_availability()) { - suppressWarnings({search_ecotox(list(latin_name = list(terms = "perdix perdix"), test_cas = list(terms="1336363")), - c(list_ecotox_fields(), "results.result_id", "results.test_id", "tests.reference_number"))}) + suppressMessages(suppressWarnings({search_ecotox(list(latin_name = list(terms = "perdix perdix"), + test_cas = list(terms = "1336363")), + c(list_ecotox_fields(), "results.result_id", "results.test_id", "tests.reference_number"))})) +} else NULL + +search_q <- if (check_ecotox_availability()) { + suppressMessages(suppressWarnings({search_query_ecotox(list(test_id = list(terms = "1", method = "exact")))})) } else NULL throws_errors <- function(expression) { - result <- F + result <- FALSE tryCatch(expression, error = function(e) {result <<- T}, warning = function(w) {invisible(NULL)}) result } @@ -142,12 +148,11 @@ test_that("A simple search results in expected table", { expect_true({ ## Compare result with anticipated ids: all( - simple_search1$test_id %in% - c("1020021", "1020022", "1020023", "1022155", "1031085", "1031086", "1031087", "1031088", "1031196", "1031197", - "1064409", "1064410", "1064411", "1072942", "1072943", "1072944", "1083684", "1083685", "1083686", "1098939", - "1098940", "1098941", "1098942", "1098943", "1098944", "1098945", "1098946", "1098947", "1098948", "1098949", - "1098950", "1125798", "1136665", "1136666", "1142641", "1152541", "1185661", "1185662", "1185663", "1187783", - "1189253", "1237724", "2113979", "2114101", "2194929") + c("1020021", "1020022", "1020023", "1022155", "1031085", "1031086", "1031087", "1031088", "1031196", "1031197", + "1064409", "1072942", "1072943", "1072944", "1083684", "1083685", "1083686", "1098939", "1098940", "1098941", + "1098942", "1098943", "1098944", "1098945", "1098946", "1098947", "1098948", "1098949", "1098950", "1125798", + "1136665", "1136666", "1142641", "1152541", "1185661", "1185662", "1185663", "1187783", "1189253", "1237724", + "2113979", "2114101", "2194929") %in% simple_search1$test_id ) }) }) @@ -176,7 +181,7 @@ test_that("A simple search results in unique result ids", { ################################# ################################# -test_that("A simple when there is a reference number there is a publication year.", { +test_that("In a simple search test that there is a publication year when there is a reference number.", { check_db() expect_false({ any(is.na(simple_search1$publication_year) & !is.na(simple_search1$reference_number)) @@ -230,8 +235,7 @@ test_that("Default field names are fewer than all field names", { test_that("A simple search query returns a single element of type character", { check_db() expect_true({ - search <- search_query_ecotox(list(test_id = list(terms = "1"))) - length(search == 1) && typeof(search) == "character" + length(search_q) == 1 && typeof(search_q) == "character" }) }) @@ -291,3 +295,4 @@ test_that("When multiple doses are linked to a result, no duplicates are returne test_that("get_ecotox_info doesn't throw an error.", { expect_false({ throws_errors(get_ecotox_info()) }) }) +