|
| 1 | +#' Functions for handling chemical abstract service (CAS) registry numbers |
| 2 | +#' |
| 3 | +#' Functions for handling chemical abstract service (CAS) registry numbers |
| 4 | +#' |
| 5 | +#' In the database \href{https://en.wikipedia.org/wiki/Chemical_Abstracts_Service}{CAS registry} numbers are stored |
| 6 | +#' as text (type \code{character}). As CAS numbers can consist of a maximum of 10 digits (plus two hyphens) this means |
| 7 | +#' that each CAS number can consume up to 12 bytes of memory or disk space. By storing the data numerically, only |
| 8 | +#' 5 bytes are required. These functions provide the means to handle CAS registry numbers and coerce from and to |
| 9 | +#' different formats and types. |
| 10 | +#' @param x Object from which data needs to be extracted or replaced, or needs to be coerced into a specific |
| 11 | +#' format. For nearly all of the functions documented here, this needs to be an object of the S3 class 'cas', |
| 12 | +#' which can be created with \code{as.cas}. For \code{as.cas}, \code{x} can be a \code{character} (CAS registry number |
| 13 | +#' with or without hyphenation) or a \code{numeric} value. Note that \code{as.cas} will only accept correctly |
| 14 | +#' formatted and valid CAS registry numbers. |
| 15 | +#' @param i Index specifying element(s) to extract or replace. See also \code{\link[base:Extract]{Extract}}. |
| 16 | +#' @param value A replacement value, can be anything that can be converted into an S3 cas-class object with \code{as.cas}. |
| 17 | +#' @param length A non-negative \code{integer} specifying the desired length. Double values will be coerced to |
| 18 | +#' \code{integer}: supplying an argument of length other than one is an error. |
| 19 | +#' @param hyphenate A \code{logical} value indicating whether the formatted CAS number needs to be hyphenated. |
| 20 | +#' Default is \code{TRUE}. |
| 21 | +#' @param ... Arguments passed to other functions |
| 22 | +#' @return Functions \code{cas}, \code{c} and \code{as.cas} return S3 class 'cas' objects. Coercion functions |
| 23 | +#' (starting with 'as') return the object as specified by their respective function names (i.e., \code{integer}, |
| 24 | +#' \code{double}, \code{character}, \code{list} and \code{data.frame}). The \code{show.cas} and \code{print} functions |
| 25 | +#' also return formatted \code{charater}s. The function \code{is.cas} will return a single \code{logical} value, |
| 26 | +#' indicating whether \code{x} is a valid S3 cas-class object. The square brackets return the selected index/indices, |
| 27 | +#' or the \code{vector} of cas objects where the selected elements are replaced by \code{value}. |
| 28 | +#' @rdname cas |
| 29 | +#' @name cas |
| 30 | +#' @examples |
| 31 | +#' ## This will generate a vector of cas objects containing 10 |
| 32 | +#' ## fictive (0-00-0), but valid registry numbers: |
| 33 | +#' cas(10) |
| 34 | +#' |
| 35 | +#' ## This is a cas-object: |
| 36 | +#' is.cas(cas(0L)) |
| 37 | +#' |
| 38 | +#' ## This is not a cas-object: |
| 39 | +#' is.cas(0L) |
| 40 | +#' |
| 41 | +#' ## Three different ways of creating a cas object from |
| 42 | +#' ## Benzene's CAS registry number (the result is the same) |
| 43 | +#' as.cas("71-43-2") |
| 44 | +#' as.cas("71432") |
| 45 | +#' as.cas(71432L) |
| 46 | +#' |
| 47 | +#' ## This is one way of creating a vector with multiple CAS registry numbers: |
| 48 | +#' cas_data <- as.cas(c("64175", "71432", "58082")) |
| 49 | +#' |
| 50 | +#' ## This is how you select a specific element(s) from the vector: |
| 51 | +#' cas_data[2:3] |
| 52 | +#' cas_data[[2]] |
| 53 | +#' |
| 54 | +#' ## You can also replace specific elements in the vector: |
| 55 | +#' cas_data[1] <- "7440-23-5" |
| 56 | +#' cas_data[[2]] <- "129-00-0" |
| 57 | +#' |
| 58 | +#' ## You can format CAS numbers with or without hyphens: |
| 59 | +#' format(cas_data, TRUE) |
| 60 | +#' format(cas_data, FALSE) |
| 61 | +#' |
| 62 | +#' ## The same can be achieved using as.character |
| 63 | +#' as.character(cas_data, TRUE) |
| 64 | +#' as.character(cas_data, FALSE) |
| 65 | +#' |
| 66 | +#' ## There are also show and print methods available: |
| 67 | +#' show(cas_data) |
| 68 | +#' print(cas_data) |
| 69 | +#' |
| 70 | +#' ## Numeric values can be obtained from CAS using as.numeric, as.double or as.integer |
| 71 | +#' as.numeric(cas_data) |
| 72 | +#' |
| 73 | +#' ## Be careful, however. Some CAS numbers cannot be represented by R's 32 bit integers |
| 74 | +#' ## and will produce NA's. This will work OK: |
| 75 | +#' huge_cas <- as.cas("9999999-99-5") |
| 76 | +#' |
| 77 | +#' \dontrun{ |
| 78 | +#' ## This will not: |
| 79 | +#' as.integer(huge_cas) |
| 80 | +#' } |
| 81 | +#' |
| 82 | +#' ## The trick applied by this package is that the final |
| 83 | +#' ## validation digit is stored separately as attribute: |
| 84 | +#' unclass(huge_cas) |
| 85 | +#' |
| 86 | +#' ## This is how cas objects can be concatenated: |
| 87 | +#' cas_data <- c(huge_cas, cas_data) |
| 88 | +#' |
| 89 | +#' ## This will create a data.frame |
| 90 | +#' as.data.frame(cas_data) |
| 91 | +#' |
| 92 | +#' ## This will create a list: |
| 93 | +#' as.list(cas_data) |
| 94 | +#' @author Pepijn de Vries |
| 95 | +#' @export |
| 96 | +cas <- function(length = 0L) { |
| 97 | + structure ( |
| 98 | + integer(length), ## The registry number is stored as integer without the final digit (=checksum) |
| 99 | + checksum = raw(length), ## last digit of CAS number, which serves as a checksum, stored as raw value |
| 100 | + class = "cas" |
| 101 | + ) |
| 102 | +} |
| 103 | + |
| 104 | +#' @rdname cas |
| 105 | +#' @name is.cas |
| 106 | +#' @export |
| 107 | +is.cas <- function(x) { |
| 108 | + if (!(class(x) %in% "cas")) return(F) |
| 109 | + checksums <- attributes(x)$checksum |
| 110 | + if (length(checksums) != length(x)) stop("Each CAS registry in the vector needs a checksum") |
| 111 | + validate <- outer(unclass(x), 0:9, function(x, y) { |
| 112 | + floor(x/(10^y)) %% 10 |
| 113 | + }) |
| 114 | + validate <- apply(validate, 1, function(x) { |
| 115 | + x <- sum(seq_along(x)*x) %% 10 |
| 116 | + }) |
| 117 | + return(all(validate == as.numeric(checksums))) |
| 118 | +} |
| 119 | + |
| 120 | +#' @rdname cas |
| 121 | +#' @name as.cas |
| 122 | +#' @export |
| 123 | +as.cas <- function(x) { |
| 124 | + if (is.cas(x)) return(x) |
| 125 | + x <- as.character(x) |
| 126 | + is_hyphenated <- stringr::str_sub(x, -2, -2) == "-" & stringr::str_sub(x, -5, -5) == "-" |
| 127 | + x[is_hyphenated] <- paste0( |
| 128 | + stringr::str_sub(x[is_hyphenated], 1, -6), |
| 129 | + stringr::str_sub(x[is_hyphenated], -4, -3), |
| 130 | + stringr::str_sub(x[is_hyphenated], -1, -1) |
| 131 | + ) |
| 132 | + if (any(!grepl("^[0-9]+$", x))) stop("CAS numbers can only contain hyphens at correct positions and numeric characters otherwise...") |
| 133 | + registry <- as.integer(stringr::str_sub(x, 1, -2)) |
| 134 | + registry[is.na(registry)] <- 0L |
| 135 | + attributes(registry)$checksum <- as.raw(as.integer(stringr::str_sub(x, -1, -1))) |
| 136 | + class(registry) <- "cas" |
| 137 | + if (!is.cas(registry)) stop("Input contains invalid CAS numbers") |
| 138 | + registry |
| 139 | +} |
| 140 | + |
| 141 | +#' @rdname cas |
| 142 | +#' @name [[.cas |
| 143 | +#' @export |
| 144 | +`[[.cas` <- function(x, i) { |
| 145 | + attribs <- attributes(x) |
| 146 | + attribs$checksum <- attribs$checksum[[i]] |
| 147 | + attribs$names <- attribs$names[[i]] |
| 148 | + x <- unclass(x) |
| 149 | + x <- x[[i]] |
| 150 | + attributes(x) <- attribs |
| 151 | + x |
| 152 | +} |
| 153 | + |
| 154 | +#' @rdname cas |
| 155 | +#' @name [.cas |
| 156 | +#' @export |
| 157 | +`[.cas` <- function(x, i) { |
| 158 | + attribs <- attributes(x) |
| 159 | + attribs$checksum <- attribs$checksum[i] |
| 160 | + attribs$names <- attribs$names[i] |
| 161 | + x <- unclass(x) |
| 162 | + x <- x[i] |
| 163 | + attributes(x) <- attribs |
| 164 | + x |
| 165 | +} |
| 166 | + |
| 167 | +#' @rdname cas |
| 168 | +#' @name [[<-.cas |
| 169 | +#' @export |
| 170 | +`[[<-.cas` <- function(x, i, value) { |
| 171 | + value <- as.cas(value) |
| 172 | + attribs <- attributes(x) |
| 173 | + attribs$checksum[[i]] <- attributes(value)$checksum |
| 174 | + attribs$names[[i]] <- attributes(value)$names |
| 175 | + x <- unclass(x) |
| 176 | + x[[i]] <- unclass(value) |
| 177 | + attributes(x) <- attribs |
| 178 | + x |
| 179 | +} |
| 180 | + |
| 181 | +#' @rdname cas |
| 182 | +#' @name [<-.cas |
| 183 | +#' @export |
| 184 | +`[<-.cas` <- function(x, i, value) { |
| 185 | + value <- as.cas(value) |
| 186 | + attribs <- attributes(x) |
| 187 | + attribs$checksum[i] <- attributes(value)$checksum |
| 188 | + attribs$names[i] <- attributes(value)$names |
| 189 | + x <- unclass(x) |
| 190 | + x[i] <- unclass(value) |
| 191 | + attributes(x) <- attribs |
| 192 | + x |
| 193 | +} |
| 194 | + |
| 195 | +#' @rdname cas |
| 196 | +#' @name format.cas |
| 197 | +#' @export |
| 198 | +format.cas <- function(x, hyphenate = TRUE, ...) { |
| 199 | + checksums <- attributes(x)$checksum |
| 200 | + x <- unclass(x) |
| 201 | + repp <- x |
| 202 | + repp[repp == 0] <- 1 |
| 203 | + repp <- ceiling(2 - log10(repp)) |
| 204 | + repp[repp < 0] <- 0 |
| 205 | + x <- paste0(strrep("0", repp), x) |
| 206 | + sprintf("%s%s%s%s%01i", |
| 207 | + stringr::str_sub(x, 1, -3), |
| 208 | + ifelse(hyphenate, "-", ""), |
| 209 | + stringr::str_sub(x, -2, -1), |
| 210 | + ifelse(hyphenate, "-", ""), |
| 211 | + as.numeric(checksums) |
| 212 | + ) |
| 213 | +} |
| 214 | + |
| 215 | +#' @rdname cas |
| 216 | +#' @name as.character.cas |
| 217 | +#' @export |
| 218 | +as.character.cas <- function(x, ...) { |
| 219 | + format(x, ...) |
| 220 | +} |
| 221 | + |
| 222 | +#' @rdname cas |
| 223 | +#' @name show.cas |
| 224 | +#' @export |
| 225 | +show.cas <- function(x, ...) { |
| 226 | + format(x, ...) |
| 227 | +} |
| 228 | + |
| 229 | +#' @rdname cas |
| 230 | +#' @name print.cas |
| 231 | +#' @export |
| 232 | +print.cas <- function(x, ...) { |
| 233 | + if (length(x) == 0) |
| 234 | + cat("cas(0)\n") else print(format.cas(x), ...) |
| 235 | +} |
| 236 | + |
| 237 | +#' @rdname cas |
| 238 | +#' @name as.list.cas |
| 239 | +#' @export |
| 240 | +as.list.cas <- function(x, ...) { |
| 241 | + lapply(seq_along(x), function(i) x[i]) |
| 242 | +} |
| 243 | + |
| 244 | +#' @rdname cas |
| 245 | +#' @name as.double.cas |
| 246 | +#' @export |
| 247 | +as.double.cas <- function(x, ...) { |
| 248 | + as.double(as.integer.cas(x, ...), ...) |
| 249 | +} |
| 250 | + |
| 251 | +#' @rdname cas |
| 252 | +#' @name as.integer.cas |
| 253 | +#' @export |
| 254 | +as.integer.cas <- function(x, ...) { |
| 255 | + checksums <- as.integer(attributes(x)$checksum, ...) |
| 256 | + x <- 10L*unclass(x) |
| 257 | + attributes(x) <- NULL |
| 258 | + x + checksums |
| 259 | +} |
| 260 | + |
| 261 | +#' @rdname cas |
| 262 | +#' @name c.cas |
| 263 | +#' @export |
| 264 | +c.cas <- function(...) { |
| 265 | + result <- list(...) |
| 266 | + result <- lapply(result, as.cas) |
| 267 | + checksums <- do.call(c, lapply(result, function(x) attributes(x)$checksum)) |
| 268 | + result <- do.call(c, lapply(result, function(x) unclass(x))) |
| 269 | + class(result) <- "cas" |
| 270 | + attributes(result)$checksum <- checksums |
| 271 | + result |
| 272 | +} |
| 273 | + |
| 274 | +#' @rdname cas |
| 275 | +#' @name as.data.frame.cas |
| 276 | +#' @export |
| 277 | +as.data.frame.cas <- function(...) { |
| 278 | + as.data.frame(tibble::tibble(...)) |
| 279 | +} |
0 commit comments