Skip to content

Commit 33c5336

Browse files
cas_handlers was missing in original commit
1 parent dfbdcae commit 33c5336

File tree

1 file changed

+279
-0
lines changed

1 file changed

+279
-0
lines changed

R/cas_handlers.r

+279
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
#' Functions for handling chemical abstract service (CAS) registry numbers
2+
#'
3+
#' Functions for handling chemical abstract service (CAS) registry numbers
4+
#'
5+
#' In the database \href{https://en.wikipedia.org/wiki/Chemical_Abstracts_Service}{CAS registry} numbers are stored
6+
#' as text (type \code{character}). As CAS numbers can consist of a maximum of 10 digits (plus two hyphens) this means
7+
#' that each CAS number can consume up to 12 bytes of memory or disk space. By storing the data numerically, only
8+
#' 5 bytes are required. These functions provide the means to handle CAS registry numbers and coerce from and to
9+
#' different formats and types.
10+
#' @param x Object from which data needs to be extracted or replaced, or needs to be coerced into a specific
11+
#' format. For nearly all of the functions documented here, this needs to be an object of the S3 class 'cas',
12+
#' which can be created with \code{as.cas}. For \code{as.cas}, \code{x} can be a \code{character} (CAS registry number
13+
#' with or without hyphenation) or a \code{numeric} value. Note that \code{as.cas} will only accept correctly
14+
#' formatted and valid CAS registry numbers.
15+
#' @param i Index specifying element(s) to extract or replace. See also \code{\link[base:Extract]{Extract}}.
16+
#' @param value A replacement value, can be anything that can be converted into an S3 cas-class object with \code{as.cas}.
17+
#' @param length A non-negative \code{integer} specifying the desired length. Double values will be coerced to
18+
#' \code{integer}: supplying an argument of length other than one is an error.
19+
#' @param hyphenate A \code{logical} value indicating whether the formatted CAS number needs to be hyphenated.
20+
#' Default is \code{TRUE}.
21+
#' @param ... Arguments passed to other functions
22+
#' @return Functions \code{cas}, \code{c} and \code{as.cas} return S3 class 'cas' objects. Coercion functions
23+
#' (starting with 'as') return the object as specified by their respective function names (i.e., \code{integer},
24+
#' \code{double}, \code{character}, \code{list} and \code{data.frame}). The \code{show.cas} and \code{print} functions
25+
#' also return formatted \code{charater}s. The function \code{is.cas} will return a single \code{logical} value,
26+
#' indicating whether \code{x} is a valid S3 cas-class object. The square brackets return the selected index/indices,
27+
#' or the \code{vector} of cas objects where the selected elements are replaced by \code{value}.
28+
#' @rdname cas
29+
#' @name cas
30+
#' @examples
31+
#' ## This will generate a vector of cas objects containing 10
32+
#' ## fictive (0-00-0), but valid registry numbers:
33+
#' cas(10)
34+
#'
35+
#' ## This is a cas-object:
36+
#' is.cas(cas(0L))
37+
#'
38+
#' ## This is not a cas-object:
39+
#' is.cas(0L)
40+
#'
41+
#' ## Three different ways of creating a cas object from
42+
#' ## Benzene's CAS registry number (the result is the same)
43+
#' as.cas("71-43-2")
44+
#' as.cas("71432")
45+
#' as.cas(71432L)
46+
#'
47+
#' ## This is one way of creating a vector with multiple CAS registry numbers:
48+
#' cas_data <- as.cas(c("64175", "71432", "58082"))
49+
#'
50+
#' ## This is how you select a specific element(s) from the vector:
51+
#' cas_data[2:3]
52+
#' cas_data[[2]]
53+
#'
54+
#' ## You can also replace specific elements in the vector:
55+
#' cas_data[1] <- "7440-23-5"
56+
#' cas_data[[2]] <- "129-00-0"
57+
#'
58+
#' ## You can format CAS numbers with or without hyphens:
59+
#' format(cas_data, TRUE)
60+
#' format(cas_data, FALSE)
61+
#'
62+
#' ## The same can be achieved using as.character
63+
#' as.character(cas_data, TRUE)
64+
#' as.character(cas_data, FALSE)
65+
#'
66+
#' ## There are also show and print methods available:
67+
#' show(cas_data)
68+
#' print(cas_data)
69+
#'
70+
#' ## Numeric values can be obtained from CAS using as.numeric, as.double or as.integer
71+
#' as.numeric(cas_data)
72+
#'
73+
#' ## Be careful, however. Some CAS numbers cannot be represented by R's 32 bit integers
74+
#' ## and will produce NA's. This will work OK:
75+
#' huge_cas <- as.cas("9999999-99-5")
76+
#'
77+
#' \dontrun{
78+
#' ## This will not:
79+
#' as.integer(huge_cas)
80+
#' }
81+
#'
82+
#' ## The trick applied by this package is that the final
83+
#' ## validation digit is stored separately as attribute:
84+
#' unclass(huge_cas)
85+
#'
86+
#' ## This is how cas objects can be concatenated:
87+
#' cas_data <- c(huge_cas, cas_data)
88+
#'
89+
#' ## This will create a data.frame
90+
#' as.data.frame(cas_data)
91+
#'
92+
#' ## This will create a list:
93+
#' as.list(cas_data)
94+
#' @author Pepijn de Vries
95+
#' @export
96+
cas <- function(length = 0L) {
97+
structure (
98+
integer(length), ## The registry number is stored as integer without the final digit (=checksum)
99+
checksum = raw(length), ## last digit of CAS number, which serves as a checksum, stored as raw value
100+
class = "cas"
101+
)
102+
}
103+
104+
#' @rdname cas
105+
#' @name is.cas
106+
#' @export
107+
is.cas <- function(x) {
108+
if (!(class(x) %in% "cas")) return(F)
109+
checksums <- attributes(x)$checksum
110+
if (length(checksums) != length(x)) stop("Each CAS registry in the vector needs a checksum")
111+
validate <- outer(unclass(x), 0:9, function(x, y) {
112+
floor(x/(10^y)) %% 10
113+
})
114+
validate <- apply(validate, 1, function(x) {
115+
x <- sum(seq_along(x)*x) %% 10
116+
})
117+
return(all(validate == as.numeric(checksums)))
118+
}
119+
120+
#' @rdname cas
121+
#' @name as.cas
122+
#' @export
123+
as.cas <- function(x) {
124+
if (is.cas(x)) return(x)
125+
x <- as.character(x)
126+
is_hyphenated <- stringr::str_sub(x, -2, -2) == "-" & stringr::str_sub(x, -5, -5) == "-"
127+
x[is_hyphenated] <- paste0(
128+
stringr::str_sub(x[is_hyphenated], 1, -6),
129+
stringr::str_sub(x[is_hyphenated], -4, -3),
130+
stringr::str_sub(x[is_hyphenated], -1, -1)
131+
)
132+
if (any(!grepl("^[0-9]+$", x))) stop("CAS numbers can only contain hyphens at correct positions and numeric characters otherwise...")
133+
registry <- as.integer(stringr::str_sub(x, 1, -2))
134+
registry[is.na(registry)] <- 0L
135+
attributes(registry)$checksum <- as.raw(as.integer(stringr::str_sub(x, -1, -1)))
136+
class(registry) <- "cas"
137+
if (!is.cas(registry)) stop("Input contains invalid CAS numbers")
138+
registry
139+
}
140+
141+
#' @rdname cas
142+
#' @name [[.cas
143+
#' @export
144+
`[[.cas` <- function(x, i) {
145+
attribs <- attributes(x)
146+
attribs$checksum <- attribs$checksum[[i]]
147+
attribs$names <- attribs$names[[i]]
148+
x <- unclass(x)
149+
x <- x[[i]]
150+
attributes(x) <- attribs
151+
x
152+
}
153+
154+
#' @rdname cas
155+
#' @name [.cas
156+
#' @export
157+
`[.cas` <- function(x, i) {
158+
attribs <- attributes(x)
159+
attribs$checksum <- attribs$checksum[i]
160+
attribs$names <- attribs$names[i]
161+
x <- unclass(x)
162+
x <- x[i]
163+
attributes(x) <- attribs
164+
x
165+
}
166+
167+
#' @rdname cas
168+
#' @name [[<-.cas
169+
#' @export
170+
`[[<-.cas` <- function(x, i, value) {
171+
value <- as.cas(value)
172+
attribs <- attributes(x)
173+
attribs$checksum[[i]] <- attributes(value)$checksum
174+
attribs$names[[i]] <- attributes(value)$names
175+
x <- unclass(x)
176+
x[[i]] <- unclass(value)
177+
attributes(x) <- attribs
178+
x
179+
}
180+
181+
#' @rdname cas
182+
#' @name [<-.cas
183+
#' @export
184+
`[<-.cas` <- function(x, i, value) {
185+
value <- as.cas(value)
186+
attribs <- attributes(x)
187+
attribs$checksum[i] <- attributes(value)$checksum
188+
attribs$names[i] <- attributes(value)$names
189+
x <- unclass(x)
190+
x[i] <- unclass(value)
191+
attributes(x) <- attribs
192+
x
193+
}
194+
195+
#' @rdname cas
196+
#' @name format.cas
197+
#' @export
198+
format.cas <- function(x, hyphenate = TRUE, ...) {
199+
checksums <- attributes(x)$checksum
200+
x <- unclass(x)
201+
repp <- x
202+
repp[repp == 0] <- 1
203+
repp <- ceiling(2 - log10(repp))
204+
repp[repp < 0] <- 0
205+
x <- paste0(strrep("0", repp), x)
206+
sprintf("%s%s%s%s%01i",
207+
stringr::str_sub(x, 1, -3),
208+
ifelse(hyphenate, "-", ""),
209+
stringr::str_sub(x, -2, -1),
210+
ifelse(hyphenate, "-", ""),
211+
as.numeric(checksums)
212+
)
213+
}
214+
215+
#' @rdname cas
216+
#' @name as.character.cas
217+
#' @export
218+
as.character.cas <- function(x, ...) {
219+
format(x, ...)
220+
}
221+
222+
#' @rdname cas
223+
#' @name show.cas
224+
#' @export
225+
show.cas <- function(x, ...) {
226+
format(x, ...)
227+
}
228+
229+
#' @rdname cas
230+
#' @name print.cas
231+
#' @export
232+
print.cas <- function(x, ...) {
233+
if (length(x) == 0)
234+
cat("cas(0)\n") else print(format.cas(x), ...)
235+
}
236+
237+
#' @rdname cas
238+
#' @name as.list.cas
239+
#' @export
240+
as.list.cas <- function(x, ...) {
241+
lapply(seq_along(x), function(i) x[i])
242+
}
243+
244+
#' @rdname cas
245+
#' @name as.double.cas
246+
#' @export
247+
as.double.cas <- function(x, ...) {
248+
as.double(as.integer.cas(x, ...), ...)
249+
}
250+
251+
#' @rdname cas
252+
#' @name as.integer.cas
253+
#' @export
254+
as.integer.cas <- function(x, ...) {
255+
checksums <- as.integer(attributes(x)$checksum, ...)
256+
x <- 10L*unclass(x)
257+
attributes(x) <- NULL
258+
x + checksums
259+
}
260+
261+
#' @rdname cas
262+
#' @name c.cas
263+
#' @export
264+
c.cas <- function(...) {
265+
result <- list(...)
266+
result <- lapply(result, as.cas)
267+
checksums <- do.call(c, lapply(result, function(x) attributes(x)$checksum))
268+
result <- do.call(c, lapply(result, function(x) unclass(x)))
269+
class(result) <- "cas"
270+
attributes(result)$checksum <- checksums
271+
result
272+
}
273+
274+
#' @rdname cas
275+
#' @name as.data.frame.cas
276+
#' @export
277+
as.data.frame.cas <- function(...) {
278+
as.data.frame(tibble::tibble(...))
279+
}

0 commit comments

Comments
 (0)