diff --git a/.gitignore b/.gitignore index aaa7aaec..1f0278ca 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,7 @@ po/*~ # RStudio Connect folder rsconnect/ + +.DS_Store +.ipynb_checkpoints/ +*.ipynb \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 6f410979..9dee7e14 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -69,6 +69,7 @@ Suggests: knitr, reticulate (>= 1.36.1), hdf5r (>= 1.3.11), + pizzarr, rmarkdown, S4Vectors, SeuratObject, diff --git a/NAMESPACE b/NAMESPACE index 71bb554d..001ea935 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,9 @@ export(from_Seurat) export(from_SingleCellExperiment) export(generate_dataset) export(read_h5ad) +export(read_zarr) export(write_h5ad) +export(write_zarr) importFrom(Matrix,as.matrix) importFrom(Matrix,sparseMatrix) importFrom(Matrix,t) diff --git a/R/Seurat.R b/R/Seurat.R index 3445abfa..8e5ac625 100644 --- a/R/Seurat.R +++ b/R/Seurat.R @@ -121,7 +121,7 @@ to_Seurat <- function(obj) { # nolint from_Seurat <- function( # nolint end: object_name_linter seurat_obj, - output_class = c("InMemoryAnnData", "HDF5AnnData"), + output_class = c("InMemoryAnnData", "HDF5AnnData", "ZarrAnnData"), assay = NULL, X = "counts", ...) { diff --git a/R/SingleCellExperiment.R b/R/SingleCellExperiment.R index f5890b5c..36e81c10 100644 --- a/R/SingleCellExperiment.R +++ b/R/SingleCellExperiment.R @@ -117,7 +117,7 @@ to_SingleCellExperiment <- function(object) { # nolint from_SingleCellExperiment <- function( # nolint end: object_name_linter sce, - output_class = c("InMemory", "HDF5AnnData"), + output_class = c("InMemory", "HDF5AnnData", "ZarrAnnData"), ...) { stopifnot( inherits(sce, "SingleCellExperiment") diff --git a/R/ZarrAnnData.R b/R/ZarrAnnData.R new file mode 100644 index 00000000..0a1eb45c --- /dev/null +++ b/R/ZarrAnnData.R @@ -0,0 +1,412 @@ +#' @title ZarrAnnData +#' +#' @description +#' Implementation of a Zarr-based AnnData object. +#' @noRd +ZarrAnnData <- R6::R6Class("ZarrAnnData", # nolint + inherit = AbstractAnnData, + private = list( + zarr_store = NULL, + zarr_root = NULL, + .compression = NULL + ), + active = list( + #' @field X The X slot + X = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_X, status=done + read_zarr_element(private$zarr_store, "/X") + } else { + # trackstatus: class=ZarrAnnData, feature=set_X, status=done + value <- private$.validate_aligned_array( + value, + "X", + shape = c(self$n_obs(), self$n_vars()), + expected_rownames = rownames(self), + expected_colnames = colnames(self) + ) + write_zarr_element(value, private$zarr_store, "/X", private$.compression, overwrite = TRUE) + } + }, + #' @field layers The layers slot. Must be NULL or a named list + #' with with all elements having the dimensions consistent with + #' `obs` and `var`. + layers = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_layers, status=done + read_zarr_element(private$zarr_store, "layers") + } else { + # trackstatus: class=ZarrAnnData, feature=set_layers, status=done + value <- private$.validate_aligned_mapping( + value, + "layers", + c(self$n_obs(), self$n_vars()), + expected_rownames = rownames(self), + expected_colnames = colnames(self) + ) + write_zarr_element(value, private$zarr_store, "/layers", private$.compression, overwrite = TRUE) + } + }, + #' @field obsm The obsm slot. Must be `NULL` or a named list with + #' with all elements having the same number of rows as `obs`. + obsm = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obsm, status=done + read_zarr_element(private$zarr_store, "obsm") + } else { + # trackstatus: class=ZarrAnnData, feature=set_obsm, status=done + value <- private$.validate_aligned_mapping( + value, + "obsm", + c(self$n_obs()), + expected_rownames = rownames(self) + ) + write_zarr_element(value, private$zarr_store, "/obsm") + } + }, + #' @field varm The varm slot. Must be `NULL` or a named list with + #' with all elements having the same number of rows as `var`. + varm = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_varm, status=done + read_zarr_element(private$zarr_store, "varm") + } else { + # trackstatus: class=ZarrAnnData, feature=set_varm, status=done + value <- private$.validate_aligned_mapping( + value, + "varm", + c(self$n_vars()), + expected_rownames = colnames(self) + ) + write_zarr_element(value, private$zarr_store, "/varm") + } + }, + #' @field obsp The obsp slot. Must be `NULL` or a named list with + #' with all elements having the same number of rows and columns as `obs`. + obsp = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obsp, status=done + read_zarr_element(private$zarr_store, "obsp") + } else { + # trackstatus: class=ZarrAnnData, feature=set_obsp, status=done + value <- private$.validate_aligned_mapping( + value, + "obsp", + c(self$n_obs(), self$n_obs()), + expected_rownames = rownames(self), + expected_colnames = rownames(self) + ) + write_zarr_element(value, private$zarr_store, "/obsp") + } + }, + #' @field varp The varp slot. Must be `NULL` or a named list with + #' with all elements having the same number of rows and columns as `var`. + varp = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_varp, status=done + read_zarr_element(private$zarr_store, "varp") + } else { + # trackstatus: class=ZarrAnnData, feature=set_varp, status=done + value <- private$.validate_aligned_mapping( + value, + "varp", + c(self$n_vars(), self$n_vars()), + expected_rownames = colnames(self), + expected_colnames = colnames(self) + ) + write_zarr_element(value, private$zarr_store, "/varp") + } + }, + + #' @field obs The obs slot + obs = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obs, status=done + # TODO: shall we keep include_index = TRUE, or get rid of the argument ? + read_zarr_element(private$zarr_store, "/obs", include_index = TRUE) + } else { + # trackstatus: class=ZarrAnnData, feature=set_obs, status=done + value <- private$.validate_obsvar_dataframe(value, "obs") + write_zarr_element( + value, + private$zarr_store, + "/obs", + private$.compression, + overwrite = TRUE + ) + } + }, + #' @field var The var slot + var = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_var, status=done + # TODO: shall we keep include_index = TRUE, or get rid of the argument ? + read_zarr_element(private$zarr_store, "/var", include_index = TRUE) + } else { + # trackstatus: class=ZarrAnnData, feature=set_var, status=done + value <- private$.validate_obsvar_dataframe(value, "var") + write_zarr_element( + value, + private$zarr_store, + "/var", + overwrite = TRUE + ) + } + }, + #' @field obs_names Names of observations + obs_names = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_obs_names, status=done + rownames(self$obs) + } else { + # trackstatus: class=ZarrAnnData, feature=set_obs_names, status=done + rownames(self$obs) <- value + } + }, + #' @field var_names Names of variables + var_names = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_var_names, status=done + rownames(self$var) + } else { + # trackstatus: class=ZarrAnnData, feature=set_var_names, status=done + rownames(self$var) <- value + } + }, + #' @field uns The uns slot. Must be `NULL` or a named list. + uns = function(value) { + if (missing(value)) { + # trackstatus: class=ZarrAnnData, feature=get_uns, status=done + read_zarr_element(private$zarr_store, "uns") + } else { + # trackstatus: class=ZarrAnnData, feature=set_uns, status=done + value <- private$.validate_named_list(value, "uns") + write_zarr_element(value, private$zarr_store, "/uns") + } + } + ), + public = list( + #' @description ZarrAnnData constructor + #' + #' @param store The Zarr store instance. + #' @param X Either `NULL` or a observation × variable matrix with + #' dimensions consistent with `obs` and `var`. + #' @param layers Either `NULL` or a named list, where each element is an + #' observation × variable matrix with dimensions consistent with `obs` and + #' `var`. + #' @param obs Either `NULL` or a `data.frame` with columns containing + #' information about observations. If `NULL`, an `n_obs`×0 data frame will + #' automatically be generated. + #' @param var Either `NULL` or a `data.frame` with columns containing + #' information about variables. If `NULL`, an `n_vars`×0 data frame will + #' automatically be generated. + #' @param obsm The obsm slot is used to store multi-dimensional annotation + #' arrays. It must be either `NULL` or a named list, where each element is a + #' matrix with `n_obs` rows and an arbitrary number of columns. + #' @param varm The varm slot is used to store multi-dimensional annotation + #' arrays. It must be either `NULL` or a named list, where each element is a + #' matrix with `n_vars` rows and an arbitrary number of columns. + #' @param obsp The obsp slot is used to store sparse multi-dimensional + #' annotation arrays. It must be either `NULL` or a named list, where each + #' element is a sparse matrix where each dimension has length `n_obs`. + #' @param varp The varp slot is used to store sparse multi-dimensional + #' annotation arrays. It must be either `NULL` or a named list, where each + #' element is a sparse matrix where each dimension has length `n_vars`. + #' @param uns The uns slot is used to store unstructured annotation. It must + #' be either `NULL` or a named list. + #' @param compression The compression algorithm to use when writing + #' Zarr arrays. Can be one of `"none"`, `"gzip"` or `"lzf"`. Defaults to + #' `"none"`. + #' + #' @details + #' The constructor creates a new Zarr AnnData interface object. This can + #' either be used to either connect to an existing `.zarr` store or to + #' populate an empty one. In both cases, any additional slots provided will be + #' set on the created object. This will cause data to be overwritten if the + #' file already exists. + initialize = function(store, + X = NULL, + obs = NULL, + var = NULL, + layers = NULL, + obsm = NULL, + varm = NULL, + obsp = NULL, + varp = NULL, + uns = NULL, + shape = NULL, + mode = c("r", "r+", "a", "w", "w-", "x"), + compression = c("none", "gzip", "lzf")) { + if (!requireNamespace("pizzarr", quietly = TRUE)) { + stop("The Zarr interface requires the 'pizzarr' package to be installed") + } + + # check arguments + compression <- match.arg(compression) + mode <- match.arg(mode) + + # store compression for later use + private$.compression <- compression + + root <- pizzarr::zarr_open_group(store, path = "/") + if (length(root$get_attrs()$to_list()) == 0) { + + # store private values + private$zarr_store <- store + + # Determine initial obs and var + shape <- get_shape(obs, var, X, shape) + obs <- get_initial_obs(obs, X, shape) + var <- get_initial_var(var, X, shape) + + # Create an empty Zarr + write_empty_zarr(store, obs, var, compression) + + # set other slots + if (!is.null(X)) { + self$X <- X + } + if (!is.null(layers)) { + self$layers <- layers + } + if (!is.null(obsm)) { + self$obsm <- obsm + } + if (!is.null(varm)) { + self$varm <- varm + } + if (!is.null(obsp)) { + self$obsp <- obsp + } + if (!is.null(varp)) { + self$varp <- varp + } + if (!is.null(uns)) { + self$uns <- uns + } + } else { + + # get root + root <- pizzarr::zarr_open_group(store, path = "/") + + # Check the file is a valid AnnData format + attrs <- root$get_attrs()$to_list() + + if (!all(c("encoding-type", "encoding-version") %in% names(attrs))) { + stop( + "H5AD encoding information is missing. ", + "This file may have been created with Python anndata<0.8.0." + ) + } + + # Set the file path + private$zarr_store <- store + private$zarr_root <- root + + # assert other arguments are NULL + if (!is.null(obs)) { + stop("obs must be NULL when loading an existing zarr store") + } + if (!is.null(var)) { + stop("var must be NULL when loading an existing zarr store") + } + if (!is.null(X)) { + stop("X must be NULL when loading an existing zarr store") + } + if (!is.null(layers)) { + stop("layers must be NULL when loading an existing zarr store") + } + if (!is.null(obsm)) { + stop("obsm must be NULL when loading an existing zarr store") + } + if (!is.null(varm)) { + stop("varm must be NULL when loading an existing zarr store") + } + if (!is.null(obsp)) { + stop("obsp must be NULL when loading an existing zarr store") + } + if (!is.null(varp)) { + stop("varp must be NULL when loading an existing zarr store") + } + if (!is.null(uns)) { + stop("uns must be NULL when loading an existing zarr store") + } + } + }, + + #' @description Number of observations in the AnnData object + n_obs = function() { + nrow(self$obs) + }, + + #' @description Number of variables in the AnnData object + n_vars = function() { + nrow(self$var) + } + ) +) + +#' Convert an AnnData object to a ZarrAnnData object +#' +#' This function takes an AnnData object and converts it to a ZarrAnnData +#' object, loading all fields into memory. +#' +#' @param adata An AnnData object to be converted to ZarrAnnData. +#' @param store The Zarr store. +#' @param compression The compression algorithm to use when writing +#' Zarr arrays. Can be one of `"none"`, `"gzip"` or `"lzf"`. Defaults to +#' `"none"`. +#' @param mode The mode to open the Zarr store. +#' +#' * `a` creates a new file or opens an existing one for read/write. +#' * `r` opens an existing file for reading. +#' * `r+` opens an existing file for read/write. +#' * `w` creates a file, truncating any existing ones. +#' * `w-`/`x` are synonyms, creating a file and failing if it already exists. +#' +#' @return a ZarrAnnData object with the same data as the input AnnData +#' object. +#' +#' @noRd +#' +#' @examples +#' ad <- AnnData( +#' X = matrix(1:5, 3L, 5L), +#' layers = list( +#' A = matrix(5:1, 3L, 5L), +#' B = matrix(letters[1:5], 3L, 5L) +#' ), +#' obs = data.frame(cell = 1:3), +#' var = data.frame(gene = 1:5), +#' obs_names = LETTERS[1:3], +#' var_names = letters[1:5] +#' ) +#' to_ZarrAnnData(ad, "test.zarr") +#' # remove store directory +#' unlink("test.zarr", recursive = TRUE) +# nolint start: object_name_linter +to_ZarrAnnData <- function( + # nolint end: object_name_linter + adata, + store, + compression = c("none", "gzip", "lzf"), + mode = c("w-", "r", "r+", "a", "w", "x")) { + stopifnot( + inherits(adata, "AbstractAnnData") + ) + ZarrAnnData$new( + store = store, + X = adata$X, + obs = adata$obs, + var = adata$var, + obsm = adata$obsm, + varm = adata$varm, + layers = adata$layers, + obsp = adata$obsp, + varp = adata$varp, + uns = adata$uns, + compression = compression, + shape = adata$shape(), + mode = mode + ) +} diff --git a/R/anndata_constructors.R b/R/anndata_constructors.R index 54d21ec7..b5646ebf 100644 --- a/R/anndata_constructors.R +++ b/R/anndata_constructors.R @@ -4,7 +4,8 @@ anndata_constructors <- function() { list( "HDF5AnnData" = HDF5AnnData, - "InMemoryAnnData" = InMemoryAnnData + "InMemoryAnnData" = InMemoryAnnData, + "ZarrAnnData" = ZarrAnnData ) } @@ -14,7 +15,7 @@ anndata_constructors <- function() { #' or `"InMemoryAnnData"`. #' #' @noRd -get_anndata_constructor <- function(class = c("HDF5AnnData", "InMemoryAnnData")) { +get_anndata_constructor <- function(class = c("HDF5AnnData", "ZarrAnnData", "InMemoryAnnData")) { # TODO: also support directly passing the correct class? class <- match.arg(class) anndata_constructors()[[class]] diff --git a/R/read_zarr.R b/R/read_zarr.R new file mode 100644 index 00000000..825f4eda --- /dev/null +++ b/R/read_zarr.R @@ -0,0 +1,50 @@ +#' Read Zarr +#' +#' Read data from a Zarr store +#' +#' @param path Path to the Zarr store to read +#' @param to The type of object to return. Must be one of: "InMemoryAnnData", +#' "HDF5AnnData", "SingleCellExperiment", "Seurat" +#' @param ... Extra arguments provided to `adata$to_SingleCellExperiment()` or +#' `adata$to_Seurat()`. See [AnnData()] for more information on the arguments of +#' these functions. Note: update this documentation when +#' [`r-lib/roxygen2#955`](https://github.com/r-lib/roxygen2/issues/955) is resolved. +#' +#' @return The object specified by `to` +#' @export +#' +#' @examples +#' file <- system.file("extdata", "example.zarr", package = "anndataR") +#' store <- pizzarr::DirectoryStore$new(file) +#' +#' # Read the Zarr store as a SingleCellExperiment object +#' if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { +#' sce <- read_zarr(store, to = "SingleCellExperiment") +#' } +#' +#' # Read the Zarr store as a Seurat object +#' if (requireNamespace("SeuratObject", quietly = TRUE)) { +#' seurat <- read_zarr(store, to = "Seurat") +#' } +read_zarr <- function( + path, + to = c("InMemoryAnnData", "HDF5AnnData", "SingleCellExperiment", "Seurat", "ZarrAnnData"), + ...) { + to <- match.arg(to) + + adata <- ZarrAnnData$new(path) + + fun <- switch(to, + "SingleCellExperiment" = to_SingleCellExperiment, + "Seurat" = to_Seurat, + "InMemoryAnnData" = to_InMemoryAnnData, + "HDF5AnnData" = to_HDF5AnnData, + "ZarrAnnData" = NULL + ) + + if (!is.null(fun)) { + fun(adata, ...) + } else { + adata + } +} diff --git a/R/read_zarr_helpers.R b/R/read_zarr_helpers.R new file mode 100644 index 00000000..b9ef85f0 --- /dev/null +++ b/R/read_zarr_helpers.R @@ -0,0 +1,508 @@ +#' Read Zarr encoding +#' +#' Read the encoding and version of an element in a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' +#' @return A named list with names type and version +#' +#' @noRd +read_zarr_encoding <- function(store, name, stop_on_error = TRUE) { + # Path can be to array or group + g <- pizzarr::zarr_open(store, path = name) + attrs <- g$get_attrs()$to_list() + + if (!all(c("encoding-type", "encoding-version") %in% names(attrs))) { + if (stop_on_error) { + stop( + "Encoding attributes not found for element '", name, "' " + ) + } else { + return(NULL) + } + } + + list( + type = attrs[["encoding-type"]], + version = attrs[["encoding-version"]] + ) +} + +#' Read Zarr element +#' +#' Read an element from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param type The encoding type of the element to read +#' @param version The encoding version of the element to read +#' @param stop_on_error Whether to stop on error or generate a warning instead +#' @param ... Extra arguments passed to individual reading functions +#' +#' @details +#' Encoding is automatically determined from the element using +#' `read_zarr_encoding` and used to select the appropriate reading function. +#' +#' @return Value depending on the encoding +#' +#' @noRd +read_zarr_element <- function(store, name, type = NULL, version = NULL, stop_on_error = FALSE, ...) { + if (is.null(type)) { + encoding_list <- read_zarr_encoding(store, name, stop_on_error = stop_on_error) + if (is.null(encoding_list)) { + if (stop_on_error) { + stop("No encoding info found for element '", name, "'") + } else { + warning("No encoding found for element '", name, "'") + return(NULL) + } + } + type <- encoding_list$type + version <- encoding_list$version + } + + read_fun <- switch(type, + "array" = read_zarr_dense_array, + "rec-array" = read_zarr_rec_array, + "csr_matrix" = read_zarr_csr_matrix, + "csc_matrix" = read_zarr_csc_matrix, + "dataframe" = read_zarr_data_frame, + "dict" = read_zarr_mapping, + "string" = read_zarr_string_scalar, + "numeric-scalar" = read_zarr_numeric_scalar, + "categorical" = read_zarr_categorical, + "string-array" = read_zarr_string_array, + "nullable-integer" = read_zarr_nullable_integer, + "nullable-boolean" = read_zarr_nullable_boolean, + stop( + "No function for reading H5AD encoding '", type, + "' for element '", name, "'" + ) + ) + + tryCatch( + { + read_fun(store = store, name = name, version = version, ...) + }, + error = function(e) { + message <- paste0( + "Error reading element '", name, "' of type '", type, "':\n", + conditionMessage(e) + ) + if (stop_on_error) { + stop(message) + } else { + warning(message) + return(NULL) + } + } + ) +} + +read_zarr_array <- function(store, name) { + zarr_arr <- pizzarr::zarr_open_array(store, path = name) + nested_arr <- zarr_arr$get_item("...") + return(nested_arr$data) +} + +#' Read Zarr dense array +#' +#' Read a dense array from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a matrix or a vector if 1D +#' +#' @noRd +read_zarr_dense_array <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + # Extract the NestedArray contents as a base R array. + darr <- read_zarr_array(store, name) + + + # TODO: ideally, native = TRUE should take care of the row order and column order, + # but it doesn't + # If the dense array is a 1D matrix, convert to vector + if (length(dim(darr)) == 1) { + darr <- as.vector(darr) + } + darr +} + +read_zarr_csr_matrix <- function(store, name, version) { + read_zarr_sparse_array( + store = store, + name = name, + version = version, + type = "csr_matrix" + ) +} + +read_zarr_csc_matrix <- function(store, name, version) { + read_zarr_sparse_array( + store = store, + name = name, + version = version, + type = "csc_matrix" + ) +} + +#' Read Zarr sparse array +#' +#' Read a sparse array from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' @param type Type of the sparse matrix, either "csr_matrix" or "csc_matrix" +#' +#' @return a sparse matrix/DelayedArray???, or a vector if 1D +#' @importFrom Matrix sparseMatrix +#' +#' @noRd +read_zarr_sparse_array <- function(store, name, version = "0.1.0", + type = c("csr_matrix", "csc_matrix")) { + version <- match.arg(version) + type <- match.arg(type) + + g <- pizzarr::zarr_open_group(store, path = name) + + data <- as.vector(read_zarr_array(store, paste0(name, "/data"))) + indices <- as.vector(read_zarr_array(store, paste0(name, "/indices"))) + indptr <- as.vector(read_zarr_array(store, paste0(name, "/indptr"))) + shape <- as.vector(unlist(g$get_attrs()$to_list()$shape, use.names = FALSE)) + + if (type == "csc_matrix") { + mtx <- Matrix::sparseMatrix( + i = indices, + p = indptr, + x = data, + dims = shape, + repr = "C", + index1 = FALSE + ) + } else if (type == "csr_matrix") { + mtx <- Matrix::sparseMatrix( + j = indices, + p = indptr, + x = data, + dims = shape, + repr = "R", + index1 = FALSE + ) + } + + mtx +} + +#' Read Zarr recarray +#' +#' Read a recarray from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @details +#' A "record array" (recarray) is a Python NumPy array type that contains +#' "fields" that can be indexed using attributes (similar to columns in a +#' spreadsheet). See https://numpy.org/doc/stable/reference/generated/numpy.recarray.html +#' for details. +#' +#' They are used by **scanpy** to score marker gene testing results. +#' +#' @return a named list of 1D arrays +#' +#' @noRd +read_zarr_rec_array <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + stop("Reading recarrays is not yet implemented") +} + +#' Read Zarr nullable boolean +#' +#' Read a nullable boolean from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a boolean vector +#' +#' @noRd +read_zarr_nullable_boolean <- function(store, name, version = "0.1.0") { + as.logical(read_zarr_nullable(store, name, version)) +} + +#' Read Zarr nullable integer +#' +#' Read a nullable integer from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return an integer vector +#' +#' @noRd +read_zarr_nullable_integer <- function(store, name, version = "0.1.0") { + as.integer(read_zarr_nullable(store, name, version)) +} + +#' Read Zarr nullable +#' +#' Read a nullable vector (boolean or integer) from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a nullable vector +#' +#' @noRd +read_zarr_nullable <- function(store, name, version = "0.1.0") { + version <- match.arg(version) + + mask <- read_zarr_array(store, paste0(name, "/mask")) + values <- read_zarr_array(store, paste0(name, "/values")) + + # Get values and set missing + element <- values + element[mask] <- NA + + return(element) +} + +#' Read Zarr string array +#' +#' Read a string array from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a character vector/matrix +#' +#' @noRd +read_zarr_string_array <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + # reads in transposed + string_array <- read_zarr_array(store, name) + + # If the array is 1D, convert to vector + if (length(dim(string_array)) == 1) { + string_array <- as.vector(string_array) + } + + string_array +} + +#' Read Zarr categorical +#' +#' Read a categorical from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a factor +#' +#' @noRd +read_zarr_categorical <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + codes <- read_zarr_array(store, paste0(name, "/codes")) + categories <- read_zarr_array(store, paste0(name, "/categories")) + + # Get codes and convert to 1-based indexing + codes <- codes + 1 + + if (!length(dim(codes)) == 1) { + stop("There is currently no support for multidimensional categorical arrays") + } + + # Set missing values + codes[codes == 0] <- NA + + levels <- categories + + g <- pizzarr::zarr_open_group(store, path = name) + + attributes <- g$get_attrs()$to_list() + ordered <- attributes[["ordered"]] + if (is.null(ordered) || is.na(ordered)) { + # This version of {rhdf5} doesn't yet support ENUM type attributes so we + # can't tell if the categorical should be ordered, + # see https://github.com/grimbough/rhdf5/issues/125 + warning( + "Unable to determine if categorical '", name, + "' is ordered, assuming it isn't" + ) + + ordered <- FALSE + } + + factor(codes, labels = levels, ordered = ordered) +} + +#' Read Zarr string scalar +#' +#' Read a string scalar from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a character vector of length 1 +#' +#' @noRd +read_zarr_string_scalar <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + scalar <- as.character(read_zarr_array(store, name)) + return(scalar) +} + +#' Read Zarr numeric scalar +#' +#' Read a numeric scalar from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a numeric vector of length 1 +#' +#' @noRd +read_zarr_numeric_scalar <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + scalar <- as.numeric(read_zarr_array(store, name)) + return(scalar) +} + +#' Read Zarr mapping +#' +#' Read a mapping from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return a named list +#' +#' @noRd +read_zarr_mapping <- function(store, name, version = "0.1.0") { + version <- match.arg(version) + + g <- pizzarr::zarr_open(store) + columns <- g$get_store()$listdir(name) + + # Omit Zarr metadata files from the list of columns. + columns <- columns[!columns %in% c(".zgroup", ".zattrs", ".zarray")] + + read_zarr_collection(store, name, columns) +} + +#' Read Zarr data frame +#' +#' Read a data frame from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' @param include_index Whether or not to include the index as a column +#' +#' @details +#' If `include_index == TRUE` the index stored in the Zarr store is added as a +#' column to output `data.frame` using the defined index name as the column +#' name and this is set as an attribute. If `include_index == FALSE` the index +#' is not provided in the output. In either case row names are not set. +#' +#' @return a data.frame +#' +#' @noRd +read_zarr_data_frame <- function(store, name, include_index = TRUE, + version = "0.2.0") { + version <- match.arg(version) + + g <- pizzarr::zarr_open_group(store, path = name) + + attributes <- g$get_attrs()$to_list() + index_name <- attributes$`_index` + column_order <- attributes$`column-order` + + columns <- read_zarr_collection(store, name, column_order) + + if (length(columns) == 0) { + index <- read_zarr_data_frame_index(store, name) + df <- data.frame(row.names = seq_along(index)) + } else { + df <- data.frame(columns) + } + + if (isTRUE(include_index)) { + index <- read_zarr_data_frame_index(store, name) + + # The default index name is not allowed as a column name so adjust it + if (index_name == "_index") { + rownames(df) <- index + } + + } + + df +} + +#' Read Zarr data frame index +#' +#' Read the index of a data frame from a Zarr store +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param version Encoding version of the element to read +#' +#' @return an object containing the index +#' +#' @noRd +read_zarr_data_frame_index <- function(store, name, version = "0.2.0") { + version <- match.arg(version) + + g <- pizzarr::zarr_open_group(store, path = name) + + attributes <- g$get_attrs()$to_list() + index_name <- attributes$`_index` + + read_zarr_element(store, file.path(name, index_name)) +} + +#' Read multiple Zarr datatypes +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param column_order Vector of item names (in order) +#' +#' @return a named list +#' +#' @noRd +read_zarr_collection <- function(store, name, column_order) { + columns <- list() + for (col_name in column_order) { + new_name <- paste0(name, "/", col_name) + tryCatch({ + encoding <- read_zarr_encoding(store, new_name) + columns[[col_name]] <- read_zarr_element( + store = store, + name = new_name, + type = encoding$type, + version = encoding$version + ) + }, error = function(cond) { + warning("Not reading file '", new_name, "' in collection") + }) + } + columns +} diff --git a/R/write_zarr.R b/R/write_zarr.R new file mode 100644 index 00000000..667df8fb --- /dev/null +++ b/R/write_zarr.R @@ -0,0 +1,101 @@ +#' Write Zarr +#' +#' Write a Zarr store +#' +#' @param object The object to write, either a "SingleCellExperiment" or a +#' "Seurat" object +#' @param path Path of the file (or zarr store) to write to +#' @param compression The compression algorithm to use when writing +#' Zarr arrays. Can be one of `"none"`, `"gzip"` or `"lzf"`. Defaults to +#' `"none"`. +#' @param mode The mode to open the Zarr store. +#' +#' * `a` creates a new file or opens an existing one for read/write. +#' * `r+` opens an existing file for read/write. +#' * `w` creates a file, truncating any existing ones +#' * `w-`/`x` are synonyms creating a file and failing if it already exists. +#' +#' @return `path` invisibly +#' @export +#' +#' @examples +#' adata <- AnnData( +#' X = matrix(1:5, 3L, 5L), +#' layers = list( +#' A = matrix(5:1, 3L, 5L), +#' B = matrix(letters[1:5], 3L, 5L) +#' ), +#' obs = data.frame(row.names = LETTERS[1:3], cell = 1:3), +#' var = data.frame(row.names = letters[1:5], gene = 1:5) +#' ) +#' store <- pizzarr::MemoryStore$new() +#' write_zarr(adata, store) +#' +#' # Write a SingleCellExperiment as a Zarr store +#' if (requireNamespace("SingleCellExperiment", quietly = TRUE)) { +#' ncells <- 100 +#' counts <- matrix(rpois(20000, 5), ncol = ncells) +#' logcounts <- log2(counts + 1) +#' +#' pca <- matrix(runif(ncells * 5), ncells) +#' tsne <- matrix(rnorm(ncells * 2), ncells) +#' +#' sce <- SingleCellExperiment::SingleCellExperiment( +#' assays = list(counts = counts, logcounts = logcounts), +#' reducedDims = list(PCA = pca, tSNE = tsne) +#' ) +#' +#' store <- pizzarr::MemoryStore$new() +#' write_zarr(sce, store) +#' } +#' +#' # Write a Seurat as a Zarr store +#' if (requireNamespace("SeuratObject", quietly = TRUE)) { +#' # TODO: uncomment this code when the seurat converter is fixed +#' # counts <- matrix(1:15, 3L, 5L) +#' # dimnames(counts) <- list( +#' # letters[1:3], +#' # LETTERS[1:5] +#' # ) +#' # gene.metadata <- data.frame( +#' # row.names = LETTERS[1:5], +#' # gene = 1:5 +#' # ) +#' # obj <- SeuratObject::CreateSeuratObject(counts, meta.data = gene.metadata) +#' # cell.metadata <- data.frame( +#' # row.names = letters[1:3], +#' # cell = 1:3 +#' # ) +#' # obj <- SeuratObject::AddMetaData(obj, cell.metadata) +#' # +#' # store <- pizzarr::MemoryStore$new() +#' # write_zarr(obj, store) +#' } +write_zarr <- function( + object, + path, + compression = c("none", "gzip", "lzf"), + mode = c("w-", "r", "r+", "a", "w", "x")) { + mode <- match.arg(mode) + if (inherits(object, "SingleCellExperiment")) { + from_SingleCellExperiment( + object, + output_class = "ZarrAnnData", + store = path, + compression = compression, + mode = mode + ) + } else if (inherits(object, "Seurat")) { + from_Seurat( + object, + output_class = "ZarrAnnData", + store = path, + compression = compression, + mode = mode + ) + } else if (inherits(object, "AbstractAnnData")) { + to_ZarrAnnData(object, path, compression = compression, mode = mode) + } else { + stop("Unable to write object of class: ", class(object)) + } +} diff --git a/R/write_zarr_helpers.R b/R/write_zarr_helpers.R new file mode 100644 index 00000000..a69135dc --- /dev/null +++ b/R/write_zarr_helpers.R @@ -0,0 +1,579 @@ +#' Write Zarr element +#' +#' Write an element to a Zarr store +#' +#' @param value The value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' #' @param stop_on_error Whether to stop on error or generate a warning instead +#' @param ... Additional arguments passed to writing functions +#' +#' @noRd +#' +#' @details +#' `write_zarr_element()` should always be used instead of any of the specific +#' writing functions as it contains additional boilerplate to make sure +#' elements are written correctly. +write_zarr_element <- function(value, store, name, compression = c("none", "gzip", "lzf"), stop_on_error = FALSE, ...) { # nolint + compression <- match.arg(compression) + + # Delete the path if it already exists + # TODO: https://github.com/keller-mark/pizzarr/issues/69 + + # Sparse matrices + write_fun <- + if (inherits(value, "sparseMatrix")) { # Sparse matrices + write_zarr_sparse_array + } else if (is.factor(value)) { # Categoricals + write_zarr_categorical + } else if (is.list(value)) { # Lists and data frames + if (is.data.frame(value)) { + write_zarr_data_frame + } else { + write_zarr_mapping + } + } else if (is.character(value)) { # Character values + if (length(value) == 1 && !is.matrix(value)) { + write_zarr_string_scalar + } else { + write_zarr_string_array + } + } else if (is.numeric(value) || inherits(value, "denseMatrix")) { # Numeric values + if (length(value) == 1 && !is.matrix(value)) { + write_zarr_numeric_scalar + } else if (is.integer(value) && any(is.na(value))) { + write_zarr_nullable_integer + } else { + write_zarr_dense_array + } + } else if (is.logical(value)) { # Logical values + if (any(is.na(value))) { + write_zarr_nullable_boolean + } else if (length(value) == 1) { + # Single Booleans should be written as numeric scalars + write_zarr_numeric_scalar + } else { + write_zarr_dense_array + } + } else { # Fail if unknown + stop("Writing '", class(value), "' objects to Zarr stores is not supported") + } + + + tryCatch( + { + write_fun(value = value, store = store, name = name, compression = compression, ...) + }, + error = function(e) { + message <- paste0( + "Could not write element '", name, "' of type '", class(value), "':\n", + conditionMessage(e) + ) + if (stop_on_error) { + stop(message) + } else { + warning(message) + return(NULL) + } + } + ) +} + +#' Write Zarr encoding +#' +#' Write Zarr encoding attributes to an element in a Zarr store +#' +#' @noRd +#' +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param encoding The encoding type to set +#' @param version The encoding version to set +write_zarr_encoding <- function(store, name, encoding, version) { + g <- pizzarr::zarr_open(store, path = name) + attrs <- g$get_attrs() + + attrs$set_item("encoding-type", encoding) + attrs$set_item("encoding-version", version) +} + +#' Write Zarr dense array +#' +#' Write a dense array to a Zarr store +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +#' +#' @noRd +write_zarr_dense_array <- function(value, + store, + name, + compression, + version = "0.2.0", + chunks = TRUE, + overwrite = FALSE) { + version <- match.arg(version) + + zarr_write_compressed(store, name, value, compression, chunks = chunks, overwrite = overwrite) + + # Write attributes + write_zarr_encoding(store, name, "array", version) +} + +#' Write Zarr sparse array +#' +#' Write a sparse array to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_sparse_array <- function(value, + store, + name, + compression, + version = "0.1.0", + overwrite = FALSE) { + version <- match.arg(version) + + # check types + stopifnot(inherits(value, "sparseMatrix")) + + if (inherits(value, "RsparseMatrix")) { + type <- "csr_matrix" + indices_attr <- "j" + } else if (inherits(value, "CsparseMatrix")) { + type <- "csc_matrix" + indices_attr <- "i" + } else { + stop( + "Unsupported matrix format in ", name, ".", + "Supported formats are RsparseMatrix and CsparseMatrix", + "(and objects that inherit from those)." + ) + } + + # Write sparse matrix + g <- pizzarr::zarr_open_group(store, path = name) + zarr_write_compressed(store, paste0(name, "/indices"), attr(value, indices_attr), compression, overwrite = overwrite) + zarr_write_compressed(store, paste0(name, "/indptr"), value@p, compression, overwrite = overwrite) + zarr_write_compressed(store, paste0(name, "/data"), value@x, compression, overwrite = overwrite) + + # Add encoding + write_zarr_encoding(store, name, type, version) + + # Write shape attribute + g$get_attrs()$set_item("shape", dim(value)) +} + +#' Write Zarr nullable boolean +#' +#' Write a nullable boolean to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_nullable_boolean <- function(value, store, name, compression, version = "0.1.0", overwrite = FALSE) { + # write mask and values + pizzarr::zarr_open_group(store, path = name) + value_no_na <- value + value_no_na[is.na(value_no_na)] <- FALSE + + zarr_write_compressed(store, paste0(name, "/values"), value_no_na, compression, overwrite = overwrite) + zarr_write_compressed(store, paste0(name, "/mask"), is.na(value), compression, overwrite = overwrite) + + # Write attributes + write_zarr_encoding(store, name, "nullable-boolean", version) +} + +#' Write Zarr nullable integer +#' +#' Write a nullable integer to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_nullable_integer <- function(value, store, name, compression, version = "0.1.0", overwrite = FALSE) { + # write mask and values + pizzarr::zarr_open_group(store, path = name) + value_no_na <- value + value_no_na[is.na(value_no_na)] <- -1L + + zarr_write_compressed(store, paste0(name, "/values"), value_no_na, compression, overwrite = overwrite) + zarr_write_compressed(store, paste0(name, "/mask"), is.na(value), compression, overwrite = overwrite) + + # Write attributes + write_zarr_encoding(store, name, "nullable-integer", version) +} + +#' Write Zarr string array +#' +#' Write a string array to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_string_array <- function(value, + store, + name, + compression, + version = "0.2.0", + overwrite = FALSE) { + + if (!is.null(dim(value))) { + dims <- dim(value) + } else { + dims <- length(value) + } + + object_codec <- pizzarr::VLenUtf8Codec$new() + data <- array(data = value, dim = dims) + # TODO: existing _index does not allow overwriting, so shall we keep overwrite=TRUE here ? + pizzarr::zarr_create_array(data, + store = store, path = name, dtype = "|O", + object_codec = object_codec, shape = dims, overwrite = TRUE) + + write_zarr_encoding(store, name, "string-array", version) +} + +#' Write Zarr categorical +#' +#' Write a categorical to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_categorical <- function(value, + store, + name, + compression, + version = "0.2.0", + overwrite = FALSE) { + pizzarr::zarr_open_group(store, path = name) + zarr_write_compressed(store, paste0(name, "/categories"), levels(value), compression, overwrite = overwrite) + zarr_write_compressed(store, paste0(name, "/codes"), as.integer(value), compression, overwrite = overwrite) + zarr_write_compressed(store, paste0(name, "/ordered"), is.ordered(value), compression, overwrite = overwrite) + + write_zarr_encoding(store, name, "categorical", version) +} + +#' Write Zarr string scalar +#' +#' Write a string scalar to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_string_scalar <- function(value, + store, + name, + compression, + version = "0.2.0", + overwrite = FALSE) { + # Write scalar + object_codec <- pizzarr::VLenUtf8Codec$new() + value <- array(data = value, dim = 1) + pizzarr::zarr_create_array(value, + store = store, + path = name, + dtype = "|O", + object_codec = object_codec, + shape = 1, + overwrite = overwrite) + + # Write attributes + write_zarr_encoding(store, name, "string", version) +} + +#' Write Zarr numeric scalar +#' +#' Write a numeric scalar to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_numeric_scalar <- function(value, + store, + name, + compression, + version = "0.2.0", + overwrite = FALSE) { + # Write scalar + zarr_write_compressed(store, name, value, compression, overwrite = overwrite) + + # Write attributes + write_zarr_encoding(store, name, "numeric-scalar", version) +} + +#' Write Zarr mapping +#' +#' Write a mapping to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version Encoding version of the element to write +write_zarr_mapping <- function(value, store, name, compression, version = "0.1.0", overwrite = FALSE) { + pizzarr::zarr_open_group(store, path = name) + + # Write mapping elements + for (key in names(value)) { + write_zarr_element(value[[key]], store, paste0(name, "/", key), compression, overwrite = overwrite) + } + + write_zarr_encoding(store, name, "dict", version) +} + +#' Write Zarr data frame +#' +#' Write a data frame to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param index The index to write. Can either be a vector of length equal to +#' the number of rows in `values` or a single character string giving the name +#' of a column in `values`. If `NULL` then `rownames(value)` is used. +#' @param version Encoding version of the element to write +write_zarr_data_frame <- function(value, store, name, compression, index = NULL, + version = "0.2.0", overwrite = FALSE) { + g <- pizzarr::zarr_open_group(store, path = name) + write_zarr_encoding(store, name, "dataframe", version) + + if (is.null(index)) { + index_name <- "_index" + index_value <- rownames(value) + } else if (length(index) == nrow(value)) { + index_name <- "_index" + index_value <- index + } else if (length(index) == 1 && index %in% colnames(value)) { + index_name <- index + index_value <- value[[index_name]] + value[[index_name]] <- NULL + } else { + stop( + "index must be a vector with length `nrow(value)` or a single character", + "string giving the name of a column in `value`" + ) + } + + # Write index + write_zarr_data_frame_index(index_value, store, name, compression, index_name, overwrite = overwrite) + + # Write data frame columns + for (col in colnames(value)) { + write_zarr_element(value[[col]], store, paste0(name, "/", col), compression, overwrite = overwrite) + } + + # Write additional data frame attributes + col_order <- colnames(value) + col_order <- col_order[col_order != index_name] + # If there are no columns other than the index we set column order to an + # empty numeric vector + if (length(col_order) == 0) { + col_order <- numeric() + } + + g$get_attrs()$set_item("column-order", col_order) +} + +#' Write Zarr data frame index +#' +#' Write an for a data frame to a Zarr store +#' +#' @noRd +#' +#' @param value Value to write. Must be a vector to the same length as the data +#' frame. +#' @param store A Zarr store instance +#' @param name Name of the element within the Zarr store containing the data +#' frame +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param index_name Name of the data frame column storing the index +write_zarr_data_frame_index <- function(value, store, name, compression, index_name, overwrite = FALSE) { + if (!zarr_path_exists(store, name)) { + g <- pizzarr::zarr_open_group(store, path = name) + write_zarr_encoding(store, name, "dataframe", "0.2.0") + } + + encoding <- read_zarr_encoding(store, name) + if (encoding$type != "dataframe") { + stop("'", name, "' in '", store, "' is not a data frame") + } + + # Write index columns + write_zarr_element(value, store, paste0(name, "/", index_name), overwrite = overwrite) + + # Write data frame index attribute + g <- pizzarr::zarr_open_group(store, path = name) + g$get_attrs()$set_item("_index", index_name) +} + +#' Write empty Zarr +#' +#' Write a new empty Zarr store +#' +#' @noRd +#' +#' @param store Path to the Zarr store to write +#' @param obs Data frame with observations +#' @param var Data frame with variables +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' @param version The anndata on-disk format version to write +write_empty_zarr <- function(store, obs, var, compression, version = "0.1.0", overwrite = FALSE) { + pizzarr::zarr_open_group(store, path = "/") + write_zarr_encoding(store, "/", "anndata", "0.1.0") + + write_zarr_element(obs, store, "/obs", compression, overwrite = overwrite) + write_zarr_element(var, store, "/var", compression, overwrite = overwrite) + + pizzarr::zarr_open_group(store, path = "layers") + write_zarr_encoding(store, "/layers", "dict", "0.1.0") + + pizzarr::zarr_open_group(store, path = "obsm") + write_zarr_encoding(store, "/obsm", "dict", "0.1.0") + + pizzarr::zarr_open_group(store, path = "obsp") + write_zarr_encoding(store, "/obsp", "dict", "0.1.0") + + pizzarr::zarr_open_group(store, path = "uns") + write_zarr_encoding(store, "/uns", "dict", "0.1.0") + + pizzarr::zarr_open_group(store, path = "varm") + write_zarr_encoding(store, "/varm", "dict", "0.1.0") + + pizzarr::zarr_open_group(store, path = "varp") + write_zarr_encoding(store, "/varp", "dict", "0.1.0") +} + +#' Zarr path exists +#' +#' Check that a path in Zarr exists +#' +#' @noRd +#' +#' @param store Path to a Zarr store +#' @param target_path The path within the store to test for +#' +#' @return Whether the `target_path` exists in `store` +zarr_path_exists <- function(store, target_path) { + store <- pizzarr::zarr_open(store, path = "") + result <- tryCatch({ + if (store$contains_item(target_path)) { + # This should work for DirectoryStore but not yet for MemoryStore. + return(TRUE) + } + # Fall back to use get_item. + # This should work for any store but can fail if DirectoryStore tries to read a directory as a file. + store$get_item(target_path) + return(TRUE) + }, error = function(cond) { + if (pizzarr::is_key_error(cond)) { + return(FALSE) + } + stop(cond) + }, warnings = function(w) {}) + return(result) +} + +#' Zarr write compressed +#' +#' Write Zarr dataset with chosen compression (can be none) +#' +#' @noRd +#' +#' @param store Path to a Zarr store +#' @param name Name of the element within the Zarr store containing the data +#' frame +#' @param value Value to write. Must be a vector to the same length as the data +#' frame. +#' @param compression The compression to use when writing the element. Can be +#' one of `"none"`, `"gzip"` or `"lzf"`. Defaults to `"none"`. +#' +#' @return Whether the `path` exists in `file` +zarr_write_compressed <- function(store, + name, + value, + compression = c("none", "gzip", "lzf"), + chunks = TRUE, + overwrite = FALSE) { + compression <- match.arg(compression) + if (!is.null(dim(value))) { + dims <- dim(value) + } else { + dims <- length(value) + } + + object_codec <- NA + if (is.integer(value)) { + dtype <- "