Merge pull request #1797 from lldelisle/fix_annotations

Be more flexible for annotation
stuart-lab · Oct 21, 2024 · 4fe0f8b · 4fe0f8b
2 parents f870f90 + 50670ef
commit 4fe0f8b
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 7 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: Signac
 Title: Analysis of Single-Cell Chromatin Data
-Version: 1.14.9000
+Version: 1.14.9001
 Date: 2024-10-21
 Authors@R: c(
   person(given = 'Tim', family = 'Stuart', email = '[email protected]', role = c('aut', 'cre'), comment = c(ORCID = '0000-0002-3044-0897')),

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 Other changes:
 
 * Improve error messages for `FindMotifs()` ([#1788](https://github.com/stuart-lab/signac/issues/1788))
+* Add documentation about the required format for gene annotations, and ensure this format is present when creating the assay ([#1797](https://github.com/stuart-lab/signac/pull/1797); [@lldelisle](https://github.com/lldelisle))
 
 # Signac 1.14.0
 

diff --git a/R/objects.R b/R/objects.R
@@ -156,7 +156,14 @@ ChromatinAssay <- setClass(
 #' information about the genome used. Alternatively, the name of a UCSC genome
 #' can be provided and the sequence information will be downloaded from UCSC.
 #' @param annotation A set of \code{\link[GenomicRanges]{GRanges}} containing
-#' annotations for the genome used
+#' annotations for the genome used. It must have the following columns:
+#' \itemize{
+#'   \item{tx_id or transcript_id: Transcript ID}
+#'   \item{gene_name: Gene name}
+#'   \item{gene_id: Gene ID}
+#'   \item{gene_biotype: Gene biotype (e.g. "protein_coding", "lincRNA")}
+#'   \item{type: Annotation type (e.g. "exon", "gap")}
+#' }
 #' @param bias A Tn5 integration bias matrix
 #' @param positionEnrichment A named list of matrices containing positional
 #' signal enrichment information for each cell. Should be a cell x position
@@ -173,6 +180,7 @@ ChromatinAssay <- setClass(
 #' @importFrom SeuratObject CreateAssayObject
 #' @importFrom Matrix rowSums colSums
 #' @importFrom GenomicRanges isDisjoint
+#' @importFrom S4Vectors mcols
 #' @concept assay
 #'
 #' @export
@@ -217,6 +225,12 @@ CreateChromatinAssay <- function(
   if (!is.null(x = annotation) & !inherits(x = annotation, what = "GRanges")) {
     stop("Annotation must be a GRanges object.")
   }
+  if (!any(c("tx_id", "transcript_id") %in% colnames(x = mcols(x = annotation)))) {
+    stop("Annotation must have transcript id stored in `tx_id` or `transcript_id`.")
+  }
+  if (any(!c("gene_name", "gene_id", "gene_biotype", "type") %in% colnames(x = mcols(x = annotation)))) {
+    stop("Annotation must have `gene_name`, `gene_id`, `gene_biotype` and `type`.")
+  }
   # remove low-count cells
   ncount.cell <- colSums(x = data.use > 0)
   data.use <- data.use[, ncount.cell >= min.features]
@@ -349,7 +363,14 @@ CreateChromatinAssay <- function(
 #' @param seqinfo A \code{\link[GenomeInfoDb]{Seqinfo}} object containing basic
 #' information about the genome used. Alternatively, the name of a UCSC genome
 #' can be provided and the sequence information will be downloaded from UCSC.
-#' @param annotation Genomic annotation
+#' @param annotation Genomic annotation. It must have the following columns:
+#' \itemize{
+#'   \item{tx_id or transcript_id: Transcript ID}
+#'   \item{gene_name: Gene name}
+#'   \item{gene_id: Gene ID}
+#'   \item{gene_biotype: Gene biotype (e.g. "protein_coding", "lincRNA")}
+#'   \item{type: Annotation type (e.g. "exon", "gap")}
+#' }
 #' @param motifs A \code{\link{Motif}} object
 #' @param fragments A list of \code{\link{Fragment}} objects
 #' @param bias Tn5 integration bias matrix
@@ -790,6 +811,7 @@ RenameCells.Fragment <- function(object, new.names, ...) {
 #' @importFrom SeuratObject SetAssayData
 #' @importFrom GenomeInfoDb genome Seqinfo
 #' @importFrom lifecycle deprecated is_present
+#' @importFrom S4Vectors mcols
 #' @method SetAssayData ChromatinAssay
 #' @concept assay
 #' @export
@@ -867,9 +889,18 @@ SetAssayData.ChromatinAssay <- function(
     annotation.genome <- unique(x = genome(x = new.data))
     if (!is.null(x = current.genome)) {
       if (!is.na(x = annotation.genome) &
-          (current.genome != annotation.genome)) {
+        (current.genome != annotation.genome)) {
         stop("Annotation genome does not match genome of the object")
-        }
+      }
+    }
+    if (!any(c("tx_id", "transcript_id") %in% colnames(x = mcols(x = new.data)))) {
+      stop("Annotation must have transcript id stored in `tx_id` or `transcript_id`.")
+    }
+    if (any(!c("gene_name", "gene_id", "gene_biotype", "type") %in% colnames(x = mcols(x = new.data)))) {
+      stop("Annotation must have `gene_name`, `gene_id`, `gene_biotype` and `type`.")
+    }
+    if (!"tx_id" %in% colnames(x = mcols(x = new.data))) {
+      new.data$tx_id <- new.data$transcript_id
     }
     methods::slot(object = object, name = layer) <- new.data
   } else if (layer == "bias") {

diff --git a/man/CreateChromatinAssay.Rd b/man/CreateChromatinAssay.Rd
diff --git a/man/as.ChromatinAssay.Rd b/man/as.ChromatinAssay.Rd