Merge branch 'develop' of github.com:stuart-lab/signac-private into d…

…evelop
stuart-lab · Jun 10, 2024 · f483dff · f483dff
2 parents 0d4f276 + 22884f8
commit f483dff
Show file tree

Hide file tree

Showing 7 changed files with 220 additions and 10 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: Signac
 Title: Analysis of Single-Cell Chromatin Data
-Version: 1.13.9001
-Date: 2024-04-17
+Version: 1.13.9002
+Date: 2024-06-06
 Authors@R: c(
   person(given = 'Tim', family = 'Stuart', email = '[email protected]', role = c('aut', 'cre'), comment = c(ORCID = '0000-0002-3044-0897')),
   person(given = 'Avi', family = 'Srivastava', email = '[email protected]', role = 'aut', comment = c(ORCID = '0000-0001-9798-2079')),

diff --git a/NAMESPACE b/NAMESPACE
@@ -174,6 +174,7 @@ export(RunChromVAR)
 export(RunSVD)
 export(RunTFIDF)
 export(SetMotifData)
+export(SortIdents)
 export(SplitFragments)
 export(StringToGRanges)
 export(SubsetMatrix)
@@ -320,6 +321,7 @@ importFrom(SeuratObject,GetAssayData)
 importFrom(SeuratObject,Idents)
 importFrom(SeuratObject,Key)
 importFrom(SeuratObject,LayerData)
+importFrom(SeuratObject,Layers)
 importFrom(SeuratObject,Project)
 importFrom(SeuratObject,RenameCells)
 importFrom(SeuratObject,RowMergeSparseMatrices)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,12 @@
 # Unreleased
 
+New features:
+
+* Added `SortIdents()` function to automatically order cell metadata according
+to similarity ([@JavenTyr](https://github.com/JavenTyr))
+
+Other changes:
+
 * Change to SeuratObject v5.0.2 dependency
 * Increase R dependency to 4.1
 * Deprecate `CoverageBrowser()`

diff --git a/R/utilities.R b/R/utilities.R
@@ -180,6 +180,136 @@ CellsPerGroup <- function(
   return(lut)
 }
 
+#' Sorts cell metadata variable by similarity using hierarchical clustering
+#'
+#' Compute distance matrix from a feature/variable matrix and 
+#' perform hierarchical clustering to order variables (for example, cell types)
+#' according to their similarity. 
+#'
+#' @param object A Seurat object containing single-cell data.
+#' @param layer The layer of the data to use (default is "data").
+#' @param assay Name of assay to use. If NULL, use the default assay
+#' @param label Metadata attribute to sort. If NULL, 
+#' uses the active identities.
+#' @param dendrogram Logical, whether to plot the dendrogram (default is FALSE).
+#' @param method The distance method to use for hierarchical clustering
+#' (default is 'euclidean', other options from dist{stats} are 'maximum',
+#' 'manhattan', 'canberra', 'binary' and 'minkowski').
+#' @param verbose Display messages
+#' 
+#' @return The Seurat object with metadata variable reordered by similarity.
+#' If the metadata variable was a character vector, it will be converted to a
+#' factor and the factor levels set according to the similarity ordering. If
+#' active identities were used (label=NULL), the levels will be updated according
+#' to similarity ordering.
+#' 
+#' @examples
+#' atac_small$test <- sample(1:10, ncol(atac_small), replace = TRUE)
+#' atac_small <- SortIdents(object = atac_small, label = 'test')
+#' print(levels(atac_small$test))
+#'
+#' @importFrom stats dist hclust
+#' @importFrom SeuratObject Layers Idents Idents<- DefaultAssay LayerData
+#
+#' @export
+SortIdents <- function(
+    object,
+    layer = "data",
+    assay = NULL,
+    label = NULL,
+    dendrogram = FALSE,
+    method = 'euclidean',
+    verbose = TRUE
+){
+  assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
+  if (!(layer %in% Layers(object = object[[assay]]))) {
+    stop("Requested layer is not present in ", assay)
+  }
+  allowed.methods <- c("euclidean", "maximum", "manhattan",
+                       "canberra", "binary", "minkowski")
+  if (!(method %in% allowed.methods)) {
+    stop("Selected method must be one of: ",
+         paste(allowed.methods, collapse = ", "))
+  }
+  feat_cell_matrix <- LayerData(object, layer = layer, assay = assay)
+
+  if (is.null(x = label)) {
+    cell_types <- Idents(object = object)
+    uniq_cell_types = unique(x = cell_types)
+  } else {
+    if (length(x = label) > 1) {
+      stop("Label must be a single character vector or NULL")
+    }
+    if (!(label %in% colnames(x = object[[]]))) {
+      stop("Requested metadata '", label, "' not present in object")
+    }
+    cell_types <- object[[label]]
+    uniq_cell_types = unique(x = cell_types[, 1])
+  }
+
+  if (length(x = uniq_cell_types) / ncol(x = object) > 0.7) {
+    stop("Most cells have a different value for the requested metadata variable.
+           Are you sure this is a categorical variable?")
+  }
+  if (length(x = uniq_cell_types) < 3) {
+    stop("Must have more than three different variables")
+  }
+
+  # Initialize a one-hot matrix with rows representing cells 
+  # and columns representing cell types
+  one_hot_matrix <- matrix(data = 0,
+                           nrow = ncol(x = feat_cell_matrix),
+                           ncol = length(x = uniq_cell_types),
+                           dimnames = list(colnames(x = feat_cell_matrix),
+                                           uniq_cell_types)
+  )
+
+  # Fill in the one-hot matrix
+  for (i in seq_along(along.with = uniq_cell_types)) {
+    # convert to character in case the level of a factor is returned
+    cell_type <- as.character(x = uniq_cell_types[i])
+    one_hot_matrix[cell_types == cell_type, i] <- 1
+  }
+  cell_type_counts = colSums(x = one_hot_matrix)
+
+  if (verbose) {
+    message("Creating pseudobulk profiles for ", ncol(x = one_hot_matrix),
+            " variables across ", nrow(x = feat_cell_matrix), " features")
+  }
+
+  # Aggregate (sum) features by label
+  # Normalize by number of cells per label
+  bulk_matrix <- sweep(x = feat_cell_matrix %*% one_hot_matrix, MARGIN = 2,
+                       cell_type_counts, FUN = "/")
+
+  # Calculate distance matrix and perform hierarchical clustering
+  if (verbose) {
+    message("Computing ", method, " distance between pseudobulk profiles")
+  }
+  distance_matrix <- dist(x = t(x = bulk_matrix), method = method)
+  if (verbose) {
+    message("Clustering distance matrix")
+  }
+  hc <- hclust(d = distance_matrix)
+
+  if (dendrogram){
+    plot(hc, main = paste0("Assay: ", assay, "   Layer: ", layer), 
+         xlab = SetIfNull(x = label, y = "Idents"),
+         sub = "", cex = 0.9)
+  }
+
+  ordered_cell_types <- uniq_cell_types[hc$order]
+  if (is.null(x = label)) {
+    Idents(object = object) <- factor(x = Idents(object = object), 
+                                      levels = ordered_cell_types)
+  } else {
+    object[[label]] <- factor(x = object[[label]][, 1], 
+                              levels = ordered_cell_types)
+  }
+
+  return(object)
+}
+
 #' Closest Feature
 #'
 #' Find the closest feature to a given set of genomic regions

diff --git a/man/SortIdents.Rd b/man/SortIdents.Rd
diff --git a/tests/testthat/test-utilities.R b/tests/testthat/test-utilities.R
@@ -15,6 +15,22 @@ test_that("CellsPerGroup works", {
   )
 })
 
+test_that("SortIdents works",{
+  set.seed(1)
+  atac_small$test <- sample(1:10, ncol(atac_small), replace = TRUE)
+  atac_small <- SortIdents(object = atac_small, label = "test")
+  expect_equal(
+    object = levels(atac_small$test),
+    expected = c("10", "6", "3", "7", "9", "5", "4", "8", "1", "2")
+  )
+  Idents(atac_small) <- sample(1:10, ncol(atac_small), replace = TRUE)
+  atac_small <- SortIdents(object = atac_small)
+  expect_equal(
+    object = levels(Idents(atac_small)),
+    expected = c("1", "9", "10", "2", "3", "5", "4", "7", "6", "8")
+  )
+})
+
 test_that("GRanges conversion works", {
   correct_string <- c("chr1-1-10", "chr2-12-3121")
   correct_ranges <- GRanges(

diff --git a/vignettes/pbmc_vignette.Rmd b/vignettes/pbmc_vignette.Rmd
@@ -717,15 +717,18 @@ important information including gene annotation, peak coordinates, and genomic
 links (if they're stored in the object). See the
 [visualization vignette](visualization.html) for more information.
 
-We can then visualize the DA peaks open in CD4 naive cells and CD14 monocytes, 
-near some key marker genes for these cell types, CD4 and FCN1 respectively.
-Here we'll highlight the DA peaks regions in grey.
+For plotting purposes, it's nice to have related cell types grouped together. We
+can automatically sort the plotting order according to similarities across the 
+annotated cell types by running the `SortIdents()` function:    
 
 ```{r message=FALSE, warning=FALSE}
-# set plotting order
-levels(pbmc) <- sort(levels(pbmc))
+pbmc <- SortIdents(pbmc)
 ```
 
+We can then visualize the DA peaks open in CD4 naive cells and CD14 monocytes, 
+near some key marker genes for these cell types, CD4 and LYZ respectively.
+Here we'll highlight the DA peaks regions in grey.
+
 ```{r message=FALSE, warning=FALSE, out.width='90%', fig.height=6}
 # find DA peaks overlapping gene of interest
 regions_highlight <- subsetByOverlaps(StringToGRanges(open_cd4naive), LookupGeneCoords(pbmc, "CD4"))
@@ -740,14 +743,14 @@ CoveragePlot(
 ```
 
 ```{r message=FALSE, warning=FALSE, out.width='90%', fig.height=6}
-regions_highlight <- subsetByOverlaps(StringToGRanges(open_cd14mono), LookupGeneCoords(pbmc, "FCN1"))
+regions_highlight <- subsetByOverlaps(StringToGRanges(open_cd14mono), LookupGeneCoords(pbmc, "LYZ"))
 
 CoveragePlot(
   object = pbmc,
-  region = "FCN1",
+  region = "LYZ",
   region.highlight = regions_highlight,
   extend.upstream = 1000,
-  extend.downstream = 1000
+  extend.downstream = 5000
 )
 ```