added functions: plot_MST(), plot_exprDist()

JCVenterInstitute · Apr 19, 2022 · 088b1c2 · 088b1c2
1 parent d9f0159
commit 088b1c2
Show file tree

Hide file tree

Showing 13 changed files with 145 additions and 19 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,7 +16,7 @@ Authors@R: c(
            email = "[email protected]")
     )
 Description: FR-Match is a cell type cluster mapping algorithm for single cell RNA sequencing (scRNAseq) data. It is based on a statistical test called Friedman-Rafsky (FR) test, which is a multivariate generalization of nonparametric two-sample test. This package also provides visualization tools for the implemented method.
-Depends: R (>= 4.0.0), shiny (>= 1.2.0), SingleCellExperiment
+Depends: R (>= 4.0.0), shiny (>= 1.2.0), SingleCellExperiment, pbmcapply
 Imports: methods, S4Vectors, SummarizedExperiment, Seurat, scmap, lsa, igraph, ade4, tibble, dplyr, tidyr, forcats, magrittr, pheatmap, RColorBrewer, ggplot2, gridExtra, viridis
 Suggests: 
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,9 +8,11 @@ export(make_data_object)
 export(normalization)
 export(plot_FRmatch)
 export(plot_FRmatch_cell2cluster)
+export(plot_MST)
 export(plot_bi_FRmatch)
 export(plot_clusterSize)
 export(plot_cluster_by_markers)
+export(plot_exprDist)
 export(plot_nonzero)
 export(predict_most_similar_cluster)
 export(runShiny)

diff --git a/R/FRmatch.R b/R/FRmatch.R
@@ -13,11 +13,11 @@
 #' See details in \code{\link[FRmatch]{sce.example}}.
 # #' @param imputation INACTIVE. Logical variable indicating if to impute expression zero values for the reference experiment. Default: \code{FALSE}.
 # #' See details in \code{\link[FRmatch]{impute_dropout}}.
-#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=10}, filter based on the number
+#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=5}, filter based on the number
 #' of cells per cluster; \code{filter.fscore=NULL}, filter based on the F-beta score associated with the cell cluster if available (numeric value).
 #' @param method Methods for the FR test. Default: \code{method="subsampling"} is to iteratively subsample equal number of cells (i.e. subsample size)
 #' from the query and reference clusters, and then perform the FR test. Option: \code{method="none"} is the FR test with no modification.
-#' @param subsamp.size,subsamp.iter,subsamp.seed Subsample size, number of iterations, and random seed for \code{method="subsampling"}. YMMV.
+#' @param subsamp.size,subsamp.iter,subsamp.seed Iterative subsampling size, number of iterations, and random seed for iterations. YMMV.
 #' @param numCores Number of cores for parallel computing.
 #' Default: \code{NULL}, use the maximum number of cores detected by \code{\link[parallel]{detectCores}} if not specified (an integer).
 #' @param prefix Prefix names for query and reference clusters. Default: \code{prefix=c("query.", "ref.")}.
@@ -49,8 +49,8 @@
 #' @export
 
 FRmatch <- function(sce.query, sce.ref, #imputation=FALSE,
-                    filter.size=10, filter.fscore=NULL, #filtering clusters
-                    method="subsampling", subsamp.size=5, subsamp.iter=1000, subsamp.seed=1, #subsampling
+                    filter.size=5, filter.fscore=NULL, #filtering clusters
+                    method="subsampling", subsamp.size=20, subsamp.iter=1000, subsamp.seed=1, #subsampling
                     numCores=NULL, prefix=c("query.", "ref."),
                     verbose=1, return.all=FALSE, ...){
 
@@ -75,8 +75,10 @@ FRmatch <- function(sce.query, sce.ref, #imputation=FALSE,
   # }
 
   ## extract info from sce.objects
-  querydat <- assay(sce.query) #matrix
-  refdat <- assay(sce.ref)
+  # querydat <- assay(sce.query) #matrix
+  # refdat <- assay(sce.ref)
+  querydat <- logcounts(sce.query) #matrix
+  refdat <- logcounts(sce.ref)
   membership.query <- colData(sce.query)$cluster_membership
   membership.ref <- colData(sce.ref)$cluster_membership
   order.query <- sce.query@metadata$cluster_order

diff --git a/R/FRmatch_cell2cluster.R b/R/FRmatch_cell2cluster.R
@@ -9,7 +9,7 @@
 #' See details in \code{\link[FRmatch]{sce.example}}.
 # #' @param imputation INACTIVE. Logical variable indicating if to impute expression zero values for the reference experiment. Default: \code{FALSE}.
 # #' See details in \code{\link[FRmatch]{impute_dropout}}.
-#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=10}, filter based on the number
+#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=5}, filter based on the number
 #' of cells per cluster; \code{filter.fscore=NULL}, filter based on the F-beta score associated with the cell cluster if available (numeric value).
 #' @param subsamp.size,subsamp.iter,subsamp.seed Iterative subsampling size, number of iterations, and random seed for iterations. YMMV.
 #' @param numCores Number of cores for parallel computing.
@@ -53,8 +53,8 @@
 #' @export
 
 FRmatch_cell2cluster <- function(sce.query, sce.ref, #imputation=FALSE,
-                                 filter.size=10, filter.fscore=NULL, #filtering clusters
-                                 subsamp.size=5, subsamp.iter=2000, subsamp.seed=1, #subsampling
+                                 filter.size=5, filter.fscore=NULL, #filtering clusters
+                                 subsamp.size=10, subsamp.iter=2000, subsamp.seed=1, #subsampling
                                  numCores=NULL, prefix=c("query.", "ref."),
                                  verbose=1, ...){
 

diff --git a/R/plot_FRmatch_cell2cluster.R b/R/plot_FRmatch_cell2cluster.R
@@ -21,7 +21,7 @@
 
 plot_FRmatch_cell2cluster <- function(rst.cell2cluster, type="match.prop", p.adj.method="BH", sig.level=0.1,
                                       reorder=TRUE, return.value=FALSE,
-                                      filename=NA, width=NULL, height=NULL){
+                                      main=NULL, filename=NA, width=NULL, height=NULL){
 
   ## calculate adjusted p-values
   pmat <- rst.cell2cluster$pmat
@@ -52,12 +52,14 @@ plot_FRmatch_cell2cluster <- function(rst.cell2cluster, type="match.prop", p.adj
       mutate(match=factor(match, levels = rev(c(clusterNames.ref, "unassigned"))))
 
     ## plot
+    if(is.null(main)) main <- "FR-Match cell-to-cluster"
     g <- ggplot(long.tab.match.prop, aes(x=query.cluster, y=match, size=Prop, fill=Prop)) +
       geom_point(alpha=0.7, shape=21, color="black") +
       scale_size_continuous(range = c(0, 10)) +
       scale_fill_viridis(option="D", guide = "legend") +
       scale_y_discrete(drop=FALSE) + #show all ref clusters even if no match
-      theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+      theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+      ggtitle(main)
     ## save plot or plot on device
     if(is.null(width)) width <- ncol(tab.match.prop)*.2+.5
     if(is.null(height)) height <- nrow(tab.match.prop)*.2

diff --git a/R/plot_MST.R b/R/plot_MST.R
@@ -0,0 +1,25 @@
+
+#' Plot minimum spanning tree (MST)
+#'
+#' This function is a wrapper function for plotting MST of two interested clusters.
+#'
+#' @param sce.query,sce.ref Query and reference data objects.
+#' @param query.cluster,ref.cluster Query and reference cluster names to plot.
+#' @param nsamp Number of randomly selected cells to plot for large cluster. Default: 30.
+#' @param ... Additional arguments passed to \code{\link[FRmatch]{FRtest}}.
+#'
+#' @return MST plot and FR-test result in console.
+#'
+#' @export
+
+plot_MST <- function(sce.query, sce.ref, query.cluster, ref.cluster, nsamp=30, ...){
+  ind.query <- sce.query@colData$cluster_membership==query.cluster
+  ind.query.sub <- sample(1:sum(ind.query), min(nsamp,sum(ind.query)))
+  samp1 <- assay(sce.query)[,ind.query][,ind.query.sub]
+
+  ind.ref <- sce.ref@colData$cluster_membership==ref.cluster
+  ind.ref.sub <- sample(1:sum(ind.ref), min(nsamp,sum(ind.ref)))
+  samp2 <- assay(sce.ref)[,ind.ref][,ind.ref.sub]
+
+  FRtest(samp1, samp2, plot.MST=T, label.names=c(query.cluster,ref.cluster))
+}
diff --git a/R/plot_cluster_by_markers.R b/R/plot_cluster_by_markers.R
@@ -27,8 +27,13 @@ plot_cluster_by_markers <- function(sce.E1, sce.E2=NULL, cluster.name, nsamp=30,
   if(!cluster.name %in% unique(colData(sce.query)$cluster_membership)){
     stop(paste(cluster.name, "is not found in the plotting data object. \n"))}
 
+  ## REORDER clusters according to the given order if available
+  if(!is.null(sce.ref@metadata$cluster_order)){
+    sce.ref@metadata$cluster_marker_info %<>% arrange(match(clusterName, sce.ref@metadata$cluster_order))}
+
   ## reference marker genes
   markergenes <- unique(sce.ref@metadata$cluster_marker_info$markerGene) #marker genes in ORDER!!!
+  if(is.null(markergenes)) markergenes <- rownames(sce.ref)[rowData(sce.ref)$marker_gene==1] #if metadat is not available
   ## cells of query cluster
   col.query <- colData(sce.query)$cluster_membership==cluster.name
 

diff --git a/R/plot_exprDist.R b/R/plot_exprDist.R
@@ -0,0 +1,33 @@
+
+#' Gene expression data distribution plot
+#'
+#' This function plots the expression data distributions of the two single cell datasets (e.g. query and reference) to be compared.
+#'
+#' @param sce.E1,sce.E2 Data objects, namely E1 and E2.
+#' @param name.E1,name.E2 Customized names for E1 and E2. Default: \code{"E1"} and \code{"E2"}, respectively.
+#' @param breaks,xlim,ylim Plotting parameters passed to histogram plot.
+#' @param filename File name if to save the plot. Default: \code{NA}, not to save the plot.
+#' @param width,height Width and height for saved plot.
+#'
+#' @export
+
+plot_exprDist <- function(sce.E1, sce.E2, name.E1="E1", name.E2="E2",
+                          breaks=20, xlim=c(0,10), ylim=c(0,1.7),
+                          filename=NA, width=10, height=5){
+  ## to save pdf
+  if(!is.na(filename)){pdf(filename, width=width, height=height)}
+
+  ## plot
+  par(mfrow=c(1,2), mar=c(3,4,3,2))
+  hist(logcounts(sce.E1), freq=F, xlab="",
+       breaks=breaks, xlim=xlim, ylim=ylim, main=name.E1)
+  ss <- summary(as.vector(logcounts(sce.E1)))
+  legend("topright", paste(names(ss),"=", round(ss,3)), bty="n")
+  hist(logcounts(sce.E2), freq=F, xlab="",
+       breaks=breaks, xlim=xlim, ylim=ylim, main=name.E2)
+  ss <- summary(as.vector(logcounts(sce.E2)))
+  legend("topright", paste(names(ss),"=", round(ss,3)), bty="n")
+
+  ## to close pdf
+  if(!is.na(filename)){dev.off()}
+}
diff --git a/man/FRmatch.Rd b/man/FRmatch.Rd
diff --git a/man/FRmatch_cell2cluster.Rd b/man/FRmatch_cell2cluster.Rd
diff --git a/man/plot_FRmatch_cell2cluster.Rd b/man/plot_FRmatch_cell2cluster.Rd
diff --git a/man/plot_MST.Rd b/man/plot_MST.Rd
diff --git a/man/plot_exprDist.Rd b/man/plot_exprDist.Rd