From 088b1c29da5a7561e51aedea3237de4d10b4ebc1 Mon Sep 17 00:00:00 2001
From: Yun Zhang <zhangy@jcvi.org>
Date: Tue, 19 Apr 2022 08:48:20 -0700
Subject: [PATCH] added functions: plot_MST(), plot_exprDist()

---
 DESCRIPTION                      |  2 +-
 NAMESPACE                        |  2 ++
 R/FRmatch.R                      | 14 ++++++++------
 R/FRmatch_cell2cluster.R         |  6 +++---
 R/plot_FRmatch_cell2cluster.R    |  6 ++++--
 R/plot_MST.R                     | 25 ++++++++++++++++++++++++
 R/plot_cluster_by_markers.R      |  5 +++++
 R/plot_exprDist.R                | 33 ++++++++++++++++++++++++++++++++
 man/FRmatch.Rd                   |  8 ++++----
 man/FRmatch_cell2cluster.Rd      |  6 +++---
 man/plot_FRmatch_cell2cluster.Rd |  1 +
 man/plot_MST.Rd                  | 23 ++++++++++++++++++++++
 man/plot_exprDist.Rd             | 33 ++++++++++++++++++++++++++++++++
 13 files changed, 145 insertions(+), 19 deletions(-)
 create mode 100644 R/plot_MST.R
 create mode 100644 R/plot_exprDist.R
 create mode 100644 man/plot_MST.Rd
 create mode 100644 man/plot_exprDist.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 3e9452e..616f703 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -16,7 +16,7 @@ Authors@R: c(
            email = "RScheuermann@jcvi.org")
     )
 Description: FR-Match is a cell type cluster mapping algorithm for single cell RNA sequencing (scRNAseq) data. It is based on a statistical test called Friedman-Rafsky (FR) test, which is a multivariate generalization of nonparametric two-sample test. This package also provides visualization tools for the implemented method.
-Depends: R (>= 4.0.0), shiny (>= 1.2.0), SingleCellExperiment
+Depends: R (>= 4.0.0), shiny (>= 1.2.0), SingleCellExperiment, pbmcapply
 Imports: methods, S4Vectors, SummarizedExperiment, Seurat, scmap, lsa, igraph, ade4, tibble, dplyr, tidyr, forcats, magrittr, pheatmap, RColorBrewer, ggplot2, gridExtra, viridis
 Suggests: 
     knitr,
diff --git a/NAMESPACE b/NAMESPACE
index a694af4..b5baac1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,9 +8,11 @@ export(make_data_object)
 export(normalization)
 export(plot_FRmatch)
 export(plot_FRmatch_cell2cluster)
+export(plot_MST)
 export(plot_bi_FRmatch)
 export(plot_clusterSize)
 export(plot_cluster_by_markers)
+export(plot_exprDist)
 export(plot_nonzero)
 export(predict_most_similar_cluster)
 export(runShiny)
diff --git a/R/FRmatch.R b/R/FRmatch.R
index b9e94bd..728ad21 100644
--- a/R/FRmatch.R
+++ b/R/FRmatch.R
@@ -13,11 +13,11 @@
 #' See details in \code{\link[FRmatch]{sce.example}}.
 # #' @param imputation INACTIVE. Logical variable indicating if to impute expression zero values for the reference experiment. Default: \code{FALSE}.
 # #' See details in \code{\link[FRmatch]{impute_dropout}}.
-#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=10}, filter based on the number
+#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=5}, filter based on the number
 #' of cells per cluster; \code{filter.fscore=NULL}, filter based on the F-beta score associated with the cell cluster if available (numeric value).
 #' @param method Methods for the FR test. Default: \code{method="subsampling"} is to iteratively subsample equal number of cells (i.e. subsample size)
 #' from the query and reference clusters, and then perform the FR test. Option: \code{method="none"} is the FR test with no modification.
-#' @param subsamp.size,subsamp.iter,subsamp.seed Subsample size, number of iterations, and random seed for \code{method="subsampling"}. YMMV.
+#' @param subsamp.size,subsamp.iter,subsamp.seed Iterative subsampling size, number of iterations, and random seed for iterations. YMMV.
 #' @param numCores Number of cores for parallel computing.
 #' Default: \code{NULL}, use the maximum number of cores detected by \code{\link[parallel]{detectCores}} if not specified (an integer).
 #' @param prefix Prefix names for query and reference clusters. Default: \code{prefix=c("query.", "ref.")}.
@@ -49,8 +49,8 @@
 #' @export
 
 FRmatch <- function(sce.query, sce.ref, #imputation=FALSE,
-                    filter.size=10, filter.fscore=NULL, #filtering clusters
-                    method="subsampling", subsamp.size=5, subsamp.iter=1000, subsamp.seed=1, #subsampling
+                    filter.size=5, filter.fscore=NULL, #filtering clusters
+                    method="subsampling", subsamp.size=20, subsamp.iter=1000, subsamp.seed=1, #subsampling
                     numCores=NULL, prefix=c("query.", "ref."),
                     verbose=1, return.all=FALSE, ...){
 
@@ -75,8 +75,10 @@ FRmatch <- function(sce.query, sce.ref, #imputation=FALSE,
   # }
 
   ## extract info from sce.objects
-  querydat <- assay(sce.query) #matrix
-  refdat <- assay(sce.ref)
+  # querydat <- assay(sce.query) #matrix
+  # refdat <- assay(sce.ref)
+  querydat <- logcounts(sce.query) #matrix
+  refdat <- logcounts(sce.ref)
   membership.query <- colData(sce.query)$cluster_membership
   membership.ref <- colData(sce.ref)$cluster_membership
   order.query <- sce.query@metadata$cluster_order
diff --git a/R/FRmatch_cell2cluster.R b/R/FRmatch_cell2cluster.R
index c3bc824..6db7f6a 100644
--- a/R/FRmatch_cell2cluster.R
+++ b/R/FRmatch_cell2cluster.R
@@ -9,7 +9,7 @@
 #' See details in \code{\link[FRmatch]{sce.example}}.
 # #' @param imputation INACTIVE. Logical variable indicating if to impute expression zero values for the reference experiment. Default: \code{FALSE}.
 # #' See details in \code{\link[FRmatch]{impute_dropout}}.
-#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=10}, filter based on the number
+#' @param filter.size,filter.fscore Filtering small/poor-quality clusters. Default: \code{filter.size=5}, filter based on the number
 #' of cells per cluster; \code{filter.fscore=NULL}, filter based on the F-beta score associated with the cell cluster if available (numeric value).
 #' @param subsamp.size,subsamp.iter,subsamp.seed Iterative subsampling size, number of iterations, and random seed for iterations. YMMV.
 #' @param numCores Number of cores for parallel computing.
@@ -53,8 +53,8 @@
 #' @export
 
 FRmatch_cell2cluster <- function(sce.query, sce.ref, #imputation=FALSE,
-                                 filter.size=10, filter.fscore=NULL, #filtering clusters
-                                 subsamp.size=5, subsamp.iter=2000, subsamp.seed=1, #subsampling
+                                 filter.size=5, filter.fscore=NULL, #filtering clusters
+                                 subsamp.size=10, subsamp.iter=2000, subsamp.seed=1, #subsampling
                                  numCores=NULL, prefix=c("query.", "ref."),
                                  verbose=1, ...){
 
diff --git a/R/plot_FRmatch_cell2cluster.R b/R/plot_FRmatch_cell2cluster.R
index 9ab303b..9d1e2b3 100644
--- a/R/plot_FRmatch_cell2cluster.R
+++ b/R/plot_FRmatch_cell2cluster.R
@@ -21,7 +21,7 @@
 
 plot_FRmatch_cell2cluster <- function(rst.cell2cluster, type="match.prop", p.adj.method="BH", sig.level=0.1,
                                       reorder=TRUE, return.value=FALSE,
-                                      filename=NA, width=NULL, height=NULL){
+                                      main=NULL, filename=NA, width=NULL, height=NULL){
 
   ## calculate adjusted p-values
   pmat <- rst.cell2cluster$pmat
@@ -52,12 +52,14 @@ plot_FRmatch_cell2cluster <- function(rst.cell2cluster, type="match.prop", p.adj
       mutate(match=factor(match, levels = rev(c(clusterNames.ref, "unassigned"))))
 
     ## plot
+    if(is.null(main)) main <- "FR-Match cell-to-cluster"
     g <- ggplot(long.tab.match.prop, aes(x=query.cluster, y=match, size=Prop, fill=Prop)) +
       geom_point(alpha=0.7, shape=21, color="black") +
       scale_size_continuous(range = c(0, 10)) +
       scale_fill_viridis(option="D", guide = "legend") +
       scale_y_discrete(drop=FALSE) + #show all ref clusters even if no match
-      theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+      theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
+      ggtitle(main)
     ## save plot or plot on device
     if(is.null(width)) width <- ncol(tab.match.prop)*.2+.5
     if(is.null(height)) height <- nrow(tab.match.prop)*.2
diff --git a/R/plot_MST.R b/R/plot_MST.R
new file mode 100644
index 0000000..ed695f2
--- /dev/null
+++ b/R/plot_MST.R
@@ -0,0 +1,25 @@
+
+#' Plot minimum spanning tree (MST)
+#'
+#' This function is a wrapper function for plotting MST of two interested clusters.
+#'
+#' @param sce.query,sce.ref Query and reference data objects.
+#' @param query.cluster,ref.cluster Query and reference cluster names to plot.
+#' @param nsamp Number of randomly selected cells to plot for large cluster. Default: 30.
+#' @param ... Additional arguments passed to \code{\link[FRmatch]{FRtest}}.
+#'
+#' @return MST plot and FR-test result in console.
+#'
+#' @export
+
+plot_MST <- function(sce.query, sce.ref, query.cluster, ref.cluster, nsamp=30, ...){
+  ind.query <- sce.query@colData$cluster_membership==query.cluster
+  ind.query.sub <- sample(1:sum(ind.query), min(nsamp,sum(ind.query)))
+  samp1 <- assay(sce.query)[,ind.query][,ind.query.sub]
+
+  ind.ref <- sce.ref@colData$cluster_membership==ref.cluster
+  ind.ref.sub <- sample(1:sum(ind.ref), min(nsamp,sum(ind.ref)))
+  samp2 <- assay(sce.ref)[,ind.ref][,ind.ref.sub]
+
+  FRtest(samp1, samp2, plot.MST=T, label.names=c(query.cluster,ref.cluster))
+}
diff --git a/R/plot_cluster_by_markers.R b/R/plot_cluster_by_markers.R
index 539c11b..6cc9a1c 100644
--- a/R/plot_cluster_by_markers.R
+++ b/R/plot_cluster_by_markers.R
@@ -27,8 +27,13 @@ plot_cluster_by_markers <- function(sce.E1, sce.E2=NULL, cluster.name, nsamp=30,
   if(!cluster.name %in% unique(colData(sce.query)$cluster_membership)){
     stop(paste(cluster.name, "is not found in the plotting data object. \n"))}
 
+  ## REORDER clusters according to the given order if available
+  if(!is.null(sce.ref@metadata$cluster_order)){
+    sce.ref@metadata$cluster_marker_info %<>% arrange(match(clusterName, sce.ref@metadata$cluster_order))}
+
   ## reference marker genes
   markergenes <- unique(sce.ref@metadata$cluster_marker_info$markerGene) #marker genes in ORDER!!!
+  if(is.null(markergenes)) markergenes <- rownames(sce.ref)[rowData(sce.ref)$marker_gene==1] #if metadat is not available
   ## cells of query cluster
   col.query <- colData(sce.query)$cluster_membership==cluster.name
 
diff --git a/R/plot_exprDist.R b/R/plot_exprDist.R
new file mode 100644
index 0000000..b85acb9
--- /dev/null
+++ b/R/plot_exprDist.R
@@ -0,0 +1,33 @@
+
+#' Gene expression data distribution plot
+#'
+#' This function plots the expression data distributions of the two single cell datasets (e.g. query and reference) to be compared.
+#'
+#' @param sce.E1,sce.E2 Data objects, namely E1 and E2.
+#' @param name.E1,name.E2 Customized names for E1 and E2. Default: \code{"E1"} and \code{"E2"}, respectively.
+#' @param breaks,xlim,ylim Plotting parameters passed to histogram plot.
+#' @param filename File name if to save the plot. Default: \code{NA}, not to save the plot.
+#' @param width,height Width and height for saved plot.
+#'
+#' @export
+
+plot_exprDist <- function(sce.E1, sce.E2, name.E1="E1", name.E2="E2",
+                          breaks=20, xlim=c(0,10), ylim=c(0,1.7),
+                          filename=NA, width=10, height=5){
+  ## to save pdf
+  if(!is.na(filename)){pdf(filename, width=width, height=height)}
+
+  ## plot
+  par(mfrow=c(1,2), mar=c(3,4,3,2))
+  hist(logcounts(sce.E1), freq=F, xlab="",
+       breaks=breaks, xlim=xlim, ylim=ylim, main=name.E1)
+  ss <- summary(as.vector(logcounts(sce.E1)))
+  legend("topright", paste(names(ss),"=", round(ss,3)), bty="n")
+  hist(logcounts(sce.E2), freq=F, xlab="",
+       breaks=breaks, xlim=xlim, ylim=ylim, main=name.E2)
+  ss <- summary(as.vector(logcounts(sce.E2)))
+  legend("topright", paste(names(ss),"=", round(ss,3)), bty="n")
+
+  ## to close pdf
+  if(!is.na(filename)){dev.off()}
+}
diff --git a/man/FRmatch.Rd b/man/FRmatch.Rd
index 58d7a81..474636e 100644
--- a/man/FRmatch.Rd
+++ b/man/FRmatch.Rd
@@ -7,10 +7,10 @@
 FRmatch(
   sce.query,
   sce.ref,
-  filter.size = 10,
+  filter.size = 5,
   filter.fscore = NULL,
   method = "subsampling",
-  subsamp.size = 5,
+  subsamp.size = 20,
   subsamp.iter = 1000,
   subsamp.seed = 1,
   numCores = NULL,
@@ -27,13 +27,13 @@ See details in \code{\link[FRmatch]{sce.example}}.}
 \item{sce.ref}{Data object of the \link[SingleCellExperiment]{SingleCellExperiment} class for reference experiment.
 See details in \code{\link[FRmatch]{sce.example}}.}
 
-\item{filter.size, filter.fscore}{Filtering small/poor-quality clusters. Default: \code{filter.size=10}, filter based on the number
+\item{filter.size, filter.fscore}{Filtering small/poor-quality clusters. Default: \code{filter.size=5}, filter based on the number
 of cells per cluster; \code{filter.fscore=NULL}, filter based on the F-beta score associated with the cell cluster if available (numeric value).}
 
 \item{method}{Methods for the FR test. Default: \code{method="subsampling"} is to iteratively subsample equal number of cells (i.e. subsample size)
 from the query and reference clusters, and then perform the FR test. Option: \code{method="none"} is the FR test with no modification.}
 
-\item{subsamp.size, subsamp.iter, subsamp.seed}{Subsample size, number of iterations, and random seed for \code{method="subsampling"}. YMMV.}
+\item{subsamp.size, subsamp.iter, subsamp.seed}{Iterative subsampling size, number of iterations, and random seed for iterations. YMMV.}
 
 \item{numCores}{Number of cores for parallel computing.
 Default: \code{NULL}, use the maximum number of cores detected by \code{\link[parallel]{detectCores}} if not specified (an integer).}
diff --git a/man/FRmatch_cell2cluster.Rd b/man/FRmatch_cell2cluster.Rd
index fe917c7..d83cbe9 100644
--- a/man/FRmatch_cell2cluster.Rd
+++ b/man/FRmatch_cell2cluster.Rd
@@ -7,9 +7,9 @@
 FRmatch_cell2cluster(
   sce.query,
   sce.ref,
-  filter.size = 10,
+  filter.size = 5,
   filter.fscore = NULL,
-  subsamp.size = 5,
+  subsamp.size = 10,
   subsamp.iter = 2000,
   subsamp.seed = 1,
   numCores = NULL,
@@ -25,7 +25,7 @@ See details in \code{\link[FRmatch]{sce.example}}.}
 \item{sce.ref}{Data object of the \link[SingleCellExperiment]{SingleCellExperiment} class for reference experiment.
 See details in \code{\link[FRmatch]{sce.example}}.}
 
-\item{filter.size, filter.fscore}{Filtering small/poor-quality clusters. Default: \code{filter.size=10}, filter based on the number
+\item{filter.size, filter.fscore}{Filtering small/poor-quality clusters. Default: \code{filter.size=5}, filter based on the number
 of cells per cluster; \code{filter.fscore=NULL}, filter based on the F-beta score associated with the cell cluster if available (numeric value).}
 
 \item{subsamp.size, subsamp.iter, subsamp.seed}{Iterative subsampling size, number of iterations, and random seed for iterations. YMMV.}
diff --git a/man/plot_FRmatch_cell2cluster.Rd b/man/plot_FRmatch_cell2cluster.Rd
index 496cd86..ce6611e 100644
--- a/man/plot_FRmatch_cell2cluster.Rd
+++ b/man/plot_FRmatch_cell2cluster.Rd
@@ -11,6 +11,7 @@ plot_FRmatch_cell2cluster(
   sig.level = 0.1,
   reorder = TRUE,
   return.value = FALSE,
+  main = NULL,
   filename = NA,
   width = NULL,
   height = NULL
diff --git a/man/plot_MST.Rd b/man/plot_MST.Rd
new file mode 100644
index 0000000..a710008
--- /dev/null
+++ b/man/plot_MST.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot_MST.R
+\name{plot_MST}
+\alias{plot_MST}
+\title{Plot minimum spanning tree (MST)}
+\usage{
+plot_MST(sce.query, sce.ref, query.cluster, ref.cluster, nsamp = 30, ...)
+}
+\arguments{
+\item{sce.query, sce.ref}{Query and reference data objects.}
+
+\item{query.cluster, ref.cluster}{Query and reference cluster names to plot.}
+
+\item{nsamp}{Number of randomly selected cells to plot for large cluster. Default: 30.}
+
+\item{...}{Additional arguments passed to \code{\link[FRmatch]{FRtest}}.}
+}
+\value{
+MST plot and FR-test result in console.
+}
+\description{
+This function is a wrapper function for plotting MST of two interested clusters.
+}
diff --git a/man/plot_exprDist.Rd b/man/plot_exprDist.Rd
new file mode 100644
index 0000000..b3be530
--- /dev/null
+++ b/man/plot_exprDist.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot_exprDist.R
+\name{plot_exprDist}
+\alias{plot_exprDist}
+\title{Gene expression data distribution plot}
+\usage{
+plot_exprDist(
+  sce.E1,
+  sce.E2,
+  name.E1 = "E1",
+  name.E2 = "E2",
+  breaks = 20,
+  xlim = c(0, 10),
+  ylim = c(0, 1.7),
+  filename = NA,
+  width = 10,
+  height = 5
+)
+}
+\arguments{
+\item{sce.E1, sce.E2}{Data objects, namely E1 and E2.}
+
+\item{name.E1, name.E2}{Customized names for E1 and E2. Default: \code{"E1"} and \code{"E2"}, respectively.}
+
+\item{breaks, xlim, ylim}{Plotting parameters passed to histogram plot.}
+
+\item{filename}{File name if to save the plot. Default: \code{NA}, not to save the plot.}
+
+\item{width, height}{Width and height for saved plot.}
+}
+\description{
+This function plots the expression data distributions of the two single cell datasets (e.g. query and reference) to be compared.
+}