From e4f91d54737fa54e4a0811608312fa8232ebc738 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 10:12:44 -0800
Subject: [PATCH 01/21] Add script `R/simulate.null.R`

---
 R/simulate.null.R | 93 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 R/simulate.null.R

diff --git a/R/simulate.null.R b/R/simulate.null.R
new file mode 100644
index 0000000..c43c52b
--- /dev/null
+++ b/R/simulate.null.R
@@ -0,0 +1,93 @@
+#' Simulate from a null distribution
+#'
+#' Simulate transcripts from a specified null distribution.
+#'
+#' @param data A matrix or data frame from which to sample transcripts.
+#' @param distributions A numeric vector consisting of codes corresponding to the optimal distribution of `x` as returned by, e.g., `identify.bic.optimal.data.distribution()`.  The value of `distributions[i]` corresponds to the code for the optimal distribution of the transcript in `data[i, ]`.  Possible values are
+#' * 1 = normal,
+#' * 2 = log-normal,
+#' * 3 = exponential, and
+#' * 4 = gamma.
+#' @param num.null The number of simulated transcripts to generate.
+#'
+#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns.  Row names are retained, but column names are not.
+#'
+#' @noRd
+simulate.null <- function(
+    data,
+    distributions,
+    num.null = 10000
+    ) {
+    # Sample `num.null` indices from `data`.  The corresponding
+    # transcripts will constitute our simulated null data set.
+    sampled.indices <- sample(
+        x = nrow(data),
+        size = num.null,
+        replace = TRUE
+        );
+    # For each sampled index, generate a null transcript.
+    simulated.null <- future.apply::future_lapply(
+        X = sampled.indices,
+        FUN = function(i) {
+            # Select the corresponding transcript and the code for its
+            # optimal distribution.
+            x <- data[i, ];
+            distribution <- distributions[i];
+            # Ensure the values in `x` are strictly positive.
+            add.minimum.value <- least.significant.digit(x);
+            x.nozero <- x + add.minimum.value;
+            # Apply 5% trimming.
+            x.trim <- trim.sample(x);
+            x.nozero.trim <- trim.sample(x.trim);
+            # Generate null values according to the optimal
+            # distribution for this transcript.
+            if (1 == distribution) {
+                norm.mean <- mean(x.nozero.trim);
+                norm.sd <- sd(x.trim);
+                simulated.null <- rtnorm(
+                    n = length(x),
+                    mean = norm.mean,
+                    sd = norm.sd,
+                    a = 0
+                    );
+                }
+            else if (2 == distribution) {
+                mean.log <- mean(x.nozero.trim);
+                sd.log <- sd(x.nozero.trim);
+                m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+                sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+                simulated.null <- rlnorm(
+                    n = length(x),
+                    meanlog = m2,
+                    sdlog = sd2
+                    );
+                }
+            else if (3 == distribution) {
+                exp.rate <- 1 / mean(x.nozero.trim);
+                simulated.null <- rexp(
+                    n = length(x),
+                    rate = exp.rate
+                    );
+                }
+            else if (4 == distribution) {
+                mean.gamma <- mean(x.nozero.trim);
+                sd.gamma <- sd(x.nozero.trim);
+                gamma.shape <- (mean.gamma/sd.gamma)^2;
+                gamma.rate <- mean.gamma/(sd.gamma^2);
+                simulated.null <- rgamma(
+                    n = length(x),
+                    shape = gamma.shape,
+                    rate = gamma.rate
+                    );
+                }
+            },
+        future.seed = TRUE
+        );
+    simulated.null <- do.call(
+        what = rbind,
+        args = simulated.null
+        );
+    rownames(simulated.null) <- rownames(data)[sampled.indices];
+    simulated.null;
+    }
+

From c7e20797d4fa5dd65af2ed4be8f8e187bc3067aa Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 10:23:09 -0800
Subject: [PATCH 02/21] Qualify namespace in `R/simulate.null.R`

Qualify namespace in function calls in `R/simulate.null.R`.
---
 R/simulate.null.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index c43c52b..b7b14f9 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -43,8 +43,8 @@ simulate.null <- function(
             # distribution for this transcript.
             if (1 == distribution) {
                 norm.mean <- mean(x.nozero.trim);
-                norm.sd <- sd(x.trim);
-                simulated.null <- rtnorm(
+                norm.sd <- stats::sd(x.trim);
+                simulated.null <- extraDistr::rtnorm(
                     n = length(x),
                     mean = norm.mean,
                     sd = norm.sd,
@@ -53,10 +53,10 @@ simulate.null <- function(
                 }
             else if (2 == distribution) {
                 mean.log <- mean(x.nozero.trim);
-                sd.log <- sd(x.nozero.trim);
+                sd.log <- stats::sd(x.nozero.trim);
                 m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
                 sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
-                simulated.null <- rlnorm(
+                simulated.null <- stats::rlnorm(
                     n = length(x),
                     meanlog = m2,
                     sdlog = sd2
@@ -64,17 +64,17 @@ simulate.null <- function(
                 }
             else if (3 == distribution) {
                 exp.rate <- 1 / mean(x.nozero.trim);
-                simulated.null <- rexp(
+                simulated.null <- stats::rexp(
                     n = length(x),
                     rate = exp.rate
                     );
                 }
             else if (4 == distribution) {
                 mean.gamma <- mean(x.nozero.trim);
-                sd.gamma <- sd(x.nozero.trim);
+                sd.gamma <- stats::sd(x.nozero.trim);
                 gamma.shape <- (mean.gamma/sd.gamma)^2;
                 gamma.rate <- mean.gamma/(sd.gamma^2);
-                simulated.null <- rgamma(
+                simulated.null <- stats::rgamma(
                     n = length(x),
                     shape = gamma.shape,
                     rate = gamma.rate

From ca68c79b8dcab7d0e6325c3208a5f2bf586fe2a3 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 11:35:44 -0800
Subject: [PATCH 03/21] Add example to function `simulate.null()`

Add example to function `simulate.null()`, and update the description
of the return value.
---
 R/simulate.null.R | 57 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index b7b14f9..42b5707 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -10,7 +10,61 @@
 #' * 4 = gamma.
 #' @param num.null The number of simulated transcripts to generate.
 #'
-#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns.  Row names are retained, but column names are not.
+#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns.  Row names are retained from `data` and correspond to the row names of the transcripts used to simulate each row.  Column names are not retained.
+#'
+#' @examples
+#' # Prepare fake data.
+#' set.seed(1234);
+#' n <- 25;
+#' x1 <- rnorm(
+#'     n = n,
+#'     mean = 5,
+#'     sd = 2
+#'     );
+#' x2 <- rlnorm(
+#'     n = n,
+#'     meanlog = 0,
+#'     sdlog = 0.5
+#'     );
+#' x3 <- rexp(
+#'     n = n,
+#'     rate = 0.2
+#'     );
+#' x4 <- rexp(
+#'     n = n,
+#'     rate = 0.1
+#'     );
+#' x5 <- rgamma(
+#'     n = n,
+#'     shape = 2,
+#'     scale = 2
+#'     );
+#' x6 <- rgamma(
+#'     n = n,
+#'     shape = 4,
+#'     scale = 0.5
+#'     );
+#' x <- rbind(x1, x2, x3, x4, x5, x6);
+#' rownames(x) <- letters[seq_len(nrow(x))];
+#' colnames(x) <- paste(
+#'      'Sample',
+#'      seq_len(ncol(x)),
+#'      sep = '.'
+#'      );
+#' 
+#' # Identify optimal distributions for each transcript.
+#' optimal.distribution.x <- apply(
+#'     X = x,
+#'     MARGIN = 1,
+#'     FUN = identify.bic.optimal.data.distribution
+#'     );
+#' 
+#' # Simulate from null distributions.
+#' simulated.x <- simulate.null(
+#'     data = x,
+#'     distributions = optimal.distribution.x,
+#'     num.null = 100
+#'     );
 #'
 #' @noRd
 simulate.null <- function(
@@ -90,4 +144,3 @@ simulate.null <- function(
     rownames(simulated.null) <- rownames(data)[sampled.indices];
     simulated.null;
     }
-

From c06dbe19acda5b0e14428abfda94644a662eef3b Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 11:44:05 -0800
Subject: [PATCH 04/21] Add `num.null` as parameter to `detect.outliers()`

Add `num.null` as a parameter to the function `detect.outliers()` to
give users control over how many null transcripts to simulate.
---
 R/detect.outliers.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 5264775..300563d 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -1,9 +1,10 @@
 #' Detect outliers
 #'
 #' @param data A matrix or data frame of FPKM values, organized with transcripts on rows and samples on columns.  Transcript identifiers should be stored as `rownames(data)`.
+#' @param num.null The number of transcripts to generate when simulating from null distributions.
 #'
 #' @export
-detect.outliers <- function(data) {
+detect.outliers <- function(data, num.null) {
     # Determine which of the normal, log-normal, exponential, or gamma
     # distributions provides the best fit to each row of values in
     # `data`.

From 82f55096dc96c49393467d0f0cff06c30045b058 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 11:45:58 -0800
Subject: [PATCH 05/21] Simulate null transcripts in `detect.outliers()`

Add code to simulate null transcripts in the function
`detect.outliers()`, and include the resulting matrix in the list of
values to return.
---
 R/detect.outliers.R | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 300563d..f0b6138 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -129,6 +129,14 @@ detect.outliers <- function(data, num.null) {
         num.allowed.NA = 0
         );
 
+    # Generate a matrix of null transcripts by simulating from their
+    # respective optimal distributions.
+    null.data <- simulate.null(
+        data = data,
+        distributions = optimal.distribution.data,
+        num.null = num.null
+        );
+
     list(
         optimal.distribution.data = optimal.distribution.data,
         optimal.distribution.residuals = optimal.distribution.residuals,
@@ -143,6 +151,7 @@ detect.outliers <- function(data, num.null) {
         cosine.similarity = cosine.similarity,
         observed.5method = observed.5method,
         observed.5method.ranks = observed.5method.ranks,
-        observed.5method.rank.product = observed.5method.rank.product
+        observed.5method.rank.product = observed.5method.rank.product,
+        null.data = null.data
         );
     }

From 1fc1f91ac7f9992c1eadd6980d61d38ec324ea1f Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 14:04:35 -0800
Subject: [PATCH 06/21] Use `truncnorm::rtruncnorm()` in `simulate.null()`

Use `truncnorm::rtruncnorm()` rather than `extraDistr::rtnorm()` in
`simulate.null()`.  The two functions appear to be equivalent, and
doing so avoids adding the package extraDistr as a dependency.
---
 R/simulate.null.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index 42b5707..c216f43 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -98,7 +98,7 @@ simulate.null <- function(
             if (1 == distribution) {
                 norm.mean <- mean(x.nozero.trim);
                 norm.sd <- stats::sd(x.trim);
-                simulated.null <- extraDistr::rtnorm(
+                simulated.null <- truncnorm::rtruncnorm(
                     n = length(x),
                     mean = norm.mean,
                     sd = norm.sd,

From d2383b87382cafd33f4cec5019bbd7de75fac0b8 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 14:06:40 -0800
Subject: [PATCH 07/21] Update documentation for `detect.outliers()`

---
 man/detect.outliers.Rd | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/man/detect.outliers.Rd b/man/detect.outliers.Rd
index 454da50..5bec01c 100644
--- a/man/detect.outliers.Rd
+++ b/man/detect.outliers.Rd
@@ -4,10 +4,12 @@
 \alias{detect.outliers}
 \title{Detect outliers}
 \usage{
-detect.outliers(data)
+detect.outliers(data, num.null)
 }
 \arguments{
 \item{data}{A matrix or data frame of FPKM values, organized with transcripts on rows and samples on columns.  Transcript identifiers should be stored as \code{rownames(data)}.}
+
+\item{num.null}{The number of transcripts to generate when simulating from null distributions.}
 }
 \description{
 Detect outliers

From d1dbfe7fe4f1dfa14d6d9227b06d685308a5ed41 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Mon, 26 Feb 2024 14:58:31 -0800
Subject: [PATCH 08/21] Handle data frame input to `simulate.null()`

Add code to `simulate.null()` to avoid errors handling data frame
input.
---
 R/simulate.null.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index c216f43..f49ef60 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -85,7 +85,7 @@ simulate.null <- function(
         FUN = function(i) {
             # Select the corresponding transcript and the code for its
             # optimal distribution.
-            x <- data[i, ];
+            x <- as.numeric(data[i, ]);
             distribution <- distributions[i];
             # Ensure the values in `x` are strictly positive.
             add.minimum.value <- least.significant.digit(x);

From d2204808013264cf9f6ca86cd6090967f92d2a13 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Tue, 5 Mar 2024 11:38:46 -0800
Subject: [PATCH 09/21] Add script `R/add.residual.noise.R`

Add script `R/add.residual.noise.R` with first draft of function
`add.residual.noise()`.
---
 R/add.residual.noise.R | 70 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 R/add.residual.noise.R

diff --git a/R/add.residual.noise.R b/R/add.residual.noise.R
new file mode 100644
index 0000000..9777015
--- /dev/null
+++ b/R/add.residual.noise.R
@@ -0,0 +1,70 @@
+#' Add noise to residuals
+#'
+#' Add additional noise to residuals based on a specified null distribution.
+#'
+#' @param x A numeric vector.
+#' @param distribution  A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`.  Possible values are
+#' * 1 = normal,
+#' * 2 = log-normal,
+#' * 3 = exponential, and
+#' * 4 = gamma.
+#'
+#' @return A numeric vector of the same length as `x`.
+#'
+#' @noRd
+add.residual.noise <- function(x, distribution) {
+    # Add a minimum value to ensure the values in `x` are strictly
+    # positive.
+    add.minimum.value <- least.significant.digit(x)
+    if (min(x) < 0) {
+        x.nozero <- x - min(x) + add.minimum.value;
+        }
+    else {
+        x.nozero <- x + add.minimum.value;
+        }
+    # Generate null values from the distribution coded by
+    # `distribution`.
+    if (1 == distribution) {
+        norm.mean <- mean(x.nozero);
+        norm.sd <- stats::sd(x.nozero);
+        simulated.noise <- truncnorm::rtruncnorm(
+            n = length(x),
+            mean = norm.mean,
+            sd = norm.sd,
+            a = 0
+            );
+        }
+    else if (2 == distribution) {
+        mean.log <- mean(x.nozero);
+        sd.log <- stats::sd(x.nozero);
+        m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        simulated.noise <- stats::rlnorm(
+            n = length(x),
+            meanlog = m2,
+            sdlog = sd2
+            );
+        }
+    else if (3 == distribution) {
+        exp.rate <- 1 / mean(x.nozero);
+        simulated.noise <- stats::rexp(
+            n = length(x),
+            rate = exp.rate
+            );
+        }
+    else if (4 == distribution) {
+        mean.gamma <- mean(x.nozero);
+        sd.gamma <- stats::sd(x.nozero);
+        gamma.shape <- (mean.gamma / sd.gamma)^2;
+        gamma.rate <- mean.gamma / (sd.gamma^2);
+        simulated.noise <- stats::rgamma(
+            n = length(x),
+            shape = gamma.shape,
+            rate = gamma.rate
+            );
+        }
+    # Add the simulated noise to `x`, but make sure the result is
+    # positive.
+    print(simulated.noise);
+    abs(x + simulated.noise);
+    }

From 42fbd358c7f0b9fff765022f67b5c4ace322c926 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Wed, 6 Mar 2024 10:01:29 -0800
Subject: [PATCH 10/21] Draft alternative simulation function

Draft an alternative function definition, tentatively called
`simulate.null2()`, for simulating null transcripts.  This function
combines the computations from the original `simulate.null()` function
with those of the new `add.residual.noise()` function, with the
benefit being that we ultimately end up with one `num.null` by
`ncol(data)` matrix rather than two, thereby relieving memory
constraints.
---
 R/simulate.null.R | 119 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index f49ef60..895b71c 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -144,3 +144,122 @@ simulate.null <- function(
     rownames(simulated.null) <- rownames(data)[sampled.indices];
     simulated.null;
     }
+
+
+simulate.null2 <- function(
+    x,
+    x.distribution,
+    r,
+    r.distribution
+    ) {
+    #
+    # Simulate transcripts
+    #
+    # Ensure the values in `x` are strictly positive.
+    add.minimum.value <- least.significant.digit(x);
+    x.nozero <- x + add.minimum.value;
+    # Apply 5% trimming.
+    x.trim <- trim.sample(x);
+    x.nozero.trim <- trim.sample(x.trim);
+    # Generate null values according to the optimal
+    # distribution for this transcript.
+    if (1 == x.distribution) {
+        norm.mean <- mean(x.nozero.trim);
+        norm.sd <- stats::sd(x.trim);
+        simulated.null <- truncnorm::rtruncnorm(
+            n = length(x),
+            mean = norm.mean,
+            sd = norm.sd,
+            a = 0
+            );
+        }
+    else if (2 == x.distribution) {
+        mean.log <- mean(x.nozero.trim);
+        sd.log <- stats::sd(x.nozero.trim);
+        m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        simulated.null <- stats::rlnorm(
+            n = length(x),
+            meanlog = m2,
+            sdlog = sd2
+            );
+        }
+    else if (3 == x.distribution) {
+        exp.rate <- 1 / mean(x.nozero.trim);
+        simulated.null <- stats::rexp(
+            n = length(x),
+            rate = exp.rate
+            );
+        }
+    else if (4 == x.distribution) {
+        mean.gamma <- mean(x.nozero.trim);
+        sd.gamma <- stats::sd(x.nozero.trim);
+        gamma.shape <- (mean.gamma/sd.gamma)^2;
+        gamma.rate <- mean.gamma/(sd.gamma^2);
+        simulated.null <- stats::rgamma(
+            n = length(x),
+            shape = gamma.shape,
+            rate = gamma.rate
+            );
+        }
+    #
+    # Simulate noise
+    #
+    # Ensure the values in `r` are strictly positive.
+    add.minimum.value <- least.significant.digit(r)
+    if (min(r) < 0) {
+        r.nozero <- r - min(r) + add.minimum.value;
+        }
+    else {
+        r.nozero <- r + add.minimum.value;
+        }
+    # Generate null values from the distribution coded by
+    # `distribution`.
+    if (1 == r.distribution) {
+        norm.mean <- mean(r.nozero);
+        norm.sd <- stats::sd(r.nozero);
+        simulated.noise <- truncnorm::rtruncnorm(
+            n = length(x),
+            mean = norm.mean,
+            sd = norm.sd,
+            a = 0
+            );
+        }
+    else if (2 == r.distribution) {
+        mean.log <- mean(r.nozero);
+        sd.log <- stats::sd(r.nozero);
+        m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        simulated.noise <- stats::rlnorm(
+            n = length(x),
+            meanlog = m2,
+            sdlog = sd2
+            );
+        }
+    else if (3 == r.distribution) {
+        exp.rate <- 1 / mean(r.nozero);
+        simulated.noise <- stats::rexp(
+            n = length(x),
+            rate = exp.rate
+            );
+        }
+    else if (4 == r.distribution) {
+        mean.gamma <- mean(r.nozero);
+        sd.gamma <- stats::sd(r.nozero);
+        gamma.shape <- (mean.gamma / sd.gamma)^2;
+        gamma.rate <- mean.gamma / (sd.gamma^2);
+        simulated.noise <- stats::rgamma(
+            n = length(x),
+            shape = gamma.shape,
+            rate = gamma.rate
+            );
+        }
+    if (min(r) < 0) {
+        simulated.noise <- simulated.noise + min(r) - add.minimum.value;
+        }
+    else {
+        simulated.noise <- simulated.noise - add.minimum.value;
+        }
+    # Add the simulated noise to the simulated transcript.
+    abs(simulated.null + simulated.noise)
+    }

From 1eb6e1b0c91fe355def5fdba1f9b081e9d3c998b Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Wed, 6 Mar 2024 15:08:35 -0800
Subject: [PATCH 11/21] Replace original `simulate.null()` definition

Replace the original definition of the function `simulate.null()` with
the new definition.  Update documentation and examples.
---
 R/simulate.null.R | 151 ++++++++--------------------------------------
 1 file changed, 24 insertions(+), 127 deletions(-)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index 895b71c..e95c569 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -2,151 +2,48 @@
 #'
 #' Simulate transcripts from a specified null distribution.
 #'
-#' @param data A matrix or data frame from which to sample transcripts.
-#' @param distributions A numeric vector consisting of codes corresponding to the optimal distribution of `x` as returned by, e.g., `identify.bic.optimal.data.distribution()`.  The value of `distributions[i]` corresponds to the code for the optimal distribution of the transcript in `data[i, ]`.  Possible values are
+#' @param x A numeric vector of transcripts.
+#' @param x.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.data.distribution()`.  Possible values are
 #' * 1 = normal,
 #' * 2 = log-normal,
 #' * 3 = exponential, and
 #' * 4 = gamma.
-#' @param num.null The number of simulated transcripts to generate.
+#' @param r A numeric vector of residuals calculated for this transcript.
+#' @param r.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`.  Possible values are the same as those for `x.distribution`.
 #'
-#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns.  Row names are retained from `data` and correspond to the row names of the transcripts used to simulate each row.  Column names are not retained.
+#' @return A numeric vector of the same length as `x`.  Names are not retained.
 #'
 #' @examples
 #' # Prepare fake data.
 #' set.seed(1234);
-#' n <- 25;
-#' x1 <- rnorm(
-#'     n = n,
-#'     mean = 5,
-#'     sd = 2
-#'     );
-#' x2 <- rlnorm(
-#'     n = n,
-#'     meanlog = 0,
-#'     sdlog = 0.5
-#'     );
-#' x3 <- rexp(
-#'     n = n,
-#'     rate = 0.2
-#'     );
-#' x4 <- rexp(
-#'     n = n,
-#'     rate = 0.1
-#'     );
-#' x5 <- rgamma(
-#'     n = n,
+#' x <- rgamma(
+#'     n = 20,
 #'     shape = 2,
 #'     scale = 2
 #'     );
-#' x6 <- rgamma(
-#'     n = n,
-#'     shape = 4,
-#'     scale = 0.5
+#' names(x) <- paste('Sample', seq_along(x), sep = '.');
+#' x.dist <- identify.bic.optimal.data.distribution(
+#'     x = x
 #'     );
-#' x <- rbind(x1, x2, x3, x4, x5, x6);
-#' rownames(x) <- letters[seq_len(nrow(x))];
-#' colnames(x) <- paste(
-#'      'Sample',
-#'      seq_len(ncol(x)),
-#'      sep = '.'
-#'      );
-#' 
-#' # Identify optimal distributions for each transcript.
-#' optimal.distribution.x <- apply(
-#'     X = x,
-#'     MARGIN = 1,
-#'     FUN = identify.bic.optimal.data.distribution
+#' r <- calculate.residuals(
+#'     x = x,
+#'     distribution = x.dist
 #'     );
-#' 
-#' # Simulate from null distributions.
-#' simulated.x <- simulate.null(
-#'     data = x,
-#'     distributions = optimal.distribution.x,
-#'     num.null = 100
+#' r.trimmed <- trim.sample(
+#'     x = r
+#'     );
+#' r.dist <- identify.bic.optimal.residuals.distribution(
+#'     x = r.trimmed
+#'     );
+#' null <- simulate.null(
+#'     x = x,
+#'     x.distribution = x.dist,
+#'     r = r.trimmed,
+#'     r.distribution = r.dist
 #'     );
 #'
 #' @noRd
 simulate.null <- function(
-    data,
-    distributions,
-    num.null = 10000
-    ) {
-    # Sample `num.null` indices from `data`.  The corresponding
-    # transcripts will constitute our simulated null data set.
-    sampled.indices <- sample(
-        x = nrow(data),
-        size = num.null,
-        replace = TRUE
-        );
-    # For each sampled index, generate a null transcript.
-    simulated.null <- future.apply::future_lapply(
-        X = sampled.indices,
-        FUN = function(i) {
-            # Select the corresponding transcript and the code for its
-            # optimal distribution.
-            x <- as.numeric(data[i, ]);
-            distribution <- distributions[i];
-            # Ensure the values in `x` are strictly positive.
-            add.minimum.value <- least.significant.digit(x);
-            x.nozero <- x + add.minimum.value;
-            # Apply 5% trimming.
-            x.trim <- trim.sample(x);
-            x.nozero.trim <- trim.sample(x.trim);
-            # Generate null values according to the optimal
-            # distribution for this transcript.
-            if (1 == distribution) {
-                norm.mean <- mean(x.nozero.trim);
-                norm.sd <- stats::sd(x.trim);
-                simulated.null <- truncnorm::rtruncnorm(
-                    n = length(x),
-                    mean = norm.mean,
-                    sd = norm.sd,
-                    a = 0
-                    );
-                }
-            else if (2 == distribution) {
-                mean.log <- mean(x.nozero.trim);
-                sd.log <- stats::sd(x.nozero.trim);
-                m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
-                sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
-                simulated.null <- stats::rlnorm(
-                    n = length(x),
-                    meanlog = m2,
-                    sdlog = sd2
-                    );
-                }
-            else if (3 == distribution) {
-                exp.rate <- 1 / mean(x.nozero.trim);
-                simulated.null <- stats::rexp(
-                    n = length(x),
-                    rate = exp.rate
-                    );
-                }
-            else if (4 == distribution) {
-                mean.gamma <- mean(x.nozero.trim);
-                sd.gamma <- stats::sd(x.nozero.trim);
-                gamma.shape <- (mean.gamma/sd.gamma)^2;
-                gamma.rate <- mean.gamma/(sd.gamma^2);
-                simulated.null <- stats::rgamma(
-                    n = length(x),
-                    shape = gamma.shape,
-                    rate = gamma.rate
-                    );
-                }
-            },
-        future.seed = TRUE
-        );
-    simulated.null <- do.call(
-        what = rbind,
-        args = simulated.null
-        );
-    rownames(simulated.null) <- rownames(data)[sampled.indices];
-    simulated.null;
-    }
-
-
-simulate.null2 <- function(
     x,
     x.distribution,
     r,

From efcc36530e45167a76f3a1843b7a1d7473d28047 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 08:29:56 -0800
Subject: [PATCH 12/21] Use new `simulate.null()` in `detect.outliers()`

Use the new definition of the function `simulate.null()` in the main
`detect.outliers()` function.
---
 R/detect.outliers.R | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 28a5850..0ba3964 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -132,10 +132,26 @@ detect.outliers <- function(data, num.null) {
 
     # Generate a matrix of null transcripts by simulating from their
     # respective optimal distributions.
-    null.data <- simulate.null(
-        data = data,
-        distributions = optimal.distribution.data,
-        num.null = num.null
+    sampled.indices <- sample(
+        x = nrow(data),
+        size = num.null,
+        replace = TRUE
+        );
+    null.data <- future.apply::future_lapply(
+        X = sampled.indices,
+        FUN = function(i) {
+            simulate.null(
+                x = as.numeric(data[i, ]),
+                x.distribution = optimal.distribution.data[i],
+                r = as.numeric(observed.residuals.trimmed[i, ]),
+                r.distribution = optimal.distribution.residuals[i]
+                );
+            },
+        future.seed = TRUE
+        );
+    null.data <- do.call(
+        what = rbind,
+        args = null
         );
 
     list(

From 52fc9a01ed9697b28d8c9057da5fd6594664a099 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 08:32:12 -0800
Subject: [PATCH 13/21] Delete script `R/add.residual.noise.R`

Delete script `R/add.residual.noise.R`, whose functionality has been
incorporated into `R/simulate.null.R`.
---
 R/add.residual.noise.R | 70 ------------------------------------------
 1 file changed, 70 deletions(-)
 delete mode 100644 R/add.residual.noise.R

diff --git a/R/add.residual.noise.R b/R/add.residual.noise.R
deleted file mode 100644
index 9777015..0000000
--- a/R/add.residual.noise.R
+++ /dev/null
@@ -1,70 +0,0 @@
-#' Add noise to residuals
-#'
-#' Add additional noise to residuals based on a specified null distribution.
-#'
-#' @param x A numeric vector.
-#' @param distribution  A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`.  Possible values are
-#' * 1 = normal,
-#' * 2 = log-normal,
-#' * 3 = exponential, and
-#' * 4 = gamma.
-#'
-#' @return A numeric vector of the same length as `x`.
-#'
-#' @noRd
-add.residual.noise <- function(x, distribution) {
-    # Add a minimum value to ensure the values in `x` are strictly
-    # positive.
-    add.minimum.value <- least.significant.digit(x)
-    if (min(x) < 0) {
-        x.nozero <- x - min(x) + add.minimum.value;
-        }
-    else {
-        x.nozero <- x + add.minimum.value;
-        }
-    # Generate null values from the distribution coded by
-    # `distribution`.
-    if (1 == distribution) {
-        norm.mean <- mean(x.nozero);
-        norm.sd <- stats::sd(x.nozero);
-        simulated.noise <- truncnorm::rtruncnorm(
-            n = length(x),
-            mean = norm.mean,
-            sd = norm.sd,
-            a = 0
-            );
-        }
-    else if (2 == distribution) {
-        mean.log <- mean(x.nozero);
-        sd.log <- stats::sd(x.nozero);
-        m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
-        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
-        simulated.noise <- stats::rlnorm(
-            n = length(x),
-            meanlog = m2,
-            sdlog = sd2
-            );
-        }
-    else if (3 == distribution) {
-        exp.rate <- 1 / mean(x.nozero);
-        simulated.noise <- stats::rexp(
-            n = length(x),
-            rate = exp.rate
-            );
-        }
-    else if (4 == distribution) {
-        mean.gamma <- mean(x.nozero);
-        sd.gamma <- stats::sd(x.nozero);
-        gamma.shape <- (mean.gamma / sd.gamma)^2;
-        gamma.rate <- mean.gamma / (sd.gamma^2);
-        simulated.noise <- stats::rgamma(
-            n = length(x),
-            shape = gamma.shape,
-            rate = gamma.rate
-            );
-        }
-    # Add the simulated noise to `x`, but make sure the result is
-    # positive.
-    print(simulated.noise);
-    abs(x + simulated.noise);
-    }

From 9a499813abc6257933453c35f3c89f292734c8c7 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 10:30:08 -0800
Subject: [PATCH 14/21] Fix typo in `R/detect.outliers.R`

---
 R/detect.outliers.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 0ba3964..9801a5e 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -151,7 +151,7 @@ detect.outliers <- function(data, num.null) {
         );
     null.data <- do.call(
         what = rbind,
-        args = null
+        args = null.data
         );
 
     list(

From c10be709fca0f52416045423c3cb7ae9a0ed7e5d Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 15:11:36 -0800
Subject: [PATCH 15/21] Add transcript names to `null.data`

Add transcript names to `null.data` in function `detect.outliers()`.
---
 R/detect.outliers.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 9801a5e..c84622f 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -153,6 +153,7 @@ detect.outliers <- function(data, num.null) {
         what = rbind,
         args = null.data
         );
+    rownames(null.data) <- rownames(data)[sampled.indices];
 
     list(
         optimal.distribution.data = optimal.distribution.data,

From 44eb93a4d6c059cb00d1b61cff073f9307436bf2 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 15:13:00 -0800
Subject: [PATCH 16/21] Determine distribution of null transcripts

Determine optimal distribution of each null transcript in function
`detect.outliers()`.
---
 R/detect.outliers.R | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index c84622f..a6523ed 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -154,6 +154,15 @@ detect.outliers <- function(data, num.null) {
         args = null.data
         );
     rownames(null.data) <- rownames(data)[sampled.indices];
+    # Determine which of the normal, log-normal, exponential, or gamma
+    # distributions provides the best fit to each row of values in
+    # `null.data`.
+    optimal.distribution.null.data <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = identify.bic.optimal.data.distribution,
+        future.seed = TRUE
+        );
 
     list(
         optimal.distribution.data = optimal.distribution.data,

From fa8541f9408c8677ee1c435213c39a9249a59aed Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 15:14:07 -0800
Subject: [PATCH 17/21] Calculate outlier statistics on null data

Calculate outlier statistics on null data in function
`detect.outliers()`.  To optimize memory use, overwrite the vectors
used for the observed data rather than create new vectors.
---
 R/detect.outliers.R | 65 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index a6523ed..7683236 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -164,6 +164,71 @@ detect.outliers <- function(data, num.null) {
         future.seed = TRUE
         );
 
+    # Compute quantities for outlier detection on the null data: (1)
+    # z-scores based on the mean / standard deviation, (2) z-scores
+    # based on the trimmed mean / trimmed standard deviation, (3)
+    # z-scores based on the median / median absolute deviation, and
+    # (4) the cluster assignment from k-means with two clusters.
+    data.mean <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'mean'
+        );
+    data.median <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'median'
+        );
+    data.trimmean <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'mean',
+        trim = 0.05
+        );
+    data.kmeans <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'kmeans',
+        nstart = 1000,
+        future.seed = TRUE
+        );
+    # Compute the ranges of the z-score statistics.
+    zrange.mean <- future.apply::future_apply(
+        X = data.mean,
+        MARGIN = 2,
+        FUN = zrange
+        );
+    zrange.median <- future.apply::future_apply(
+        X = data.median,
+        MARGIN = 2,
+        FUN = zrange
+        );
+    zrange.trimmean <- future.apply::future_apply(
+        X = data.trimmean,
+        MARGIN = 2,
+        FUN = zrange
+        );
+    # Compute the k-means fraction.
+    fraction.kmeans <- future.apply::future_apply(
+        X = data.kmeans,
+        MARGIN = 2,
+        FUN = kmeans.fraction
+        );
+    # Compute the cosine similarity.
+    cosine.similarity <- future.apply::future_sapply(
+        X = seq_len(nrow(null.data)),
+        FUN = function(i) {
+            outlier.detection.cosine(
+                x = as.numeric(null.data[i, ]),
+                distribution = optimal.distribution.null.data[i]
+                );
+            }
+        );
+    names(cosine.similarity) <- rownames(null.data);
     list(
         optimal.distribution.data = optimal.distribution.data,
         optimal.distribution.residuals = optimal.distribution.residuals,

From 308db680bd3face8d65b375465f0213b1db33715 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 15:16:35 -0800
Subject: [PATCH 18/21] Calculate rank product on null transcripts

Combine outlier statistics from null transcripts into a matrix, and
calculate the rank product for each null transcript.
---
 R/detect.outliers.R | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 7683236..1e741d9 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -229,6 +229,25 @@ detect.outliers <- function(data, num.null) {
             }
         );
     names(cosine.similarity) <- rownames(null.data);
+    # Assemble the statistics from the five methods into a single
+    # matrix.
+    null.5method <- cbind(
+        zrange.mean = zrange.mean,
+        zrange.median = zrange.median,
+        zrange.trimmean = zrange.trimmean,
+        fraction.kmeans = fraction.kmeans,
+        cosine.similarity = cosine.similarity
+        );
+    # Assign ranks within each method.
+    null.5method.ranks <- outlier.rank(
+        outlier.statistics.matrix = null.5method
+        );
+    # Compute the rank product for each null transcript.
+    null.5method.rank.product <- outlier.rank.product(
+        ranks.matrix = null.5method.ranks,
+        num.allowed.NA = 0
+        );
+
     list(
         optimal.distribution.data = optimal.distribution.data,
         optimal.distribution.residuals = optimal.distribution.residuals,

From bd9cd23d6907e05895516706f4e18d52ec48b99a Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 7 Mar 2024 15:17:49 -0800
Subject: [PATCH 19/21] Export new objects in `detect.outliers()`

Add new objects to the list returned by the function
`detect.outliers()`.
---
 R/detect.outliers.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index 1e741d9..e206546 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -263,6 +263,10 @@ detect.outliers <- function(data, num.null) {
         observed.5method = observed.5method,
         observed.5method.ranks = observed.5method.ranks,
         observed.5method.rank.product = observed.5method.rank.product,
-        null.data = null.data
+        null.data = null.data,
+        optimal.distribution.null.data = optimal.distribution.null.data,
+        null.5method = null.5method,
+        null.5method.ranks = null.5method.ranks,
+        null.5method.rank.product = null.5method.rank.product
         );
     }

From a1e70b658e83baa4c4bdec7ed0260d81979d7dc3 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Tue, 12 Mar 2024 11:33:04 -0700
Subject: [PATCH 20/21] Don't rank null data

Don't compute ranks of null data in function `detect.outliers()`.
Rather, ranks are computed as each observed transcript is added to the
null data.
---
 R/detect.outliers.R | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/R/detect.outliers.R b/R/detect.outliers.R
index e206546..ed086fc 100644
--- a/R/detect.outliers.R
+++ b/R/detect.outliers.R
@@ -238,15 +238,6 @@ detect.outliers <- function(data, num.null) {
         fraction.kmeans = fraction.kmeans,
         cosine.similarity = cosine.similarity
         );
-    # Assign ranks within each method.
-    null.5method.ranks <- outlier.rank(
-        outlier.statistics.matrix = null.5method
-        );
-    # Compute the rank product for each null transcript.
-    null.5method.rank.product <- outlier.rank.product(
-        ranks.matrix = null.5method.ranks,
-        num.allowed.NA = 0
-        );
 
     list(
         optimal.distribution.data = optimal.distribution.data,
@@ -265,8 +256,6 @@ detect.outliers <- function(data, num.null) {
         observed.5method.rank.product = observed.5method.rank.product,
         null.data = null.data,
         optimal.distribution.null.data = optimal.distribution.null.data,
-        null.5method = null.5method,
-        null.5method.ranks = null.5method.ranks,
-        null.5method.rank.product = null.5method.rank.product
+        null.5method = null.5method
         );
     }

From a2f77fa663e3eceb280ebca3598e9d173c772ef7 Mon Sep 17 00:00:00 2001
From: John M Sahrmann <jsahrmann@mednet.ucla.edu>
Date: Thu, 21 Mar 2024 11:10:55 -0700
Subject: [PATCH 21/21] Fix style

---
 R/simulate.null.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/simulate.null.R b/R/simulate.null.R
index e95c569..7820c15 100644
--- a/R/simulate.null.R
+++ b/R/simulate.null.R
@@ -91,8 +91,8 @@ simulate.null <- function(
     else if (4 == x.distribution) {
         mean.gamma <- mean(x.nozero.trim);
         sd.gamma <- stats::sd(x.nozero.trim);
-        gamma.shape <- (mean.gamma/sd.gamma)^2;
-        gamma.rate <- mean.gamma/(sd.gamma^2);
+        gamma.shape <- (mean.gamma / sd.gamma)^2;
+        gamma.rate <- mean.gamma / (sd.gamma^2);
         simulated.null <- stats::rgamma(
             n = length(x),
             shape = gamma.shape,