uclahs-cds · jeeyunhan · Apr 1, 2024 · Feb 26, 2024 · Feb 26, 2024 · Feb 26, 2024
diff --git a/R/detect.outliers.R b/R/detect.outliers.R
@@ -1,9 +1,10 @@
 #' Detect outliers
 #'
 #' @param data A matrix or data frame of FPKM values, organized with transcripts on rows and samples on columns.  Transcript identifiers should be stored as `rownames(data)`.
+#' @param num.null The number of transcripts to generate when simulating from null distributions.
 #'
 #' @export
-detect.outliers <- function(data) {
+detect.outliers <- function(data, num.null) {
     # Determine which of the normal, log-normal, exponential, or gamma
     # distributions provides the best fit to each row of values in
     # `data`.
@@ -129,6 +130,115 @@ detect.outliers <- function(data) {
         num.allowed.NA = 0
         );
 
+    # Generate a matrix of null transcripts by simulating from their
+    # respective optimal distributions.
+    sampled.indices <- sample(
+        x = nrow(data),
+        size = num.null,
+        replace = TRUE
+        );
+    null.data <- future.apply::future_lapply(
+        X = sampled.indices,
+        FUN = function(i) {
+            simulate.null(
+                x = as.numeric(data[i, ]),
+                x.distribution = optimal.distribution.data[i],
+                r = as.numeric(observed.residuals.trimmed[i, ]),
+                r.distribution = optimal.distribution.residuals[i]
+                );
+            },
+        future.seed = TRUE
+        );
+    null.data <- do.call(
+        what = rbind,
+        args = null.data
+        );
+    rownames(null.data) <- rownames(data)[sampled.indices];
+    # Determine which of the normal, log-normal, exponential, or gamma
+    # distributions provides the best fit to each row of values in
+    # `null.data`.
+    optimal.distribution.null.data <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = identify.bic.optimal.data.distribution,
+        future.seed = TRUE
+        );
+
+    # Compute quantities for outlier detection on the null data: (1)
+    # z-scores based on the mean / standard deviation, (2) z-scores
+    # based on the trimmed mean / trimmed standard deviation, (3)
+    # z-scores based on the median / median absolute deviation, and
+    # (4) the cluster assignment from k-means with two clusters.
+    data.mean <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'mean'
+        );
+    data.median <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'median'
+        );
+    data.trimmean <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'mean',
+        trim = 0.05
+        );
+    data.kmeans <- future.apply::future_apply(
+        X = null.data,
+        MARGIN = 1,
+        FUN = quantify.outliers,
+        method = 'kmeans',
+        nstart = 1000,
+        future.seed = TRUE
+        );
+    # Compute the ranges of the z-score statistics.
+    zrange.mean <- future.apply::future_apply(
+        X = data.mean,
+        MARGIN = 2,
+        FUN = zrange
+        );
+    zrange.median <- future.apply::future_apply(
+        X = data.median,
+        MARGIN = 2,
+        FUN = zrange
+        );
+    zrange.trimmean <- future.apply::future_apply(
+        X = data.trimmean,
+        MARGIN = 2,
+        FUN = zrange
+        );
+    # Compute the k-means fraction.
+    fraction.kmeans <- future.apply::future_apply(
+        X = data.kmeans,
+        MARGIN = 2,
+        FUN = kmeans.fraction
+        );
+    # Compute the cosine similarity.
+    cosine.similarity <- future.apply::future_sapply(
+        X = seq_len(nrow(null.data)),
+        FUN = function(i) {
+            outlier.detection.cosine(
+                x = as.numeric(null.data[i, ]),
+                distribution = optimal.distribution.null.data[i]
+                );
+            }
+        );
+    names(cosine.similarity) <- rownames(null.data);
+    # Assemble the statistics from the five methods into a single
+    # matrix.
+    null.5method <- cbind(
+        zrange.mean = zrange.mean,
+        zrange.median = zrange.median,
+        zrange.trimmean = zrange.trimmean,
+        fraction.kmeans = fraction.kmeans,
+        cosine.similarity = cosine.similarity
+        );
+
     list(
         optimal.distribution.data = optimal.distribution.data,
         optimal.distribution.residuals = optimal.distribution.residuals,
@@ -143,6 +253,9 @@ detect.outliers <- function(data) {
         cosine.similarity = cosine.similarity,
         observed.5method = observed.5method,
         observed.5method.ranks = observed.5method.ranks,
-        observed.5method.rank.product = observed.5method.rank.product
+        observed.5method.rank.product = observed.5method.rank.product,
+        null.data = null.data,
+        optimal.distribution.null.data = optimal.distribution.null.data,
+        null.5method = null.5method
         );
     }
diff --git a/R/simulate.null.R b/R/simulate.null.R
@@ -0,0 +1,162 @@
+#' Simulate from a null distribution
+#'
+#' Simulate transcripts from a specified null distribution.
+#'
+#' @param x A numeric vector of transcripts.
+#' @param x.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.data.distribution()`.  Possible values are
+#' * 1 = normal,
+#' * 2 = log-normal,
+#' * 3 = exponential, and
+#' * 4 = gamma.
+#' @param r A numeric vector of residuals calculated for this transcript.
+#' @param r.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`.  Possible values are the same as those for `x.distribution`.
+#'
+#' @return A numeric vector of the same length as `x`.  Names are not retained.
+#'
+#' @examples
+#' # Prepare fake data.
+#' set.seed(1234);
+#' x <- rgamma(
+#'     n = 20,
+#'     shape = 2,
+#'     scale = 2
+#'     );
+#' names(x) <- paste('Sample', seq_along(x), sep = '.');
+#' x.dist <- identify.bic.optimal.data.distribution(
+#'     x = x
+#'     );
+#' r <- calculate.residuals(
+#'     x = x,
+#'     distribution = x.dist
+#'     );
+#' r.trimmed <- trim.sample(
+#'     x = r
+#'     );
+#' r.dist <- identify.bic.optimal.residuals.distribution(
+#'     x = r.trimmed
+#'     );
+#' null <- simulate.null(
+#'     x = x,
+#'     x.distribution = x.dist,
+#'     r = r.trimmed,
+#'     r.distribution = r.dist
+#'     );
+#'
+#' @noRd
+simulate.null <- function(
+    x,
+    x.distribution,
+    r,
+    r.distribution
+    ) {
+    #
+    # Simulate transcripts
+    #
+    # Ensure the values in `x` are strictly positive.
+    add.minimum.value <- least.significant.digit(x);
+    x.nozero <- x + add.minimum.value;
+    # Apply 5% trimming.
+    x.trim <- trim.sample(x);
+    x.nozero.trim <- trim.sample(x.trim);
+    # Generate null values according to the optimal
+    # distribution for this transcript.
+    if (1 == x.distribution) {
+        norm.mean <- mean(x.nozero.trim);
+        norm.sd <- stats::sd(x.trim);
+        simulated.null <- truncnorm::rtruncnorm(
+            n = length(x),
+            mean = norm.mean,
+            sd = norm.sd,
+            a = 0
+            );
+        }
+    else if (2 == x.distribution) {
+        mean.log <- mean(x.nozero.trim);
+        sd.log <- stats::sd(x.nozero.trim);
+        m2 <-  log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        simulated.null <- stats::rlnorm(
+            n = length(x),
+            meanlog = m2,
+            sdlog = sd2
+            );
+        }
+    else if (3 == x.distribution) {
+        exp.rate <- 1 / mean(x.nozero.trim);
+        simulated.null <- stats::rexp(
+            n = length(x),
+            rate = exp.rate
+            );
+        }
+    else if (4 == x.distribution) {
+        mean.gamma <- mean(x.nozero.trim);
+        sd.gamma <- stats::sd(x.nozero.trim);
+        gamma.shape <- (mean.gamma / sd.gamma)^2;
+        gamma.rate <- mean.gamma / (sd.gamma^2);
+        simulated.null <- stats::rgamma(
+            n = length(x),
+            shape = gamma.shape,
+            rate = gamma.rate
+            );
+        }
+    #
+    # Simulate noise
+    #
+    # Ensure the values in `r` are strictly positive.
+    add.minimum.value <- least.significant.digit(r)
+    if (min(r) < 0) {
+        r.nozero <- r - min(r) + add.minimum.value;
+        }
+    else {
+        r.nozero <- r + add.minimum.value;
+        }
+    # Generate null values from the distribution coded by
+    # `distribution`.
+    if (1 == r.distribution) {
+        norm.mean <- mean(r.nozero);
+        norm.sd <- stats::sd(r.nozero);
+        simulated.noise <- truncnorm::rtruncnorm(
+            n = length(x),
+            mean = norm.mean,
+            sd = norm.sd,
+            a = 0
+            );
+        }
+    else if (2 == r.distribution) {
+        mean.log <- mean(r.nozero);
+        sd.log <- stats::sd(r.nozero);
+        m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
+        sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
+        simulated.noise <- stats::rlnorm(
+            n = length(x),
+            meanlog = m2,
+            sdlog = sd2
+            );
+        }
+    else if (3 == r.distribution) {
+        exp.rate <- 1 / mean(r.nozero);
+        simulated.noise <- stats::rexp(
+            n = length(x),
+            rate = exp.rate
+            );
+        }
+    else if (4 == r.distribution) {
+        mean.gamma <- mean(r.nozero);
+        sd.gamma <- stats::sd(r.nozero);
+        gamma.shape <- (mean.gamma / sd.gamma)^2;
+        gamma.rate <- mean.gamma / (sd.gamma^2);
+        simulated.noise <- stats::rgamma(
+            n = length(x),
+            shape = gamma.shape,
+            rate = gamma.rate
+            );
+        }
+    if (min(r) < 0) {
+        simulated.noise <- simulated.noise + min(r) - add.minimum.value;
+        }
+    else {
+        simulated.noise <- simulated.noise - add.minimum.value;
+        }
+    # Add the simulated noise to the simulated transcript.
+    abs(simulated.null + simulated.noise)
+    }
diff --git a/man/detect.outliers.Rd b/man/detect.outliers.Rd