From e4f91d54737fa54e4a0811608312fa8232ebc738 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 10:12:44 -0800 Subject: [PATCH 01/21] Add script `R/simulate.null.R` --- R/simulate.null.R | 93 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 R/simulate.null.R diff --git a/R/simulate.null.R b/R/simulate.null.R new file mode 100644 index 0000000..c43c52b --- /dev/null +++ b/R/simulate.null.R @@ -0,0 +1,93 @@ +#' Simulate from a null distribution +#' +#' Simulate transcripts from a specified null distribution. +#' +#' @param data A matrix or data frame from which to sample transcripts. +#' @param distributions A numeric vector consisting of codes corresponding to the optimal distribution of `x` as returned by, e.g., `identify.bic.optimal.data.distribution()`. The value of `distributions[i]` corresponds to the code for the optimal distribution of the transcript in `data[i, ]`. Possible values are +#' * 1 = normal, +#' * 2 = log-normal, +#' * 3 = exponential, and +#' * 4 = gamma. +#' @param num.null The number of simulated transcripts to generate. +#' +#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns. Row names are retained, but column names are not. +#' +#' @noRd +simulate.null <- function( + data, + distributions, + num.null = 10000 + ) { + # Sample `num.null` indices from `data`. The corresponding + # transcripts will constitute our simulated null data set. + sampled.indices <- sample( + x = nrow(data), + size = num.null, + replace = TRUE + ); + # For each sampled index, generate a null transcript. + simulated.null <- future.apply::future_lapply( + X = sampled.indices, + FUN = function(i) { + # Select the corresponding transcript and the code for its + # optimal distribution. + x <- data[i, ]; + distribution <- distributions[i]; + # Ensure the values in `x` are strictly positive. + add.minimum.value <- least.significant.digit(x); + x.nozero <- x + add.minimum.value; + # Apply 5% trimming. + x.trim <- trim.sample(x); + x.nozero.trim <- trim.sample(x.trim); + # Generate null values according to the optimal + # distribution for this transcript. + if (1 == distribution) { + norm.mean <- mean(x.nozero.trim); + norm.sd <- sd(x.trim); + simulated.null <- rtnorm( + n = length(x), + mean = norm.mean, + sd = norm.sd, + a = 0 + ); + } + else if (2 == distribution) { + mean.log <- mean(x.nozero.trim); + sd.log <- sd(x.nozero.trim); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + simulated.null <- rlnorm( + n = length(x), + meanlog = m2, + sdlog = sd2 + ); + } + else if (3 == distribution) { + exp.rate <- 1 / mean(x.nozero.trim); + simulated.null <- rexp( + n = length(x), + rate = exp.rate + ); + } + else if (4 == distribution) { + mean.gamma <- mean(x.nozero.trim); + sd.gamma <- sd(x.nozero.trim); + gamma.shape <- (mean.gamma/sd.gamma)^2; + gamma.rate <- mean.gamma/(sd.gamma^2); + simulated.null <- rgamma( + n = length(x), + shape = gamma.shape, + rate = gamma.rate + ); + } + }, + future.seed = TRUE + ); + simulated.null <- do.call( + what = rbind, + args = simulated.null + ); + rownames(simulated.null) <- rownames(data)[sampled.indices]; + simulated.null; + } + From c7e20797d4fa5dd65af2ed4be8f8e187bc3067aa Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 10:23:09 -0800 Subject: [PATCH 02/21] Qualify namespace in `R/simulate.null.R` Qualify namespace in function calls in `R/simulate.null.R`. --- R/simulate.null.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/simulate.null.R b/R/simulate.null.R index c43c52b..b7b14f9 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -43,8 +43,8 @@ simulate.null <- function( # distribution for this transcript. if (1 == distribution) { norm.mean <- mean(x.nozero.trim); - norm.sd <- sd(x.trim); - simulated.null <- rtnorm( + norm.sd <- stats::sd(x.trim); + simulated.null <- extraDistr::rtnorm( n = length(x), mean = norm.mean, sd = norm.sd, @@ -53,10 +53,10 @@ simulate.null <- function( } else if (2 == distribution) { mean.log <- mean(x.nozero.trim); - sd.log <- sd(x.nozero.trim); + sd.log <- stats::sd(x.nozero.trim); m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); - simulated.null <- rlnorm( + simulated.null <- stats::rlnorm( n = length(x), meanlog = m2, sdlog = sd2 @@ -64,17 +64,17 @@ simulate.null <- function( } else if (3 == distribution) { exp.rate <- 1 / mean(x.nozero.trim); - simulated.null <- rexp( + simulated.null <- stats::rexp( n = length(x), rate = exp.rate ); } else if (4 == distribution) { mean.gamma <- mean(x.nozero.trim); - sd.gamma <- sd(x.nozero.trim); + sd.gamma <- stats::sd(x.nozero.trim); gamma.shape <- (mean.gamma/sd.gamma)^2; gamma.rate <- mean.gamma/(sd.gamma^2); - simulated.null <- rgamma( + simulated.null <- stats::rgamma( n = length(x), shape = gamma.shape, rate = gamma.rate From ca68c79b8dcab7d0e6325c3208a5f2bf586fe2a3 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 11:35:44 -0800 Subject: [PATCH 03/21] Add example to function `simulate.null()` Add example to function `simulate.null()`, and update the description of the return value. --- R/simulate.null.R | 57 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/R/simulate.null.R b/R/simulate.null.R index b7b14f9..42b5707 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -10,7 +10,61 @@ #' * 4 = gamma. #' @param num.null The number of simulated transcripts to generate. #' -#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns. Row names are retained, but column names are not. +#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns. Row names are retained from `data` and correspond to the row names of the transcripts used to simulate each row. Column names are not retained. +#' +#' @examples +#' # Prepare fake data. +#' set.seed(1234); +#' n <- 25; +#' x1 <- rnorm( +#' n = n, +#' mean = 5, +#' sd = 2 +#' ); +#' x2 <- rlnorm( +#' n = n, +#' meanlog = 0, +#' sdlog = 0.5 +#' ); +#' x3 <- rexp( +#' n = n, +#' rate = 0.2 +#' ); +#' x4 <- rexp( +#' n = n, +#' rate = 0.1 +#' ); +#' x5 <- rgamma( +#' n = n, +#' shape = 2, +#' scale = 2 +#' ); +#' x6 <- rgamma( +#' n = n, +#' shape = 4, +#' scale = 0.5 +#' ); +#' x <- rbind(x1, x2, x3, x4, x5, x6); +#' rownames(x) <- letters[seq_len(nrow(x))]; +#' colnames(x) <- paste( +#' 'Sample', +#' seq_len(ncol(x)), +#' sep = '.' +#' ); +#' +#' # Identify optimal distributions for each transcript. +#' optimal.distribution.x <- apply( +#' X = x, +#' MARGIN = 1, +#' FUN = identify.bic.optimal.data.distribution +#' ); +#' +#' # Simulate from null distributions. +#' simulated.x <- simulate.null( +#' data = x, +#' distributions = optimal.distribution.x, +#' num.null = 100 +#' ); #' #' @noRd simulate.null <- function( @@ -90,4 +144,3 @@ simulate.null <- function( rownames(simulated.null) <- rownames(data)[sampled.indices]; simulated.null; } - From c06dbe19acda5b0e14428abfda94644a662eef3b Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 11:44:05 -0800 Subject: [PATCH 04/21] Add `num.null` as parameter to `detect.outliers()` Add `num.null` as a parameter to the function `detect.outliers()` to give users control over how many null transcripts to simulate. --- R/detect.outliers.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 5264775..300563d 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -1,9 +1,10 @@ #' Detect outliers #' #' @param data A matrix or data frame of FPKM values, organized with transcripts on rows and samples on columns. Transcript identifiers should be stored as `rownames(data)`. +#' @param num.null The number of transcripts to generate when simulating from null distributions. #' #' @export -detect.outliers <- function(data) { +detect.outliers <- function(data, num.null) { # Determine which of the normal, log-normal, exponential, or gamma # distributions provides the best fit to each row of values in # `data`. From 82f55096dc96c49393467d0f0cff06c30045b058 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 11:45:58 -0800 Subject: [PATCH 05/21] Simulate null transcripts in `detect.outliers()` Add code to simulate null transcripts in the function `detect.outliers()`, and include the resulting matrix in the list of values to return. --- R/detect.outliers.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 300563d..f0b6138 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -129,6 +129,14 @@ detect.outliers <- function(data, num.null) { num.allowed.NA = 0 ); + # Generate a matrix of null transcripts by simulating from their + # respective optimal distributions. + null.data <- simulate.null( + data = data, + distributions = optimal.distribution.data, + num.null = num.null + ); + list( optimal.distribution.data = optimal.distribution.data, optimal.distribution.residuals = optimal.distribution.residuals, @@ -143,6 +151,7 @@ detect.outliers <- function(data, num.null) { cosine.similarity = cosine.similarity, observed.5method = observed.5method, observed.5method.ranks = observed.5method.ranks, - observed.5method.rank.product = observed.5method.rank.product + observed.5method.rank.product = observed.5method.rank.product, + null.data = null.data ); } From 1fc1f91ac7f9992c1eadd6980d61d38ec324ea1f Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 14:04:35 -0800 Subject: [PATCH 06/21] Use `truncnorm::rtruncnorm()` in `simulate.null()` Use `truncnorm::rtruncnorm()` rather than `extraDistr::rtnorm()` in `simulate.null()`. The two functions appear to be equivalent, and doing so avoids adding the package extraDistr as a dependency. --- R/simulate.null.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/simulate.null.R b/R/simulate.null.R index 42b5707..c216f43 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -98,7 +98,7 @@ simulate.null <- function( if (1 == distribution) { norm.mean <- mean(x.nozero.trim); norm.sd <- stats::sd(x.trim); - simulated.null <- extraDistr::rtnorm( + simulated.null <- truncnorm::rtruncnorm( n = length(x), mean = norm.mean, sd = norm.sd, From d2383b87382cafd33f4cec5019bbd7de75fac0b8 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 14:06:40 -0800 Subject: [PATCH 07/21] Update documentation for `detect.outliers()` --- man/detect.outliers.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/detect.outliers.Rd b/man/detect.outliers.Rd index 454da50..5bec01c 100644 --- a/man/detect.outliers.Rd +++ b/man/detect.outliers.Rd @@ -4,10 +4,12 @@ \alias{detect.outliers} \title{Detect outliers} \usage{ -detect.outliers(data) +detect.outliers(data, num.null) } \arguments{ \item{data}{A matrix or data frame of FPKM values, organized with transcripts on rows and samples on columns. Transcript identifiers should be stored as \code{rownames(data)}.} + +\item{num.null}{The number of transcripts to generate when simulating from null distributions.} } \description{ Detect outliers From d1dbfe7fe4f1dfa14d6d9227b06d685308a5ed41 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Mon, 26 Feb 2024 14:58:31 -0800 Subject: [PATCH 08/21] Handle data frame input to `simulate.null()` Add code to `simulate.null()` to avoid errors handling data frame input. --- R/simulate.null.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/simulate.null.R b/R/simulate.null.R index c216f43..f49ef60 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -85,7 +85,7 @@ simulate.null <- function( FUN = function(i) { # Select the corresponding transcript and the code for its # optimal distribution. - x <- data[i, ]; + x <- as.numeric(data[i, ]); distribution <- distributions[i]; # Ensure the values in `x` are strictly positive. add.minimum.value <- least.significant.digit(x); From d2204808013264cf9f6ca86cd6090967f92d2a13 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Tue, 5 Mar 2024 11:38:46 -0800 Subject: [PATCH 09/21] Add script `R/add.residual.noise.R` Add script `R/add.residual.noise.R` with first draft of function `add.residual.noise()`. --- R/add.residual.noise.R | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 R/add.residual.noise.R diff --git a/R/add.residual.noise.R b/R/add.residual.noise.R new file mode 100644 index 0000000..9777015 --- /dev/null +++ b/R/add.residual.noise.R @@ -0,0 +1,70 @@ +#' Add noise to residuals +#' +#' Add additional noise to residuals based on a specified null distribution. +#' +#' @param x A numeric vector. +#' @param distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`. Possible values are +#' * 1 = normal, +#' * 2 = log-normal, +#' * 3 = exponential, and +#' * 4 = gamma. +#' +#' @return A numeric vector of the same length as `x`. +#' +#' @noRd +add.residual.noise <- function(x, distribution) { + # Add a minimum value to ensure the values in `x` are strictly + # positive. + add.minimum.value <- least.significant.digit(x) + if (min(x) < 0) { + x.nozero <- x - min(x) + add.minimum.value; + } + else { + x.nozero <- x + add.minimum.value; + } + # Generate null values from the distribution coded by + # `distribution`. + if (1 == distribution) { + norm.mean <- mean(x.nozero); + norm.sd <- stats::sd(x.nozero); + simulated.noise <- truncnorm::rtruncnorm( + n = length(x), + mean = norm.mean, + sd = norm.sd, + a = 0 + ); + } + else if (2 == distribution) { + mean.log <- mean(x.nozero); + sd.log <- stats::sd(x.nozero); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + simulated.noise <- stats::rlnorm( + n = length(x), + meanlog = m2, + sdlog = sd2 + ); + } + else if (3 == distribution) { + exp.rate <- 1 / mean(x.nozero); + simulated.noise <- stats::rexp( + n = length(x), + rate = exp.rate + ); + } + else if (4 == distribution) { + mean.gamma <- mean(x.nozero); + sd.gamma <- stats::sd(x.nozero); + gamma.shape <- (mean.gamma / sd.gamma)^2; + gamma.rate <- mean.gamma / (sd.gamma^2); + simulated.noise <- stats::rgamma( + n = length(x), + shape = gamma.shape, + rate = gamma.rate + ); + } + # Add the simulated noise to `x`, but make sure the result is + # positive. + print(simulated.noise); + abs(x + simulated.noise); + } From 42fbd358c7f0b9fff765022f67b5c4ace322c926 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Wed, 6 Mar 2024 10:01:29 -0800 Subject: [PATCH 10/21] Draft alternative simulation function Draft an alternative function definition, tentatively called `simulate.null2()`, for simulating null transcripts. This function combines the computations from the original `simulate.null()` function with those of the new `add.residual.noise()` function, with the benefit being that we ultimately end up with one `num.null` by `ncol(data)` matrix rather than two, thereby relieving memory constraints. --- R/simulate.null.R | 119 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/R/simulate.null.R b/R/simulate.null.R index f49ef60..895b71c 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -144,3 +144,122 @@ simulate.null <- function( rownames(simulated.null) <- rownames(data)[sampled.indices]; simulated.null; } + + +simulate.null2 <- function( + x, + x.distribution, + r, + r.distribution + ) { + # + # Simulate transcripts + # + # Ensure the values in `x` are strictly positive. + add.minimum.value <- least.significant.digit(x); + x.nozero <- x + add.minimum.value; + # Apply 5% trimming. + x.trim <- trim.sample(x); + x.nozero.trim <- trim.sample(x.trim); + # Generate null values according to the optimal + # distribution for this transcript. + if (1 == x.distribution) { + norm.mean <- mean(x.nozero.trim); + norm.sd <- stats::sd(x.trim); + simulated.null <- truncnorm::rtruncnorm( + n = length(x), + mean = norm.mean, + sd = norm.sd, + a = 0 + ); + } + else if (2 == x.distribution) { + mean.log <- mean(x.nozero.trim); + sd.log <- stats::sd(x.nozero.trim); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + simulated.null <- stats::rlnorm( + n = length(x), + meanlog = m2, + sdlog = sd2 + ); + } + else if (3 == x.distribution) { + exp.rate <- 1 / mean(x.nozero.trim); + simulated.null <- stats::rexp( + n = length(x), + rate = exp.rate + ); + } + else if (4 == x.distribution) { + mean.gamma <- mean(x.nozero.trim); + sd.gamma <- stats::sd(x.nozero.trim); + gamma.shape <- (mean.gamma/sd.gamma)^2; + gamma.rate <- mean.gamma/(sd.gamma^2); + simulated.null <- stats::rgamma( + n = length(x), + shape = gamma.shape, + rate = gamma.rate + ); + } + # + # Simulate noise + # + # Ensure the values in `r` are strictly positive. + add.minimum.value <- least.significant.digit(r) + if (min(r) < 0) { + r.nozero <- r - min(r) + add.minimum.value; + } + else { + r.nozero <- r + add.minimum.value; + } + # Generate null values from the distribution coded by + # `distribution`. + if (1 == r.distribution) { + norm.mean <- mean(r.nozero); + norm.sd <- stats::sd(r.nozero); + simulated.noise <- truncnorm::rtruncnorm( + n = length(x), + mean = norm.mean, + sd = norm.sd, + a = 0 + ); + } + else if (2 == r.distribution) { + mean.log <- mean(r.nozero); + sd.log <- stats::sd(r.nozero); + m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); + sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); + simulated.noise <- stats::rlnorm( + n = length(x), + meanlog = m2, + sdlog = sd2 + ); + } + else if (3 == r.distribution) { + exp.rate <- 1 / mean(r.nozero); + simulated.noise <- stats::rexp( + n = length(x), + rate = exp.rate + ); + } + else if (4 == r.distribution) { + mean.gamma <- mean(r.nozero); + sd.gamma <- stats::sd(r.nozero); + gamma.shape <- (mean.gamma / sd.gamma)^2; + gamma.rate <- mean.gamma / (sd.gamma^2); + simulated.noise <- stats::rgamma( + n = length(x), + shape = gamma.shape, + rate = gamma.rate + ); + } + if (min(r) < 0) { + simulated.noise <- simulated.noise + min(r) - add.minimum.value; + } + else { + simulated.noise <- simulated.noise - add.minimum.value; + } + # Add the simulated noise to the simulated transcript. + abs(simulated.null + simulated.noise) + } From 1eb6e1b0c91fe355def5fdba1f9b081e9d3c998b Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Wed, 6 Mar 2024 15:08:35 -0800 Subject: [PATCH 11/21] Replace original `simulate.null()` definition Replace the original definition of the function `simulate.null()` with the new definition. Update documentation and examples. --- R/simulate.null.R | 151 ++++++++-------------------------------------- 1 file changed, 24 insertions(+), 127 deletions(-) diff --git a/R/simulate.null.R b/R/simulate.null.R index 895b71c..e95c569 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -2,151 +2,48 @@ #' #' Simulate transcripts from a specified null distribution. #' -#' @param data A matrix or data frame from which to sample transcripts. -#' @param distributions A numeric vector consisting of codes corresponding to the optimal distribution of `x` as returned by, e.g., `identify.bic.optimal.data.distribution()`. The value of `distributions[i]` corresponds to the code for the optimal distribution of the transcript in `data[i, ]`. Possible values are +#' @param x A numeric vector of transcripts. +#' @param x.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.data.distribution()`. Possible values are #' * 1 = normal, #' * 2 = log-normal, #' * 3 = exponential, and #' * 4 = gamma. -#' @param num.null The number of simulated transcripts to generate. +#' @param r A numeric vector of residuals calculated for this transcript. +#' @param r.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`. Possible values are the same as those for `x.distribution`. #' -#' @return A matrix of simulated transcripts with `num.null` rows and `ncol(data)` columns. Row names are retained from `data` and correspond to the row names of the transcripts used to simulate each row. Column names are not retained. +#' @return A numeric vector of the same length as `x`. Names are not retained. #' #' @examples #' # Prepare fake data. #' set.seed(1234); -#' n <- 25; -#' x1 <- rnorm( -#' n = n, -#' mean = 5, -#' sd = 2 -#' ); -#' x2 <- rlnorm( -#' n = n, -#' meanlog = 0, -#' sdlog = 0.5 -#' ); -#' x3 <- rexp( -#' n = n, -#' rate = 0.2 -#' ); -#' x4 <- rexp( -#' n = n, -#' rate = 0.1 -#' ); -#' x5 <- rgamma( -#' n = n, +#' x <- rgamma( +#' n = 20, #' shape = 2, #' scale = 2 #' ); -#' x6 <- rgamma( -#' n = n, -#' shape = 4, -#' scale = 0.5 +#' names(x) <- paste('Sample', seq_along(x), sep = '.'); +#' x.dist <- identify.bic.optimal.data.distribution( +#' x = x #' ); -#' x <- rbind(x1, x2, x3, x4, x5, x6); -#' rownames(x) <- letters[seq_len(nrow(x))]; -#' colnames(x) <- paste( -#' 'Sample', -#' seq_len(ncol(x)), -#' sep = '.' -#' ); -#' -#' # Identify optimal distributions for each transcript. -#' optimal.distribution.x <- apply( -#' X = x, -#' MARGIN = 1, -#' FUN = identify.bic.optimal.data.distribution +#' r <- calculate.residuals( +#' x = x, +#' distribution = x.dist #' ); -#' -#' # Simulate from null distributions. -#' simulated.x <- simulate.null( -#' data = x, -#' distributions = optimal.distribution.x, -#' num.null = 100 +#' r.trimmed <- trim.sample( +#' x = r +#' ); +#' r.dist <- identify.bic.optimal.residuals.distribution( +#' x = r.trimmed +#' ); +#' null <- simulate.null( +#' x = x, +#' x.distribution = x.dist, +#' r = r.trimmed, +#' r.distribution = r.dist #' ); #' #' @noRd simulate.null <- function( - data, - distributions, - num.null = 10000 - ) { - # Sample `num.null` indices from `data`. The corresponding - # transcripts will constitute our simulated null data set. - sampled.indices <- sample( - x = nrow(data), - size = num.null, - replace = TRUE - ); - # For each sampled index, generate a null transcript. - simulated.null <- future.apply::future_lapply( - X = sampled.indices, - FUN = function(i) { - # Select the corresponding transcript and the code for its - # optimal distribution. - x <- as.numeric(data[i, ]); - distribution <- distributions[i]; - # Ensure the values in `x` are strictly positive. - add.minimum.value <- least.significant.digit(x); - x.nozero <- x + add.minimum.value; - # Apply 5% trimming. - x.trim <- trim.sample(x); - x.nozero.trim <- trim.sample(x.trim); - # Generate null values according to the optimal - # distribution for this transcript. - if (1 == distribution) { - norm.mean <- mean(x.nozero.trim); - norm.sd <- stats::sd(x.trim); - simulated.null <- truncnorm::rtruncnorm( - n = length(x), - mean = norm.mean, - sd = norm.sd, - a = 0 - ); - } - else if (2 == distribution) { - mean.log <- mean(x.nozero.trim); - sd.log <- stats::sd(x.nozero.trim); - m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); - sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); - simulated.null <- stats::rlnorm( - n = length(x), - meanlog = m2, - sdlog = sd2 - ); - } - else if (3 == distribution) { - exp.rate <- 1 / mean(x.nozero.trim); - simulated.null <- stats::rexp( - n = length(x), - rate = exp.rate - ); - } - else if (4 == distribution) { - mean.gamma <- mean(x.nozero.trim); - sd.gamma <- stats::sd(x.nozero.trim); - gamma.shape <- (mean.gamma/sd.gamma)^2; - gamma.rate <- mean.gamma/(sd.gamma^2); - simulated.null <- stats::rgamma( - n = length(x), - shape = gamma.shape, - rate = gamma.rate - ); - } - }, - future.seed = TRUE - ); - simulated.null <- do.call( - what = rbind, - args = simulated.null - ); - rownames(simulated.null) <- rownames(data)[sampled.indices]; - simulated.null; - } - - -simulate.null2 <- function( x, x.distribution, r, From efcc36530e45167a76f3a1843b7a1d7473d28047 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 08:29:56 -0800 Subject: [PATCH 12/21] Use new `simulate.null()` in `detect.outliers()` Use the new definition of the function `simulate.null()` in the main `detect.outliers()` function. --- R/detect.outliers.R | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 28a5850..0ba3964 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -132,10 +132,26 @@ detect.outliers <- function(data, num.null) { # Generate a matrix of null transcripts by simulating from their # respective optimal distributions. - null.data <- simulate.null( - data = data, - distributions = optimal.distribution.data, - num.null = num.null + sampled.indices <- sample( + x = nrow(data), + size = num.null, + replace = TRUE + ); + null.data <- future.apply::future_lapply( + X = sampled.indices, + FUN = function(i) { + simulate.null( + x = as.numeric(data[i, ]), + x.distribution = optimal.distribution.data[i], + r = as.numeric(observed.residuals.trimmed[i, ]), + r.distribution = optimal.distribution.residuals[i] + ); + }, + future.seed = TRUE + ); + null.data <- do.call( + what = rbind, + args = null ); list( From 52fc9a01ed9697b28d8c9057da5fd6594664a099 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 08:32:12 -0800 Subject: [PATCH 13/21] Delete script `R/add.residual.noise.R` Delete script `R/add.residual.noise.R`, whose functionality has been incorporated into `R/simulate.null.R`. --- R/add.residual.noise.R | 70 ------------------------------------------ 1 file changed, 70 deletions(-) delete mode 100644 R/add.residual.noise.R diff --git a/R/add.residual.noise.R b/R/add.residual.noise.R deleted file mode 100644 index 9777015..0000000 --- a/R/add.residual.noise.R +++ /dev/null @@ -1,70 +0,0 @@ -#' Add noise to residuals -#' -#' Add additional noise to residuals based on a specified null distribution. -#' -#' @param x A numeric vector. -#' @param distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`. Possible values are -#' * 1 = normal, -#' * 2 = log-normal, -#' * 3 = exponential, and -#' * 4 = gamma. -#' -#' @return A numeric vector of the same length as `x`. -#' -#' @noRd -add.residual.noise <- function(x, distribution) { - # Add a minimum value to ensure the values in `x` are strictly - # positive. - add.minimum.value <- least.significant.digit(x) - if (min(x) < 0) { - x.nozero <- x - min(x) + add.minimum.value; - } - else { - x.nozero <- x + add.minimum.value; - } - # Generate null values from the distribution coded by - # `distribution`. - if (1 == distribution) { - norm.mean <- mean(x.nozero); - norm.sd <- stats::sd(x.nozero); - simulated.noise <- truncnorm::rtruncnorm( - n = length(x), - mean = norm.mean, - sd = norm.sd, - a = 0 - ); - } - else if (2 == distribution) { - mean.log <- mean(x.nozero); - sd.log <- stats::sd(x.nozero); - m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2)); - sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2))); - simulated.noise <- stats::rlnorm( - n = length(x), - meanlog = m2, - sdlog = sd2 - ); - } - else if (3 == distribution) { - exp.rate <- 1 / mean(x.nozero); - simulated.noise <- stats::rexp( - n = length(x), - rate = exp.rate - ); - } - else if (4 == distribution) { - mean.gamma <- mean(x.nozero); - sd.gamma <- stats::sd(x.nozero); - gamma.shape <- (mean.gamma / sd.gamma)^2; - gamma.rate <- mean.gamma / (sd.gamma^2); - simulated.noise <- stats::rgamma( - n = length(x), - shape = gamma.shape, - rate = gamma.rate - ); - } - # Add the simulated noise to `x`, but make sure the result is - # positive. - print(simulated.noise); - abs(x + simulated.noise); - } From 9a499813abc6257933453c35f3c89f292734c8c7 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 10:30:08 -0800 Subject: [PATCH 14/21] Fix typo in `R/detect.outliers.R` --- R/detect.outliers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 0ba3964..9801a5e 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -151,7 +151,7 @@ detect.outliers <- function(data, num.null) { ); null.data <- do.call( what = rbind, - args = null + args = null.data ); list( From c10be709fca0f52416045423c3cb7ae9a0ed7e5d Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 15:11:36 -0800 Subject: [PATCH 15/21] Add transcript names to `null.data` Add transcript names to `null.data` in function `detect.outliers()`. --- R/detect.outliers.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 9801a5e..c84622f 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -153,6 +153,7 @@ detect.outliers <- function(data, num.null) { what = rbind, args = null.data ); + rownames(null.data) <- rownames(data)[sampled.indices]; list( optimal.distribution.data = optimal.distribution.data, From 44eb93a4d6c059cb00d1b61cff073f9307436bf2 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 15:13:00 -0800 Subject: [PATCH 16/21] Determine distribution of null transcripts Determine optimal distribution of each null transcript in function `detect.outliers()`. --- R/detect.outliers.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index c84622f..a6523ed 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -154,6 +154,15 @@ detect.outliers <- function(data, num.null) { args = null.data ); rownames(null.data) <- rownames(data)[sampled.indices]; + # Determine which of the normal, log-normal, exponential, or gamma + # distributions provides the best fit to each row of values in + # `null.data`. + optimal.distribution.null.data <- future.apply::future_apply( + X = null.data, + MARGIN = 1, + FUN = identify.bic.optimal.data.distribution, + future.seed = TRUE + ); list( optimal.distribution.data = optimal.distribution.data, From fa8541f9408c8677ee1c435213c39a9249a59aed Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 15:14:07 -0800 Subject: [PATCH 17/21] Calculate outlier statistics on null data Calculate outlier statistics on null data in function `detect.outliers()`. To optimize memory use, overwrite the vectors used for the observed data rather than create new vectors. --- R/detect.outliers.R | 65 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index a6523ed..7683236 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -164,6 +164,71 @@ detect.outliers <- function(data, num.null) { future.seed = TRUE ); + # Compute quantities for outlier detection on the null data: (1) + # z-scores based on the mean / standard deviation, (2) z-scores + # based on the trimmed mean / trimmed standard deviation, (3) + # z-scores based on the median / median absolute deviation, and + # (4) the cluster assignment from k-means with two clusters. + data.mean <- future.apply::future_apply( + X = null.data, + MARGIN = 1, + FUN = quantify.outliers, + method = 'mean' + ); + data.median <- future.apply::future_apply( + X = null.data, + MARGIN = 1, + FUN = quantify.outliers, + method = 'median' + ); + data.trimmean <- future.apply::future_apply( + X = null.data, + MARGIN = 1, + FUN = quantify.outliers, + method = 'mean', + trim = 0.05 + ); + data.kmeans <- future.apply::future_apply( + X = null.data, + MARGIN = 1, + FUN = quantify.outliers, + method = 'kmeans', + nstart = 1000, + future.seed = TRUE + ); + # Compute the ranges of the z-score statistics. + zrange.mean <- future.apply::future_apply( + X = data.mean, + MARGIN = 2, + FUN = zrange + ); + zrange.median <- future.apply::future_apply( + X = data.median, + MARGIN = 2, + FUN = zrange + ); + zrange.trimmean <- future.apply::future_apply( + X = data.trimmean, + MARGIN = 2, + FUN = zrange + ); + # Compute the k-means fraction. + fraction.kmeans <- future.apply::future_apply( + X = data.kmeans, + MARGIN = 2, + FUN = kmeans.fraction + ); + # Compute the cosine similarity. + cosine.similarity <- future.apply::future_sapply( + X = seq_len(nrow(null.data)), + FUN = function(i) { + outlier.detection.cosine( + x = as.numeric(null.data[i, ]), + distribution = optimal.distribution.null.data[i] + ); + } + ); + names(cosine.similarity) <- rownames(null.data); list( optimal.distribution.data = optimal.distribution.data, optimal.distribution.residuals = optimal.distribution.residuals, From 308db680bd3face8d65b375465f0213b1db33715 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 15:16:35 -0800 Subject: [PATCH 18/21] Calculate rank product on null transcripts Combine outlier statistics from null transcripts into a matrix, and calculate the rank product for each null transcript. --- R/detect.outliers.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 7683236..1e741d9 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -229,6 +229,25 @@ detect.outliers <- function(data, num.null) { } ); names(cosine.similarity) <- rownames(null.data); + # Assemble the statistics from the five methods into a single + # matrix. + null.5method <- cbind( + zrange.mean = zrange.mean, + zrange.median = zrange.median, + zrange.trimmean = zrange.trimmean, + fraction.kmeans = fraction.kmeans, + cosine.similarity = cosine.similarity + ); + # Assign ranks within each method. + null.5method.ranks <- outlier.rank( + outlier.statistics.matrix = null.5method + ); + # Compute the rank product for each null transcript. + null.5method.rank.product <- outlier.rank.product( + ranks.matrix = null.5method.ranks, + num.allowed.NA = 0 + ); + list( optimal.distribution.data = optimal.distribution.data, optimal.distribution.residuals = optimal.distribution.residuals, From bd9cd23d6907e05895516706f4e18d52ec48b99a Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 7 Mar 2024 15:17:49 -0800 Subject: [PATCH 19/21] Export new objects in `detect.outliers()` Add new objects to the list returned by the function `detect.outliers()`. --- R/detect.outliers.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index 1e741d9..e206546 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -263,6 +263,10 @@ detect.outliers <- function(data, num.null) { observed.5method = observed.5method, observed.5method.ranks = observed.5method.ranks, observed.5method.rank.product = observed.5method.rank.product, - null.data = null.data + null.data = null.data, + optimal.distribution.null.data = optimal.distribution.null.data, + null.5method = null.5method, + null.5method.ranks = null.5method.ranks, + null.5method.rank.product = null.5method.rank.product ); } From a1e70b658e83baa4c4bdec7ed0260d81979d7dc3 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Tue, 12 Mar 2024 11:33:04 -0700 Subject: [PATCH 20/21] Don't rank null data Don't compute ranks of null data in function `detect.outliers()`. Rather, ranks are computed as each observed transcript is added to the null data. --- R/detect.outliers.R | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/R/detect.outliers.R b/R/detect.outliers.R index e206546..ed086fc 100644 --- a/R/detect.outliers.R +++ b/R/detect.outliers.R @@ -238,15 +238,6 @@ detect.outliers <- function(data, num.null) { fraction.kmeans = fraction.kmeans, cosine.similarity = cosine.similarity ); - # Assign ranks within each method. - null.5method.ranks <- outlier.rank( - outlier.statistics.matrix = null.5method - ); - # Compute the rank product for each null transcript. - null.5method.rank.product <- outlier.rank.product( - ranks.matrix = null.5method.ranks, - num.allowed.NA = 0 - ); list( optimal.distribution.data = optimal.distribution.data, @@ -265,8 +256,6 @@ detect.outliers <- function(data, num.null) { observed.5method.rank.product = observed.5method.rank.product, null.data = null.data, optimal.distribution.null.data = optimal.distribution.null.data, - null.5method = null.5method, - null.5method.ranks = null.5method.ranks, - null.5method.rank.product = null.5method.rank.product + null.5method = null.5method ); } From a2f77fa663e3eceb280ebca3598e9d173c772ef7 Mon Sep 17 00:00:00 2001 From: John M Sahrmann Date: Thu, 21 Mar 2024 11:10:55 -0700 Subject: [PATCH 21/21] Fix style --- R/simulate.null.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/simulate.null.R b/R/simulate.null.R index e95c569..7820c15 100644 --- a/R/simulate.null.R +++ b/R/simulate.null.R @@ -91,8 +91,8 @@ simulate.null <- function( else if (4 == x.distribution) { mean.gamma <- mean(x.nozero.trim); sd.gamma <- stats::sd(x.nozero.trim); - gamma.shape <- (mean.gamma/sd.gamma)^2; - gamma.rate <- mean.gamma/(sd.gamma^2); + gamma.shape <- (mean.gamma / sd.gamma)^2; + gamma.rate <- mean.gamma / (sd.gamma^2); simulated.null <- stats::rgamma( n = length(x), shape = gamma.shape,