Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define null data simulation function #6

Merged
merged 24 commits into from
Apr 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e4f91d5
Add script `R/simulate.null.R`
jsahrmann Feb 26, 2024
c7e2079
Qualify namespace in `R/simulate.null.R`
jsahrmann Feb 26, 2024
f546970
Merge branch 'jsahrmann-add-residuals-functions' into jsahrmann-defin…
jsahrmann Feb 26, 2024
ca68c79
Add example to function `simulate.null()`
jsahrmann Feb 26, 2024
c06dbe1
Add `num.null` as parameter to `detect.outliers()`
jsahrmann Feb 26, 2024
82f5509
Simulate null transcripts in `detect.outliers()`
jsahrmann Feb 26, 2024
1fc1f91
Use `truncnorm::rtruncnorm()` in `simulate.null()`
jsahrmann Feb 26, 2024
d2383b8
Update documentation for `detect.outliers()`
jsahrmann Feb 26, 2024
c6229ee
Merge branch 'jsahrmann-add-residuals-functions' into jsahrmann-defin…
jsahrmann Feb 26, 2024
d1dbfe7
Handle data frame input to `simulate.null()`
jsahrmann Feb 26, 2024
bc24b8e
Merge branch 'main' into jsahrmann-define-simulate-null
jsahrmann Mar 5, 2024
d220480
Add script `R/add.residual.noise.R`
jsahrmann Mar 5, 2024
42fbd35
Draft alternative simulation function
jsahrmann Mar 6, 2024
1eb6e1b
Replace original `simulate.null()` definition
jsahrmann Mar 6, 2024
efcc365
Use new `simulate.null()` in `detect.outliers()`
jsahrmann Mar 7, 2024
52fc9a0
Delete script `R/add.residual.noise.R`
jsahrmann Mar 7, 2024
9a49981
Fix typo in `R/detect.outliers.R`
jsahrmann Mar 7, 2024
c10be70
Add transcript names to `null.data`
jsahrmann Mar 7, 2024
44eb93a
Determine distribution of null transcripts
jsahrmann Mar 7, 2024
fa8541f
Calculate outlier statistics on null data
jsahrmann Mar 7, 2024
308db68
Calculate rank product on null transcripts
jsahrmann Mar 7, 2024
bd9cd23
Export new objects in `detect.outliers()`
jsahrmann Mar 7, 2024
a1e70b6
Don't rank null data
jsahrmann Mar 12, 2024
a2f77fa
Fix style
jsahrmann Mar 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 115 additions & 2 deletions R/detect.outliers.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#' Detect outliers
#'
#' @param data A matrix or data frame of FPKM values, organized with transcripts on rows and samples on columns. Transcript identifiers should be stored as `rownames(data)`.
#' @param num.null The number of transcripts to generate when simulating from null distributions.
#'
#' @export
detect.outliers <- function(data) {
detect.outliers <- function(data, num.null) {
# Determine which of the normal, log-normal, exponential, or gamma
# distributions provides the best fit to each row of values in
# `data`.
Expand Down Expand Up @@ -129,6 +130,115 @@ detect.outliers <- function(data) {
num.allowed.NA = 0
);

# Generate a matrix of null transcripts by simulating from their
# respective optimal distributions.
sampled.indices <- sample(
x = nrow(data),
size = num.null,
replace = TRUE
);
null.data <- future.apply::future_lapply(
X = sampled.indices,
FUN = function(i) {
simulate.null(
x = as.numeric(data[i, ]),
x.distribution = optimal.distribution.data[i],
r = as.numeric(observed.residuals.trimmed[i, ]),
r.distribution = optimal.distribution.residuals[i]
);
},
future.seed = TRUE
);
null.data <- do.call(
what = rbind,
args = null.data
);
rownames(null.data) <- rownames(data)[sampled.indices];
# Determine which of the normal, log-normal, exponential, or gamma
# distributions provides the best fit to each row of values in
# `null.data`.
optimal.distribution.null.data <- future.apply::future_apply(
X = null.data,
MARGIN = 1,
FUN = identify.bic.optimal.data.distribution,
future.seed = TRUE
);

# Compute quantities for outlier detection on the null data: (1)
# z-scores based on the mean / standard deviation, (2) z-scores
# based on the trimmed mean / trimmed standard deviation, (3)
# z-scores based on the median / median absolute deviation, and
# (4) the cluster assignment from k-means with two clusters.
data.mean <- future.apply::future_apply(
X = null.data,
MARGIN = 1,
FUN = quantify.outliers,
method = 'mean'
);
data.median <- future.apply::future_apply(
X = null.data,
MARGIN = 1,
FUN = quantify.outliers,
method = 'median'
);
data.trimmean <- future.apply::future_apply(
X = null.data,
MARGIN = 1,
FUN = quantify.outliers,
method = 'mean',
trim = 0.05
);
data.kmeans <- future.apply::future_apply(
X = null.data,
MARGIN = 1,
FUN = quantify.outliers,
method = 'kmeans',
nstart = 1000,
future.seed = TRUE
);
# Compute the ranges of the z-score statistics.
zrange.mean <- future.apply::future_apply(
X = data.mean,
MARGIN = 2,
FUN = zrange
);
zrange.median <- future.apply::future_apply(
X = data.median,
MARGIN = 2,
FUN = zrange
);
zrange.trimmean <- future.apply::future_apply(
X = data.trimmean,
MARGIN = 2,
FUN = zrange
);
# Compute the k-means fraction.
fraction.kmeans <- future.apply::future_apply(
X = data.kmeans,
MARGIN = 2,
FUN = kmeans.fraction
);
# Compute the cosine similarity.
cosine.similarity <- future.apply::future_sapply(
X = seq_len(nrow(null.data)),
FUN = function(i) {
outlier.detection.cosine(
x = as.numeric(null.data[i, ]),
distribution = optimal.distribution.null.data[i]
);
}
);
names(cosine.similarity) <- rownames(null.data);
# Assemble the statistics from the five methods into a single
# matrix.
null.5method <- cbind(
zrange.mean = zrange.mean,
zrange.median = zrange.median,
zrange.trimmean = zrange.trimmean,
fraction.kmeans = fraction.kmeans,
cosine.similarity = cosine.similarity
);

list(
optimal.distribution.data = optimal.distribution.data,
optimal.distribution.residuals = optimal.distribution.residuals,
Expand All @@ -143,6 +253,9 @@ detect.outliers <- function(data) {
cosine.similarity = cosine.similarity,
observed.5method = observed.5method,
observed.5method.ranks = observed.5method.ranks,
observed.5method.rank.product = observed.5method.rank.product
observed.5method.rank.product = observed.5method.rank.product,
null.data = null.data,
optimal.distribution.null.data = optimal.distribution.null.data,
null.5method = null.5method
);
}
162 changes: 162 additions & 0 deletions R/simulate.null.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#' Simulate from a null distribution
#'
#' Simulate transcripts from a specified null distribution.
#'
#' @param x A numeric vector of transcripts.
#' @param x.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.data.distribution()`. Possible values are
#' * 1 = normal,
#' * 2 = log-normal,
#' * 3 = exponential, and
#' * 4 = gamma.
#' @param r A numeric vector of residuals calculated for this transcript.
#' @param r.distribution A numeric code corresponding to the optimal distribution of `x` as returned by `identify.bic.optimal.residuals.distribution()`. Possible values are the same as those for `x.distribution`.
#'
#' @return A numeric vector of the same length as `x`. Names are not retained.
#'
#' @examples
#' # Prepare fake data.
#' set.seed(1234);
#' x <- rgamma(
#' n = 20,
#' shape = 2,
#' scale = 2
#' );
#' names(x) <- paste('Sample', seq_along(x), sep = '.');
#' x.dist <- identify.bic.optimal.data.distribution(
#' x = x
#' );
#' r <- calculate.residuals(
#' x = x,
#' distribution = x.dist
#' );
#' r.trimmed <- trim.sample(
#' x = r
#' );
#' r.dist <- identify.bic.optimal.residuals.distribution(
#' x = r.trimmed
#' );
#' null <- simulate.null(
#' x = x,
#' x.distribution = x.dist,
#' r = r.trimmed,
#' r.distribution = r.dist
#' );
#'
#' @noRd
simulate.null <- function(
x,
x.distribution,
r,
r.distribution
) {
#
# Simulate transcripts
#
# Ensure the values in `x` are strictly positive.
add.minimum.value <- least.significant.digit(x);
x.nozero <- x + add.minimum.value;
# Apply 5% trimming.
x.trim <- trim.sample(x);
x.nozero.trim <- trim.sample(x.trim);
# Generate null values according to the optimal
# distribution for this transcript.
if (1 == x.distribution) {
norm.mean <- mean(x.nozero.trim);
norm.sd <- stats::sd(x.trim);
simulated.null <- truncnorm::rtruncnorm(
n = length(x),
mean = norm.mean,
sd = norm.sd,
a = 0
);
}
else if (2 == x.distribution) {
mean.log <- mean(x.nozero.trim);
sd.log <- stats::sd(x.nozero.trim);
m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
simulated.null <- stats::rlnorm(
n = length(x),
meanlog = m2,
sdlog = sd2
);
}
else if (3 == x.distribution) {
exp.rate <- 1 / mean(x.nozero.trim);
simulated.null <- stats::rexp(
n = length(x),
rate = exp.rate
);
}
else if (4 == x.distribution) {
mean.gamma <- mean(x.nozero.trim);
sd.gamma <- stats::sd(x.nozero.trim);
gamma.shape <- (mean.gamma / sd.gamma)^2;
gamma.rate <- mean.gamma / (sd.gamma^2);
simulated.null <- stats::rgamma(
n = length(x),
shape = gamma.shape,
rate = gamma.rate
);
}
#
# Simulate noise
#
# Ensure the values in `r` are strictly positive.
add.minimum.value <- least.significant.digit(r)
if (min(r) < 0) {
r.nozero <- r - min(r) + add.minimum.value;
}
else {
r.nozero <- r + add.minimum.value;
}
# Generate null values from the distribution coded by
# `distribution`.
if (1 == r.distribution) {
norm.mean <- mean(r.nozero);
norm.sd <- stats::sd(r.nozero);
simulated.noise <- truncnorm::rtruncnorm(
n = length(x),
mean = norm.mean,
sd = norm.sd,
a = 0
);
}
else if (2 == r.distribution) {
mean.log <- mean(r.nozero);
sd.log <- stats::sd(r.nozero);
m2 <- log(mean.log^2 / sqrt(sd.log^2 + mean.log^2));
sd2 <- sqrt(log(1 + (sd.log^2 / mean.log^2)));
simulated.noise <- stats::rlnorm(
n = length(x),
meanlog = m2,
sdlog = sd2
);
}
else if (3 == r.distribution) {
exp.rate <- 1 / mean(r.nozero);
simulated.noise <- stats::rexp(
n = length(x),
rate = exp.rate
);
}
else if (4 == r.distribution) {
mean.gamma <- mean(r.nozero);
sd.gamma <- stats::sd(r.nozero);
gamma.shape <- (mean.gamma / sd.gamma)^2;
gamma.rate <- mean.gamma / (sd.gamma^2);
simulated.noise <- stats::rgamma(
n = length(x),
shape = gamma.shape,
rate = gamma.rate
);
}
if (min(r) < 0) {
simulated.noise <- simulated.noise + min(r) - add.minimum.value;
}
else {
simulated.noise <- simulated.noise - add.minimum.value;
}
# Add the simulated noise to the simulated transcript.
abs(simulated.null + simulated.noise)
}
4 changes: 3 additions & 1 deletion man/detect.outliers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading