Skip to content

Commit c3b2b0c

Browse files
scheidecalexatSL
andauthored
Add pre-processing vignette, new preProcessAdat() function (SomaLogic#134)
* Added pre-processing vignette - added pre-processing.Rmd to vignettes/articles/ - added entry to _pkgdown.yml - drafted sections on filtering features, filtering samples, data qc and transformations - added `calcOutlierMap()` and its print and plot S3 methods, along with `getOutlierIds()` for identifying sample level outliers * Added `preProcessAdat()` function - added new function `preProcessAdat()` to filter features, filter samples, generate data qc plots, and perform standard analyte RFU transformations * Improved adat ingest documentation in README - added comments to clarify filepath input to `read_adat()` example in README * Begin with adat for stat workflow articles - updated stat workflow articles with comments about how to read in the `example_data.adat` object - updated data preparation chunks to use the `preProcessAdat()` function for filtering features, filtering samples, and performing log10, center and scale transformations * Added unit tests for `preProcessAdat()` - added snapshot unit tests for preProcessAdat() messaging, print and qc plot output - added helper utility functions for saving plot snapshot output to testthat/helper.R --------- Co-authored-by: Alex Poole <[email protected]>
1 parent d566e61 commit c3b2b0c

27 files changed

+2171
-63
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Depends:
2929
Imports:
3030
cli,
3131
dplyr (>= 1.0.6),
32+
ggplot2,
3233
lifecycle (>= 1.0.0),
3334
magrittr (>= 2.0.1),
3435
methods,
@@ -37,7 +38,6 @@ Imports:
3738
tidyr (>= 1.1.3)
3839
Suggests:
3940
Biobase,
40-
ggplot2,
4141
knitr,
4242
purrr,
4343
recipes,

NAMESPACE

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ S3method(left_join,soma_adat)
4343
S3method(median,soma_adat)
4444
S3method(merge,soma_adat)
4545
S3method(mutate,soma_adat)
46+
S3method(plot,Map)
47+
S3method(plot,outlier_map)
4648
S3method(print,adat_summary)
49+
S3method(print,outlier_map)
4750
S3method(print,soma_adat)
4851
S3method(print,target_map)
4952
S3method(rename,soma_adat)
@@ -68,6 +71,7 @@ export(anti_join)
6871
export(antilog)
6972
export(apt2seqid)
7073
export(arrange)
74+
export(calcOutlierMap)
7175
export(calc_eLOD)
7276
export(checkSomaScanVersion)
7377
export(cleanNames)
@@ -83,6 +87,7 @@ export(getAnalytes)
8387
export(getFeatureData)
8488
export(getFeatures)
8589
export(getMeta)
90+
export(getOutlierIds)
8691
export(getSeqId)
8792
export(getSeqIdMatches)
8893
export(getSignalSpace)
@@ -111,6 +116,7 @@ export(merge_clin)
111116
export(mutate)
112117
export(parseHeader)
113118
export(pivotExpressionSet)
119+
export(preProcessAdat)
114120
export(read.adat)
115121
export(read_adat)
116122
export(read_annotations)
@@ -144,6 +150,7 @@ importFrom(dplyr,left_join)
144150
importFrom(dplyr,mutate)
145151
importFrom(dplyr,rename)
146152
importFrom(dplyr,right_join)
153+
importFrom(dplyr,row_number)
147154
importFrom(dplyr,sample_frac)
148155
importFrom(dplyr,sample_n)
149156
importFrom(dplyr,select)
@@ -153,6 +160,34 @@ importFrom(dplyr,slice_sample)
153160
importFrom(dplyr,starts_with)
154161
importFrom(dplyr,summarise)
155162
importFrom(dplyr,ungroup)
163+
importFrom(ggplot2,aes)
164+
importFrom(ggplot2,element_blank)
165+
importFrom(ggplot2,element_rect)
166+
importFrom(ggplot2,element_text)
167+
importFrom(ggplot2,facet_wrap)
168+
importFrom(ggplot2,geom_boxplot)
169+
importFrom(ggplot2,geom_hline)
170+
importFrom(ggplot2,geom_point)
171+
importFrom(ggplot2,geom_raster)
172+
importFrom(ggplot2,geom_vline)
173+
importFrom(ggplot2,ggplot)
174+
importFrom(ggplot2,ggsave)
175+
importFrom(ggplot2,guide_axis)
176+
importFrom(ggplot2,guide_colorbar)
177+
importFrom(ggplot2,guide_legend)
178+
importFrom(ggplot2,guides)
179+
importFrom(ggplot2,labs)
180+
importFrom(ggplot2,scale_color_manual)
181+
importFrom(ggplot2,scale_fill_gradientn)
182+
importFrom(ggplot2,scale_fill_manual)
183+
importFrom(ggplot2,scale_x_continuous)
184+
importFrom(ggplot2,scale_x_discrete)
185+
importFrom(ggplot2,scale_y_reverse)
186+
importFrom(ggplot2,sec_axis)
187+
importFrom(ggplot2,theme)
188+
importFrom(ggplot2,theme_bw)
189+
importFrom(ggplot2,unit)
190+
importFrom(ggplot2,ylim)
156191
importFrom(lifecycle,deprecate_soft)
157192
importFrom(lifecycle,deprecate_stop)
158193
importFrom(lifecycle,deprecate_warn)
@@ -173,6 +208,7 @@ importFrom(tibble,deframe)
173208
importFrom(tibble,enframe)
174209
importFrom(tibble,is_tibble)
175210
importFrom(tibble,tibble)
211+
importFrom(tidyr,gather)
176212
importFrom(tidyr,pivot_longer)
177213
importFrom(tidyr,separate)
178214
importFrom(tidyr,unite)

R/0-declare-global-variables.R

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,27 @@ utils::globalVariables(
1111
"AptName",
1212
"array_id",
1313
"blank_col",
14+
"ColCheck",
1415
"Dilution",
1516
"eLOD",
1617
"feature",
18+
"Group",
19+
"GroupCount",
20+
"Normalization Scale Factor",
21+
"Organism",
1722
"prefix",
23+
"Response",
24+
"RowCheck",
1825
"rn",
26+
"SampleId",
27+
"SampleType",
1928
"SeqId",
2029
"seqid",
21-
"value"
30+
"SubjectCount",
31+
"SubjectId",
32+
"TargetFullName",
33+
"Type",
34+
"value",
35+
"Value"
2236
)
2337
)

R/calcOutlierMap.R

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
#' Calculate MAD Outlier Map
2+
#'
3+
#' Calculate the median absolute deviation (statistical) outliers measurements
4+
#' and fold-change criteria from an ADAT. Two values are required for the
5+
#' calculation: median absolute deviation (MAD) and fold-change (FC). Outliers
6+
#' are determined based on the result of _both_ `6*MAD` and `x*FC` , where `x`
7+
#' is the number of fold changes defined.
8+
#'
9+
#' For the S3 plotting method, see [plot.Map()].
10+
#'
11+
#' @family Calc Map
12+
#' @param data A `soma_adat` object containing RFU feature data.
13+
#' @param anno_tbl An annotations table produced via [getAnalyteInfo()].
14+
#' Used to calculate analyte dilutions for the matrix column ordering.
15+
#' If `NULL`, a table is generated internally from `data` (if possible), and
16+
#' the analytes are plotted in dilution order.
17+
#' @param apt.order Character. How should the columns/features be ordered?
18+
#' Options include: by dilution mix ("dilution"), by median overall signal
19+
#' ("signal"), or as-is in `data` (default).
20+
#' @param sample.order Either a character string indicating the column name
21+
#' with entries to be used to order the data frame rows, or a numeric vector
22+
#' representing the order of the data frame rows. The
23+
#' default (`NULL`) leaves the row ordering as it is in `data`.
24+
#' @param fc.crit Integer. The fold change criterion to evaluate. Defaults to 5x.
25+
#' @return A list of class `c("outlier_map", "Map")` containing:
26+
#' \item{matrix}{A boolean matrix of `TRUE/FALSE` whether each sample is an
27+
#' outlier according the the stated criteria.}
28+
#' \item{x.lab}{A character string containing the plot x-axis label.}
29+
#' \item{title}{A character string containing the plot title.}
30+
#' \item{rows.by.freq}{A logical indicating if the samples are ordered
31+
#' by outlier frequency.}
32+
#' \item{class.tab}{A table containing the frequencies of each class if input
33+
#' `sample.order` is defined as a categorical variable.}
34+
#' \item{sample.order}{A numeric vector representing the order of the data
35+
#' frame rows.}
36+
#' \item{legend.sub}{A character string containing the plot legend subtitle.}
37+
#' @author Stu Field
38+
#' @examples
39+
#' om <- calcOutlierMap(example_data)
40+
#' class(om)
41+
#'
42+
#' # S3 print method
43+
#' om
44+
#'
45+
#' # `sample.order = "frequency"` orders samples by outlier frequency
46+
#' om <- calcOutlierMap(example_data, sample.order = "frequency")
47+
#' om$rows.by.freq
48+
#' om$sample.order
49+
#'
50+
#' # order samples by user specified indices
51+
#' om <- calcOutlierMap(example_data, sample.order = 192:1)
52+
#' om$sample.order
53+
#'
54+
#' # order samples field in Adat
55+
#' om <- calcOutlierMap(example_data, sample.order = "Sex")
56+
#' om$sample.order
57+
#' @importFrom stats median
58+
59+
#' @export
60+
calcOutlierMap <- function(data, anno_tbl = NULL,
61+
apt.order = c(NA, "dilution", "signal"),
62+
sample.order = NULL, fc.crit = 5) {
63+
64+
apt.order <- match.arg(apt.order)
65+
data <- .refactorData(data)
66+
sampleL <- length(sample.order)
67+
freq <- sampleL == 1L && tolower(sample.order) %in% "frequency"
68+
class_tab <- NA
69+
ord <- seq_len(nrow(data))
70+
ret <- list(matrix = matrix(0)) # placeholder: reserve position 1
71+
72+
if ( is.null(anno_tbl) ) {
73+
anno_tbl <- getAnalyteInfo(data)
74+
}
75+
76+
# Order of the rows in the Map
77+
if ( !is.null(sample.order) && !freq ) {
78+
if ( sampleL > 1L && is.numeric(sample.order) ) {
79+
if ( sampleL != nrow(data) ) {
80+
stop(
81+
"Incorrect number of row indices: ", value(nrow(data)),
82+
"rows vs. ", value(sampleL), " indices.", call. = FALSE
83+
)
84+
} else {
85+
data <- data[sample.order, ]
86+
ord <- sample.order
87+
ret$y.lab <- "Samples (User Specified Order)"
88+
}
89+
} else if ( sampleL == 1L && is.character(sample.order) ) {
90+
stopifnot(sample.order %in% names(data))
91+
ord <- order(data[[sample.order]])
92+
data <- data[ord, ]
93+
class_tab <- table(data[[sample.order]])
94+
ret$y.lab <- sprintf("Samples (by %s)", sample.order)
95+
} else {
96+
stop(
97+
"Something wrong with `sample.order =` argument: ",
98+
value(sample.order), call. = FALSE
99+
)
100+
}
101+
}
102+
103+
data_mat <- data.matrix(data[, getAnalytes(data)])
104+
105+
# calc statistical outliers matrix (boolean matrix of TRUE/FALSE)
106+
mat <- apply(data_mat, 2, function(.apt) {
107+
seq_along(.apt) %in% .getOutliers(.apt, fc.crit)
108+
})
109+
rownames(mat) <- rownames(data_mat) # rownames stripped by apply()
110+
111+
if ( sum(mat) == 0 ) {
112+
warning("No outliers detected in outlier map!", call. = FALSE)
113+
}
114+
115+
if ( freq ) {
116+
mat <- mat[ names(sort(rowSums(mat))), ]
117+
ret$y.lab <- "Samples Ordered by Outlier Frequency"
118+
}
119+
120+
if ( is.na(apt.order) ) {
121+
122+
ret$x.lab <- "Proteins Ordered in Adat"
123+
124+
} else if ( apt.order == "dilution" ) {
125+
126+
apt.dils <- .getDilList(anno_tbl)
127+
mat <- mat[, unlist(apt.dils)]
128+
ret$dil.nums <- lengths(apt.dils)
129+
ret$col.order <- "dilution"
130+
ret$x.lab <- sprintf("Dilution Ordered Proteins (%s)",
131+
paste(names(apt.dils), collapse = " | "))
132+
133+
} else if ( apt.order == "signal" ) {
134+
135+
signal.order <- sort(apply(data_mat, 2, stats::median))
136+
mat <- mat[, names(signal.order)]
137+
ret$col.order <- "signal"
138+
ret$x.lab <- "Proteins by Median Signal"
139+
140+
} else {
141+
stop("Problem with `apt.order =` argument: ",
142+
value(apt.order), call. = FALSE)
143+
}
144+
145+
ret$title <- paste0(
146+
"Outlier Map: | x - median(x) | > 6 * mad(x) & FC > ", fc.crit, "x"
147+
)
148+
ret$rows.by.freq <- freq
149+
ret$class.tab <- class_tab
150+
ret$sample.order <- ord
151+
ret$matrix <- mat
152+
ret$legend.sub <- "Proteins"
153+
invisible(
154+
structure(
155+
ret,
156+
class = c("outlier_map", "Map", "list")
157+
)
158+
)
159+
}
160+
161+
162+
#' @describeIn calcOutlierMap
163+
#' There is a S3 print method for class `"outlier_map"`.
164+
#' @param x An object of class `"outlier_map"`.
165+
#' @param ... Arguments for S3 print methods.
166+
#' @export
167+
print.outlier_map <- function(x, ...) {
168+
writeLines(
169+
cli_rule("SomaLogic Outlier Map", line = "double", line_col = "magenta")
170+
)
171+
key <- c(
172+
"Outlier Map dimensions",
173+
"Title",
174+
"Class Table",
175+
"Rows by Frequency",
176+
"Sample Order",
177+
"x-label",
178+
"Legend Sub-title") |> .pad(25)
179+
value <- c(
180+
.value(paste(dim(x$matrix), collapse = " x ")),
181+
.value(x$title),
182+
c(x$class.tab),
183+
x$rows.by.freq,
184+
.value(x$x.lab),
185+
.value(x$sample.order),
186+
.value(x$legend.sub)
187+
)
188+
writeLines(paste(" ", key, value))
189+
writeLines(cli_rule(line = "double", line_col = "green"))
190+
invisible(x)
191+
}
192+
193+
194+
#' S3 plot methods for class outlier_map
195+
#' @noRd
196+
#' @export
197+
plot.outlier_map <- function(x, ...) {
198+
NextMethod("plot", type = "outlier")
199+
}

R/getOutlierIds.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#' Get Flagged Ids From MAD Outlier Map
2+
#'
3+
#' Return the IDs of flagged samples for objects of the `outlier_map` class.
4+
#' Samples are flagged based on the percent analytes (RFU columns) for a given
5+
#' sample that were identified as outliers using the median absolute
6+
#' deviation (MAD).
7+
#'
8+
#' @family Calc Map
9+
#' @inheritParams plot.Map
10+
#' @param x An object of class:
11+
#' * `outlier_map` - from [calcOutlierMap()]
12+
#' @param data Optional. The data originally used to create the map `x`. If
13+
#' omitted, a single column data frame is returned.
14+
#' @param include Optional. Character vector of column name(s) in `data` to
15+
#' include in the resulting data frame. Ignored if `data = NULL`.
16+
#' @return A `data.frame` of the indices (`idx`) of flagged samples, along
17+
#' with any additional variables as specified by `include`.
18+
#' @author Caleb Scheidel
19+
#' @examples
20+
#' # flagged outliers
21+
#' # create a single sample outlier (12)
22+
#' out_adat <- example_data
23+
#' apts <- getAnalytes(out_adat)
24+
#' out_adat[12, apts] <- out_adat[12, apts] * 10
25+
#'
26+
#' om <- calcOutlierMap(out_adat)
27+
#' getOutlierIds(om, out_adat, flags = 0.05, include = c("Sex", "Subarray"))
28+
#' @export
29+
getOutlierIds <- function(x, flags = 0.05, data = NULL, include = NULL) {
30+
31+
if ( !inherits(x, "outlier_map") ) {
32+
stop("Input `x` object must be class `outlier_map`!",
33+
call. = FALSE)
34+
}
35+
36+
# ensure that flags value is between 0 & 1
37+
if ( flags < 0 || flags > 1 ) {
38+
stop("`flags =` argument must be between 0 and 1!", call. = FALSE)
39+
}
40+
41+
flagged <- which(rowSums(x$matrix) >= ncol(x$matrix) * flags) |> unname()
42+
df_idx <- data.frame(idx = flagged) # default 1-col df
43+
44+
if ( !length(flagged) ) {
45+
.todo("No observations were flagged at this flagging proportion: {.val {flags}}")
46+
}
47+
48+
if ( is.null(data) ) {
49+
df_idx
50+
} else {
51+
stopifnot(
52+
"The `data` argument must be a `data.frame` object." = is.data.frame(data),
53+
"All `include` must be in `data`." = all(include %in% names(data))
54+
)
55+
df <- as.data.frame(data) # strip soma_adat class
56+
cbind(
57+
df_idx,
58+
rm_rn(df[flagged, include, drop = FALSE]) # ensure no rn
59+
)
60+
}
61+
}

0 commit comments

Comments
 (0)