|
1 | 1 | #' @title Predict methylation subtype |
2 | 2 | #' @description Assign patients to four prostate cancer DNA methylation subtypes |
3 | 3 | #' @inheritParams validate.subtype.model.cpgs |
4 | | -#' @param impute.using.all.cpgs TRUE/FALSE indicating whether to impute missing values using all CpGs in `methy.data` or only the CpGs required by \link{subtype.model}. When TRUE, imputation will be slower and use more memory, but should be more accurate. |
| 4 | +#' @param subtype.model Which subtype model to use ('PAMR' or 'RF' for random forest). Although slower, we recommend 'RF' for its increased accuracy and intrinsic imputation for missing values. Further, if some of the required CpGs are completely missing, then you must use 'RF'. |
| 5 | +#' @param pamr.impute.using.all.cpgs If using `subtype.model = 'PAMR'`, should imputation be done using all CpGs in `methy.data` (TRUE) or only the CpGs required by \link{subtype.model.pamr} (FALSE). When TRUE, imputation will be slower and use more memory, but should be more accurate. |
| 6 | +#' @param seed integer seed used for imputation. |
5 | 7 | #' @export |
| 8 | +#' @return |
| 9 | +#' * `subtypes`: data.frame with the estimated subtypes and sample IDs (rownames of `methy.data`) |
| 10 | +#' * `validation`: output from \link{validate.subtype.model.cpgs} to check if `methy.data` contains the required CpGs and whether any CpGs have high missingness. |
6 | 11 | #' @examples |
7 | | -#'data('subtype.model'); |
8 | | -#' |
9 | 12 | #'### example CpG data |
10 | 13 | #'data('example.data'); |
11 | 14 | #' |
12 | 15 | #'subtypes <- estimate.subtypes(example.data); |
13 | | -#'head(subtypes); |
14 | | -estimate.subtypes <- function(methy.data, prop.missing.cutoff = 0.3, impute.using.all.cpgs = TRUE) { |
| 16 | +#' |
| 17 | +#'# estimated subtypes |
| 18 | +#'head(subtypes$subtypes); |
| 19 | +#' |
| 20 | +#'# validation results: |
| 21 | +#'# length(subtypes$validation$required.cpgs) |
| 22 | +#'# length(subtypes$validation$required.cpgs.with.high.missing) |
| 23 | +#'# length(subtypes$validation$missing.cpgs) |
| 24 | +estimate.subtypes <- function( |
| 25 | + methy.data, |
| 26 | + subtype.model = 'RF', |
| 27 | + prop.missing.cutoff = 0.3, |
| 28 | + pamr.impute.using.all.cpgs = TRUE, |
| 29 | + seed = 123 |
| 30 | + ) { |
| 31 | + set.seed(seed); |
| 32 | + stopifnot('subtype.model must be RF or PAMR' = subtype.model %in% c('PAMR', 'RF')); |
| 33 | + stopifnot('prop.missing.cutoff must be between 0 and 1' = prop.missing.cutoff >= 0 & prop.missing.cutoff <= 1); |
| 34 | + stopifnot('PAMR cannot handle CpGs that are 100% missing; consider using RF if some of the required CpGs are completely missing' = !(prop.missing.cutoff == 1 & subtype.model == 'PAMR')); |
| 35 | + |
15 | 36 | check <- validate.subtype.model.cpgs(methy.data, prop.missing.cutoff); |
16 | | - if (!check$val.passed) { |
17 | | - print('Error: methy.data has CpGs with high missingness that are required for predicting subtypes. See the returned results for more details.') |
18 | | - return(check); |
| 37 | + if (!check$val.passed & prop.missing.cutoff < 1) { |
| 38 | + message('Error: methy.data has CpGs with high missingness that are required for predicting subtypes. See the returned $validation results for more details. If you insist on predicting the subtypes despite the high missingness (which will decrease the accuracy of subtype assignment), consider using subtype.model = \'RF\' with prop.missing.cutoff = 1.'); |
| 39 | + return(list( |
| 40 | + subtypes = NULL, |
| 41 | + validation = check |
| 42 | + )); |
19 | 43 | } |
20 | | - # impute missing values |
21 | | - if (sum(is.na(methy.data)) == 0) { |
22 | | - methy.data.imp <- methy.data; |
23 | | - } else { |
24 | | - print('Starting imputation...'); |
25 | | - if (!impute.using.all.cpgs) { |
26 | | - methy.data <- methy.data[,check$required.cpgs, drop = FALSE]; |
| 44 | + |
| 45 | + ### PAMR |
| 46 | + if (subtype.model == 'PAMR') { |
| 47 | + # Impute missing values |
| 48 | + if (sum(is.na(methy.data)) == 0) { |
| 49 | + methy.data.imp <- methy.data; |
| 50 | + } else { |
| 51 | + message('Starting imputation...'); |
| 52 | + if (!pamr.impute.using.all.cpgs) { |
| 53 | + methy.data <- methy.data[,check$required.cpgs, drop = FALSE]; |
| 54 | + } |
| 55 | + base::invisible(utils::capture.output(methy.data.imp <- impute::impute.knn(t(methy.data))$data)); |
| 56 | + methy.data.imp <- data.frame(t(methy.data.imp), check.names = FALSE); |
| 57 | + message('Finished imputation.'); |
27 | 58 | } |
28 | | - base::invisible(utils::capture.output(methy.data.imp <- impute::impute.knn(t(methy.data))$data)); |
29 | | - methy.data.imp <- data.frame(t(methy.data.imp), check.names = FALSE); |
30 | | - print('Finished imputation.'); |
| 59 | + data(subtype.model.pamr, envir = environment()); |
| 60 | + methy.data.imp.sub <- methy.data.imp[,check$required.cpgs]; |
| 61 | + methy.data.imp.sub <- t(methy.data.imp.sub); |
| 62 | + |
| 63 | + subtypes <- pamr::pamr.predict( |
| 64 | + fit = subtype.model.pamr, |
| 65 | + newx = methy.data.imp.sub, # CpGs in rows, samples in columns |
| 66 | + type = 'class', |
| 67 | + threshold = 0 |
| 68 | + ); |
| 69 | + stopifnot(length(subtypes) == ncol(methy.data.imp.sub)); |
| 70 | + subtypes <- data.frame( |
| 71 | + subtype = subtypes, |
| 72 | + check.names = FALSE |
| 73 | + ); |
| 74 | + rownames(subtypes) <- colnames(methy.data.imp.sub); |
31 | 75 | } |
| 76 | + ### RF |
| 77 | + if (subtype.model == 'RF') { |
| 78 | + methy.data <- data.frame(methy.data, check.names = FALSE); |
| 79 | + data(subtype.model.rf, envir = environment()); |
| 80 | + subtype.cpgs <- colnames(subtype.model.rf$xvar); |
| 81 | + |
| 82 | + # for cpgs in subtype.cpgs that are not present in methy.data column names, add them as a new column of NAs. RF will impute them. |
| 83 | + missing.cpgs <- setdiff(subtype.cpgs, colnames(methy.data)); |
32 | 84 |
|
33 | | - # requireNamespace in order to get predict() S3 methods to work correctly |
34 | | - #requireNamespace('pamr', quietly = TRUE); |
35 | | - data(subtype.model, envir = environment()); |
36 | | - methy.data.imp.sub <- methy.data.imp[,check$required.cpgs]; |
37 | | - methy.data.imp.sub <- t(methy.data.imp.sub); |
38 | | - subtypes <- pamr::pamr.predict( |
39 | | - fit = subtype.model, |
40 | | - newx = methy.data.imp.sub, # CpGs in rows, samples in columns |
41 | | - type = 'class', |
42 | | - threshold = 0 |
43 | | - ); |
44 | | - stopifnot(length(subtypes) == ncol(methy.data.imp.sub)); |
45 | | - subtypes <- data.frame( |
46 | | - subtype = subtypes, |
47 | | - check.names = FALSE |
48 | | - ); |
49 | | - rownames(subtypes) <- colnames(methy.data.imp.sub); |
| 85 | + if (length(missing.cpgs) > 0) { |
| 86 | + message(sprintf( |
| 87 | + 'Warning: %d of %d required CpGs are missing from the data. See the $validation outcome for more details. Although random forest imputes missing values, having many CpGs that are missing may decrease accuracy of subtype assignment.', |
| 88 | + length(missing.cpgs), |
| 89 | + length(subtype.cpgs) |
| 90 | + )); |
| 91 | + for (cpg in missing.cpgs) { |
| 92 | + methy.data[,cpg] <- NA; |
| 93 | + }; |
| 94 | + } |
| 95 | + stopifnot(all(subtype.cpgs %in% colnames(methy.data))); |
| 96 | + methy.data <- methy.data[,subtype.cpgs, drop = FALSE]; |
| 97 | + stopifnot(all(colnames(methy.data) == subtype.cpgs)); |
| 98 | + |
| 99 | + subtypes <- predict( |
| 100 | + object = subtype.model.rf, |
| 101 | + newdata = methy.data, |
| 102 | + na.action = 'na.impute' |
| 103 | + ); |
| 104 | + stopifnot(length(subtypes$class) == nrow(methy.data)); |
| 105 | + subtypes <- data.frame( |
| 106 | + subtype = subtypes$class |
| 107 | + ); |
| 108 | + rownames(subtypes) <- rownames(methy.data); |
| 109 | + } |
50 | 110 |
|
51 | | - return(subtypes); |
| 111 | + return(list( |
| 112 | + subtypes = subtypes, |
| 113 | + validation = check |
| 114 | + )); |
52 | 115 | } |
0 commit comments