Skip to content

Commit b52c0e2

Browse files
authored
Merge pull request #31 from uclahs-cds/jarbet-add-RF-subtype-model
Add Random Forest model for assigning methylation subtype
2 parents 7d3c1b7 + 953fa77 commit b52c0e2

19 files changed

+304
-81
lines changed

DESCRIPTION

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: PrCaMethy
22
Title: Prostate Cancer Methylation
3-
Version: 0.2.0
4-
Date: 2025-05-15
3+
Version: 1.0.0
4+
Date: 2025-05-22
55
Maintainer: Jaron Arbet <[email protected]>
66
Description: Resources for predicting clinical and molecular features using prostate cancer DNA methylation data.
77
Authors@R: c(
@@ -15,7 +15,8 @@ Imports:
1515
pamr,
1616
progress,
1717
progressr,
18-
randomForest
18+
randomForest,
19+
randomForestSRC
1920
Suggests:
2021
knitr,
2122
rmarkdown,

NEWS.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
## PrCaMethy 1.0.0 (2025-05-22)
2+
3+
### New Features
4+
5+
- Added a random forest model for assigning the 4 methylation subtypes (`subtype.model.rf`). Unlike `subtype.model.pamr` which requires all 5,486 subtype-defining CpGs to be measured, `subtype.model.rf` can handle missing CpGs through imputation (although ideally you should have as many of the CpGs as possible).
6+
7+
### Changed
8+
9+
- Renamed `subtype.model` to `subtype.model.pamr` since now the package has 2 models for assigning the methylation subtypes (`subtype.model.pamr` and `subtype.model.rf`). This will not cause any breaking changes to the user.
10+
11+
### Breaking changes
12+
13+
- `estimate.subtypes()` now returns a list with 2 elements: `subtypes` and `validation` where the latter checks the validity of the input methylation data. Previously it only returned the subtypes data.frame. This is a minor breaking change.
14+
115
## PrCaMethy 0.2.0 (2025-05-15)
216

317
### New Features

R/estimate.subtypes.R

Lines changed: 99 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,115 @@
11
#' @title Predict methylation subtype
22
#' @description Assign patients to four prostate cancer DNA methylation subtypes
33
#' @inheritParams validate.subtype.model.cpgs
4-
#' @param impute.using.all.cpgs TRUE/FALSE indicating whether to impute missing values using all CpGs in `methy.data` or only the CpGs required by \link{subtype.model}. When TRUE, imputation will be slower and use more memory, but should be more accurate.
4+
#' @param subtype.model Which subtype model to use ('PAMR' or 'RF' for random forest). Although slower, we recommend 'RF' for its increased accuracy and intrinsic imputation for missing values. Further, if some of the required CpGs are completely missing, then you must use 'RF'.
5+
#' @param pamr.impute.using.all.cpgs If using `subtype.model = 'PAMR'`, should imputation be done using all CpGs in `methy.data` (TRUE) or only the CpGs required by \link{subtype.model.pamr} (FALSE). When TRUE, imputation will be slower and use more memory, but should be more accurate.
6+
#' @param seed integer seed used for imputation.
57
#' @export
8+
#' @return
9+
#' * `subtypes`: data.frame with the estimated subtypes and sample IDs (rownames of `methy.data`)
10+
#' * `validation`: output from \link{validate.subtype.model.cpgs} to check if `methy.data` contains the required CpGs and whether any CpGs have high missingness.
611
#' @examples
7-
#'data('subtype.model');
8-
#'
912
#'### example CpG data
1013
#'data('example.data');
1114
#'
1215
#'subtypes <- estimate.subtypes(example.data);
13-
#'head(subtypes);
14-
estimate.subtypes <- function(methy.data, prop.missing.cutoff = 0.3, impute.using.all.cpgs = TRUE) {
16+
#'
17+
#'# estimated subtypes
18+
#'head(subtypes$subtypes);
19+
#'
20+
#'# validation results:
21+
#'# length(subtypes$validation$required.cpgs)
22+
#'# length(subtypes$validation$required.cpgs.with.high.missing)
23+
#'# length(subtypes$validation$missing.cpgs)
24+
estimate.subtypes <- function(
25+
methy.data,
26+
subtype.model = 'RF',
27+
prop.missing.cutoff = 0.3,
28+
pamr.impute.using.all.cpgs = TRUE,
29+
seed = 123
30+
) {
31+
set.seed(seed);
32+
stopifnot('subtype.model must be RF or PAMR' = subtype.model %in% c('PAMR', 'RF'));
33+
stopifnot('prop.missing.cutoff must be between 0 and 1' = prop.missing.cutoff >= 0 & prop.missing.cutoff <= 1);
34+
stopifnot('PAMR cannot handle CpGs that are 100% missing; consider using RF if some of the required CpGs are completely missing' = !(prop.missing.cutoff == 1 & subtype.model == 'PAMR'));
35+
1536
check <- validate.subtype.model.cpgs(methy.data, prop.missing.cutoff);
16-
if (!check$val.passed) {
17-
print('Error: methy.data has CpGs with high missingness that are required for predicting subtypes. See the returned results for more details.')
18-
return(check);
37+
if (!check$val.passed & prop.missing.cutoff < 1) {
38+
message('Error: methy.data has CpGs with high missingness that are required for predicting subtypes. See the returned $validation results for more details. If you insist on predicting the subtypes despite the high missingness (which will decrease the accuracy of subtype assignment), consider using subtype.model = \'RF\' with prop.missing.cutoff = 1.');
39+
return(list(
40+
subtypes = NULL,
41+
validation = check
42+
));
1943
}
20-
# impute missing values
21-
if (sum(is.na(methy.data)) == 0) {
22-
methy.data.imp <- methy.data;
23-
} else {
24-
print('Starting imputation...');
25-
if (!impute.using.all.cpgs) {
26-
methy.data <- methy.data[,check$required.cpgs, drop = FALSE];
44+
45+
### PAMR
46+
if (subtype.model == 'PAMR') {
47+
# Impute missing values
48+
if (sum(is.na(methy.data)) == 0) {
49+
methy.data.imp <- methy.data;
50+
} else {
51+
message('Starting imputation...');
52+
if (!pamr.impute.using.all.cpgs) {
53+
methy.data <- methy.data[,check$required.cpgs, drop = FALSE];
54+
}
55+
base::invisible(utils::capture.output(methy.data.imp <- impute::impute.knn(t(methy.data))$data));
56+
methy.data.imp <- data.frame(t(methy.data.imp), check.names = FALSE);
57+
message('Finished imputation.');
2758
}
28-
base::invisible(utils::capture.output(methy.data.imp <- impute::impute.knn(t(methy.data))$data));
29-
methy.data.imp <- data.frame(t(methy.data.imp), check.names = FALSE);
30-
print('Finished imputation.');
59+
data(subtype.model.pamr, envir = environment());
60+
methy.data.imp.sub <- methy.data.imp[,check$required.cpgs];
61+
methy.data.imp.sub <- t(methy.data.imp.sub);
62+
63+
subtypes <- pamr::pamr.predict(
64+
fit = subtype.model.pamr,
65+
newx = methy.data.imp.sub, # CpGs in rows, samples in columns
66+
type = 'class',
67+
threshold = 0
68+
);
69+
stopifnot(length(subtypes) == ncol(methy.data.imp.sub));
70+
subtypes <- data.frame(
71+
subtype = subtypes,
72+
check.names = FALSE
73+
);
74+
rownames(subtypes) <- colnames(methy.data.imp.sub);
3175
}
76+
### RF
77+
if (subtype.model == 'RF') {
78+
methy.data <- data.frame(methy.data, check.names = FALSE);
79+
data(subtype.model.rf, envir = environment());
80+
subtype.cpgs <- colnames(subtype.model.rf$xvar);
81+
82+
# for cpgs in subtype.cpgs that are not present in methy.data column names, add them as a new column of NAs. RF will impute them.
83+
missing.cpgs <- setdiff(subtype.cpgs, colnames(methy.data));
3284

33-
# requireNamespace in order to get predict() S3 methods to work correctly
34-
#requireNamespace('pamr', quietly = TRUE);
35-
data(subtype.model, envir = environment());
36-
methy.data.imp.sub <- methy.data.imp[,check$required.cpgs];
37-
methy.data.imp.sub <- t(methy.data.imp.sub);
38-
subtypes <- pamr::pamr.predict(
39-
fit = subtype.model,
40-
newx = methy.data.imp.sub, # CpGs in rows, samples in columns
41-
type = 'class',
42-
threshold = 0
43-
);
44-
stopifnot(length(subtypes) == ncol(methy.data.imp.sub));
45-
subtypes <- data.frame(
46-
subtype = subtypes,
47-
check.names = FALSE
48-
);
49-
rownames(subtypes) <- colnames(methy.data.imp.sub);
85+
if (length(missing.cpgs) > 0) {
86+
message(sprintf(
87+
'Warning: %d of %d required CpGs are missing from the data. See the $validation outcome for more details. Although random forest imputes missing values, having many CpGs that are missing may decrease accuracy of subtype assignment.',
88+
length(missing.cpgs),
89+
length(subtype.cpgs)
90+
));
91+
for (cpg in missing.cpgs) {
92+
methy.data[,cpg] <- NA;
93+
};
94+
}
95+
stopifnot(all(subtype.cpgs %in% colnames(methy.data)));
96+
methy.data <- methy.data[,subtype.cpgs, drop = FALSE];
97+
stopifnot(all(colnames(methy.data) == subtype.cpgs));
98+
99+
subtypes <- predict(
100+
object = subtype.model.rf,
101+
newdata = methy.data,
102+
na.action = 'na.impute'
103+
);
104+
stopifnot(length(subtypes$class) == nrow(methy.data));
105+
subtypes <- data.frame(
106+
subtype = subtypes$class
107+
);
108+
rownames(subtypes) <- rownames(methy.data);
109+
}
50110

51-
return(subtypes);
111+
return(list(
112+
subtypes = subtypes,
113+
validation = check
114+
));
52115
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' Model for predicting methylation subtype
1+
#' PAMR model for predicting methylation subtype
22
#'
33
#' [pamr](https://cran.r-project.org/web/packages/pamr/index.html) model used for assigning new patients to four prostate cancer DNA methylation subtypes.
4-
'subtype.model'
4+
'subtype.model.pamr'

R/subtype.model.rf.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#' Random Forest model for predicting methylation subtype
2+
#'
3+
#' [randomForestSRC](https://cran.r-project.org/web/packages/randomForestSRC/index.html) model used for assigning new patients to four prostate cancer DNA methylation subtypes. Note [`subtype.model.pamr`] requires all 5,486 subtype-defining CpGs to be measured, whereas `subtype.model.rf` can handle missing CpGs through imputation (although ideally you should have as many of the CpGs as possible).
4+
'subtype.model.rf'

R/validate.subtype.model.cpgs.R

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#' Validate input data for estimate.subtypes()
22
#'
3-
#' Check whether `methy.data` contains all CpGs required by \link{subtype.model} for assigning patients to four prostate cancer DNA methylation subtypes.
3+
#' Check whether `methy.data` contains all CpGs required by \link{subtype.model.pamr} or \link{subtype.model.rf} for assigning patients to four prostate cancer DNA methylation subtypes.
44
#'
55
#' @param methy.data A data.frame with patients as rows (rownames give patient ids) and column names give CpG ids.
6-
#' @param prop.missing.cutoff The maximum proportion of missing values allowed for each required CpG. KNN imputation is used to impute missing values.
6+
#' @param prop.missing.cutoff The maximum proportion of missing values allowed for each required CpG.
77
#' @export
88
#' @return
99
#' * `val.passed` a logical indicating whether the data passed validation
@@ -29,8 +29,8 @@ validate.subtype.model.cpgs <- function(methy.data, prop.missing.cutoff = 0.3) {
2929
methy.data.nomiss <- na.omit(methy.data);
3030
stopifnot('All values of methy.data should be between 0 and 1' = all(methy.data.nomiss >= 0 & methy.data.nomiss <= 1));
3131

32-
data(subtype.model, envir = environment());
33-
required.cpgs <- rownames(subtype.model$centroids);
32+
data(subtype.model.pamr, envir = environment());
33+
required.cpgs <- rownames(subtype.model.pamr$centroids);
3434
missing.cpgs <- setdiff(required.cpgs, colnames(methy.data));
3535
nonmissing.cpgs <- setdiff(required.cpgs, missing.cpgs);
3636
if (length(nonmissing.cpgs) > 0) {
@@ -47,10 +47,17 @@ validate.subtype.model.cpgs <- function(methy.data, prop.missing.cutoff = 0.3) {
4747
}
4848
}
4949
);
50+
51+
# regardless of what the user specifies for prop.missing.cutoff, we should
52+
# print a warning if some CpGs have high missing.
53+
cpgs.high.miss.warn <- sum(required.cpgs.prop.missing > 0.5);
54+
if (cpgs.high.miss.warn > 0) {
55+
message('Warning: ', cpgs.high.miss.warn, ' out of ', length(required.cpgs) ,' required CpGs have > 50% missing values. Having many CpGs with high missing data may decrease accuracy of subtype assignment.');
56+
}
57+
5058
required.cpgs.with.high.missing <- lapply(required.cpgs.prop.missing, function(x) x[x > prop.missing.cutoff]);
5159
val.passed <- length(unlist(required.cpgs.with.high.missing)) == 0 & length(unlist(missing.cpgs)) == 0;
52-
val.passed;
53-
unlist(required.cpgs.with.high.missing)
60+
5461
return(list(
5562
val.passed = val.passed,
5663
required.cpgs = required.cpgs,

data/subtype.model.pamr.rda

558 KB
Binary file not shown.

data/subtype.model.rda

-557 KB
Binary file not shown.

data/subtype.model.rf.rda

20.5 MB
Binary file not shown.

inst/subtype.model.R

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
1+
# there are 2 models for assigning the 4 methylation subtypes to new samples: PAMR and random forest
2+
13
source('config.R') # see project PRAD-000101-MethySubtypes/PrCaMethy/config.R
2-
load(arg$path.subtype.model);
34

4-
subtype.model <- model.for.predicting.subtypes;
5-
subtype.model.required.cpgs <- rownames(subtype.model$centroids);
65

7-
usethis::use_data(subtype.model, overwrite = TRUE, compress = 'xz');
6+
### PAMR
7+
load(arg$path.subtype.model.pamr);
8+
9+
subtype.model.pamr <- model.for.predicting.subtypes;
10+
required.cpgs <- rownames(subtype.model.pamr$centroids);
11+
12+
usethis::use_data(subtype.model.pamr, overwrite = TRUE, compress = 'xz');
813
usethis::use_package('pamr');
14+
15+
### Random Forest
16+
load(arg$path.subtype.model.rf);
17+
subtype.model.rf <- rf;
18+
usethis::use_data(subtype.model.rf, overwrite = TRUE, compress = 'xz');
19+
usethis::use_package('randomForestSRC');

0 commit comments

Comments
 (0)