uclahs-cds
diff --git a/‎DESCRIPTION‎
Lines changed: 4 additions & 3 deletions b/‎DESCRIPTION‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎NEWS.md‎
Lines changed: 14 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎R/estimate.subtypes.R‎
Lines changed: 99 additions & 36 deletions b/‎R/estimate.subtypes.R‎
Lines changed: 99 additions & 36 deletions
diff --git a/‎R/subtype.model.R‎ ‎R/subtype.model.pamr.R‎R/subtype.model.R renamed to R/subtype.model.pamr.R
Lines changed: 2 additions & 2 deletions b/‎R/subtype.model.R‎ ‎R/subtype.model.pamr.R‎R/subtype.model.R renamed to R/subtype.model.pamr.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/subtype.model.rf.R‎
Lines changed: 4 additions & 0 deletions b/‎R/subtype.model.rf.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/validate.subtype.model.cpgs.R‎
Lines changed: 13 additions & 6 deletions b/‎R/validate.subtype.model.cpgs.R‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎data/subtype.model.pamr.rda‎
558 KB b/‎data/subtype.model.pamr.rda‎
558 KB
diff --git a/‎data/subtype.model.rda‎
-557 KB b/‎data/subtype.model.rda‎
-557 KB
diff --git a/‎data/subtype.model.rf.rda‎
20.5 MB b/‎data/subtype.model.rf.rda‎
20.5 MB
diff --git a/‎inst/subtype.model.R‎
Lines changed: 15 additions & 4 deletions b/‎inst/subtype.model.R‎
Lines changed: 15 additions & 4 deletions
@@ -1,7 +1,7 @@
 Package: PrCaMethy
 Title: Prostate Cancer Methylation
-Version: 0.2.0
-Date: 2025-05-15
+Version: 1.0.0
+Date: 2025-05-22
 Maintainer: Jaron Arbet <[email protected]>
 Description: Resources for predicting clinical and molecular features using prostate cancer DNA methylation data.
 Authors@R: c(
@@ -15,7 +15,8 @@ Imports:
     pamr,
     progress,
     progressr,
-    randomForest
+    randomForest,
+    randomForestSRC
 Suggests:
     knitr,
     rmarkdown,
 
@@ -1,3 +1,17 @@
+## PrCaMethy 1.0.0 (2025-05-22)
+
+### New Features
+
+- Added a random forest model for assigning the 4 methylation subtypes (`subtype.model.rf`).  Unlike `subtype.model.pamr` which requires all 5,486 subtype-defining CpGs to be measured, `subtype.model.rf` can handle missing CpGs through imputation (although ideally you should have as many of the CpGs as possible).
+
+### Changed
+
+- Renamed `subtype.model` to `subtype.model.pamr` since now the package has 2 models for assigning the methylation subtypes (`subtype.model.pamr` and `subtype.model.rf`).  This will not cause any breaking changes to the user.
+
+### Breaking changes
+
+- `estimate.subtypes()` now returns a list with 2 elements: `subtypes` and `validation` where the latter checks the validity of the input methylation data.  Previously it only returned the subtypes data.frame.  This is a minor breaking change.
+
 ## PrCaMethy 0.2.0 (2025-05-15)
 
 ### New Features
 
@@ -1,52 +1,115 @@
 #' @title Predict methylation subtype
 #' @description Assign patients to four prostate cancer DNA methylation subtypes
 #' @inheritParams validate.subtype.model.cpgs
-#' @param impute.using.all.cpgs TRUE/FALSE indicating whether to impute missing values using all CpGs in `methy.data` or only the CpGs required by \link{subtype.model}.  When TRUE, imputation will be slower and use more memory, but should be more accurate.
+#' @param subtype.model Which subtype model to use ('PAMR' or 'RF' for random forest).  Although slower, we recommend 'RF' for its increased accuracy and intrinsic imputation for missing values.  Further, if some of the required CpGs are completely missing, then you must use 'RF'.
+#' @param pamr.impute.using.all.cpgs If using `subtype.model = 'PAMR'`, should imputation be done using all CpGs in `methy.data` (TRUE) or only the CpGs required by \link{subtype.model.pamr} (FALSE).  When TRUE, imputation will be slower and use more memory, but should be more accurate.
+#' @param seed integer seed used for imputation.
 #' @export
+#' @return
+#' * `subtypes`: data.frame with the estimated subtypes and sample IDs (rownames of `methy.data`)
+#' * `validation`: output from \link{validate.subtype.model.cpgs} to check if `methy.data` contains the required CpGs and whether any CpGs have high missingness.
 #' @examples
-#'data('subtype.model');
-#'
 #'### example CpG data
 #'data('example.data');
 #'
 #'subtypes <- estimate.subtypes(example.data);
-#'head(subtypes);
-estimate.subtypes <- function(methy.data, prop.missing.cutoff = 0.3, impute.using.all.cpgs = TRUE) {
+#'
+#'# estimated subtypes
+#'head(subtypes$subtypes);
+#'
+#'# validation results:
+#'# length(subtypes$validation$required.cpgs)
+#'# length(subtypes$validation$required.cpgs.with.high.missing)
+#'# length(subtypes$validation$missing.cpgs)
+estimate.subtypes <- function(
+    methy.data,
+    subtype.model = 'RF',
+    prop.missing.cutoff = 0.3,
+    pamr.impute.using.all.cpgs = TRUE,
+    seed = 123
+    ) {
+    set.seed(seed);
+    stopifnot('subtype.model must be RF or PAMR' = subtype.model %in% c('PAMR', 'RF'));
+    stopifnot('prop.missing.cutoff must be between 0 and 1' = prop.missing.cutoff >= 0 & prop.missing.cutoff <= 1);
+    stopifnot('PAMR cannot handle CpGs that are 100% missing; consider using RF if some of the required CpGs are completely missing' = !(prop.missing.cutoff == 1 & subtype.model == 'PAMR'));
+
     check <- validate.subtype.model.cpgs(methy.data, prop.missing.cutoff);
-    if (!check$val.passed) {
-        print('Error: methy.data has CpGs with high missingness that are required for predicting subtypes.  See the returned results for more details.')
-        return(check);
+    if (!check$val.passed & prop.missing.cutoff < 1) {
+        message('Error: methy.data has CpGs with high missingness that are required for predicting subtypes.  See the returned $validation results for more details.  If you insist on predicting the subtypes despite the high missingness (which will decrease the accuracy of subtype assignment), consider using subtype.model = \'RF\' with prop.missing.cutoff = 1.');
+        return(list(
+            subtypes = NULL,
+            validation = check
+            ));
         }
-    # impute missing values
-    if (sum(is.na(methy.data)) == 0) {
-        methy.data.imp <- methy.data;
-    } else {
-        print('Starting imputation...');
-        if (!impute.using.all.cpgs) {
-            methy.data <- methy.data[,check$required.cpgs, drop = FALSE];
+
+    ### PAMR
+    if (subtype.model == 'PAMR') {
+        # Impute missing values
+        if (sum(is.na(methy.data)) == 0) {
+            methy.data.imp <- methy.data;
+        } else {
+            message('Starting imputation...');
+            if (!pamr.impute.using.all.cpgs) {
+                methy.data <- methy.data[,check$required.cpgs, drop = FALSE];
+                }
+            base::invisible(utils::capture.output(methy.data.imp <- impute::impute.knn(t(methy.data))$data));
+            methy.data.imp <- data.frame(t(methy.data.imp), check.names = FALSE);
+            message('Finished imputation.');
             }
-        base::invisible(utils::capture.output(methy.data.imp <- impute::impute.knn(t(methy.data))$data));
-        methy.data.imp <- data.frame(t(methy.data.imp), check.names = FALSE);
-        print('Finished imputation.');
+        data(subtype.model.pamr, envir = environment());
+        methy.data.imp.sub <- methy.data.imp[,check$required.cpgs];
+        methy.data.imp.sub <- t(methy.data.imp.sub);
+
+        subtypes <- pamr::pamr.predict(
+            fit = subtype.model.pamr,
+            newx = methy.data.imp.sub, # CpGs in rows, samples in columns
+            type = 'class',
+            threshold = 0
+            );
+        stopifnot(length(subtypes) == ncol(methy.data.imp.sub));
+        subtypes <- data.frame(
+            subtype = subtypes,
+            check.names = FALSE
+            );
+        rownames(subtypes) <- colnames(methy.data.imp.sub);
         }
+    ### RF
+    if (subtype.model == 'RF') {
+        methy.data <- data.frame(methy.data, check.names = FALSE);
+        data(subtype.model.rf, envir = environment());
+        subtype.cpgs <- colnames(subtype.model.rf$xvar);
+
+        # for cpgs in subtype.cpgs that are not present in methy.data column names, add them as a new column of NAs.  RF will impute them.
+        missing.cpgs <- setdiff(subtype.cpgs, colnames(methy.data));
 
-    # requireNamespace in order to get predict() S3 methods to work correctly
-    #requireNamespace('pamr', quietly = TRUE);
-    data(subtype.model, envir = environment());
-    methy.data.imp.sub <- methy.data.imp[,check$required.cpgs];
-    methy.data.imp.sub <- t(methy.data.imp.sub);
-    subtypes <- pamr::pamr.predict(
-        fit = subtype.model,
-        newx = methy.data.imp.sub, # CpGs in rows, samples in columns
-        type = 'class',
-        threshold = 0
-        );
-    stopifnot(length(subtypes) == ncol(methy.data.imp.sub));
-    subtypes <- data.frame(
-        subtype = subtypes,
-        check.names = FALSE
-        );
-    rownames(subtypes) <- colnames(methy.data.imp.sub);
+        if (length(missing.cpgs) > 0) {
+            message(sprintf(
+                'Warning: %d of %d required CpGs are missing from the data. See the $validation outcome for more details.  Although random forest imputes missing values, having many CpGs that are missing may decrease accuracy of subtype assignment.',
+                length(missing.cpgs),
+                length(subtype.cpgs)
+                ));
+            for (cpg in missing.cpgs) {
+                methy.data[,cpg] <- NA;
+                };
+            }
+        stopifnot(all(subtype.cpgs %in% colnames(methy.data)));
+        methy.data <- methy.data[,subtype.cpgs, drop = FALSE];
+        stopifnot(all(colnames(methy.data) == subtype.cpgs));
+
+        subtypes <- predict(
+            object = subtype.model.rf,
+            newdata = methy.data,
+            na.action = 'na.impute'
+            );
+        stopifnot(length(subtypes$class) == nrow(methy.data));
+        subtypes <- data.frame(
+            subtype = subtypes$class
+            );
+        rownames(subtypes) <- rownames(methy.data);
+        }
 
-    return(subtypes);
+    return(list(
+        subtypes = subtypes,
+        validation = check
+        ));
     }
@@ -1,4 +1,4 @@
-#' Model for predicting methylation subtype
+#' PAMR model for predicting methylation subtype
 #'
 #' [pamr](https://cran.r-project.org/web/packages/pamr/index.html) model used for assigning new patients to four prostate cancer DNA methylation subtypes.
-'subtype.model'
+'subtype.model.pamr'
@@ -0,0 +1,4 @@
+#' Random Forest model for predicting methylation subtype
+#'
+#' [randomForestSRC](https://cran.r-project.org/web/packages/randomForestSRC/index.html) model used for assigning new patients to four prostate cancer DNA methylation subtypes. Note [`subtype.model.pamr`] requires all 5,486 subtype-defining CpGs to be measured, whereas `subtype.model.rf` can handle missing CpGs through imputation (although ideally you should have as many of the CpGs as possible).
+'subtype.model.rf'
@@ -1,9 +1,9 @@
 #' Validate input data for estimate.subtypes()
 #'
-#' Check whether `methy.data` contains all CpGs required by \link{subtype.model} for assigning patients to four prostate cancer DNA methylation subtypes.
+#' Check whether `methy.data` contains all CpGs required by \link{subtype.model.pamr} or \link{subtype.model.rf} for assigning patients to four prostate cancer DNA methylation subtypes.
 #'
 #' @param methy.data A data.frame with patients as rows (rownames give patient ids) and column names give CpG ids.
-#' @param prop.missing.cutoff The maximum proportion of missing values allowed for each required CpG. KNN imputation is used to impute missing values.
+#' @param prop.missing.cutoff The maximum proportion of missing values allowed for each required CpG.
 #' @export
 #' @return
 #' * `val.passed` a logical indicating whether the data passed validation
@@ -29,8 +29,8 @@ validate.subtype.model.cpgs <- function(methy.data, prop.missing.cutoff = 0.3) {
     methy.data.nomiss <- na.omit(methy.data);
     stopifnot('All values of methy.data should be between 0 and 1' = all(methy.data.nomiss >= 0 & methy.data.nomiss <= 1));
 
-    data(subtype.model, envir = environment());
-    required.cpgs <- rownames(subtype.model$centroids);
+    data(subtype.model.pamr, envir = environment());
+    required.cpgs <- rownames(subtype.model.pamr$centroids);
     missing.cpgs <- setdiff(required.cpgs, colnames(methy.data));
     nonmissing.cpgs <- setdiff(required.cpgs, missing.cpgs);
     if (length(nonmissing.cpgs) > 0) {
@@ -47,10 +47,17 @@ validate.subtype.model.cpgs <- function(methy.data, prop.missing.cutoff = 0.3) {
                 }
             }
         );
+
+    # regardless of what the user specifies for prop.missing.cutoff, we should
+    # print a warning if some CpGs have high missing.
+    cpgs.high.miss.warn <- sum(required.cpgs.prop.missing > 0.5);
+    if (cpgs.high.miss.warn > 0) {
+        message('Warning: ', cpgs.high.miss.warn, ' out of ', length(required.cpgs) ,' required CpGs have > 50% missing values. Having many CpGs with high missing data may decrease accuracy of subtype assignment.');
+        }
+
     required.cpgs.with.high.missing <- lapply(required.cpgs.prop.missing, function(x) x[x > prop.missing.cutoff]);
     val.passed <- length(unlist(required.cpgs.with.high.missing)) == 0 & length(unlist(missing.cpgs)) == 0;
-    val.passed;
-    unlist(required.cpgs.with.high.missing)
+
     return(list(
         val.passed = val.passed,
         required.cpgs = required.cpgs,
 
@@ -1,8 +1,19 @@
+# there are 2 models for assigning the 4 methylation subtypes to new samples: PAMR and random forest
+
 source('config.R') # see project PRAD-000101-MethySubtypes/PrCaMethy/config.R
-load(arg$path.subtype.model);
 
-subtype.model <- model.for.predicting.subtypes;
-subtype.model.required.cpgs <- rownames(subtype.model$centroids);
 
-usethis::use_data(subtype.model, overwrite = TRUE, compress = 'xz');
+### PAMR
+load(arg$path.subtype.model.pamr);
+
+subtype.model.pamr <- model.for.predicting.subtypes;
+required.cpgs <- rownames(subtype.model.pamr$centroids);
+
+usethis::use_data(subtype.model.pamr, overwrite = TRUE, compress = 'xz');
 usethis::use_package('pamr');
+
+### Random Forest
+load(arg$path.subtype.model.rf);
+subtype.model.rf <- rf;
+usethis::use_data(subtype.model.rf, overwrite = TRUE, compress = 'xz');
+usethis::use_package('randomForestSRC');