uclahs-cds · alkaZeltser · Aug 7, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/R/plot-pgs.R b/R/plot-pgs.R
@@ -590,6 +590,14 @@ create.pgs.boxplot <- function(
                         boxplot.colors <- suppressWarnings(BoutrosLab.plotting.general::default.colours(length(levels(pgs.data[ , phenotype]))));
                         names(boxplot.colors) <- levels(pgs.data[ , phenotype]);
                     }
+
+                # handle stripplot vs boxplot coloring
+                if (add.stripplot) {
+                    box.colors <- 'transparent';
+                } else {
+                    box.colors <- boxplot.colors;
+                }
+
                 # plot boxplot
                 group.yaxis.formatting <- basic.yaxis.formatting;
                 pgs.boxplots.by.phenotype[[paste0(pgs.column,'_',phenotype)]] <- BoutrosLab.plotting.general::create.boxplot(
@@ -606,8 +614,8 @@ create.pgs.boxplot <- function(
                     xaxis.cex = xaxes.cex,
                     yat = group.yaxis.formatting$at,
                     yaxis.lab = group.yaxis.formatting$axis.lab,
-                    points.col = boxplot.colors[pgs.data[ , phenotype]] # color points by phenotype
-                    #col = boxplot.colors
+                    points.col = boxplot.colors[pgs.data[ , phenotype]], # color points by phenotype
+                    col = box.colors # color boxes by phenotype
                     );
                 }
             }

diff --git a/R/run-pgs-statistics.R b/R/run-pgs-statistics.R
@@ -243,7 +243,7 @@ run.pgs.regression <- function(pgs, phenotype.data) {
 #' for binary or continuous phenotypes. For continuous phenotypes, it converts them
 #' to binary based on a specified cutoff threshold. It calculates and returns AUC,
 #' Odds Ratios (OR), and p-values for each PGS. Corresponding ROC curves are plotted automatically.
-#' @param data A data frame containing the PGS, phenotype, and covariate columns.
+#' @param pgs.data A data frame containing the PGS, phenotype, and covariate columns.
 #' @param pgs.columns A character vector specifying the names of the PGS columns
 #'   in \code{data} to be analyzed. All specified columns must be numeric.
 #' @param phenotype.columns A character vector specifying the names of the phenotype columns in \code{data} to be analyzed. If binary phenotypes are specified, they must be factors with two levels (0 and 1).
@@ -320,7 +320,7 @@ run.pgs.regression <- function(pgs, phenotype.data) {
 #'
 #' @export
 analyze.pgs.binary.predictiveness <- function(
-    data,
+    pgs.data,
     pgs.columns,
     phenotype.columns,
     covariate.columns = NULL,
@@ -337,22 +337,22 @@ analyze.pgs.binary.predictiveness <- function(
     ) {
     ## Validate Inputs ##
 
-    if (!is.data.frame(data)) {
-        stop('`data` must be a data frame.');
+    if (!is.data.frame(pgs.data)) {
+        stop('`pgs.data` must be a data frame.');
         }
 
-    if (!all(pgs.columns %in% names(data))) {
+    if (!all(pgs.columns %in% names(pgs.data))) {
         stop('Not all specified `pgs.columns` found in the data frame.');
         }
-    if (!all(sapply(data[, pgs.columns, drop = FALSE], is.numeric))) {
+    if (!all(sapply(pgs.data[, pgs.columns, drop = FALSE], is.numeric))) {
         stop('All `pgs.columns` must be numeric.');
         }
 
-    if (!all(phenotype.columns %in% names(data))) {
+    if (!all(phenotype.columns %in% names(pgs.data))) {
         stop(paste0('Not all specified `phenotype.columns` found in the data frame.'));
         }
 
-    if (!is.null(covariate.columns) && !all(covariate.columns %in% names(data))) {
+    if (!is.null(covariate.columns) && !all(covariate.columns %in% names(pgs.data))) {
         stop('Not all specified `covariate.columns` found in the data frame.');
         }
 
@@ -371,14 +371,14 @@ analyze.pgs.binary.predictiveness <- function(
     # Validate phenotype types consistency
     for (pheno.col in phenotype.columns) {
         if (phenotype.type == 'binary') {
-            if (!is.factor(data[[pheno.col]]) && !all(unique(na.omit(data[[pheno.col]])) %in% c(0, 1))) {
+            if (!is.factor(pgs.data[[pheno.col]]) && !all(unique(na.omit(pgs.data[[pheno.col]])) %in% c(0, 1))) {
                 stop(paste0('Phenotype column \'', pheno.col, '\' is specified as binary but is not a factor or 0/1 numeric. Convert to factor.'));
                 }
-            if (is.factor(data[[pheno.col]]) && nlevels(data[[pheno.col]]) != 2) {
+            if (is.factor(pgs.data[[pheno.col]]) && nlevels(pgs.data[[pheno.col]]) != 2) {
                 stop(paste0('Binary phenotype column \'', pheno.col, '\' must have exactly two levels.'));
                 }
         } else { # phenotype.type == 'continuous'
-            if (!is.numeric(data[[pheno.col]])) {
+            if (!is.numeric(pgs.data[[pheno.col]])) {
                 stop(paste0('Phenotype column \'', pheno.col, '\' is specified as continuous but is not numeric.'));
                 }
             if (is.null(cutoff.threshold)) {
@@ -415,7 +415,7 @@ analyze.pgs.binary.predictiveness <- function(
     for (current.phenotype in phenotype.columns) {
         # message(paste0('Processing phenotype: ', current.phenotype));
 
-        temp.data <- data; # Work on a fresh copy for each phenotype
+        temp.data <- pgs.data; # Work on a fresh copy for each phenotype
 
         pheno.var <- temp.data[[current.phenotype]];
 

diff --git a/README.md b/README.md
@@ -90,7 +90,7 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol
 
 4. Create summary plots.
 
-    ApplyPolygenicScore comes with several plotting functions designed to operate on the results of `apply.polygenic.score`. Display PGS density curves with `create.pgs.density.plot` and PGS percentile ranks with `create.pgs.rank.plot`. If you provided phenotype data in step 3, you can incorporate categorical data into the density plots and categorical and continuous phenotype data into the rank plots, and use `create.pgs.with.continuous.phenotype.plot` to make scatterplots of your PGS against any continuous phenotype data.
+    ApplyPolygenicScore comes with several plotting functions designed to operate on the results of `apply.polygenic.score`. Display PGS density curves with `create.pgs.density.plot`, distributions with `create.pgs.boxplot` and PGS percentile ranks with `create.pgs.rank.plot`. If you provided phenotype data in step 3, you can incorporate categorical data into the density plots and categorical and continuous phenotype data into the rank plots, and use `create.pgs.with.continuous.phenotype.plot` to make scatterplots of your PGS against any continuous phenotype data. For more sophisticated downstream analysis, check how well the PGS predicts binary outcomes using `analyze.pgs.binary.predictiveness`.
 
 For more step-by-step instructions, check out our [vignettes](https://CRAN.R-project.org/package=ApplyPolygenicScore).
 

diff --git a/man/analyze.pgs.binary.predictiveness.Rd b/man/analyze.pgs.binary.predictiveness.Rd
diff --git a/tests/testthat/test-pgs-statistics.R b/tests/testthat/test-pgs-statistics.R
@@ -267,18 +267,18 @@ test_that(
         # Test 1: `data` must be a data frame
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = list(),
+                pgs.data = list(),
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary'
             ),
-            '`data` must be a data frame.'
+            '`pgs.data` must be a data frame.'
         );
 
         # Test 2: Not all specified `pgs.columns` found in the data frame
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = c('PGS.A', 'NonExistentPGS'),
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary'
@@ -289,7 +289,7 @@ test_that(
         # Test 3: All `pgs.columns` must be numeric
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'NonNumericPGS',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary'
@@ -300,7 +300,7 @@ test_that(
         # Test 4: Not all specified `phenotype.columns` found in the data frame
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = c('Pheno.Binary.01', 'NonExistentPheno'),
                 phenotype.type = 'binary'
@@ -311,7 +311,7 @@ test_that(
         # Test 5: `covariate.columns` not found
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 covariate.columns = 'NonExistentCovariate',
@@ -323,7 +323,7 @@ test_that(
         # Test 6: `phenotype.type` is invalid (using match.arg's error)
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'invalid_type'
@@ -339,7 +339,7 @@ test_that(
         # Test 7a: Binary phenotype column is a factor but not 2 levels
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.3Levels',
                 phenotype.type = 'binary'
@@ -351,7 +351,7 @@ test_that(
         # Pheno.Binary.Char.YesNo is character, not 0/1 numeric
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.Char.YesNo',
                 phenotype.type = 'binary'
@@ -362,7 +362,7 @@ test_that(
         # Test 7c: Numeric binary phenotype is automatically converted to factor.
         expect_warning( # Expect no error or warning for valid conversion
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.Numeric.01',
                 phenotype.type = 'binary'
@@ -373,7 +373,7 @@ test_that(
         # Test 7d: Binary phenotype that converts successfully (no error expected, but good for coverage)
         expect_silent( # Expect no error or warning for valid conversion
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary'
@@ -390,7 +390,7 @@ test_that(
         test.data.bad.continuous$Pheno.Continuous.Num <- as.character(test.data.bad.continuous$Pheno.Continuous.Num);
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data.bad.continuous,
+                pgs.data = test.data.bad.continuous,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Continuous.Num',
                 phenotype.type = 'continuous',
@@ -402,7 +402,7 @@ test_that(
         # Test 9: `cutoff.threshold` missing for continuous phenotype
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Continuous.Num',
                 phenotype.type = 'continuous',
@@ -414,7 +414,7 @@ test_that(
         # Test 10: `cutoff.threshold` is a list but missing entry for specific phenotype
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Continuous.Num',
                 phenotype.type = 'continuous',
@@ -426,7 +426,7 @@ test_that(
         # Test 11: `cutoff.threshold` is not numeric or a named list
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Continuous.Num',
                 phenotype.type = 'continuous',
@@ -438,7 +438,7 @@ test_that(
         # Test 12: `cutoff.threshold` is a list but value for specific phenotype is NULL/not numeric
         expect_error(
             analyze.pgs.binary.predictiveness(
-                data = test.data,
+                pgs.data = test.data,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Continuous.Num',
                 phenotype.type = 'continuous',
@@ -465,7 +465,7 @@ test_that(
             }
             expect_error(
                 analyze.pgs.binary.predictiveness(
-                    data = many.pgs.data,
+                    pgs.data = many.pgs.data,
                     pgs.columns = paste0('PGS', 1:13),
                     phenotype.columns = 'Pheno.Binary.01',
                     phenotype.type = 'binary',
@@ -487,7 +487,7 @@ test_that(
         );
         expect_warning(
             results <- analyze.pgs.binary.predictiveness(
-                data = test.data.no.complete.cases,
+                pgs.data = test.data.no.complete.cases,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary',
@@ -506,7 +506,7 @@ test_that(
         );
         expect_warning(
             results <- analyze.pgs.binary.predictiveness(
-                data = test.data.bad.glm,
+                pgs.data = test.data.bad.glm,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary',
@@ -526,7 +526,7 @@ test_that(
             );
         expect_warning(
             results <- analyze.pgs.binary.predictiveness(
-                data = test.data.bad.roc,
+                pgs.data = test.data.bad.roc,
                 pgs.columns = 'PGS.A',
                 phenotype.columns = 'Pheno.Binary.01',
                 phenotype.type = 'binary',
@@ -550,7 +550,7 @@ test_that(
 
         # Scenario 1: Binary phenotypes, with covariates, return plot object
         results.binary <- analyze.pgs.binary.predictiveness(
-            data = test.data, # Using the global test.data
+            pgs.data = test.data, # Using the global test.data
             pgs.columns = c('PGS.A', 'PGS.B'),
             phenotype.columns = c('Pheno.Binary.01', 'Pheno.Binary.TF'),
             covariate.columns = c('Cov.Age', 'Cov.Sex'),
@@ -596,7 +596,7 @@ test_that(
         # Scenario 2: Continuous phenotype, no covariates, save to file (roc.plot should be NULL in return)
         # Use existing test.data
         results.continuous <- analyze.pgs.binary.predictiveness(
-            data = test.data, # Using the global test.data
+            pgs.data = test.data, # Using the global test.data
             pgs.columns = 'PGS.A',
             phenotype.columns = 'Pheno.Continuous.Num',
             covariate.columns = NULL,