diff --git a/R/plot-pgs.R b/R/plot-pgs.R index 362296a..951272f 100644 --- a/R/plot-pgs.R +++ b/R/plot-pgs.R @@ -590,6 +590,14 @@ create.pgs.boxplot <- function( boxplot.colors <- suppressWarnings(BoutrosLab.plotting.general::default.colours(length(levels(pgs.data[ , phenotype])))); names(boxplot.colors) <- levels(pgs.data[ , phenotype]); } + + # handle stripplot vs boxplot coloring + if (add.stripplot) { + box.colors <- 'transparent'; + } else { + box.colors <- boxplot.colors; + } + # plot boxplot group.yaxis.formatting <- basic.yaxis.formatting; pgs.boxplots.by.phenotype[[paste0(pgs.column,'_',phenotype)]] <- BoutrosLab.plotting.general::create.boxplot( @@ -606,8 +614,8 @@ create.pgs.boxplot <- function( xaxis.cex = xaxes.cex, yat = group.yaxis.formatting$at, yaxis.lab = group.yaxis.formatting$axis.lab, - points.col = boxplot.colors[pgs.data[ , phenotype]] # color points by phenotype - #col = boxplot.colors + points.col = boxplot.colors[pgs.data[ , phenotype]], # color points by phenotype + col = box.colors # color boxes by phenotype ); } } diff --git a/R/run-pgs-statistics.R b/R/run-pgs-statistics.R index 5134ec2..dbd8dfb 100644 --- a/R/run-pgs-statistics.R +++ b/R/run-pgs-statistics.R @@ -243,9 +243,9 @@ run.pgs.regression <- function(pgs, phenotype.data) { #' for binary or continuous phenotypes. For continuous phenotypes, it converts them #' to binary based on a specified cutoff threshold. It calculates and returns AUC, #' Odds Ratios (OR), and p-values for each PGS. Corresponding ROC curves are plotted automatically. -#' @param data A data frame containing the PGS, phenotype, and covariate columns. +#' @param pgs.data A data frame containing the PGS, phenotype, and covariate columns. #' @param pgs.columns A character vector specifying the names of the PGS columns -#' in \code{data} to be analyzed. All specified columns must be numeric. +#' in \code{pgs.data} to be analyzed. All specified columns must be numeric. #' @param phenotype.columns A character vector specifying the names of the phenotype columns in \code{data} to be analyzed. If binary phenotypes are specified, they must be factors with two levels (0 and 1). #' @param covariate.columns A character vector specifying the names of covariate columns in \code{data} to be included in the regression model. Default is NULL. #' @param phenotype.type A character string specifying the type of phenotype. Must be either 'continuous' or 'binary'. All provided phenotype columns must match this type. @@ -320,7 +320,7 @@ run.pgs.regression <- function(pgs, phenotype.data) { #' #' @export analyze.pgs.binary.predictiveness <- function( - data, + pgs.data, pgs.columns, phenotype.columns, covariate.columns = NULL, @@ -337,22 +337,22 @@ analyze.pgs.binary.predictiveness <- function( ) { ## Validate Inputs ## - if (!is.data.frame(data)) { - stop('`data` must be a data frame.'); + if (!is.data.frame(pgs.data)) { + stop('`pgs.data` must be a data frame.'); } - if (!all(pgs.columns %in% names(data))) { + if (!all(pgs.columns %in% names(pgs.data))) { stop('Not all specified `pgs.columns` found in the data frame.'); } - if (!all(sapply(data[, pgs.columns, drop = FALSE], is.numeric))) { + if (!all(sapply(pgs.data[, pgs.columns, drop = FALSE], is.numeric))) { stop('All `pgs.columns` must be numeric.'); } - if (!all(phenotype.columns %in% names(data))) { + if (!all(phenotype.columns %in% names(pgs.data))) { stop(paste0('Not all specified `phenotype.columns` found in the data frame.')); } - if (!is.null(covariate.columns) && !all(covariate.columns %in% names(data))) { + if (!is.null(covariate.columns) && !all(covariate.columns %in% names(pgs.data))) { stop('Not all specified `covariate.columns` found in the data frame.'); } @@ -371,14 +371,14 @@ analyze.pgs.binary.predictiveness <- function( # Validate phenotype types consistency for (pheno.col in phenotype.columns) { if (phenotype.type == 'binary') { - if (!is.factor(data[[pheno.col]]) && !all(unique(na.omit(data[[pheno.col]])) %in% c(0, 1))) { + if (!is.factor(pgs.data[[pheno.col]]) && !all(unique(na.omit(pgs.data[[pheno.col]])) %in% c(0, 1))) { stop(paste0('Phenotype column \'', pheno.col, '\' is specified as binary but is not a factor or 0/1 numeric. Convert to factor.')); } - if (is.factor(data[[pheno.col]]) && nlevels(data[[pheno.col]]) != 2) { + if (is.factor(pgs.data[[pheno.col]]) && nlevels(pgs.data[[pheno.col]]) != 2) { stop(paste0('Binary phenotype column \'', pheno.col, '\' must have exactly two levels.')); } } else { # phenotype.type == 'continuous' - if (!is.numeric(data[[pheno.col]])) { + if (!is.numeric(pgs.data[[pheno.col]])) { stop(paste0('Phenotype column \'', pheno.col, '\' is specified as continuous but is not numeric.')); } if (is.null(cutoff.threshold)) { @@ -415,7 +415,7 @@ analyze.pgs.binary.predictiveness <- function( for (current.phenotype in phenotype.columns) { # message(paste0('Processing phenotype: ', current.phenotype)); - temp.data <- data; # Work on a fresh copy for each phenotype + temp.data <- pgs.data; # Work on a fresh copy for each phenotype pheno.var <- temp.data[[current.phenotype]]; diff --git a/README.md b/README.md index 6115c4e..c29052d 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ If you wish to apply a PGS to a cohort, we recommend that genotypes for the whol 4. Create summary plots. - ApplyPolygenicScore comes with several plotting functions designed to operate on the results of `apply.polygenic.score`. Display PGS density curves with `create.pgs.density.plot` and PGS percentile ranks with `create.pgs.rank.plot`. If you provided phenotype data in step 3, you can incorporate categorical data into the density plots and categorical and continuous phenotype data into the rank plots, and use `create.pgs.with.continuous.phenotype.plot` to make scatterplots of your PGS against any continuous phenotype data. + ApplyPolygenicScore comes with several plotting functions designed to operate on the results of `apply.polygenic.score`. Display PGS density curves with `create.pgs.density.plot`, distributions with `create.pgs.boxplot` and PGS percentile ranks with `create.pgs.rank.plot`. If you provided phenotype data in step 3, you can incorporate categorical data into the density plots and categorical and continuous phenotype data into the rank plots, and use `create.pgs.with.continuous.phenotype.plot` to make scatterplots of your PGS against any continuous phenotype data. For more sophisticated downstream analysis, check how well the PGS predicts binary outcomes using `analyze.pgs.binary.predictiveness`. For more step-by-step instructions, check out our [vignettes](https://CRAN.R-project.org/package=ApplyPolygenicScore). diff --git a/man/analyze.pgs.binary.predictiveness.Rd b/man/analyze.pgs.binary.predictiveness.Rd index 692f348..2d962b8 100644 --- a/man/analyze.pgs.binary.predictiveness.Rd +++ b/man/analyze.pgs.binary.predictiveness.Rd @@ -5,7 +5,7 @@ \title{Analyze PGS Predictiveness for Binary Phenotypes} \usage{ analyze.pgs.binary.predictiveness( - data, + pgs.data, pgs.columns, phenotype.columns, covariate.columns = NULL, @@ -22,7 +22,7 @@ analyze.pgs.binary.predictiveness( ) } \arguments{ -\item{data}{A data frame containing the PGS, phenotype, and covariate columns.} +\item{pgs.data}{A data frame containing the PGS, phenotype, and covariate columns.} \item{pgs.columns}{A character vector specifying the names of the PGS columns in \code{data} to be analyzed. All specified columns must be numeric.} diff --git a/tests/testthat/test-pgs-statistics.R b/tests/testthat/test-pgs-statistics.R index 0cbf2f9..e74095b 100644 --- a/tests/testthat/test-pgs-statistics.R +++ b/tests/testthat/test-pgs-statistics.R @@ -267,18 +267,18 @@ test_that( # Test 1: `data` must be a data frame expect_error( analyze.pgs.binary.predictiveness( - data = list(), + pgs.data = list(), pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary' ), - '`data` must be a data frame.' + '`pgs.data` must be a data frame.' ); # Test 2: Not all specified `pgs.columns` found in the data frame expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = c('PGS.A', 'NonExistentPGS'), phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary' @@ -289,7 +289,7 @@ test_that( # Test 3: All `pgs.columns` must be numeric expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'NonNumericPGS', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary' @@ -300,7 +300,7 @@ test_that( # Test 4: Not all specified `phenotype.columns` found in the data frame expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = c('Pheno.Binary.01', 'NonExistentPheno'), phenotype.type = 'binary' @@ -311,7 +311,7 @@ test_that( # Test 5: `covariate.columns` not found expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', covariate.columns = 'NonExistentCovariate', @@ -323,7 +323,7 @@ test_that( # Test 6: `phenotype.type` is invalid (using match.arg's error) expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'invalid_type' @@ -339,7 +339,7 @@ test_that( # Test 7a: Binary phenotype column is a factor but not 2 levels expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.3Levels', phenotype.type = 'binary' @@ -351,7 +351,7 @@ test_that( # Pheno.Binary.Char.YesNo is character, not 0/1 numeric expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.Char.YesNo', phenotype.type = 'binary' @@ -362,7 +362,7 @@ test_that( # Test 7c: Numeric binary phenotype is automatically converted to factor. expect_warning( # Expect no error or warning for valid conversion analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.Numeric.01', phenotype.type = 'binary' @@ -373,7 +373,7 @@ test_that( # Test 7d: Binary phenotype that converts successfully (no error expected, but good for coverage) expect_silent( # Expect no error or warning for valid conversion analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary' @@ -390,7 +390,7 @@ test_that( test.data.bad.continuous$Pheno.Continuous.Num <- as.character(test.data.bad.continuous$Pheno.Continuous.Num); expect_error( analyze.pgs.binary.predictiveness( - data = test.data.bad.continuous, + pgs.data = test.data.bad.continuous, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Continuous.Num', phenotype.type = 'continuous', @@ -402,7 +402,7 @@ test_that( # Test 9: `cutoff.threshold` missing for continuous phenotype expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Continuous.Num', phenotype.type = 'continuous', @@ -414,7 +414,7 @@ test_that( # Test 10: `cutoff.threshold` is a list but missing entry for specific phenotype expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Continuous.Num', phenotype.type = 'continuous', @@ -426,7 +426,7 @@ test_that( # Test 11: `cutoff.threshold` is not numeric or a named list expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Continuous.Num', phenotype.type = 'continuous', @@ -438,7 +438,7 @@ test_that( # Test 12: `cutoff.threshold` is a list but value for specific phenotype is NULL/not numeric expect_error( analyze.pgs.binary.predictiveness( - data = test.data, + pgs.data = test.data, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Continuous.Num', phenotype.type = 'continuous', @@ -465,7 +465,7 @@ test_that( } expect_error( analyze.pgs.binary.predictiveness( - data = many.pgs.data, + pgs.data = many.pgs.data, pgs.columns = paste0('PGS', 1:13), phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary', @@ -487,7 +487,7 @@ test_that( ); expect_warning( results <- analyze.pgs.binary.predictiveness( - data = test.data.no.complete.cases, + pgs.data = test.data.no.complete.cases, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary', @@ -506,7 +506,7 @@ test_that( ); expect_warning( results <- analyze.pgs.binary.predictiveness( - data = test.data.bad.glm, + pgs.data = test.data.bad.glm, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary', @@ -526,7 +526,7 @@ test_that( ); expect_warning( results <- analyze.pgs.binary.predictiveness( - data = test.data.bad.roc, + pgs.data = test.data.bad.roc, pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Binary.01', phenotype.type = 'binary', @@ -550,7 +550,7 @@ test_that( # Scenario 1: Binary phenotypes, with covariates, return plot object results.binary <- analyze.pgs.binary.predictiveness( - data = test.data, # Using the global test.data + pgs.data = test.data, # Using the global test.data pgs.columns = c('PGS.A', 'PGS.B'), phenotype.columns = c('Pheno.Binary.01', 'Pheno.Binary.TF'), covariate.columns = c('Cov.Age', 'Cov.Sex'), @@ -596,7 +596,7 @@ test_that( # Scenario 2: Continuous phenotype, no covariates, save to file (roc.plot should be NULL in return) # Use existing test.data results.continuous <- analyze.pgs.binary.predictiveness( - data = test.data, # Using the global test.data + pgs.data = test.data, # Using the global test.data pgs.columns = 'PGS.A', phenotype.columns = 'Pheno.Continuous.Num', covariate.columns = NULL, diff --git a/vignettes/UserGuide.Rmd b/vignettes/UserGuide.Rmd index 4ad22f8..496d26e 100644 --- a/vignettes/UserGuide.Rmd +++ b/vignettes/UserGuide.Rmd @@ -9,30 +9,46 @@ vignette: > %\VignetteEncoding{UTF-8} --- +```{r options, include = FALSE} +knitr::opts_chunk$set(comment = NA, purl = FALSE, dpi = 96, fig.width = 0.5, fig.height = 0.7, dev = 'png') +``` + +```{r render-how-to, include = FALSE, eval = FALSE, echo = FALSE} +# Render vignette outside of build_vignettes() via rmarkdown: +# Render the HTML vignette +rmarkdown::render('vignettes/UserGuide.Rmd', output_format = 'rmarkdown::html_vignette') + +# Render the PDF vignette. Needs to have a functional LaTeX installation. +rmarkdown::render('vignettes/UserGuide.Rmd', output_format = 'rmarkdown::pdf_document') +``` + ```{r setup} library(ApplyPolygenicScore) ``` + # Introduction -ApplyPolygenicScore provides utilities for applying a polygenic score to genetic data. +ApplyPolygenicScore provides utilities for applying a polygenic score model to genetic data. This guide contains step-by-step instructions for how best to take advantage of the functions in this package to facilitate this process and transition smoothly into downstream analyses. # Input data -The application of a polygenic score to genetic data requires two basic inputs: +The application of a polygenic score model to genetic data requires two basic inputs: 1. A VCF file containing genotype data for a set of individuals. -2. A polygenic score weight file containing the SNP coordinates and weights for the polygenic score. +2. A polygenic score model weight file containing the SNP coordinates and weights for the polygenic score. Optionally, a third data type, phenotype data, can be provided to several ApplyPolygenicScore functions. VCF file requirements: -- VCF files must be in standard VCF format. -- All multiallelic sites must be merged into one line per variant. + +* VCF files must be in standard VCF format. +* All multiallelic sites must be merged into one line per variant. PGS weight file requirements: + - PGS weight files provided by the PGS Catalog should be imported with the native package function `import.pgs.weight.file` to ensure proper formatting - PGS weight files from other sources must be formatted according to PGS Catalog standards for compatibility with the import function. - PGS weight files from other sources that are imported independently must have the required columns: `CHROM`, `POS`, `effect_allele`, `beta` and optionally `ID`, `other_allele`, and `allelefrequency_effect`. @@ -69,6 +85,25 @@ vcf.data <- import.vcf( str(vcf.data) ``` + +#### Imported VCF data structures + +As we can see, `import.vcf` returns a list with two elements: `split.wide.vcf.matrices` and `combined.long.vcf.df`. +These two objects represent two ways of representing the imported data. + +The "wide" format is the most memory-efficient. +It splits the VCF data into two objects: a table of per-variant data with only essential columns (CHROM, POS, ID, REF, ALT) +and a matrix of genotypes where each row corresponds to a variant and each column corresponds to an individual. +This format allows for efficient storage and manipulation of genotype data, especially for large VCF files. +The key limitation of this format is that it does not allow the importation of information (other than genotype) + unique to a sample-variant pair, such as that stored in the `FORMAT` field of the VCF file. + + +The "long" format is optional and will only be returned if `long.format = TRUE` is specified. It allows for the importation of all +information in the VCF file, including the `FORMAT` field, however it requires a lot more memory to store in R. This is because a +row is created for each sample-variant pair, resulting in a data frame with many more rows than the wide format. +For most applications, the wide format is sufficient and preferred for its memory efficiency. + ### Importing polygenic score data Use the `import.pgs.weight.file` function to import PGS weight files formatted according to PGS Catalog specifications. @@ -104,7 +139,7 @@ and must contain a column named `Indiv` corresponding to the individual IDs in t ```{r import-phenotype} # Isolating the individual IDs from the VCF data -vcf.individuals <- unique(vcf.data$combined.long.vcf.df$dat$Indiv) +vcf.individuals <- colnames(vcf.data$split.wide.vcf.matrices$genotyped.alleles) # Simulating phenotype data set.seed(123) @@ -151,9 +186,12 @@ head(pgs.bed.format) ``` Coordinate files must match the coordinate style used in the VCF file you wish to filter. The `convert.pgs.to.bed` function provides options for -formatting chromosome names, as these tend to vary between human genome reference GRCh38 and GRCh37 aligned files. Use `chr.prefix = TRUE` to add 'chr' -to the chromosome name (GRCh38 style) and `chr.prefix = FALSE` to remove 'chr' from the chromosome name (GRCh37 style). Use `numeric.sex.chr = FALSE` to -format the X and Y chromosomes as 'X' and 'Y' respectively, and `numeric.sex.chr = TRUE` to format the X and Y chromosomes as '23' and '24' respectively. +formatting chromosome names, as these can vary between human genome reference files that originate from different databases. + +- Use `chr.prefix = TRUE` to add 'chr' to the chromosome name (UCSC style) +- Use `chr.prefix = FALSE` to remove 'chr' from the chromosome name (Ensembl style) +- Use `numeric.sex.chr = FALSE` to format the X and Y chromosomes as 'X' and 'Y' respectively +- Use `numeric.sex.chr = TRUE` to format the X and Y chromosomes as '23' and '24' respectively The `slop` option imitates `bedtools` nomenclature for adding base pairs to the start and end of a set of coordinates. `slop = 10` adds 10 base pairs to the start and end of each variant coordinate. @@ -214,6 +252,16 @@ configured to just run the validation step to check your inputs first. ```{r check-pgs-columns} +# By default, apply.polygenic.score expects the vcf data to be in wide format. +apply.polygenic.score( + vcf.data = vcf.data$split.wide.vcf.matrices, + vcf.long.format = FALSE, + pgs.weight.data = pgs.data$pgs.weight.data, + phenotype.data = phenotype.data, + validate.inputs.only = TRUE + ) + +# If you have imported a VCF file in long format, you can specify that with the vcf.long.format argument. apply.polygenic.score( vcf.data = vcf.data$combined.long.vcf.df$dat, vcf.long.format = TRUE, @@ -235,8 +283,7 @@ to handle missing genotypes. ```{r apply-pgs} pgs.results <- apply.polygenic.score( - vcf.data = vcf.data$combined.long.vcf.df$dat, - vcf.long.format = TRUE, + vcf.data = vcf.data$split.wide.vcf.matrices, pgs.weight.data = pgs.data$pgs.weight.data, correct.strand.flips = FALSE, # no strand flip check to avoid warnings missing.genotype.method = 'none' @@ -250,12 +297,21 @@ The warning indicates that some variants from the PGS weight data were not found apply any weights to these variants, which effectively assumes their dosage as homozygous reference for all individuals. Checkout our discussion on [missing genotype data](https://github.com/uclahs-cds/package-ApplyPolygenicScore/discussions/17) for more details on how missing variants come to be. + Next, we see the output of the function. The output is a list with two elements: `pgs.output`, and `regression.output`. `regression.output` is empty because we did not provide the optional `phenotype.data`, more on that later. `pgs.output` is a data frame with the individual IDs from the VCF and the calculated polygenic score in the `PGS` column for each individual. As we will see later, each missing genotype method creates a uniquely named column of PGS values. + + Next, the `percentile`, `decile`, and `quartile` columns report the respective percentile information for each individual's PGS among the distribution of the entire cohort in the VCF data. These values look a bit strange in this example because our [example VCF](https://github.com/uclahs-cds/package-ApplyPolygenicScore/discussions/6) contains only two individuals which have identical genotypes. In a real-world scenario, these values would be more informative. + + The next two columns report the number and percentage of missing PGS genotypes for each individual. This number should be equal to or greater than the number reported in the warning message from `combine.vcf.with.pgs`. + +The final column reports the number of non-missing alleles for each individual. This value is equivalent to the number of non-missing variants multiplied by 2 (2 alleles per genotype in a diploid genome). +It is used to normalize the PGS values for missing genotypes when the `normalize` missing genotype method is used. + ### `combine.vcf.with.pgs` A quick aside about this function. While it is internal to `apply.polygenic.score`, it is also available for use on its own. The output includes a list of the variants that were not found in the VCF data, @@ -264,7 +320,7 @@ which may be useful for troubleshooting missing genotype data. ```{r merge-vcf-with-pgs} merged.vcf.pgs.data <- combine.vcf.with.pgs( - vcf.data = vcf.data$combined.long.vcf.df$dat, + vcf.data = vcf.data$split.wide.vcf.matrices$vcf.fixed, pgs.weight.data = pgs.data$pgs.weight.data ) @@ -279,6 +335,7 @@ By convention, the `other_allele` (non-effect allele) in the PGS weight data sho Checkout our discussion on [allele matching](https://github.com/uclahs-cds/package-ApplyPolygenicScore/discussions/68) for more details on the various ways in which this matching may fail. `apply.polygenic.score` can perform an allele match check (internally using `assess.pgs.vcf.allele.match`). This functionality can be customized with the following arguments: + - `correct.strand.flips` to automatically correct mismatches caused by strand flips by flipping the PGS weight data alleles. - `remove.ambiguous.allele.matches` to remove variants with mismatched alleles that cannot be corrected by flipping. The variant will be treated as a cohort-wide missing variant and handled according to missing variant rules described below. - `max.strand.flips` to specify the number of unambiguous strand flips that needs to be present for removal of ambiguous strand flips to occur (convenient for cases where you have reason to believe that all ambiguous strand flips are actually effect size flips) @@ -288,8 +345,7 @@ Checkout our discussion on [allele matching](https://github.com/uclahs-cds/packa # Most strict allele match handling strict.allele.match.result <- apply.polygenic.score( - vcf.data = vcf.data$combined.long.vcf.df$dat, - vcf.long.format = TRUE, + vcf.data = vcf.data$split.wide.vcf.matrices, pgs.weight.data = pgs.data$pgs.weight.data, missing.genotype.method = 'none', correct.strand.flips = TRUE, @@ -299,8 +355,7 @@ strict.allele.match.result <- apply.polygenic.score( # Less strict allele match handling less.strict.allele.match.result <- apply.polygenic.score( - vcf.data = vcf.data$combined.long.vcf.df$dat, - vcf.long.format = TRUE, + vcf.data = vcf.data$split.wide.vcf.matrices, pgs.weight.data = pgs.data$pgs.weight.data, missing.genotype.method = 'none', correct.strand.flips = TRUE, @@ -327,8 +382,7 @@ Briefly: ```{r missing-genotype-methods} all.missing.methods.pgs.results <- apply.polygenic.score( - vcf.data = vcf.data$combined.long.vcf.df$dat, - vcf.long.format = TRUE, + vcf.data = vcf.data$split.wide.vcf.matrices, pgs.weight.data = pgs.data$pgs.weight.data, correct.strand.flips = FALSE, # no strand flip check to avoid warnings missing.genotype.method = c('none', 'normalize', 'mean.dosage') @@ -363,8 +417,7 @@ Curious to see how your PGS values distribute among quintiles? No problem. Speci ```{r custom-percentiles} custom.percentiles.pgs.results <- apply.polygenic.score( - vcf.data = vcf.data$combined.long.vcf.df$dat, - vcf.long.format = TRUE, + vcf.data = vcf.data$split.wide.vcf.matrices, pgs.weight.data = pgs.data$pgs.weight.data, correct.strand.flips = FALSE, # no strand flip check to avoid warnings n.percentiles = 5 @@ -442,8 +495,8 @@ If you wish to use one of the other PGSs for analysis, the source PGS column for ApplyPolygenicScore provides four plotting functions designed to operate on the output of `apply.polygenic.score`: 1. `create.pgs.density.plot` -2. `create.pgs.with.continuous.phenotype.plot` -3. `create.pgs.boxplot` +2. `create.pgs.boxplot` +3. `create.pgs.with.continuous.phenotype.plot` 4. `create.pgs.rank.plot` These visualizations are intended to provide quick quality assessments of PGS in a cohort. @@ -489,6 +542,18 @@ phenotype.density.filename <- ApplyPolygenicScore:::generate.filename( extension = 'png' ); +basic.boxplot.filename <- ApplyPolygenicScore:::generate.filename( + project.stem = 'vignette-example-basic', + file.core = 'pgs-boxplot', + extension = 'png' + ); + +phenotype.boxplot.filename <- ApplyPolygenicScore:::generate.filename( + project.stem = 'vignette-example-phenotype', + file.core = 'pgs-boxplot', + extension = 'png' + ); + correlation.filename <- ApplyPolygenicScore:::generate.filename( project.stem = 'vignette-example-correlation', file.core = 'pgs-scatter', @@ -507,6 +572,18 @@ phenotype.rank.filename <- ApplyPolygenicScore:::generate.filename( extension = 'png' ); +roc.filename <- ApplyPolygenicScore:::generate.filename( + project.stem = 'vignette-example-basic', + file.core = 'pgs-roc-curves', + extension = 'png' + ); + +roc.continuous.filename <- ApplyPolygenicScore:::generate.filename( + project.stem = 'vignette-example-continuous', + file.core = 'pgs-roc-curves', + extension = 'png' + ); + ``` ```{r pgs-density, eval = TRUE} @@ -515,7 +592,9 @@ create.pgs.density.plot( pgs.data = pgs.results.with.phenotype.analysis$pgs.output, output.dir = temp.dir, filename.prefix = 'vignette-example-basic', - file.extension = 'png' + file.extension = 'png', + width = 5, + height = 5 ) ``` @@ -527,6 +606,7 @@ knitr::include_graphics(file.path(temp.dir, basic.density.filename)); ### Add phenotypes If provided with phenotype column names, this function will plot individual PGS density curves for each category in a categorical phenotype. + - Non-categorical phenotypes are ignored. - Colors are automatically assigned up to 12 categories. - If more than 12 categories are present, all categories are plotted in black. @@ -542,7 +622,10 @@ create.pgs.density.plot( filename.prefix = 'vignette-example-phenotype', file.extension = 'png', tidy.titles = TRUE, - phenotype.columns = c('binary.factor.phenotype', 'categorical.phenotype', 'continuous.phenotype') + phenotype.columns = c('binary.factor.phenotype', 'categorical.phenotype', 'continuous.phenotype'), + width = 5, + height = 7, + yaxes.cex = 1.2 ) ``` @@ -551,10 +634,64 @@ create.pgs.density.plot( knitr::include_graphics(file.path(temp.dir, phenotype.density.filename)); ``` +## PGS Boxplot +`create.pgs.boxplot` plots a boxplot of the PGS distribution in the cohort, optionally grouped by phenotype categories. +By default, individual scores are plotted as points over the boxplot in a stripplot style, and this option can be disabled with the `add.stripplot = FALSE` argument. + +### Basic plot + +```{r pgs-boxplot, eval = TRUE} + +create.pgs.boxplot( + pgs.data = pgs.results.with.phenotype.analysis$pgs.output, + output.dir = temp.dir, + filename.prefix = 'vignette-example-basic', + file.extension = 'png', + width = 4, + height = 4 + ) + +``` + +```{r out.width = '50%', echo = FALSE} +knitr::include_graphics(file.path(temp.dir, basic.boxplot.filename)); +``` + +### Add phenotypes + +If provided with phenotype column names, this function will plot individual boxplots for each category in a categorical phenotype. + +- Non-categorical phenotypes are ignored. +- Colors are automatically assigned up to 12 categories. +- If more than 12 categories are present, all categories are plotted in black. +- When multiple PGS columns are present and/or multiple phenotype columns are specified, each combination is plotted in a separate panel. +- Panels are arranged in a grid with unique PGS inputs on the x-axis and unique phenotype inputs on the y-axis. + +```{r pgs-boxplot-phenotype, eval = TRUE} + +create.pgs.boxplot( + pgs.data = pgs.results.with.phenotype.analysis$pgs.output, + output.dir = temp.dir, + filename.prefix = 'vignette-example-phenotype', + file.extension = 'png', + tidy.titles = TRUE, + add.stripplot = FALSE, + width = 5, + height = 11, + alpha = 0.9, + phenotype.columns = c('binary.factor.phenotype', 'categorical.phenotype', 'continuous.phenotype'), + titles.cex = 1.2 + ) + +``` + +```{r out.width = '50%', echo = FALSE} +knitr::include_graphics(file.path(temp.dir, phenotype.boxplot.filename)); +``` ## PGS Correlation `create.pgs.with.continuous.phenotype.plot` plots a scatterplot between the PGS and a continuous phenotype, and reports a correlation. -Non-continuous phenotypes are ignored. This function can only be used if phenotype data is provided. +Non-continuous phenotypes are ignored. This function can only be used if phenotype data are provided. ### Basic plot @@ -566,16 +703,18 @@ create.pgs.with.continuous.phenotype.plot( filename.prefix = 'vignette-example-correlation', file.extension = 'png', tidy.titles = TRUE, - phenotype.columns = c('continuous.phenotype', 'categorical.phenotype') + phenotype.columns = c('continuous.phenotype', 'categorical.phenotype'), + width = 5, + height = 5 ) ``` -```{r out.width = '70%', echo = FALSE} +```{r out.width = '50%', echo = FALSE} knitr::include_graphics(file.path(temp.dir, correlation.filename)); ``` -Unsurprisingly, our randomly generated phenotype data is not correlated with our fake PGS. +Unsurprisingly, our randomly generated phenotype data are not correlated with our fake PGS. - When multiple PGS columns are present and/or multiple phenotype columns are specified, each combination is plotted in a separate panel. - Panels are arranged in a grid with unique PGS inputs on the x-axis and unique phenotype inputs on the y-axis. @@ -601,7 +740,9 @@ create.pgs.rank.plot( pgs.data = pgs.results.with.phenotype.analysis$pgs.output, output.dir = temp.dir, filename.prefix = 'vignette-example-basic', - file.extension = 'png' + file.extension = 'png', + width = 5, + height = 6 ) ``` @@ -619,7 +760,9 @@ create.pgs.rank.plot( output.dir = temp.dir, filename.prefix = 'vignette-example-phenotype', file.extension = 'png', - phenotype.columns = c('binary.factor.phenotype', 'binary.phenotype', 'categorical.phenotype', 'continuous.phenotype') + phenotype.columns = c('binary.factor.phenotype', 'binary.phenotype', 'categorical.phenotype', 'continuous.phenotype'), + width = 6, + height = 8 ) ``` @@ -638,3 +781,57 @@ This function comes with a handful of additional arguments to control the appear - Use `missing.genotype.style` to toggle between displaying counts and percentages. - Use `categorical.palette` to specify a custom color palette for categorical phenotype bars. - Use `binary.pallette` to specify a custom color palette for binary and continuous phenotype bars. + +## Case-control analysis + +Another feature of ApplyPolygenicScore is the ability to perform a case-control style analysis to assess the accuracy of a PGS in predicting a binary phenotype. +The `analyze.pgs.binary.predictiveness` function takes the output of `apply.polygenic.score`, fits a logistic regression to predict the specified phenotype, plots +a receiver-operator curve, and returns the corresponding area under the curve (AUC) value. + +### Basic Plot + +```{r pgs-case-control, eval = TRUE} +pgs.case.control.analysis <- analyze.pgs.binary.predictiveness( + pgs.data = pgs.results.with.phenotype.analysis$pgs.output, + pgs.columns = 'PGS.with.replaced.missing', + phenotype.column = 'binary.phenotype', + output.dir = temp.dir, + filename.prefix = 'vignette-example-basic', + file.extension = 'png', + width = 5, + height = 5, + titles.cex = 1.2 + ); +``` + +```{r out.width = '50%', echo = FALSE} +knitr::include_graphics(file.path(temp.dir, roc.filename)); +``` + +### Binarize Continuous Phenotypes +If you wish to assess the predictiveness of a PGS in predicting a continuous phenotype, + you can binarize the continuous phenotype by specifying a threshold that will split the continuous values into two categories. + +```{r pgs-case-control-continuous, eval = TRUE, results = 'hide'} +pgs.case.control.analysis.continuous <- analyze.pgs.binary.predictiveness( + pgs.data = pgs.results.with.phenotype.analysis$pgs.output, + pgs.columns = 'PGS.with.replaced.missing', + phenotype.column = 'continuous.phenotype', + phenotype.type = 'continuous', + cutoff.threshold = 0, # binarize at 0 + output.dir = temp.dir, + filename.prefix = 'vignette-example-continuous', + file.extension = 'png', + width = 5, + height = 5, + titles.cex = 1.2 + ); +``` + +```{r out.width = '50%', echo = FALSE} +knitr::include_graphics(file.path(temp.dir, roc.continuous.filename)); +``` + + +- provide multiple pgs columns to plot multiple ROC curves in the same plot +- provide multiple phenotype columns to plot separate panels for each phenotype