Skip to content

Commit c3d48af

Browse files
authored
Merge pull request #67 from uclahs-cds/nzeltser-strand-flip-check
add strand flip check
2 parents a11ad47 + e591ae3 commit c3d48af

20 files changed

+1336
-22
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,13 @@ importFrom(
3636

3737
export(
3838
apply.polygenic.score,
39+
assess.pgs.vcf.allele.match,
3940
combine.pgs.bed,
4041
combine.vcf.with.pgs,
4142
convert.alleles.to.pgs.dosage,
4243
convert.allele.frequency.to.dosage,
4344
convert.pgs.to.bed,
45+
flip.DNA.allele,
4446
format.chromosome.notation,
4547
get.pgs.percentiles,
4648
import.pgs.weight.file,

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Added handling of overlapping deletion allele notation
33
* Added secondary PGS/VCF variant matching method using rsID after first attempt with genomic coordinates
44
* Added checks for rsID as an optional column in input PGS weight files
5+
* Added functionality to assess allele matches and correct strand flips
56

67
# ApplyPolygenicScore 2.0.0 (2024-07-31)
78

R/apply-pgs.R

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ validate.vcf.input <- function(vcf.data) {
2121

2222
}
2323

24-
validate.pgs.data.input <- function(pgs.weight.data, use.external.effect.allele.frequency) {
24+
validate.pgs.data.input <- function(pgs.weight.data, use.external.effect.allele.frequency, correct.strand.flips, remove.ambiguous.allele.matches, remove.mismatched.indels) {
2525
if (!is.data.frame(pgs.weight.data)) {
2626
stop('pgs.weight.data must be a data.frame');
2727
}
@@ -32,6 +32,13 @@ validate.pgs.data.input <- function(pgs.weight.data, use.external.effect.allele.
3232
stop('pgs.weight.data must contain columns named CHROM, POS, effect_allele, and beta');
3333
}
3434

35+
# additional required columns if strand flip correction is enabled
36+
if (correct.strand.flips || remove.ambiguous.allele.matches || remove.mismatched.indels) {
37+
if (!('other_allele' %in% colnames(pgs.weight.data))) {
38+
stop('pgs.weight.data must contain a column named other_allele if correct.strand.flips, remove.ambiguous.allele.matches, or remove.mismatched.indels is TRUE');
39+
}
40+
}
41+
3542
if (use.external.effect.allele.frequency) {
3643
required.eaf.column <- 'allelefrequency_effect';
3744
if (!(required.eaf.column %in% colnames(pgs.weight.data))) {
@@ -89,6 +96,12 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
8996
#' Phenotype variables are automatically classified as continuous, binary, or neither based on data type and number of unique values. The calculated PGS is associated
9097
#' with each phenotype variable using linear or logistic regression for continuous or binary phenotypes, respectively. See \code{run.pgs.regression} for more details.
9198
#' If no phenotype.analysis.columns are provided, no regression analysis is performed.
99+
#' @param correct.strand.flips A logical indicating whether to check PGS weight data/VCF genotype data matches for strand flips and correct them. Default is \code{TRUE}.
100+
#' The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.
101+
#' @param remove.ambiguous.allele.matches A logical indicating whether to remove PGS variants with ambiguous allele matches between PGS weight data and VCF genotype data. Default is \code{FALSE}.
102+
#' The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.
103+
#' @param remove.mismatched.indels A logical indicating whether to remove indel variants that are mismatched between PGS weight data and VCF genotype data. Default is \code{FALSE}.
104+
#' The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.
92105
#' @param output.dir A character string indicating the directory to write output files. Separate files are written for per-sample pgs results and optional regression results.
93106
#' Files are tab-separate .txt files. Default is NULL in which case no files are written.
94107
#' @param file.prefix A character string to prepend to the output file names. Default is \code{NULL}.
@@ -101,6 +114,7 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
101114
#' @param validate.inputs.only A logical indicating whether to only perform input data validation checks without running PGS application.
102115
#' If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.
103116
#' @return A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided.
117+
#'
104118
#' \strong{Output Structure}
105119
#'
106120
#' The outputed list contains the following elements:
@@ -143,7 +157,7 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
143157
#'
144158
#' \strong{Missing Genotype Handling}
145159
#'
146-
#' VCF genotype data are matched to PGS data by chromosome, position, and effect allele. If a SNP cannot be matched by genomic coordinate,
160+
#' VCF genotype data are matched to PGS data by chromosome and position. If a SNP cannot be matched by genomic coordinate,
147161
#' an attempt is made to match by rsID (if available). If a SNP from the PGS weight data is not found in the VCF data after these two matching attempts,
148162
#' it is considered a cohort-wide missing variant.
149163
#'
@@ -171,6 +185,27 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
171185
#' It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted.
172186
#' VCF data can be formatted to merged format using external tools for VCF file manipulation.
173187
#'
188+
#' \strong{Allele Mismatch Handling}
189+
#' Variants from the PGS weight data are merged with records in the VCF data by genetic coordinate.
190+
#' After the merge is complete, there may be cases where the VCF reference (REF) and alternative (ALT) alleles do not match their conventional counterparts in the
191+
#' PGS weight data (other allele and effect allele, respectively).
192+
#' This is usually caused by a strand flip: the variant in question was called against opposite DNA reference strands in the PGS training data and the VCF data.
193+
#' Strand flips can be detected and corrected by flipping the affected allele to its reverse complement.
194+
#' \code{apply.polygenic.score} uses \code{assess.pgs.vcf.allele.match} to assess allele concordance, and is controlled through the following arguments:
195+
#'
196+
#' \itemize{
197+
#' \item \code{correct.strand.flips}: When \code{TRUE}, detected strand flips are corrected by flipping the affected value in the \code{effect_allele} column prior to dosage calling.
198+
#' \item \code{remove.ambiguous.allele.matches}: Corresponds to the \code{return.ambiguous.as.missing} argument in \code{assess.pgs.vcf.allele.match}. When \code{TRUE}, non-INDEL allele
199+
#' mismatches that cannot be resolved (due to palindromic alleles or causes other than strand flips) are removed by marking the affected value in the \code{effect_allele} column as missing
200+
#' prior to dosage calling and missing genotype handling. The corresponding dosage is set to NA and the variant is handled according to the chosen missing genotype method.
201+
#' \item \code{remove.mismatched.indels}: Corresponds to the \code{return.indels.as.missing} argument in \code{assess.pgs.vcf.allele.match}. When \code{TRUE}, INDEL allele mismatches
202+
#' (which cannot be assessed for strand flips) are removed by marking the affected value in the \code{effect_allele} column as missing prior to dosage calling and missing genotype handling.
203+
#' The corresponding dosage is set to NA and the variant is handled according to the chosen missing genotype method.
204+
#' }
205+
#'
206+
#' Note that an allele match assessment requires the presence of both the \code{other_allele} and \code{effect_allele} in the PGS weight data.
207+
#' The \code{other_allele} column is not required by the PGS Catalog, and so is not always available.
208+
#'
174209
#' @examples
175210
#' # Example VCF
176211
#' vcf.path <- system.file(
@@ -205,6 +240,15 @@ validate.phenotype.data.input <- function(phenotype.data, phenotype.analysis.col
205240
#' use.external.effect.allele.frequency = TRUE
206241
#' );
207242
#'
243+
#' # Specify allele mismatch handling
244+
#' pgs.data <- apply.polygenic.score(
245+
#' vcf.data = vcf.import$dat,
246+
#' pgs.weight.data = pgs.import$pgs.weight.data,
247+
#' correct.strand.flips = TRUE,
248+
#' remove.ambiguous.allele.matches = TRUE,
249+
#' remove.mismatched.indels = FALSE
250+
#' );
251+
#'
208252
#' # Provide phenotype data for basic correlation analysis
209253
#' phenotype.data <- data.frame(
210254
#' Indiv = unique(vcf.import$dat$Indiv),
@@ -234,6 +278,9 @@ apply.polygenic.score <- function(
234278
pgs.weight.data,
235279
phenotype.data = NULL,
236280
phenotype.analysis.columns = NULL,
281+
correct.strand.flips = TRUE,
282+
remove.ambiguous.allele.matches = FALSE,
283+
remove.mismatched.indels = FALSE,
237284
output.dir = NULL,
238285
file.prefix = NULL,
239286
missing.genotype.method = 'mean.dosage',
@@ -246,7 +293,13 @@ apply.polygenic.score <- function(
246293
### Start Input Validation ###
247294

248295
validate.vcf.input(vcf.data = vcf.data);
249-
validate.pgs.data.input(pgs.weight.data = pgs.weight.data, use.external.effect.allele.frequency = use.external.effect.allele.frequency);
296+
validate.pgs.data.input(
297+
pgs.weight.data = pgs.weight.data,
298+
use.external.effect.allele.frequency = use.external.effect.allele.frequency,
299+
correct.strand.flips = correct.strand.flips,
300+
remove.ambiguous.allele.matches = remove.ambiguous.allele.matches,
301+
remove.mismatched.indels = remove.mismatched.indels
302+
);
250303
validate.phenotype.data.input(phenotype.data = phenotype.data, phenotype.analysis.columns = phenotype.analysis.columns, vcf.data = vcf.data);
251304

252305
if (validate.inputs.only) {
@@ -290,6 +343,19 @@ apply.polygenic.score <- function(
290343
# free up some memory
291344
rm(vcf.data);
292345

346+
### Start Allele Match Check ###
347+
if (remove.ambiguous.allele.matches || correct.strand.flips) {
348+
match.assessment <- ApplyPolygenicScore::assess.pgs.vcf.allele.match(
349+
vcf.ref.allele = merged.vcf.with.pgs.data$REF,
350+
vcf.alt.allele = merged.vcf.with.pgs.data$ALT,
351+
pgs.ref.allele = merged.vcf.with.pgs.data$other_allele,
352+
pgs.effect.allele = merged.vcf.with.pgs.data$effect_allele,
353+
return.ambiguous.as.missing = remove.ambiguous.allele.matches,
354+
return.indels.as.missing = remove.mismatched.indels
355+
);
356+
merged.vcf.with.pgs.data$effect_allele <- match.assessment$new.pgs.effect.allele;
357+
}
358+
293359
# calculate dosage
294360
merged.vcf.with.pgs.data$dosage <- convert.alleles.to.pgs.dosage(
295361
called.alleles = merged.vcf.with.pgs.data$gt_GT_alleles,

0 commit comments

Comments
 (0)