diff --git a/DESCRIPTION b/DESCRIPTION index 7fcd526..85ef4ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,7 +5,8 @@ Version: 3.0.2 Authors@R: c( person('Paul', 'Boutros', role = 'cre', email = 'PBoutros@mednet.ucla.edu'), person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')), - person('Rachel', 'Dang', role = 'ctb')) + person('Rachel', 'Dang', role = 'ctb'), + person('Raag', 'Agrawal', role = 'ctb')) Description: Simple and transparent parsing of genotype/dosage data from an input Variant Call Format (VCF) file, matching of genotype coordinates to the component Single Nucleotide Polymorphisms (SNPs) @@ -14,7 +15,7 @@ Description: Simple and transparent parsing of genotype/dosage data in accordance with the additive weighted sum of dosages model. Methods are designed in reference to best practices described by Collister, Liu, and Clifton (2022) . -Depends: +Depends: R (>= 4.2.0) Imports: vcfR, @@ -23,7 +24,7 @@ Imports: reshape2, BoutrosLab.plotting.general, lattice -Suggests: +Suggests: knitr, rmarkdown, testthat (>= 3.0.0) diff --git a/NEWS.md b/NEWS.md index 181cf44..4812d8b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # Unreleased +# ApplyPolygenicScore unreleased +## Changed +* Fixed regression of combine.vcf.with.pgs() function that prevented it from handling multiple rsIDs on the same line. +* added new contributor + # ApplyPolygenicScore 3.0.2 ## Changed diff --git a/R/combine-vcf-with-pgs.R b/R/combine-vcf-with-pgs.R index 2ccf4fd..939609e 100644 --- a/R/combine-vcf-with-pgs.R +++ b/R/combine-vcf-with-pgs.R @@ -105,23 +105,34 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) { missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data)); rm(missing.snp.merged.data); - # Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;) - # most efficient way to do this is to use the data.table package + # Expand the VCF$ID column to a row-per-rsID format. + # Some variants have multiple rsIDs in the ID column separated by semicolons. + # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row. + # we create a new data frame with the expanded rsID data if (any(grepl(';', vcf.data$ID))) { - data.table::setDT(vcf.data); - split.rsid.vcf.data <- merge( - x = vcf.data, - # split only entries with multiple rsIDs, save in new column, and merge back with the original data - y = vcf.data[grepl(';', get('ID')), unique(unlist(strsplit(as.character(get('ID')), ';', fixed = TRUE))), by = .(get('Indiv'), get('CHROM'), get('POS')) - ][,.(new.ID = get('V1'), get('Indiv'), get('CHROM'), get('POS'))], - by = c('CHROM', 'POS', 'Indiv'), - all = TRUE + split.rows <- strsplit( + x = as.character(vcf.data$ID), + split = ';', + fixed = TRUE ); - # replace entries with multiple rsIDs with the new, split, rsID - split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL]; - } else { + + # remove duplicate IDs + split.rows <- lapply(split.rows, function(x) unique(x)); + + row.indices <- rep( + x = seq_len(nrow(vcf.data)), + times = lengths(split.rows) + ); + + split.rsid.vcf.data <- vcf.data[row.indices, ]; + + split.rsid.vcf.data$ID.vcf.unsplit <- split.rsid.vcf.data$ID; # save original rsID names for final output + split.rsid.vcf.data$ID <- unlist(split.rows); + + } else { + vcf.data$ID.vcf.unsplit <- vcf.data$ID; # save an ID.vcf.unsplit column for consistency split.rsid.vcf.data <- vcf.data; - } + } # merge missing SNP data on split rsID merged.vcf.with.missing.pgs.data <- merge( @@ -155,7 +166,8 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) { # add columns to match original merge merged.vcf.with.missing.pgs.data$ID.pgs <- merged.vcf.with.missing.pgs.data$ID; - merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID; + merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID.vcf.unsplit; + merged.vcf.with.missing.pgs.data$ID.vcf.unsplit <- NULL; merged.vcf.with.missing.pgs.data$merge.strategy <- 'rsID'; # subset columns to match original merge diff --git a/tests/testthat/test-vcf-pgs-merge.R b/tests/testthat/test-vcf-pgs-merge.R index 038bacb..2cc92af 100644 --- a/tests/testthat/test-vcf-pgs-merge.R +++ b/tests/testthat/test-vcf-pgs-merge.R @@ -284,13 +284,22 @@ test_that( ); test.vcf.data.missing.locus.matching.rsid <- data.frame( CHROM = c('chr1', 'chr3', 'chr2', 'chr4'), - POS = c(1, 3, 3, 6), + POS = c(1, 3, 3, 6), #rs2/chr2:3 is missing by POS but present by rsID, #rs5/chr4:6 is missing by rsID AND by POS ID = c('rs1', 'rs3', 'rs2', 'rs5'), REF = c('A', 'T', 'C', 'G'), ALT = c('T', 'A', 'G', 'C'), Indiv = c('sample1', 'sample2', 'sample3', 'sample4'), gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C') ); + test.vcf.data.missing.locus.matching.rsid.with.semicolons <- data.frame( + CHROM = c('chr1', 'chr3', 'chr2', 'chr4'), + POS = c(1, 3, 3, 6), + ID = c('rs1;rs1', 'rs3', 'rs2;rsB', 'rs5;rsC'), + REF = c('A', 'T', 'C', 'G'), + ALT = c('T', 'A', 'G', 'C'), + Indiv = c('sample1', 'sample2', 'sample3', 'sample4'), + gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C') + ); test.pgs.weight.data <- data.frame( CHROM = c('chr1', 'chr3', 'chr2', 'chr4'), POS = c(1, 3, 2, 4), @@ -316,6 +325,14 @@ test_that( 'PGS is missing 1 SNPs from VCF' ); + expect_warning( + combine.vcf.with.pgs( + vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons, + pgs.weight.data = test.pgs.weight.data + ), + 'PGS is missing 1 SNPs from VCF' + ); + test.combine.vcf.with.pgs.no.missing <- combine.vcf.with.pgs( vcf.data = test.vcf.data.no.missing, pgs.weight.data = test.pgs.weight.data @@ -331,6 +348,11 @@ test_that( pgs.weight.data = test.pgs.weight.data ); + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons <- combine.vcf.with.pgs( + vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons, + pgs.weight.data = test.pgs.weight.data + ); + # check that combine.vcf.with.pgs returns the correct number of rows expect_equal( nrow(test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data), @@ -344,6 +366,10 @@ test_that( nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data), 4 ); + expect_equal( + nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data), + 4 + ); expect_equal( nrow(test.combine.vcf.with.pgs.missing$missing.snp.data), @@ -357,6 +383,10 @@ test_that( nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data), 1 ); + expect_equal( + nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data), + 1 + ); # check that combine.vcf.with.pgs returns the correct number of columns expect_equal( @@ -371,6 +401,10 @@ test_that( ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data), 11 ); + expect_equal( + ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data), + 11 + ); # check that combine.vcf.with.pgs returns the correct columns expect_true( @@ -382,6 +416,9 @@ test_that( expect_true( all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data)) ); + expect_true( + all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data)) + ); # check that combine.vcf.with.pgs returns the correct values expect_equal( @@ -396,7 +433,10 @@ test_that( test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$CHROM, c('chr1', 'chr3', 'chr2', 'chr4') ); - + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$CHROM, + c('chr1', 'chr3', 'chr2', 'chr4') + ); expect_equal( test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$POS, @@ -410,6 +450,10 @@ test_that( test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$POS, c(1, 3, 3, 4) ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$POS, + c(1, 3, 3, 4) + ); expect_equal( test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs, @@ -423,7 +467,44 @@ test_that( test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.pgs, c('rs1', 'rs3', 'rs2', 'rs4') ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.pgs, + c('rs1', 'rs3', 'rs2', 'rs4') + ); + expect_equal( + test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs, + test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.vcf + ); + expect_equal( + test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$ID.vcf, + c('rs1', 'rs2', 'rs3', NA) + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.vcf, + c('rs1', 'rs3', 'rs2', NA) + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.vcf, + c('rs1;rs1', 'rs3', 'rs2;rsB', NA) + ); + + expect_equal( + test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$merge.strategy, + rep('genomic coordinate', 4) + ); + expect_equal( + test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$merge.strategy, + c(rep('genomic coordinate', 3), 'rsID') + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$merge.strategy, + c('genomic coordinate', 'genomic coordinate', 'rsID','rsID') + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$merge.strategy, + c('genomic coordinate', 'genomic coordinate', 'rsID','rsID') + ); # check that combine.vcf.with.pgs returns the correct values for missing SNPs expect_equal( @@ -512,6 +593,45 @@ test_that( is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data$gt_GT_alleles) ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.pgs, + 'chr4' + ); + expect_true( + is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.vcf) + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.pgs, + 4 + ); + expect_true( + is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.vcf) + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ID, + 'rs4' + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$effect_allele, + 'G' + ); + expect_equal( + test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$beta, + 4 + ); + expect_true( + is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$REF) + ); + expect_true( + is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ALT) + ); + expect_true( + is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$Indiv) + ); + expect_true( + is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$gt_GT_alleles) + ); + } );