Merge pull request #75 from uclahs-cds/ragrawal-fix-semicolon

alkaZeltser · web-flow · commit 07b8d71a357a · 2025-03-31T17:52:28.000-07:00
Ragrawal fix semicolon
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,8 @@ Version: 3.0.2
 Authors@R: c(
     person('Paul', 'Boutros', role = 'cre', email = 'PBoutros@mednet.ucla.edu'),
     person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
-    person('Rachel', 'Dang', role = 'ctb'))
+    person('Rachel', 'Dang', role = 'ctb'),
+    person('Raag', 'Agrawal', role = 'ctb'))
 Description: Simple and transparent parsing of genotype/dosage data
     from an input Variant Call Format (VCF) file, matching of genotype
     coordinates to the component Single Nucleotide Polymorphisms (SNPs)
@@ -14,7 +15,7 @@ Description: Simple and transparent parsing of genotype/dosage data
     in accordance with the additive weighted sum of dosages model. Methods
     are designed in reference to best practices described by
     Collister, Liu, and Clifton (2022) <doi:10.3389/fgene.2022.818574>.
-Depends: 
+Depends:
     R (>= 4.2.0)
 Imports:
     vcfR,
@@ -23,7 +24,7 @@ Imports:
     reshape2,
     BoutrosLab.plotting.general,
     lattice
-Suggests: 
+Suggests:
     knitr,
     rmarkdown,
     testthat (>= 3.0.0)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # Unreleased
 
+# ApplyPolygenicScore unreleased
+## Changed
+* Fixed regression of combine.vcf.with.pgs() function that prevented it from handling multiple rsIDs on the same line.
+* added new contributor
+
 # ApplyPolygenicScore 3.0.2
 
 ## Changed
diff --git a/R/combine-vcf-with-pgs.R b/R/combine-vcf-with-pgs.R
@@ -105,23 +105,34 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
         missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
         rm(missing.snp.merged.data);
 
-        # Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;)
-        # most efficient way to do this is to use the data.table package
+        # Expand the VCF$ID column to a row-per-rsID format.
+        # Some variants have multiple rsIDs in the ID column separated by semicolons.
+        # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
+        # we create a new data frame with the expanded rsID data
         if (any(grepl(';', vcf.data$ID))) {
-            data.table::setDT(vcf.data);
-            split.rsid.vcf.data <- merge(
-                x = vcf.data,
-                # split only entries with multiple rsIDs, save in new column, and merge back with the original data
-                y = vcf.data[grepl(';', get('ID')), unique(unlist(strsplit(as.character(get('ID')), ';', fixed = TRUE))), by = .(get('Indiv'), get('CHROM'), get('POS'))
-                    ][,.(new.ID = get('V1'), get('Indiv'), get('CHROM'), get('POS'))],
-                by = c('CHROM', 'POS', 'Indiv'),
-                all = TRUE
+            split.rows <- strsplit(
+                x           = as.character(vcf.data$ID),
+                split       = ';',
+                fixed       = TRUE
                 );
-            # replace entries with multiple rsIDs with the new, split, rsID
-            split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL];
-            } else {
+
+            # remove duplicate IDs
+            split.rows <- lapply(split.rows, function(x) unique(x));
+
+            row.indices <- rep(
+                x           = seq_len(nrow(vcf.data)),
+                times       = lengths(split.rows)
+                );
+
+            split.rsid.vcf.data <- vcf.data[row.indices, ];
+
+            split.rsid.vcf.data$ID.vcf.unsplit <- split.rsid.vcf.data$ID; # save original rsID names for final output
+            split.rsid.vcf.data$ID <- unlist(split.rows);
+
+        } else {
+            vcf.data$ID.vcf.unsplit <- vcf.data$ID; # save an ID.vcf.unsplit column for consistency
             split.rsid.vcf.data <- vcf.data;
-            }
+        }
 
         # merge missing SNP data on split rsID
         merged.vcf.with.missing.pgs.data <- merge(
@@ -155,7 +166,8 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
 
         # add columns to match original merge
         merged.vcf.with.missing.pgs.data$ID.pgs <- merged.vcf.with.missing.pgs.data$ID;
-        merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID;
+        merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID.vcf.unsplit;
+        merged.vcf.with.missing.pgs.data$ID.vcf.unsplit <- NULL;
         merged.vcf.with.missing.pgs.data$merge.strategy <- 'rsID';
 
         # subset columns to match original merge
diff --git a/tests/testthat/test-vcf-pgs-merge.R b/tests/testthat/test-vcf-pgs-merge.R
@@ -284,13 +284,22 @@ test_that(
             );
         test.vcf.data.missing.locus.matching.rsid <- data.frame(
             CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
-            POS = c(1, 3, 3, 6),
+            POS = c(1, 3, 3, 6), #rs2/chr2:3 is missing by POS but present by rsID, #rs5/chr4:6 is missing by rsID AND by POS
             ID = c('rs1', 'rs3', 'rs2', 'rs5'),
             REF = c('A', 'T', 'C', 'G'),
             ALT = c('T', 'A', 'G', 'C'),
             Indiv = c('sample1', 'sample2', 'sample3', 'sample4'),
             gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C')
             );
+        test.vcf.data.missing.locus.matching.rsid.with.semicolons <- data.frame(
+            CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
+            POS = c(1, 3, 3, 6),
+            ID = c('rs1;rs1', 'rs3', 'rs2;rsB', 'rs5;rsC'),
+            REF = c('A', 'T', 'C', 'G'),
+            ALT = c('T', 'A', 'G', 'C'),
+            Indiv = c('sample1', 'sample2', 'sample3', 'sample4'),
+            gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C')
+            );
         test.pgs.weight.data <- data.frame(
             CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
             POS = c(1, 3, 2, 4),
@@ -316,6 +325,14 @@ test_that(
             'PGS is missing 1 SNPs from VCF'
             );
 
+        expect_warning(
+            combine.vcf.with.pgs(
+                vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons,
+                pgs.weight.data = test.pgs.weight.data
+                ),
+            'PGS is missing 1 SNPs from VCF'
+            );
+
         test.combine.vcf.with.pgs.no.missing <- combine.vcf.with.pgs(
             vcf.data = test.vcf.data.no.missing,
             pgs.weight.data = test.pgs.weight.data
@@ -331,6 +348,11 @@ test_that(
             pgs.weight.data = test.pgs.weight.data
             );
 
+        test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons <- combine.vcf.with.pgs(
+            vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons,
+            pgs.weight.data = test.pgs.weight.data
+            );
+
         # check that combine.vcf.with.pgs returns the correct number of rows
         expect_equal(
             nrow(test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data),
@@ -344,6 +366,10 @@ test_that(
             nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data),
             4
             );
+        expect_equal(
+            nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data),
+            4
+            );
 
         expect_equal(
             nrow(test.combine.vcf.with.pgs.missing$missing.snp.data),
@@ -357,6 +383,10 @@ test_that(
             nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data),
             1
             );
+        expect_equal(
+            nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data),
+            1
+            );
 
         # check that combine.vcf.with.pgs returns the correct number of columns
         expect_equal(
@@ -371,6 +401,10 @@ test_that(
             ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data),
             11
             );
+        expect_equal(
+            ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data),
+            11
+            );
 
         # check that combine.vcf.with.pgs returns the correct columns
         expect_true(
@@ -382,6 +416,9 @@ test_that(
         expect_true(
             all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data))
             );
+        expect_true(
+            all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data))
+            );
 
         # check that combine.vcf.with.pgs returns the correct values
         expect_equal(
@@ -396,7 +433,10 @@ test_that(
             test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$CHROM,
             c('chr1', 'chr3', 'chr2', 'chr4')
             );
-
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$CHROM,
+            c('chr1', 'chr3', 'chr2', 'chr4')
+            );
 
         expect_equal(
             test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$POS,
@@ -410,6 +450,10 @@ test_that(
             test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$POS,
             c(1, 3, 3, 4)
             );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$POS,
+            c(1, 3, 3, 4)
+            );
 
         expect_equal(
             test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs,
@@ -423,7 +467,44 @@ test_that(
             test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.pgs,
             c('rs1', 'rs3', 'rs2', 'rs4')
             );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.pgs,
+            c('rs1', 'rs3', 'rs2', 'rs4')
+            );
 
+        expect_equal(
+            test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs,
+            test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.vcf
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$ID.vcf,
+            c('rs1', 'rs2', 'rs3', NA)
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.vcf,
+            c('rs1', 'rs3', 'rs2', NA)
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.vcf,
+            c('rs1;rs1', 'rs3', 'rs2;rsB', NA)
+            );
+
+        expect_equal(
+            test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$merge.strategy,
+            rep('genomic coordinate', 4)
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$merge.strategy,
+            c(rep('genomic coordinate', 3), 'rsID')
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$merge.strategy,
+            c('genomic coordinate', 'genomic coordinate', 'rsID','rsID')
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$merge.strategy,
+            c('genomic coordinate', 'genomic coordinate', 'rsID','rsID')
+            );
 
         # check that combine.vcf.with.pgs returns the correct values for missing SNPs
         expect_equal(
@@ -512,6 +593,45 @@ test_that(
             is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data$gt_GT_alleles)
             );
 
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.pgs,
+            'chr4'
+            );
+        expect_true(
+            is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.vcf)
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.pgs,
+            4
+            );
+        expect_true(
+            is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.vcf)
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ID,
+            'rs4'
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$effect_allele,
+            'G'
+            );
+        expect_equal(
+            test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$beta,
+            4
+            );
+        expect_true(
+            is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$REF)
+            );
+        expect_true(
+            is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ALT)
+            );
+        expect_true(
+            is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$Indiv)
+            );
+        expect_true(
+            is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$gt_GT_alleles)
+            );
+
         }
     );