uclahs-cds · alkaZeltser · Apr 1, 2025 · Mar 22, 2025 · Mar 22, 2025 · Mar 22, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,8 @@ Version: 3.0.2
 Authors@R: c(
     person('Paul', 'Boutros', role = 'cre', email = '[email protected]'),
     person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
-    person('Rachel', 'Dang', role = 'ctb'))
+    person('Rachel', 'Dang', role = 'ctb'),
+    person('Raag', 'Agrawal', role = 'ctb'))
 Description: Simple and transparent parsing of genotype/dosage data
     from an input Variant Call Format (VCF) file, matching of genotype
     coordinates to the component Single Nucleotide Polymorphisms (SNPs)
@@ -14,7 +15,7 @@ Description: Simple and transparent parsing of genotype/dosage data
     in accordance with the additive weighted sum of dosages model. Methods
     are designed in reference to best practices described by
     Collister, Liu, and Clifton (2022) <doi:10.3389/fgene.2022.818574>.
-Depends: 
+Depends:
     R (>= 4.2.0)
 Imports:
     vcfR,
@@ -23,7 +24,7 @@ Imports:
     reshape2,
     BoutrosLab.plotting.general,
     lattice
-Suggests: 
+Suggests:
     knitr,
     rmarkdown,
     testthat (>= 3.0.0)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # Unreleased
 
+# ApplyPolygenicScore unreleased
+## Changed
+* Fixed regression of combine.vcf.with.pgs() function that prevented it from handling multiple rsIDs on the same line.
+* added new contributor
+
 # ApplyPolygenicScore 3.0.2
 
 ## Changed

diff --git a/R/combine-vcf-with-pgs.R b/R/combine-vcf-with-pgs.R
@@ -105,23 +105,30 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
         missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
         rm(missing.snp.merged.data);
 
-        # Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;)
-        # most efficient way to do this is to use the data.table package
-        if (any(grepl(';', vcf.data$ID))) {
-            data.table::setDT(vcf.data);
-            split.rsid.vcf.data <- merge(
-                x = vcf.data,
-                # split only entries with multiple rsIDs, save in new column, and merge back with the original data
-                y = vcf.data[grepl(';', get('ID')), unique(unlist(strsplit(as.character(get('ID')), ';', fixed = TRUE))), by = .(get('Indiv'), get('CHROM'), get('POS'))
-                    ][,.(new.ID = get('V1'), get('Indiv'), get('CHROM'), get('POS'))],
-                by = c('CHROM', 'POS', 'Indiv'),
-                all = TRUE
-                );
-            # replace entries with multiple rsIDs with the new, split, rsID
-            split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL];
-            } else {
-            split.rsid.vcf.data <- vcf.data;
-            }
+    # Expand the VCF$ID column to a row-per-rsID format.
+    # Some variants have multiple rsIDs in the ID column separated by semicolons.
+    # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
+    # we create a new data frame with the expanded rsID data
+    if (any(grepl(';', vcf.data$ID))) {
+        split.rows <- strsplit(
+            x           = as.character(vcf.data$ID),
+            split       = ';',
+            fixed       = TRUE
+            );
+
+        row.indices <- rep(
+            x           = seq_len(nrow(vcf.data)),
+            times       = lengths(split.rows)
+            );
+
+        expanded.vcf <- vcf.data[row.indices, ];
+
+        expanded.vcf$ID <- unlist(split.rows);
+
+        split.rsid.vcf.data <- expanded.vcf;
+    } else {
+        split.rsid.vcf.data <- vcf.data;
+    }
 
         # merge missing SNP data on split rsID
         merged.vcf.with.missing.pgs.data <- merge(