uclahs-cds · alkaZeltser · Apr 1, 2025 · Mar 22, 2025 · Mar 22, 2025 · Mar 22, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,8 @@ Version: 3.0.2
 Authors@R: c(
     person('Paul', 'Boutros', role = 'cre', email = '[email protected]'),
     person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
-    person('Rachel', 'Dang', role = 'ctb'))
+    person('Rachel', 'Dang', role = 'ctb'),
+    person('Raag', 'Agrawal', role = 'ctb'))
 Description: Simple and transparent parsing of genotype/dosage data
     from an input Variant Call Format (VCF) file, matching of genotype
     coordinates to the component Single Nucleotide Polymorphisms (SNPs)
@@ -14,7 +15,7 @@ Description: Simple and transparent parsing of genotype/dosage data
     in accordance with the additive weighted sum of dosages model. Methods
     are designed in reference to best practices described by
     Collister, Liu, and Clifton (2022) <doi:10.3389/fgene.2022.818574>.
-Depends: 
+Depends:
     R (>= 4.2.0)
 Imports:
     vcfR,
@@ -23,7 +24,7 @@ Imports:
     reshape2,
     BoutrosLab.plotting.general,
     lattice
-Suggests: 
+Suggests:
     knitr,
     rmarkdown,
     testthat (>= 3.0.0)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # Unreleased
 
+# ApplyPolygenicScore unreleased
+## Changed
+* Fixed regression of combine.vcf.with.pgs() function that prevented it from handling multiple rsIDs on the same line.
+* added new contributor
+
 # ApplyPolygenicScore 3.0.2
 
 ## Changed

diff --git a/R/combine-vcf-with-pgs.R b/R/combine-vcf-with-pgs.R
@@ -105,23 +105,19 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
         missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
         rm(missing.snp.merged.data);
 
-        # Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;)
-        # most efficient way to do this is to use the data.table package
+        # Split VCF$ID column into separate rows for each rsID (multiple rsIDs separated by ;)
         if (any(grepl(';', vcf.data$ID))) {
-            data.table::setDT(vcf.data);
-            split.rsid.vcf.data <- merge(
-                x = vcf.data,
-                # split only entries with multiple rsIDs, save in new column, and merge back with the original data
-                y = vcf.data[grepl(';', get('ID')), unique(unlist(strsplit(as.character(get('ID')), ';', fixed = TRUE))), by = .(get('Indiv'), get('CHROM'), get('POS'))
-                    ][,.(new.ID = get('V1'), get('Indiv'), get('CHROM'), get('POS'))],
-                by = c('CHROM', 'POS', 'Indiv'),
-                all = TRUE
+            split.rows <- strsplit(
+                as.character(vcf.data$ID),
+                ';',
+                fixed = TRUE
                 );
-            # replace entries with multiple rsIDs with the new, split, rsID
-            split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL];
-            } else {
-            split.rsid.vcf.data <- vcf.data;
-            }
+            expanded.vcf <- vcf.data[rep(seq_len(nrow(vcf.data)), lengths(split.rows)), ]
+            expanded.vcf$ID <- unlist(split.rows)
+            split.rsid.vcf.data <- expanded.vcf
+        } else {
+            split.rsid.vcf.data <- vcf.data
+        }
 
         # merge missing SNP data on split rsID
         merged.vcf.with.missing.pgs.data <- merge(