arguments written out

raagagrawal · raagagrawal · commit 9b95a6d90f29 · 2025-03-23T12:48:04.000-07:00
diff --git a/R/combine-vcf-with-pgs.R b/R/combine-vcf-with-pgs.R
@@ -105,19 +105,30 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
         missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
         rm(missing.snp.merged.data);
 
-        # Split VCF$ID column into separate rows for each rsID (multiple rsIDs separated by ;)
-        if (any(grepl(';', vcf.data$ID))) {
-            split.rows <- strsplit(
-                as.character(vcf.data$ID),
-                ';',
-                fixed = TRUE
-                );
-            expanded.vcf <- vcf.data[rep(seq_len(nrow(vcf.data)), lengths(split.rows)), ]
-            expanded.vcf$ID <- unlist(split.rows)
-            split.rsid.vcf.data <- expanded.vcf
-        } else {
-            split.rsid.vcf.data <- vcf.data
-        }
+    # Expand the VCF$ID column to a row-per-rsID format.
+    # Some variants have multiple rsIDs in the ID column separated by semicolons.
+    # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
+    # we create a new data frame with the expanded rsID data
+    if (any(grepl(';', vcf.data$ID))) {
+        split.rows <- strsplit(
+            x           = as.character(vcf.data$ID),
+            split       = ';',
+            fixed       = TRUE
+            );
+
+        row.indices <- rep(
+            x           = seq_len(nrow(vcf.data)),
+            times       = lengths(split.rows)
+            );
+
+        expanded.vcf <- vcf.data[row.indices, ];
+
+        expanded.vcf$ID <- unlist(split.rows);
+
+        split.rsid.vcf.data <- expanded.vcf;
+    } else {
+        split.rsid.vcf.data <- vcf.data;
+    }
 
         # merge missing SNP data on split rsID
         merged.vcf.with.missing.pgs.data <- merge(