@@ -105,19 +105,30 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
105105 missing.snp.pgs.weight.data <- subset(missing.snp.merged.data , select = colnames(pgs.weight.data ));
106106 rm(missing.snp.merged.data );
107107
108- # Split VCF$ID column into separate rows for each rsID (multiple rsIDs separated by ;)
109- if (any(grepl(' ;' , vcf.data $ ID ))) {
110- split.rows <- strsplit(
111- as.character(vcf.data $ ID ),
112- ' ;' ,
113- fixed = TRUE
114- );
115- expanded.vcf <- vcf.data [rep(seq_len(nrow(vcf.data )), lengths(split.rows )), ]
116- expanded.vcf $ ID <- unlist(split.rows )
117- split.rsid.vcf.data <- expanded.vcf
118- } else {
119- split.rsid.vcf.data <- vcf.data
120- }
108+ # Expand the VCF$ID column to a row-per-rsID format.
109+ # Some variants have multiple rsIDs in the ID column separated by semicolons.
110+ # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
111+ # we create a new data frame with the expanded rsID data
112+ if (any(grepl(' ;' , vcf.data $ ID ))) {
113+ split.rows <- strsplit(
114+ x = as.character(vcf.data $ ID ),
115+ split = ' ;' ,
116+ fixed = TRUE
117+ );
118+
119+ row.indices <- rep(
120+ x = seq_len(nrow(vcf.data )),
121+ times = lengths(split.rows )
122+ );
123+
124+ expanded.vcf <- vcf.data [row.indices , ];
125+
126+ expanded.vcf $ ID <- unlist(split.rows );
127+
128+ split.rsid.vcf.data <- expanded.vcf ;
129+ } else {
130+ split.rsid.vcf.data <- vcf.data ;
131+ }
121132
122133 # merge missing SNP data on split rsID
123134 merged.vcf.with.missing.pgs.data <- merge(
0 commit comments