@@ -105,30 +105,31 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
105105 missing.snp.pgs.weight.data <- subset(missing.snp.merged.data , select = colnames(pgs.weight.data ));
106106 rm(missing.snp.merged.data );
107107
108- # Expand the VCF$ID column to a row-per-rsID format.
109- # Some variants have multiple rsIDs in the ID column separated by semicolons.
110- # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
111- # we create a new data frame with the expanded rsID data
112- if (any(grepl(' ;' , vcf.data $ ID ))) {
113- split.rows <- strsplit(
114- x = as.character(vcf.data $ ID ),
115- split = ' ;' ,
116- fixed = TRUE
117- );
118-
119- row.indices <- rep(
120- x = seq_len(nrow(vcf.data )),
121- times = lengths(split.rows )
122- );
123-
124- expanded.vcf <- vcf.data [row.indices , ];
108+ # Expand the VCF$ID column to a row-per-rsID format.
109+ # Some variants have multiple rsIDs in the ID column separated by semicolons.
110+ # We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
111+ # we create a new data frame with the expanded rsID data
112+ if (any(grepl(' ;' , vcf.data $ ID ))) {
113+ split.rows <- strsplit(
114+ x = as.character(vcf.data $ ID ),
115+ split = ' ;' ,
116+ fixed = TRUE
117+ );
118+
119+ row.indices <- rep(
120+ x = seq_len(nrow(vcf.data )),
121+ times = lengths(split.rows )
122+ );
123+
124+ split.rsid.vcf.data <- vcf.data [row.indices , ];
125+
126+ split.rsid.vcf.data $ ID.vcf.unsplit <- split.rsid.vcf.data $ ID ; # save original rsID names for final output
127+ split.rsid.vcf.data $ ID <- unlist(split.rows );
125128
126- expanded.vcf $ ID <- unlist(split.rows );
127-
128- split.rsid.vcf.data <- expanded.vcf ;
129- } else {
130- split.rsid.vcf.data <- vcf.data ;
131- }
129+ } else {
130+ vcf.data $ ID.vcf.unsplit <- vcf.data $ ID ; # save an ID.vcf.unsplit column for consistency
131+ split.rsid.vcf.data <- vcf.data ;
132+ }
132133
133134 # merge missing SNP data on split rsID
134135 merged.vcf.with.missing.pgs.data <- merge(
@@ -162,7 +163,8 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
162163
163164 # add columns to match original merge
164165 merged.vcf.with.missing.pgs.data $ ID.pgs <- merged.vcf.with.missing.pgs.data $ ID ;
165- merged.vcf.with.missing.pgs.data $ ID.vcf <- merged.vcf.with.missing.pgs.data $ ID ;
166+ merged.vcf.with.missing.pgs.data $ ID.vcf <- merged.vcf.with.missing.pgs.data $ ID.vcf.unsplit ;
167+ merged.vcf.with.missing.pgs.data $ ID.vcf.unsplit <- NULL ;
166168 merged.vcf.with.missing.pgs.data $ merge.strategy <- ' rsID' ;
167169
168170 # subset columns to match original merge
0 commit comments