Skip to content

Commit 07b8d71

Browse files
authored
Merge pull request #75 from uclahs-cds/ragrawal-fix-semicolon
Ragrawal fix semicolon
2 parents 1922324 + 7aff72b commit 07b8d71

File tree

4 files changed

+158
-20
lines changed

4 files changed

+158
-20
lines changed

DESCRIPTION

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ Version: 3.0.2
55
Authors@R: c(
66
person('Paul', 'Boutros', role = 'cre', email = '[email protected]'),
77
person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
8-
person('Rachel', 'Dang', role = 'ctb'))
8+
person('Rachel', 'Dang', role = 'ctb'),
9+
person('Raag', 'Agrawal', role = 'ctb'))
910
Description: Simple and transparent parsing of genotype/dosage data
1011
from an input Variant Call Format (VCF) file, matching of genotype
1112
coordinates to the component Single Nucleotide Polymorphisms (SNPs)
@@ -14,7 +15,7 @@ Description: Simple and transparent parsing of genotype/dosage data
1415
in accordance with the additive weighted sum of dosages model. Methods
1516
are designed in reference to best practices described by
1617
Collister, Liu, and Clifton (2022) <doi:10.3389/fgene.2022.818574>.
17-
Depends:
18+
Depends:
1819
R (>= 4.2.0)
1920
Imports:
2021
vcfR,
@@ -23,7 +24,7 @@ Imports:
2324
reshape2,
2425
BoutrosLab.plotting.general,
2526
lattice
26-
Suggests:
27+
Suggests:
2728
knitr,
2829
rmarkdown,
2930
testthat (>= 3.0.0)

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Unreleased
22

3+
# ApplyPolygenicScore unreleased
4+
## Changed
5+
* Fixed regression of combine.vcf.with.pgs() function that prevented it from handling multiple rsIDs on the same line.
6+
* added new contributor
7+
38
# ApplyPolygenicScore 3.0.2
49

510
## Changed

R/combine-vcf-with-pgs.R

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -105,23 +105,34 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
105105
missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
106106
rm(missing.snp.merged.data);
107107

108-
# Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;)
109-
# most efficient way to do this is to use the data.table package
108+
# Expand the VCF$ID column to a row-per-rsID format.
109+
# Some variants have multiple rsIDs in the ID column separated by semicolons.
110+
# We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
111+
# we create a new data frame with the expanded rsID data
110112
if (any(grepl(';', vcf.data$ID))) {
111-
data.table::setDT(vcf.data);
112-
split.rsid.vcf.data <- merge(
113-
x = vcf.data,
114-
# split only entries with multiple rsIDs, save in new column, and merge back with the original data
115-
y = vcf.data[grepl(';', get('ID')), unique(unlist(strsplit(as.character(get('ID')), ';', fixed = TRUE))), by = .(get('Indiv'), get('CHROM'), get('POS'))
116-
][,.(new.ID = get('V1'), get('Indiv'), get('CHROM'), get('POS'))],
117-
by = c('CHROM', 'POS', 'Indiv'),
118-
all = TRUE
113+
split.rows <- strsplit(
114+
x = as.character(vcf.data$ID),
115+
split = ';',
116+
fixed = TRUE
119117
);
120-
# replace entries with multiple rsIDs with the new, split, rsID
121-
split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL];
122-
} else {
118+
119+
# remove duplicate IDs
120+
split.rows <- lapply(split.rows, function(x) unique(x));
121+
122+
row.indices <- rep(
123+
x = seq_len(nrow(vcf.data)),
124+
times = lengths(split.rows)
125+
);
126+
127+
split.rsid.vcf.data <- vcf.data[row.indices, ];
128+
129+
split.rsid.vcf.data$ID.vcf.unsplit <- split.rsid.vcf.data$ID; # save original rsID names for final output
130+
split.rsid.vcf.data$ID <- unlist(split.rows);
131+
132+
} else {
133+
vcf.data$ID.vcf.unsplit <- vcf.data$ID; # save an ID.vcf.unsplit column for consistency
123134
split.rsid.vcf.data <- vcf.data;
124-
}
135+
}
125136

126137
# merge missing SNP data on split rsID
127138
merged.vcf.with.missing.pgs.data <- merge(
@@ -155,7 +166,8 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
155166

156167
# add columns to match original merge
157168
merged.vcf.with.missing.pgs.data$ID.pgs <- merged.vcf.with.missing.pgs.data$ID;
158-
merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID;
169+
merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID.vcf.unsplit;
170+
merged.vcf.with.missing.pgs.data$ID.vcf.unsplit <- NULL;
159171
merged.vcf.with.missing.pgs.data$merge.strategy <- 'rsID';
160172

161173
# subset columns to match original merge

tests/testthat/test-vcf-pgs-merge.R

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,13 +284,22 @@ test_that(
284284
);
285285
test.vcf.data.missing.locus.matching.rsid <- data.frame(
286286
CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
287-
POS = c(1, 3, 3, 6),
287+
POS = c(1, 3, 3, 6), #rs2/chr2:3 is missing by POS but present by rsID, #rs5/chr4:6 is missing by rsID AND by POS
288288
ID = c('rs1', 'rs3', 'rs2', 'rs5'),
289289
REF = c('A', 'T', 'C', 'G'),
290290
ALT = c('T', 'A', 'G', 'C'),
291291
Indiv = c('sample1', 'sample2', 'sample3', 'sample4'),
292292
gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C')
293293
);
294+
test.vcf.data.missing.locus.matching.rsid.with.semicolons <- data.frame(
295+
CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
296+
POS = c(1, 3, 3, 6),
297+
ID = c('rs1;rs1', 'rs3', 'rs2;rsB', 'rs5;rsC'),
298+
REF = c('A', 'T', 'C', 'G'),
299+
ALT = c('T', 'A', 'G', 'C'),
300+
Indiv = c('sample1', 'sample2', 'sample3', 'sample4'),
301+
gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C')
302+
);
294303
test.pgs.weight.data <- data.frame(
295304
CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
296305
POS = c(1, 3, 2, 4),
@@ -316,6 +325,14 @@ test_that(
316325
'PGS is missing 1 SNPs from VCF'
317326
);
318327

328+
expect_warning(
329+
combine.vcf.with.pgs(
330+
vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons,
331+
pgs.weight.data = test.pgs.weight.data
332+
),
333+
'PGS is missing 1 SNPs from VCF'
334+
);
335+
319336
test.combine.vcf.with.pgs.no.missing <- combine.vcf.with.pgs(
320337
vcf.data = test.vcf.data.no.missing,
321338
pgs.weight.data = test.pgs.weight.data
@@ -331,6 +348,11 @@ test_that(
331348
pgs.weight.data = test.pgs.weight.data
332349
);
333350

351+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons <- combine.vcf.with.pgs(
352+
vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons,
353+
pgs.weight.data = test.pgs.weight.data
354+
);
355+
334356
# check that combine.vcf.with.pgs returns the correct number of rows
335357
expect_equal(
336358
nrow(test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data),
@@ -344,6 +366,10 @@ test_that(
344366
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data),
345367
4
346368
);
369+
expect_equal(
370+
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data),
371+
4
372+
);
347373

348374
expect_equal(
349375
nrow(test.combine.vcf.with.pgs.missing$missing.snp.data),
@@ -357,6 +383,10 @@ test_that(
357383
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data),
358384
1
359385
);
386+
expect_equal(
387+
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data),
388+
1
389+
);
360390

361391
# check that combine.vcf.with.pgs returns the correct number of columns
362392
expect_equal(
@@ -371,6 +401,10 @@ test_that(
371401
ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data),
372402
11
373403
);
404+
expect_equal(
405+
ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data),
406+
11
407+
);
374408

375409
# check that combine.vcf.with.pgs returns the correct columns
376410
expect_true(
@@ -382,6 +416,9 @@ test_that(
382416
expect_true(
383417
all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data))
384418
);
419+
expect_true(
420+
all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data))
421+
);
385422

386423
# check that combine.vcf.with.pgs returns the correct values
387424
expect_equal(
@@ -396,7 +433,10 @@ test_that(
396433
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$CHROM,
397434
c('chr1', 'chr3', 'chr2', 'chr4')
398435
);
399-
436+
expect_equal(
437+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$CHROM,
438+
c('chr1', 'chr3', 'chr2', 'chr4')
439+
);
400440

401441
expect_equal(
402442
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$POS,
@@ -410,6 +450,10 @@ test_that(
410450
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$POS,
411451
c(1, 3, 3, 4)
412452
);
453+
expect_equal(
454+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$POS,
455+
c(1, 3, 3, 4)
456+
);
413457

414458
expect_equal(
415459
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs,
@@ -423,7 +467,44 @@ test_that(
423467
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.pgs,
424468
c('rs1', 'rs3', 'rs2', 'rs4')
425469
);
470+
expect_equal(
471+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.pgs,
472+
c('rs1', 'rs3', 'rs2', 'rs4')
473+
);
426474

475+
expect_equal(
476+
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs,
477+
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.vcf
478+
);
479+
expect_equal(
480+
test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$ID.vcf,
481+
c('rs1', 'rs2', 'rs3', NA)
482+
);
483+
expect_equal(
484+
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.vcf,
485+
c('rs1', 'rs3', 'rs2', NA)
486+
);
487+
expect_equal(
488+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.vcf,
489+
c('rs1;rs1', 'rs3', 'rs2;rsB', NA)
490+
);
491+
492+
expect_equal(
493+
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$merge.strategy,
494+
rep('genomic coordinate', 4)
495+
);
496+
expect_equal(
497+
test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$merge.strategy,
498+
c(rep('genomic coordinate', 3), 'rsID')
499+
);
500+
expect_equal(
501+
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$merge.strategy,
502+
c('genomic coordinate', 'genomic coordinate', 'rsID','rsID')
503+
);
504+
expect_equal(
505+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$merge.strategy,
506+
c('genomic coordinate', 'genomic coordinate', 'rsID','rsID')
507+
);
427508

428509
# check that combine.vcf.with.pgs returns the correct values for missing SNPs
429510
expect_equal(
@@ -512,6 +593,45 @@ test_that(
512593
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data$gt_GT_alleles)
513594
);
514595

596+
expect_equal(
597+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.pgs,
598+
'chr4'
599+
);
600+
expect_true(
601+
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.vcf)
602+
);
603+
expect_equal(
604+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.pgs,
605+
4
606+
);
607+
expect_true(
608+
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.vcf)
609+
);
610+
expect_equal(
611+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ID,
612+
'rs4'
613+
);
614+
expect_equal(
615+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$effect_allele,
616+
'G'
617+
);
618+
expect_equal(
619+
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$beta,
620+
4
621+
);
622+
expect_true(
623+
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$REF)
624+
);
625+
expect_true(
626+
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ALT)
627+
);
628+
expect_true(
629+
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$Indiv)
630+
);
631+
expect_true(
632+
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$gt_GT_alleles)
633+
);
634+
515635
}
516636
);
517637

0 commit comments

Comments
 (0)