Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ Version: 3.0.2
Authors@R: c(
person('Paul', 'Boutros', role = 'cre', email = 'PBoutros@mednet.ucla.edu'),
person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
person('Rachel', 'Dang', role = 'ctb'))
person('Rachel', 'Dang', role = 'ctb'),
person('Raag', 'Agrawal', role = 'ctb'))
Description: Simple and transparent parsing of genotype/dosage data
from an input Variant Call Format (VCF) file, matching of genotype
coordinates to the component Single Nucleotide Polymorphisms (SNPs)
Expand All @@ -14,7 +15,7 @@ Description: Simple and transparent parsing of genotype/dosage data
in accordance with the additive weighted sum of dosages model. Methods
are designed in reference to best practices described by
Collister, Liu, and Clifton (2022) <doi:10.3389/fgene.2022.818574>.
Depends:
Depends:
R (>= 4.2.0)
Imports:
vcfR,
Expand All @@ -23,7 +24,7 @@ Imports:
reshape2,
BoutrosLab.plotting.general,
lattice
Suggests:
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Unreleased

# ApplyPolygenicScore unreleased
## Changed
* Fixed regression of combine.vcf.with.pgs() function that prevented it from handling multiple rsIDs on the same line.
* added new contributor

# ApplyPolygenicScore 3.0.2

## Changed
Expand Down
42 changes: 27 additions & 15 deletions R/combine-vcf-with-pgs.R
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great improvement on clarity. I noticed that the original iteration uses the Indiv column, is that column not necessary or not always available?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we replicate all columns in the dataframe so we don't explicitely need to reference Indiv.

Original file line number Diff line number Diff line change
Expand Up @@ -105,23 +105,34 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {
missing.snp.pgs.weight.data <- subset(missing.snp.merged.data, select = colnames(pgs.weight.data));
rm(missing.snp.merged.data);

# Split VCF$ID column into separate rows for each rsID (multiple rsIDs are separated by ;)
# most efficient way to do this is to use the data.table package
# Expand the VCF$ID column to a row-per-rsID format.
# Some variants have multiple rsIDs in the ID column separated by semicolons.
# We detect such cases using grepl, split them, and expand the data so that each rsID has its own row.
# we create a new data frame with the expanded rsID data
if (any(grepl(';', vcf.data$ID))) {
data.table::setDT(vcf.data);
split.rsid.vcf.data <- merge(
x = vcf.data,
# split only entries with multiple rsIDs, save in new column, and merge back with the original data
y = vcf.data[grepl(';', get('ID')), unique(unlist(strsplit(as.character(get('ID')), ';', fixed = TRUE))), by = .(get('Indiv'), get('CHROM'), get('POS'))
][,.(new.ID = get('V1'), get('Indiv'), get('CHROM'), get('POS'))],
by = c('CHROM', 'POS', 'Indiv'),
all = TRUE
split.rows <- strsplit(
x = as.character(vcf.data$ID),
split = ';',
fixed = TRUE
);
# replace entries with multiple rsIDs with the new, split, rsID
split.rsid.vcf.data <- split.rsid.vcf.data[!is.na(new.ID), ID := new.ID][, new.ID := NULL];
} else {

# remove duplicate IDs
split.rows <- lapply(split.rows, function(x) unique(x));

row.indices <- rep(
x = seq_len(nrow(vcf.data)),
times = lengths(split.rows)
);

split.rsid.vcf.data <- vcf.data[row.indices, ];

split.rsid.vcf.data$ID.vcf.unsplit <- split.rsid.vcf.data$ID; # save original rsID names for final output
split.rsid.vcf.data$ID <- unlist(split.rows);

} else {
vcf.data$ID.vcf.unsplit <- vcf.data$ID; # save an ID.vcf.unsplit column for consistency
split.rsid.vcf.data <- vcf.data;
}
}

# merge missing SNP data on split rsID
merged.vcf.with.missing.pgs.data <- merge(
Expand Down Expand Up @@ -155,7 +166,8 @@ combine.vcf.with.pgs <- function(vcf.data, pgs.weight.data) {

# add columns to match original merge
merged.vcf.with.missing.pgs.data$ID.pgs <- merged.vcf.with.missing.pgs.data$ID;
merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID;
merged.vcf.with.missing.pgs.data$ID.vcf <- merged.vcf.with.missing.pgs.data$ID.vcf.unsplit;
merged.vcf.with.missing.pgs.data$ID.vcf.unsplit <- NULL;
merged.vcf.with.missing.pgs.data$merge.strategy <- 'rsID';

# subset columns to match original merge
Expand Down
124 changes: 122 additions & 2 deletions tests/testthat/test-vcf-pgs-merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -284,13 +284,22 @@ test_that(
);
test.vcf.data.missing.locus.matching.rsid <- data.frame(
CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
POS = c(1, 3, 3, 6),
POS = c(1, 3, 3, 6), #rs2/chr2:3 is missing by POS but present by rsID, #rs5/chr4:6 is missing by rsID AND by POS
ID = c('rs1', 'rs3', 'rs2', 'rs5'),
REF = c('A', 'T', 'C', 'G'),
ALT = c('T', 'A', 'G', 'C'),
Indiv = c('sample1', 'sample2', 'sample3', 'sample4'),
gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C')
);
test.vcf.data.missing.locus.matching.rsid.with.semicolons <- data.frame(
CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
POS = c(1, 3, 3, 6),
ID = c('rs1;rs1', 'rs3', 'rs2;rsB', 'rs5;rsC'),
REF = c('A', 'T', 'C', 'G'),
ALT = c('T', 'A', 'G', 'C'),
Indiv = c('sample1', 'sample2', 'sample3', 'sample4'),
gt_GT_alleles = c('A/T', 'T/A', 'C/G', 'G/C')
);
test.pgs.weight.data <- data.frame(
CHROM = c('chr1', 'chr3', 'chr2', 'chr4'),
POS = c(1, 3, 2, 4),
Expand All @@ -316,6 +325,14 @@ test_that(
'PGS is missing 1 SNPs from VCF'
);

expect_warning(
combine.vcf.with.pgs(
vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons,
pgs.weight.data = test.pgs.weight.data
),
'PGS is missing 1 SNPs from VCF'
);

test.combine.vcf.with.pgs.no.missing <- combine.vcf.with.pgs(
vcf.data = test.vcf.data.no.missing,
pgs.weight.data = test.pgs.weight.data
Expand All @@ -331,6 +348,11 @@ test_that(
pgs.weight.data = test.pgs.weight.data
);

test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons <- combine.vcf.with.pgs(
vcf.data = test.vcf.data.missing.locus.matching.rsid.with.semicolons,
pgs.weight.data = test.pgs.weight.data
);

# check that combine.vcf.with.pgs returns the correct number of rows
expect_equal(
nrow(test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data),
Expand All @@ -344,6 +366,10 @@ test_that(
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data),
4
);
expect_equal(
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data),
4
);

expect_equal(
nrow(test.combine.vcf.with.pgs.missing$missing.snp.data),
Expand All @@ -357,6 +383,10 @@ test_that(
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data),
1
);
expect_equal(
nrow(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data),
1
);

# check that combine.vcf.with.pgs returns the correct number of columns
expect_equal(
Expand All @@ -371,6 +401,10 @@ test_that(
ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data),
11
);
expect_equal(
ncol(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data),
11
);

# check that combine.vcf.with.pgs returns the correct columns
expect_true(
Expand All @@ -382,6 +416,9 @@ test_that(
expect_true(
all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data))
);
expect_true(
all(c('CHROM', 'POS', 'REF', 'ALT', 'Indiv', 'gt_GT_alleles', 'effect_allele', 'beta', 'ID.pgs', 'ID.vcf', 'merge.strategy') %in% colnames(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data))
);

# check that combine.vcf.with.pgs returns the correct values
expect_equal(
Expand All @@ -396,7 +433,10 @@ test_that(
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$CHROM,
c('chr1', 'chr3', 'chr2', 'chr4')
);

expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$CHROM,
c('chr1', 'chr3', 'chr2', 'chr4')
);

expect_equal(
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$POS,
Expand All @@ -410,6 +450,10 @@ test_that(
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$POS,
c(1, 3, 3, 4)
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$POS,
c(1, 3, 3, 4)
);

expect_equal(
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs,
Expand All @@ -423,7 +467,44 @@ test_that(
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.pgs,
c('rs1', 'rs3', 'rs2', 'rs4')
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.pgs,
c('rs1', 'rs3', 'rs2', 'rs4')
);

expect_equal(
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.pgs,
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$ID.vcf
);
expect_equal(
test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$ID.vcf,
c('rs1', 'rs2', 'rs3', NA)
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$ID.vcf,
c('rs1', 'rs3', 'rs2', NA)
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$ID.vcf,
c('rs1;rs1', 'rs3', 'rs2;rsB', NA)
);

expect_equal(
test.combine.vcf.with.pgs.no.missing$merged.vcf.with.pgs.data$merge.strategy,
rep('genomic coordinate', 4)
);
expect_equal(
test.combine.vcf.with.pgs.missing$merged.vcf.with.pgs.data$merge.strategy,
c(rep('genomic coordinate', 3), 'rsID')
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid$merged.vcf.with.pgs.data$merge.strategy,
c('genomic coordinate', 'genomic coordinate', 'rsID','rsID')
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$merged.vcf.with.pgs.data$merge.strategy,
c('genomic coordinate', 'genomic coordinate', 'rsID','rsID')
);

# check that combine.vcf.with.pgs returns the correct values for missing SNPs
expect_equal(
Expand Down Expand Up @@ -512,6 +593,45 @@ test_that(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid$missing.snp.data$gt_GT_alleles)
);

expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.pgs,
'chr4'
);
expect_true(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$CHROM.vcf)
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.pgs,
4
);
expect_true(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$POS.vcf)
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ID,
'rs4'
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$effect_allele,
'G'
);
expect_equal(
test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$beta,
4
);
expect_true(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$REF)
);
expect_true(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$ALT)
);
expect_true(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$Indiv)
);
expect_true(
is.na(test.combine.vcf.with.pgs.missing.locus.matching.rsid.with.semicolons$missing.snp.data$gt_GT_alleles)
);

}
);

Expand Down