Skip to content

Commit d595990

Browse files
authored
Merge pull request #86 from uclahs-cds/nzeltser-optimize-main-functions
optimize main functions
2 parents 3c680bb + 8b66e5f commit d595990

23 files changed

+1895
-642
lines changed

DESCRIPTION

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: ApplyPolygenicScore
22
Type: Package
33
Title: Utilities for the Application of a Polygenic Score to a VCF
4-
Version: 3.1.0
4+
Version: 4.0.0
55
Authors@R: c(
66
person('Paul', 'Boutros', role = 'cre', email = '[email protected]'),
77
person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
@@ -21,7 +21,6 @@ Imports:
2121
vcfR,
2222
pROC,
2323
data.table,
24-
reshape2,
2524
BoutrosLab.plotting.general,
2625
lattice
2726
Suggests:

NAMESPACE

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,5 @@
11
import('vcfR');
2-
importFrom(
3-
'data.table',
4-
'tstrsplit',
5-
'setDT',
6-
':='
7-
);
8-
import('reshape2');
2+
import('data.table');
93
import('BoutrosLab.plotting.general');
104
importFrom(
115
'lattice',

NEWS.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22

33
# ApplyPolygenicScore unreleased
44

5+
# ApplyPolygenicScore 4.0.0
6+
7+
## Changed
8+
* Refactored all main functions for gains in RAM efficiency and runtime
9+
* Introduced a breaking change to the output of `import.vcf`. The outputed list object has a different naming scheme and different data formats. Previous data formats are still supported by setting `long.format` to `TRUE`, however the naming scheme is still changed.
10+
* Introduced a breaking change to `apply.polygenic.score`. The expected default `vcf.data` input format has changed. The previous input format is still supported by setting `vcf.long.format` to `TRUE` from the default `FALSE`.
11+
12+
## Added
13+
* Added support for more efficient storage and manipulation of imported VCF data. The default output of `import.vcf` now returns VCF data in a split format. A `data.table` object contains VCF data from fixed fields (CHROM, POS, ID, REF, ALT). A `matrix` object contains sample-specific genotypes in allele-format in a sample (columns) by variant (rows) matrix.
14+
515
# ApplyPolygenicScore 3.1.0
616

717
## Changed

R/apply-pgs.R

Lines changed: 262 additions & 168 deletions
Large diffs are not rendered by default.

R/assess-strand-flip.R

Lines changed: 169 additions & 93 deletions
Large diffs are not rendered by default.

R/calculate-dosage.R

Lines changed: 90 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Handling CRAN warnings for data.table syntax:
2+
if (getRversion() >= '2.15.1') utils::globalVariables(c('dosage'));
3+
14
#' @title Convert alleles to dosage
25
#' @description Convert genotype calls in the form of witten out alleles (e.g. 'A/T') to dosages (0, 1, 2) based on provided risk alleles from a PGS.
36
#' @param called.alleles A vector of genotypes in allelic notation separated by a slash or pipe.
@@ -9,10 +12,34 @@
912
#' convert.alleles.to.pgs.dosage(called.alleles, risk.alleles);
1013
#' @export
1114
convert.alleles.to.pgs.dosage <- function(called.alleles, risk.alleles) {
12-
# check that risk.alleles is the same length as called.alleles
13-
if (length(called.alleles) != length(risk.alleles)) {
14-
stop('called.alleles and risk.alleles must be the same length.');
15+
16+
# Check input class and convert to a matrix for consistent processing
17+
is.vector.input <- is.vector(called.alleles);
18+
if (is.vector.input) {
19+
# Fast-fail for all missing genotypes
20+
if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
21+
return(rep(NA, length(called.alleles)));
22+
}
23+
called.alleles.matrix <- matrix(called.alleles, ncol = 1);
24+
} else if (is.matrix(called.alleles)) {
25+
# Fast-fail for all missing genotypes
26+
if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
27+
return(matrix(NA, nrow = nrow(called.alleles), ncol = ncol(called.alleles), dimnames = dimnames(called.alleles)));
1528
}
29+
called.alleles.matrix <- called.alleles;
30+
} else {
31+
stop("Unrecognized 'called.alleles' format. Must be a vector or a matrix.");
32+
}
33+
34+
# Check that called.alleles.matrix has rows corresponding to risk.alleles
35+
if (nrow(called.alleles.matrix) != length(risk.alleles)) {
36+
stop('Number of rows in called.alleles must equal length of risk.alleles.');
37+
}
38+
39+
# # check that risk.alleles is the same length as called.alleles
40+
# if (length(called.alleles) != length(risk.alleles)) {
41+
# stop('called.alleles and risk.alleles must be the same length.');
42+
# }
1643

1744
# check for missing risk alleles and warn
1845
if (any(is.na(risk.alleles))) {
@@ -24,68 +51,74 @@ convert.alleles.to.pgs.dosage <- function(called.alleles, risk.alleles) {
2451
stop('unrecognized risk.allele format, must be capitalized letters.');
2552
}
2653

27-
# handle totally missing genotypes
28-
# if the entire vector is NA or the entire vector is '.', return NA
29-
if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
30-
split.alleles <- data.frame(called.alleles, called.alleles);
31-
} else {
32-
# check that called.alleles is a vector of genotypes in allelic notation or '.' separated by a slash or pipe
33-
# "*" characters represent overlapping deletions from an upstream indel and are accepted VCF format
34-
allowed.pattern <- '^((([A-Z]+|\\.|\\*)[/\\|]([A-Z]+|\\.|\\*))|\\.|[A-Z]+)$' # '|' are special chars in regular expressions
35-
passing.alleles <- grepl(allowed.pattern, called.alleles);
36-
passing.alleles[is.na(called.alleles)] <- TRUE; # NA allowed
37-
if (!all(passing.alleles)) {
38-
stop('unrecognized called.alleles format, must be capitalized letters, "." or "*" separated by a slash or pipe.');
39-
}
40-
# replace hemizygous genotypes with a placeholder for easier splitting
41-
# index for non-NA alleles that are missing allele separators:
42-
no.sep.index <- (!grepl('/|\\|', called.alleles) & !is.na(called.alleles) & called.alleles != '.');
43-
called.alleles[no.sep.index] <- paste0(called.alleles[no.sep.index], '/-');
44-
split.alleles <- data.table::tstrsplit(called.alleles, split = c('/|\\|'), keep = c(1,2)); # '|' are special chars in regular expressions
45-
}
46-
names(split.alleles) <- c('called.allele.a', 'called.allele.b');
47-
48-
# replace 'NA' with '.' for easier comparisons
49-
missing.label <- '.';
50-
split.alleles <- lapply(
51-
X = split.alleles,
52-
FUN = function(x) {
53-
x[is.na(x)] <- missing.label;
54-
return(x);
54+
# Vectorized validation and handling of called alleles
55+
# "*" characters represent overlapping deletions from an upstream indel and are accepted VCF format
56+
allowed.pattern <- '^((([A-Z]+|\\.|\\*)[/\\|]([A-Z]+|\\.|\\*))|\\.|[A-Z]+)$';
57+
passing.alleles <- grepl(allowed.pattern, called.alleles);
58+
passing.alleles[is.na(called.alleles)] <- TRUE;
59+
if (!all(passing.alleles)) {
60+
stop('unrecognized called.alleles format, must be capitalized letters, "." or "*" separated by a slash or pipe.');
5561
}
62+
63+
# Replace hemizygous genotypes with a placeholder for consistent splitting
64+
no.sep.index <- (!grepl('/|\\|', called.alleles) & !is.na(called.alleles) & called.alleles != '.');
65+
called.alleles[no.sep.index] <- paste0(called.alleles[no.sep.index], '/-');
66+
called.alleles.matrix <- matrix(called.alleles, nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
67+
68+
# Split the entire matrix of alleles into two matrices, one for each allele
69+
alleles.split <- data.table::tstrsplit(as.vector(called.alleles.matrix), split = '/|\\|', fixed = FALSE);
70+
alleles.a <- matrix(alleles.split[[1]], nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
71+
alleles.b <- matrix(alleles.split[[2]], nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
72+
73+
# Replicate risk.alleles across columns for vectorized comparison
74+
risk.alleles.matrix <- matrix(
75+
rep(risk.alleles, times = ncol(called.alleles.matrix)),
76+
nrow = nrow(called.alleles.matrix)
5677
);
5778

58-
dosage <- rep(NA, length(called.alleles));
59-
for (i in 1:length(called.alleles)) {
60-
if (is.na(risk.alleles[i])) {
61-
dosage[i] <- NA; # if the risk allele is missing, return NA, no dosage can be calculated
62-
} else if ((split.alleles$called.allele.a[i] == missing.label) & (split.alleles$called.allele.b[i] == missing.label)) {
63-
dosage[i] <- NA; # if both allelles are missing, no genotype was called, return NA
64-
} else if (split.alleles$called.allele.a[i] == missing.label | split.alleles$called.allele.b[i] == missing.label) {
65-
dosage[i] <- NA; # if one of the alleles is marked as missing but the other is not, this is an unrecognized format
66-
warning('one of two alleles is marked as missing at index ', i, ', this is an unrecognized format, returning NA for dosage.');
67-
} else if (split.alleles$called.allele.a[i] == risk.alleles[i] & split.alleles$called.allele.b[i] == risk.alleles[i]) {
68-
dosage[i] <- 2; # if both alleles are the risk allele, the genotype is homozygous for the effect allele and the dosage is 2.
69-
} else if (split.alleles$called.allele.a[i] == risk.alleles[i] | split.alleles$called.allele.b[i] == risk.alleles[i]) {
70-
dosage[i] <- 1; # if only one of the alleles is the risk allele, the genotype is heterozygous and the dosage is 1.
71-
} else {
72-
dosage[i] <- 0; # if neither allele is the risk allele, the genotype is homozygous for the non-effect allele and the dosage is 0.
73-
}
74-
}
75-
return(dosage);
79+
# Compute dosage (0, 1, 2)
80+
# Initialize dosage matrix with zeros
81+
dosage.matrix <- matrix(0L, nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
82+
# Add 1 to dosage for each instance of the risk allele
83+
dosage.matrix <- dosage.matrix + (alleles.a == risk.alleles.matrix);
84+
dosage.matrix <- dosage.matrix + (alleles.b == risk.alleles.matrix);
85+
86+
# Handle special cases
87+
# Check for missing alleles ('NA' or '.') for both NA assignment and warning
88+
is.missing.a <- is.na(alleles.a) | (alleles.a == '.');
89+
is.missing.b <- is.na(alleles.b) | (alleles.b == '.');
90+
91+
# Case where one allele is marked as missing and the other is not (e.g. `./A`)
92+
# This should return NA and issue a warning
93+
is.one.missing <- (is.missing.a & !is.missing.b) | (!is.missing.a & is.missing.b);
94+
if (any(is.one.missing)) {
95+
warning('some genotypes contain a missing allele, returning NA for corresponding dosage.');
96+
}
97+
98+
# Apply the final NA mask
99+
na.mask <- is.missing.a | is.missing.b | is.na(called.alleles.matrix) | is.na(risk.alleles.matrix);
100+
dosage.matrix[na.mask] <- NA;
101+
102+
# Restore the matrix dimensions and dimnames
103+
dimnames(dosage.matrix) <- dimnames(called.alleles.matrix);
104+
105+
# If the original input was a vector, convert the output back to a vector
106+
if (is.vector.input) {
107+
return(as.vector(dosage.matrix));
108+
} else {
109+
return(dosage.matrix);
110+
}
111+
76112
}
77113

78114
# The function for calculating a dosage value intended to replace missing genotypes.
79115
calculate.missing.genotype.dosage <- function(dosage.matrix) {
80116
# calculate the mean dosage for each variant
81-
mean.dosage <- apply(
82-
X = dosage.matrix,
83-
MARGIN = 1,
84-
FUN = function(x) {
85-
# simple mean
86-
mean(x, na.rm = TRUE)
87-
}
88-
);
117+
mean.dosage <- rowMeans(x = dosage.matrix, na.rm = TRUE);
118+
119+
# replace NaN (from all NA rows) with NA
120+
mean.dosage[is.nan(mean.dosage)] <- NA;
121+
89122
return(mean.dosage);
90123
}
91124

0 commit comments

Comments
 (0)