1+ # Handling CRAN warnings for data.table syntax:
2+ if (getRversion() > = ' 2.15.1' ) utils :: globalVariables(c(' dosage' ));
3+
14# ' @title Convert alleles to dosage
25# ' @description Convert genotype calls in the form of witten out alleles (e.g. 'A/T') to dosages (0, 1, 2) based on provided risk alleles from a PGS.
36# ' @param called.alleles A vector of genotypes in allelic notation separated by a slash or pipe.
912# ' convert.alleles.to.pgs.dosage(called.alleles, risk.alleles);
1013# ' @export
1114convert.alleles.to.pgs.dosage <- function (called.alleles , risk.alleles ) {
12- # check that risk.alleles is the same length as called.alleles
13- if (length(called.alleles ) != length(risk.alleles )) {
14- stop(' called.alleles and risk.alleles must be the same length.' );
15+
16+ # Check input class and convert to a matrix for consistent processing
17+ is.vector.input <- is.vector(called.alleles );
18+ if (is.vector.input ) {
19+ # Fast-fail for all missing genotypes
20+ if (all(is.na(called.alleles )) | all(called.alleles == ' .' )) {
21+ return (rep(NA , length(called.alleles )));
22+ }
23+ called.alleles.matrix <- matrix (called.alleles , ncol = 1 );
24+ } else if (is.matrix(called.alleles )) {
25+ # Fast-fail for all missing genotypes
26+ if (all(is.na(called.alleles )) | all(called.alleles == ' .' )) {
27+ return (matrix (NA , nrow = nrow(called.alleles ), ncol = ncol(called.alleles ), dimnames = dimnames(called.alleles )));
1528 }
29+ called.alleles.matrix <- called.alleles ;
30+ } else {
31+ stop(" Unrecognized 'called.alleles' format. Must be a vector or a matrix." );
32+ }
33+
34+ # Check that called.alleles.matrix has rows corresponding to risk.alleles
35+ if (nrow(called.alleles.matrix ) != length(risk.alleles )) {
36+ stop(' Number of rows in called.alleles must equal length of risk.alleles.' );
37+ }
38+
39+ # # check that risk.alleles is the same length as called.alleles
40+ # if (length(called.alleles) != length(risk.alleles)) {
41+ # stop('called.alleles and risk.alleles must be the same length.');
42+ # }
1643
1744 # check for missing risk alleles and warn
1845 if (any(is.na(risk.alleles ))) {
@@ -24,68 +51,74 @@ convert.alleles.to.pgs.dosage <- function(called.alleles, risk.alleles) {
2451 stop(' unrecognized risk.allele format, must be capitalized letters.' );
2552 }
2653
27- # handle totally missing genotypes
28- # if the entire vector is NA or the entire vector is '.', return NA
29- if (all(is.na(called.alleles )) | all(called.alleles == ' .' )) {
30- split.alleles <- data.frame (called.alleles , called.alleles );
31- } else {
32- # check that called.alleles is a vector of genotypes in allelic notation or '.' separated by a slash or pipe
33- # "*" characters represent overlapping deletions from an upstream indel and are accepted VCF format
34- allowed.pattern <- ' ^((([A-Z]+|\\ .|\\ *)[/\\ |]([A-Z]+|\\ .|\\ *))|\\ .|[A-Z]+)$' # '|' are special chars in regular expressions
35- passing.alleles <- grepl(allowed.pattern , called.alleles );
36- passing.alleles [is.na(called.alleles )] <- TRUE ; # NA allowed
37- if (! all(passing.alleles )) {
38- stop(' unrecognized called.alleles format, must be capitalized letters, "." or "*" separated by a slash or pipe.' );
39- }
40- # replace hemizygous genotypes with a placeholder for easier splitting
41- # index for non-NA alleles that are missing allele separators:
42- no.sep.index <- (! grepl(' /|\\ |' , called.alleles ) & ! is.na(called.alleles ) & called.alleles != ' .' );
43- called.alleles [no.sep.index ] <- paste0(called.alleles [no.sep.index ], ' /-' );
44- split.alleles <- data.table :: tstrsplit(called.alleles , split = c(' /|\\ |' ), keep = c(1 ,2 )); # '|' are special chars in regular expressions
45- }
46- names(split.alleles ) <- c(' called.allele.a' , ' called.allele.b' );
47-
48- # replace 'NA' with '.' for easier comparisons
49- missing.label <- ' .' ;
50- split.alleles <- lapply(
51- X = split.alleles ,
52- FUN = function (x ) {
53- x [is.na(x )] <- missing.label ;
54- return (x );
54+ # Vectorized validation and handling of called alleles
55+ # "*" characters represent overlapping deletions from an upstream indel and are accepted VCF format
56+ allowed.pattern <- ' ^((([A-Z]+|\\ .|\\ *)[/\\ |]([A-Z]+|\\ .|\\ *))|\\ .|[A-Z]+)$' ;
57+ passing.alleles <- grepl(allowed.pattern , called.alleles );
58+ passing.alleles [is.na(called.alleles )] <- TRUE ;
59+ if (! all(passing.alleles )) {
60+ stop(' unrecognized called.alleles format, must be capitalized letters, "." or "*" separated by a slash or pipe.' );
5561 }
62+
63+ # Replace hemizygous genotypes with a placeholder for consistent splitting
64+ no.sep.index <- (! grepl(' /|\\ |' , called.alleles ) & ! is.na(called.alleles ) & called.alleles != ' .' );
65+ called.alleles [no.sep.index ] <- paste0(called.alleles [no.sep.index ], ' /-' );
66+ called.alleles.matrix <- matrix (called.alleles , nrow = nrow(called.alleles.matrix ), ncol = ncol(called.alleles.matrix ));
67+
68+ # Split the entire matrix of alleles into two matrices, one for each allele
69+ alleles.split <- data.table :: tstrsplit(as.vector(called.alleles.matrix ), split = ' /|\\ |' , fixed = FALSE );
70+ alleles.a <- matrix (alleles.split [[1 ]], nrow = nrow(called.alleles.matrix ), ncol = ncol(called.alleles.matrix ));
71+ alleles.b <- matrix (alleles.split [[2 ]], nrow = nrow(called.alleles.matrix ), ncol = ncol(called.alleles.matrix ));
72+
73+ # Replicate risk.alleles across columns for vectorized comparison
74+ risk.alleles.matrix <- matrix (
75+ rep(risk.alleles , times = ncol(called.alleles.matrix )),
76+ nrow = nrow(called.alleles.matrix )
5677 );
5778
58- dosage <- rep(NA , length(called.alleles ));
59- for (i in 1 : length(called.alleles )) {
60- if (is.na(risk.alleles [i ])) {
61- dosage [i ] <- NA ; # if the risk allele is missing, return NA, no dosage can be calculated
62- } else if ((split.alleles $ called.allele.a [i ] == missing.label ) & (split.alleles $ called.allele.b [i ] == missing.label )) {
63- dosage [i ] <- NA ; # if both allelles are missing, no genotype was called, return NA
64- } else if (split.alleles $ called.allele.a [i ] == missing.label | split.alleles $ called.allele.b [i ] == missing.label ) {
65- dosage [i ] <- NA ; # if one of the alleles is marked as missing but the other is not, this is an unrecognized format
66- warning(' one of two alleles is marked as missing at index ' , i , ' , this is an unrecognized format, returning NA for dosage.' );
67- } else if (split.alleles $ called.allele.a [i ] == risk.alleles [i ] & split.alleles $ called.allele.b [i ] == risk.alleles [i ]) {
68- dosage [i ] <- 2 ; # if both alleles are the risk allele, the genotype is homozygous for the effect allele and the dosage is 2.
69- } else if (split.alleles $ called.allele.a [i ] == risk.alleles [i ] | split.alleles $ called.allele.b [i ] == risk.alleles [i ]) {
70- dosage [i ] <- 1 ; # if only one of the alleles is the risk allele, the genotype is heterozygous and the dosage is 1.
71- } else {
72- dosage [i ] <- 0 ; # if neither allele is the risk allele, the genotype is homozygous for the non-effect allele and the dosage is 0.
73- }
74- }
75- return (dosage );
79+ # Compute dosage (0, 1, 2)
80+ # Initialize dosage matrix with zeros
81+ dosage.matrix <- matrix (0L , nrow = nrow(called.alleles.matrix ), ncol = ncol(called.alleles.matrix ));
82+ # Add 1 to dosage for each instance of the risk allele
83+ dosage.matrix <- dosage.matrix + (alleles.a == risk.alleles.matrix );
84+ dosage.matrix <- dosage.matrix + (alleles.b == risk.alleles.matrix );
85+
86+ # Handle special cases
87+ # Check for missing alleles ('NA' or '.') for both NA assignment and warning
88+ is.missing.a <- is.na(alleles.a ) | (alleles.a == ' .' );
89+ is.missing.b <- is.na(alleles.b ) | (alleles.b == ' .' );
90+
91+ # Case where one allele is marked as missing and the other is not (e.g. `./A`)
92+ # This should return NA and issue a warning
93+ is.one.missing <- (is.missing.a & ! is.missing.b ) | (! is.missing.a & is.missing.b );
94+ if (any(is.one.missing )) {
95+ warning(' some genotypes contain a missing allele, returning NA for corresponding dosage.' );
96+ }
97+
98+ # Apply the final NA mask
99+ na.mask <- is.missing.a | is.missing.b | is.na(called.alleles.matrix ) | is.na(risk.alleles.matrix );
100+ dosage.matrix [na.mask ] <- NA ;
101+
102+ # Restore the matrix dimensions and dimnames
103+ dimnames(dosage.matrix ) <- dimnames(called.alleles.matrix );
104+
105+ # If the original input was a vector, convert the output back to a vector
106+ if (is.vector.input ) {
107+ return (as.vector(dosage.matrix ));
108+ } else {
109+ return (dosage.matrix );
110+ }
111+
76112 }
77113
78114# The function for calculating a dosage value intended to replace missing genotypes.
79115calculate.missing.genotype.dosage <- function (dosage.matrix ) {
80116 # calculate the mean dosage for each variant
81- mean.dosage <- apply(
82- X = dosage.matrix ,
83- MARGIN = 1 ,
84- FUN = function (x ) {
85- # simple mean
86- mean(x , na.rm = TRUE )
87- }
88- );
117+ mean.dosage <- rowMeans(x = dosage.matrix , na.rm = TRUE );
118+
119+ # replace NaN (from all NA rows) with NA
120+ mean.dosage [is.nan(mean.dosage )] <- NA ;
121+
89122 return (mean.dosage );
90123 }
91124
0 commit comments