uclahs-cds
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 2 deletions b/‎DESCRIPTION‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 7 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎NEWS.md‎
Lines changed: 10 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎R/apply-pgs.R‎
Lines changed: 262 additions & 168 deletions b/‎R/apply-pgs.R‎
Lines changed: 262 additions & 168 deletions
diff --git a/‎R/assess-strand-flip.R‎
Lines changed: 169 additions & 93 deletions b/‎R/assess-strand-flip.R‎
Lines changed: 169 additions & 93 deletions
diff --git a/‎R/calculate-dosage.R‎
Lines changed: 90 additions & 57 deletions b/‎R/calculate-dosage.R‎
Lines changed: 90 additions & 57 deletions
@@ -1,7 +1,7 @@
 Package: ApplyPolygenicScore
 Type: Package
 Title: Utilities for the Application of a Polygenic Score to a VCF
-Version: 3.1.0
+Version: 4.0.0
 Authors@R: c(
     person('Paul', 'Boutros', role = 'cre', email = '[email protected]'),
     person('Nicole', 'Zeltser', role = 'aut', comment = c(ORCID = '0000-0001-7246-2771')),
@@ -21,7 +21,6 @@ Imports:
     vcfR,
     pROC,
     data.table,
-    reshape2,
     BoutrosLab.plotting.general,
     lattice
 Suggests:
 
@@ -1,11 +1,5 @@
 import('vcfR');
-importFrom(
-    'data.table',
-    'tstrsplit',
-    'setDT',
-    ':='
-    );
-import('reshape2');
+import('data.table');
 import('BoutrosLab.plotting.general');
 importFrom(
     'lattice',
 
@@ -2,6 +2,16 @@
 
 # ApplyPolygenicScore unreleased
 
+# ApplyPolygenicScore 4.0.0
+
+## Changed
+* Refactored all main functions for gains in RAM efficiency and runtime
+* Introduced a breaking change to the output of `import.vcf`. The outputed list object has a different naming scheme and different data formats. Previous data formats are still supported by setting `long.format` to `TRUE`, however the naming scheme is still changed.
+* Introduced a breaking change to `apply.polygenic.score`. The expected default `vcf.data` input format has changed. The previous input format is still supported by setting `vcf.long.format` to `TRUE` from the default `FALSE`.
+
+## Added
+* Added support for more efficient storage and manipulation of imported VCF data. The default output of `import.vcf` now returns VCF data in a split format. A `data.table` object contains VCF data from fixed fields (CHROM, POS, ID, REF, ALT). A `matrix` object contains sample-specific genotypes in allele-format in a sample (columns) by variant (rows) matrix.
+
 # ApplyPolygenicScore 3.1.0
 
 ## Changed
 
@@ -1,3 +1,6 @@
+# Handling CRAN warnings for data.table syntax:
+if (getRversion() >= '2.15.1') utils::globalVariables(c('dosage'));
+
 #' @title Convert alleles to dosage
 #' @description Convert genotype calls in the form of witten out alleles (e.g. 'A/T') to dosages (0, 1, 2) based on provided risk alleles from a PGS.
 #' @param called.alleles A vector of genotypes in allelic notation separated by a slash or pipe.
@@ -9,10 +12,34 @@
 #' convert.alleles.to.pgs.dosage(called.alleles, risk.alleles);
 #' @export
 convert.alleles.to.pgs.dosage <- function(called.alleles, risk.alleles) {
-    # check that risk.alleles is the same length as called.alleles
-    if (length(called.alleles) != length(risk.alleles)) {
-        stop('called.alleles and risk.alleles must be the same length.');
+
+    # Check input class and convert to a matrix for consistent processing
+    is.vector.input <- is.vector(called.alleles);
+    if (is.vector.input) {
+        # Fast-fail for all missing genotypes
+        if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
+            return(rep(NA, length(called.alleles)));
+        }
+        called.alleles.matrix <- matrix(called.alleles, ncol = 1);
+    } else if (is.matrix(called.alleles)) {
+        # Fast-fail for all missing genotypes
+        if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
+            return(matrix(NA, nrow = nrow(called.alleles), ncol = ncol(called.alleles), dimnames = dimnames(called.alleles)));
         }
+        called.alleles.matrix <- called.alleles;
+    } else {
+        stop("Unrecognized 'called.alleles' format. Must be a vector or a matrix.");
+    }
+
+    # Check that called.alleles.matrix has rows corresponding to risk.alleles
+    if (nrow(called.alleles.matrix) != length(risk.alleles)) {
+        stop('Number of rows in called.alleles must equal length of risk.alleles.');
+    }
+
+    # # check that risk.alleles is the same length as called.alleles
+    # if (length(called.alleles) != length(risk.alleles)) {
+    #     stop('called.alleles and risk.alleles must be the same length.');
+    #     }
 
     # check for missing risk alleles and warn
     if (any(is.na(risk.alleles))) {
@@ -24,68 +51,74 @@ convert.alleles.to.pgs.dosage <- function(called.alleles, risk.alleles) {
         stop('unrecognized risk.allele format, must be capitalized letters.');
         }
 
-    # handle totally missing genotypes
-    # if the entire vector is NA or the entire vector is '.', return NA
-    if (all(is.na(called.alleles)) | all(called.alleles == '.')) {
-        split.alleles <- data.frame(called.alleles, called.alleles);
-        } else {
-            # check that called.alleles is a vector of genotypes in allelic notation or '.' separated by a slash or pipe
-            # "*" characters represent overlapping deletions from an upstream indel and are accepted VCF format
-            allowed.pattern <- '^((([A-Z]+|\\.|\\*)[/\\|]([A-Z]+|\\.|\\*))|\\.|[A-Z]+)$' # '|' are special chars in regular expressions
-            passing.alleles <- grepl(allowed.pattern, called.alleles);
-            passing.alleles[is.na(called.alleles)] <- TRUE; # NA allowed
-            if (!all(passing.alleles)) {
-                stop('unrecognized called.alleles format, must be capitalized letters, "." or "*" separated by a slash or pipe.');
-                }
-            # replace hemizygous genotypes with a placeholder for easier splitting
-            # index for non-NA alleles that are missing allele separators:
-            no.sep.index <- (!grepl('/|\\|', called.alleles) & !is.na(called.alleles) & called.alleles != '.');
-            called.alleles[no.sep.index] <- paste0(called.alleles[no.sep.index], '/-');
-            split.alleles <- data.table::tstrsplit(called.alleles, split = c('/|\\|'), keep = c(1,2)); # '|' are special chars in regular expressions
-            }
-    names(split.alleles) <- c('called.allele.a', 'called.allele.b');
-
-    # replace 'NA' with '.' for easier comparisons
-    missing.label <- '.';
-    split.alleles <- lapply(
-        X = split.alleles,
-        FUN = function(x) {
-        x[is.na(x)] <- missing.label;
-        return(x);
+    # Vectorized validation and handling of called alleles
+    # "*" characters represent overlapping deletions from an upstream indel and are accepted VCF format
+    allowed.pattern <- '^((([A-Z]+|\\.|\\*)[/\\|]([A-Z]+|\\.|\\*))|\\.|[A-Z]+)$';
+    passing.alleles <- grepl(allowed.pattern, called.alleles);
+    passing.alleles[is.na(called.alleles)] <- TRUE;
+    if (!all(passing.alleles)) {
+        stop('unrecognized called.alleles format, must be capitalized letters, "." or "*" separated by a slash or pipe.');
         }
+
+    # Replace hemizygous genotypes with a placeholder for consistent splitting
+    no.sep.index <- (!grepl('/|\\|', called.alleles) & !is.na(called.alleles) & called.alleles != '.');
+    called.alleles[no.sep.index] <- paste0(called.alleles[no.sep.index], '/-');
+    called.alleles.matrix <- matrix(called.alleles, nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
+
+    # Split the entire matrix of alleles into two matrices, one for each allele
+    alleles.split <- data.table::tstrsplit(as.vector(called.alleles.matrix), split = '/|\\|', fixed = FALSE);
+    alleles.a <- matrix(alleles.split[[1]], nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
+    alleles.b <- matrix(alleles.split[[2]], nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
+
+    # Replicate risk.alleles across columns for vectorized comparison
+    risk.alleles.matrix <- matrix(
+        rep(risk.alleles, times = ncol(called.alleles.matrix)),
+        nrow = nrow(called.alleles.matrix)
         );
 
-    dosage <- rep(NA, length(called.alleles));
-    for (i in 1:length(called.alleles)) {
-        if (is.na(risk.alleles[i])) {
-            dosage[i] <- NA; # if the risk allele is missing, return NA, no dosage can be calculated
-            } else if ((split.alleles$called.allele.a[i] == missing.label) & (split.alleles$called.allele.b[i] == missing.label)) {
-            dosage[i] <- NA; # if both allelles are missing, no genotype was called, return NA
-            } else if (split.alleles$called.allele.a[i] == missing.label | split.alleles$called.allele.b[i] == missing.label) {
-                dosage[i] <- NA; # if one of the alleles is marked as missing but the other is not, this is an unrecognized format
-                warning('one of two alleles is marked as missing at index ', i, ', this is an unrecognized format, returning NA for dosage.');
-            } else if (split.alleles$called.allele.a[i] == risk.alleles[i] & split.alleles$called.allele.b[i] == risk.alleles[i]) {
-                dosage[i] <- 2; # if both alleles are the risk allele, the genotype is homozygous for the effect allele and the dosage is 2.
-            } else if (split.alleles$called.allele.a[i] == risk.alleles[i] | split.alleles$called.allele.b[i] == risk.alleles[i]) {
-                dosage[i] <- 1; # if only one of the alleles is the risk allele, the genotype is heterozygous and the dosage is 1.
-            } else {
-                dosage[i] <- 0; # if neither allele is the risk allele, the genotype is homozygous for the non-effect allele and the dosage is 0.
-            }
-        }
-    return(dosage);
+    # Compute dosage (0, 1, 2)
+    # Initialize dosage matrix with zeros
+    dosage.matrix <- matrix(0L, nrow = nrow(called.alleles.matrix), ncol = ncol(called.alleles.matrix));
+    # Add 1 to dosage for each instance of the risk allele
+    dosage.matrix <- dosage.matrix + (alleles.a == risk.alleles.matrix);
+    dosage.matrix <- dosage.matrix + (alleles.b == risk.alleles.matrix);
+
+    # Handle special cases
+    # Check for missing alleles ('NA' or '.') for both NA assignment and warning
+    is.missing.a <- is.na(alleles.a) | (alleles.a == '.');
+    is.missing.b <- is.na(alleles.b) | (alleles.b == '.');
+
+    # Case where one allele is marked as missing and the other is not (e.g. `./A`)
+    # This should return NA and issue a warning
+    is.one.missing <- (is.missing.a & !is.missing.b) | (!is.missing.a & is.missing.b);
+    if (any(is.one.missing)) {
+        warning('some genotypes contain a missing allele, returning NA for corresponding dosage.');
+    }
+
+    # Apply the final NA mask
+    na.mask <- is.missing.a | is.missing.b | is.na(called.alleles.matrix) | is.na(risk.alleles.matrix);
+    dosage.matrix[na.mask] <- NA;
+
+    # Restore the matrix dimensions and dimnames
+    dimnames(dosage.matrix) <- dimnames(called.alleles.matrix);
+
+    # If the original input was a vector, convert the output back to a vector
+    if (is.vector.input) {
+        return(as.vector(dosage.matrix));
+    } else {
+        return(dosage.matrix);
+    }
+
     }
 
 # The function for calculating a dosage value intended to replace missing genotypes.
 calculate.missing.genotype.dosage <- function(dosage.matrix) {
     # calculate the mean dosage for each variant
-    mean.dosage <- apply(
-        X = dosage.matrix,
-        MARGIN = 1,
-        FUN = function(x) {
-            # simple mean
-            mean(x, na.rm = TRUE)
-            }
-        );
+    mean.dosage <- rowMeans(x = dosage.matrix, na.rm = TRUE);
+
+    # replace NaN (from all NA rows) with NA
+    mean.dosage[is.nan(mean.dosage)] <- NA;
+
     return(mean.dosage);
     }