From 40e0d973661a3436054b96b0002d436e7e873c05 Mon Sep 17 00:00:00 2001 From: Stefano Mangiola Date: Fri, 1 Dec 2023 14:58:32 +1100 Subject: [PATCH] improve CHECKs --- .Rbuildignore | 1 + NAMESPACE | 4 +- R/data.R | 15 ++++++ R/methods.R | 66 ++++++++++++------------- R/utilities.R | 37 +++++++++++--- man/plot_summary.Rd | 1 - man/sccomp_boxplot.Rd | 1 - man/sccomp_estimate.Rd | 53 +++++++++++--------- man/sccomp_predict.Rd | 1 - man/sccomp_remove_unwanted_variation.Rd | 1 - man/sccomp_replicate.Rd | 1 - man/sccomp_test.Rd | 1 - man/simulate_data.Rd | 1 - 13 files changed, 111 insertions(+), 72 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 144821d1..259375bd 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,4 @@ model_glm_dirichlet_multinomial_imputation.rds #inst/stan/glm_multi_beta_binomial_generate_date ^doc$ ^Meta$ + diff --git a/NAMESPACE b/NAMESPACE index a87a9a4b..cab63bd7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,8 +41,8 @@ importFrom(dplyr,any_of) importFrom(dplyr,cummean) importFrom(dplyr,distinct_at) importFrom(dplyr,enquo) -importFrom(dplyr,last_col) importFrom(dplyr,left_join) +importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,with_groups) importFrom(forcats,fct_relevel) @@ -65,7 +65,6 @@ importFrom(purrr,map2) importFrom(purrr,map2_dfc) importFrom(purrr,map2_lgl) importFrom(purrr,map_dbl) -importFrom(purrr,map_dfc) importFrom(purrr,map_dfr) importFrom(purrr,map_int) importFrom(purrr,map_lgl) @@ -120,5 +119,6 @@ importFrom(tidyr,spread) importFrom(tidyr,unite) importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) +importFrom(tidyselect,last_col) importFrom(utils,data) useDynLib(sccomp, .registration = TRUE) diff --git a/R/data.R b/R/data.R index 696a62a7..24e8a22b 100755 --- a/R/data.R +++ b/R/data.R @@ -108,3 +108,18 @@ #' #' "glm_multi_beta" + +#' multipanel_theme +#' +#' @description +#' this object includes the ggplot, theme +#' +#' @importFrom utils data +#' +#' @keywords internal +#' +#' @format A ggplot theme +#' @usage data(multipanel_theme) +#' +#' +"multipanel_theme" \ No newline at end of file diff --git a/R/methods.R b/R/methods.R index 291c2f4a..1189093a 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,8 +1,14 @@ -#' sccomp_estimate main +#' Main Function for SCCOMP Estimate #' -#' @description The function for linear modelling takes as input a table of cell counts with three columns containing a cell-group identifier, sample identifier, integer count and the factors (continuous or discrete). The user can define a linear model with an input R formula, where the first factor is the factor of interest. Alternatively, sccomp accepts single-cell data containers (Seurat, SingleCellExperiment44, cell metadata or group-size). In this case, sccomp derives the count data from cell metadata. +#' @description +#' The `sccomp_estimate` function performs linear modeling on a table of cell counts, +#' which includes a cell-group identifier, sample identifier, integer count, and factors +#' (continuous or discrete). The user can define a linear model with an input R formula, +#' where the first factor is the factor of interest. Alternatively, `sccomp` accepts +#' single-cell data containers (e.g., Seurat, SingleCellExperiment, cell metadata, or +#' group-size) and derives the count data from cell metadata. #' #' @import dplyr #' @importFrom magrittr %$% @@ -13,29 +19,30 @@ #' @importFrom SingleCellExperiment colData #' @importFrom parallel detectCores #' -#' @param .data A tibble including a cell_group name column | sample name column | read counts column (optional depending on the input class) | factor columns. -#' @param formula_composition A formula. The formula describing the model for differential abundance, for example ~treatment. -#' @param formula_variability A formula. The formula describing the model for differential variability, for example ~treatment. In most cases, if differentially variability is of interest, the formula should only include the factor of interest as a large anount of data is needed to define variability depending to each factors. -#' @param .sample A column name as symbol. The sample identifier -#' @param .cell_group A column name as symbol. The cell_group identifier -#' @param .count A column name as symbol. The cell_group abundance (read count). Used only for data frame count output. The variable in this column should be of class integer. -#' -#' @param prior_overdispersion_mean_association A list of the form list(intercept = c(5, 2), slope = c(0, 0.6), standard_deviation = c(20, 40)). Where for intercept and slope parameters, we specify mean and standard deviation, while for standard deviation, we specify shape and rate. This is used to incorporate prior knowledge about the mean/variability association of cell-type proportions. -#' @param bimodal_mean_variability_association A boolean. Whether to model the mean-variability as bimodal, as often needed in the case of single-cell RNA sequencing data, and not usually for CyTOF and microbiome data. The plot summary_plot()$credible_intervals_2D can be used to assess whether the bimodality should be modelled. -#' @param enable_loo A boolean. Enable model comparison by the R package LOO. This is helpful when you want to compare the fit between two models, for example, analogously to ANOVA, between a one factor model versus a interceot-only model. -#' -#' @param percent_false_positive A real between 0 and 100 non included. This used to identify outliers with a specific false positive rate. -#' @param exclude_priors A boolean. Whether to run a prior-free model, for benchmarking purposes. -#' @param use_data A booelan. Whether to sun the model data free. This can be used for prior predictive check. -#' @param max_sampling_iterations An integer. This limit the maximum number of iterations in case a large dataset is used, for limiting the computation time. -#' @param pass_fit A boolean. Whether to pass the Stan fit as attribute in the output. Because the Stan fit can be very large, setting this to FALSE can be used to lower the memory imprint to save the output. -#' @param approximate_posterior_inference A boolean. Whether the inference of the joint posterior distribution should be approximated with variational Bayes. It confers execution time advantage. -#' @param .sample_cell_group_pairs_to_exclude A column name that includes a boolean variable for the sample/cell-group pairs to be ignored in the fit. This argument is for pro-users. -#' @param verbose A boolean. Prints progression. -#' @param noise_model A character string. The two noise models available are multi_beta_binomial (default) and dirichlet_multinomial. -#' @param cores An integer. How many cored to be used with parallel calculations. -#' @param mcmc_seed An integer. Used for Markov-chain Monte Carlo reproducibility. By default a random number is sampled from 1 to 999999. This itself can be controlled by set.seed() -#' +#' @param .data A tibble including cell_group name column, sample name column, +#' read counts column (optional depending on the input class), and factor columns. +#' @param formula_composition A formula describing the model for differential abundance. +#' @param formula_variability A formula describing the model for differential variability. +#' @param .sample A column name as symbol for the sample identifier. +#' @param .cell_group A column name as symbol for the cell_group identifier. +#' @param .count A column name as symbol for the cell_group abundance (read count). +#' @param cores Number of cores to use for parallel calculations. +#' @param bimodal_mean_variability_association Boolean for modeling mean-variability as bimodal. +#' @param prior_mean List with prior knowledge about mean distribution, they are the intercept and coefficient. +#' @param prior_overdispersion_mean_association List with prior knowledge about mean/variability association. +#' @param percent_false_positive Real number between 0 and 100 for outlier identification. +#' @param approximate_posterior_inference Boolean for using variational Bayes for posterior inference. +#' @param enable_loo Boolean to enable model comparison using the LOO package. +#' @param exclude_priors Boolean to run a prior-free model. +#' @param use_data Boolean to run the model data-free. +#' @param max_sampling_iterations Integer to limit maximum iterations for large datasets. +#' @param pass_fit Boolean to include the Stan fit as attribute in the output. +#' @param .sample_cell_group_pairs_to_exclude Column name with boolean for sample/cell-group pairs exclusion. +#' @param verbose Boolean to print progression. +#' @param noise_model Character string for the noise model (e.g., 'multi_beta_binomial'). +#' @param mcmc_seed Integer for MCMC reproducibility. +#' +#' #' @return A nested tibble `tbl`, with the following columns #' \itemize{ #' \item cell_group - column including the cell groups being tested @@ -74,7 +81,6 @@ #' sample, #' cell_group, #' count, -#' check_outliers = FALSE, #' cores = 1 #' ) #' @@ -743,7 +749,6 @@ sccomp_remove_outliers.sccomp_tbl = function(.estimate, #' sccomp_estimate( #' counts_obj , #' ~ 0 + type, ~1, sample, cell_group, count, -#' check_outliers = FALSE, #' cores = 1 #' ) |> #' @@ -774,7 +779,6 @@ sccomp_test.data.frame = function(.data, .sample = .data |> attr(".sample") .cell_group = .data |> attr(".cell_group") .count = .data |> attr(".count") - check_outliers = .data |> attr("check_outliers") model_input = .data |> attr("model_input") truncation_df2 = .data |> attr("truncation_df2") @@ -900,7 +904,6 @@ sccomp_test.data.frame = function(.data, #' counts_obj , #' ~ type, ~1, sample, cell_group, count, #' approximate_posterior_inference = "all", -#' check_outliers = FALSE, #' cores = 1 #' ) |> #' @@ -982,7 +985,6 @@ sccomp_replicate.data.frame = function(fit, #' counts_obj , #' ~ type, ~1, sample, cell_group, count, #' approximate_posterior_inference = "all", -#' check_outliers = FALSE, #' cores = 1 #' ) |> #' @@ -1085,7 +1087,6 @@ sccomp_predict.data.frame = function(fit, #' counts_obj , #' ~ type, ~1, sample, cell_group, count, #' approximate_posterior_inference = "all", -#' check_outliers = FALSE, #' cores = 1 #' ) #' @@ -1215,7 +1216,6 @@ sccomp_remove_unwanted_variation.data.frame = function(.data, #' counts_obj , #' ~ type, ~1, sample, cell_group, count, #' approximate_posterior_inference = "all", -#' check_outliers = FALSE, #' cores = 1 #' ) #' @@ -1350,7 +1350,6 @@ simulate_data.data.frame = function(.data, #' sccomp_estimate( #' counts_obj , #' ~ type, ~1, sample, cell_group, count, -#' check_outliers = FALSE, #' cores = 1 #' ) |> #' sccomp_test() @@ -1417,7 +1416,6 @@ plot_boxplot( #' counts_obj , #' ~ type, ~1, sample, cell_group, count, #' approximate_posterior_inference = "all", -#' check_outliers = FALSE, #' cores = 1 #' ) #' diff --git a/R/utilities.R b/R/utilities.R index b3e86c69..f04ac1fa 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -623,6 +623,7 @@ alpha_to_CI = function(fitted, censoring_iteration = 1, false_positive_rate, fac #' @importFrom glue glue #' @importFrom magrittr subtract +#' @noRd get_random_intercept_design2 = function(.data_, .sample, formula_composition ){ .sample = enquo(.sample) @@ -685,6 +686,7 @@ get_random_intercept_design2 = function(.data_, .sample, formula_composition ){ #' @importFrom glue glue #' @importFrom magrittr subtract +#' @noRd get_random_intercept_design = function(.data_, .sample, random_intercept_elements ){ .sample = enquo(.sample) @@ -792,6 +794,7 @@ get_random_intercept_design = function(.data_, .sample, random_intercept_element } #' @importFrom glue glue +#' @noRd get_design_matrix = function(.data_spread, formula, .sample){ .sample = enquo(.sample) @@ -2016,13 +2019,33 @@ contrasts_to_enquos = function(contrasts){ contrasts |> enquo() |> quo_names() |> syms() %>% do.call(enquos_from_list_of_symbols, .) } -#' @importFrom purrr map_dfc -#' @importFrom tibble add_column -#' @importFrom dplyr last_col +#' Mutate Data Frame Based on Expression List +#' +#' @description +#' `mutate_from_expr_list` takes a data frame and a list of formula expressions, +#' and mutates the data frame based on these expressions. It allows for ignoring +#' errors during the mutation process. +#' +#' @param x A data frame to be mutated. +#' @param formula_expr A named list of formula expressions used for mutation. +#' @param ignore_errors Logical flag indicating whether to ignore errors during mutation. +#' +#' @return A mutated data frame with added or modified columns based on `formula_expr`. +#' +#' @details +#' The function performs various checks and transformations on the formula expressions, +#' ensuring that the specified transformations are valid and can be applied to the data frame. +#' It supports advanced features like handling special characters in column names and intelligent +#' parsing of formulas. +#' #' @importFrom purrr map2_dfc +#' @importFrom tibble add_column +#' @importFrom tidyselect last_col +#' @importFrom dplyr mutate #' @importFrom stringr str_subset -#' +#' #' @noRd +#' mutate_from_expr_list = function(x, formula_expr, ignore_errors = TRUE){ if(formula_expr |> names() |> is.null()) @@ -2599,6 +2622,7 @@ add_formula_columns = function(.data, .original_data, .sample, formula_composit #' cleaned_texts <- str_remove_all_ignoring_if_inside_backquotes(texts, "\\(") #' print(cleaned_texts) #' +#' @noRd str_remove_all_ignoring_if_inside_backquotes <- function(text_vector, regex) { # Nested function to handle regex removal for a single string remove_regex_chars <- function(text, regex) { @@ -2662,6 +2686,7 @@ str_remove_all_ignoring_if_inside_backquotes <- function(text_vector, regex) { #' split_texts <- split_regex_chars_from_vector(texts, ",") #' print(split_texts) #' +#' @noRd str_split_ignoring_if_inside_backquotes <- function(text_vector, regex) { # Nested function to handle regex split for a single string split_regex_chars <- function(text, regex) { @@ -2720,7 +2745,7 @@ str_split_ignoring_if_inside_backquotes <- function(text_vector, regex) { #' contains_only_valid_chars_for_column(c("valid_column", "invalid column", "valid123", #' "123startWithNumber", "_startWithUnderscore")) #' -#' @export +#' @noRd contains_only_valid_chars_for_column <- function(column_names) { # Function to check a single column name check_validity <- function(column_name) { @@ -2755,7 +2780,7 @@ contains_only_valid_chars_for_column <- function(column_names) { #' @examples #' str_remove_brackets_from_formula_intelligently(c("This is a test (with + brackets)", "`a (test) inside` backticks", "(another test)")) #' -#' @export +#' @noRd str_remove_brackets_from_formula_intelligently <- function(text) { # Function to remove brackets from a single string remove_brackets_single <- function(s) { diff --git a/man/plot_summary.Rd b/man/plot_summary.Rd index d07fd4eb..2e633937 100644 --- a/man/plot_summary.Rd +++ b/man/plot_summary.Rd @@ -26,7 +26,6 @@ estimate = counts_obj , ~ type, ~1, sample, cell_group, count, approximate_posterior_inference = "all", - check_outliers = FALSE, cores = 1 ) diff --git a/man/sccomp_boxplot.Rd b/man/sccomp_boxplot.Rd index c8fd0ece..feb02f09 100644 --- a/man/sccomp_boxplot.Rd +++ b/man/sccomp_boxplot.Rd @@ -27,7 +27,6 @@ estimate = sccomp_estimate( counts_obj , ~ type, ~1, sample, cell_group, count, - check_outliers = FALSE, cores = 1 ) |> sccomp_test() diff --git a/man/sccomp_estimate.Rd b/man/sccomp_estimate.Rd index 6b691264..037763ca 100644 --- a/man/sccomp_estimate.Rd +++ b/man/sccomp_estimate.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/methods.R \name{sccomp_estimate} \alias{sccomp_estimate} -\title{sccomp_estimate main} +\title{Main Function for SCCOMP Estimate} \usage{ sccomp_estimate( .data, @@ -30,45 +30,48 @@ sccomp_estimate( ) } \arguments{ -\item{.data}{A tibble including a cell_group name column | sample name column | read counts column (optional depending on the input class) | factor columns.} +\item{.data}{A tibble including cell_group name column, sample name column, +read counts column (optional depending on the input class), and factor columns.} -\item{formula_composition}{A formula. The formula describing the model for differential abundance, for example ~treatment.} +\item{formula_composition}{A formula describing the model for differential abundance.} -\item{formula_variability}{A formula. The formula describing the model for differential variability, for example ~treatment. In most cases, if differentially variability is of interest, the formula should only include the factor of interest as a large anount of data is needed to define variability depending to each factors.} +\item{formula_variability}{A formula describing the model for differential variability.} -\item{.sample}{A column name as symbol. The sample identifier} +\item{.sample}{A column name as symbol for the sample identifier.} -\item{.cell_group}{A column name as symbol. The cell_group identifier} +\item{.cell_group}{A column name as symbol for the cell_group identifier.} -\item{.count}{A column name as symbol. The cell_group abundance (read count). Used only for data frame count output. The variable in this column should be of class integer.} +\item{.count}{A column name as symbol for the cell_group abundance (read count).} -\item{cores}{An integer. How many cored to be used with parallel calculations.} +\item{cores}{Number of cores to use for parallel calculations.} -\item{bimodal_mean_variability_association}{A boolean. Whether to model the mean-variability as bimodal, as often needed in the case of single-cell RNA sequencing data, and not usually for CyTOF and microbiome data. The plot summary_plot()$credible_intervals_2D can be used to assess whether the bimodality should be modelled.} +\item{bimodal_mean_variability_association}{Boolean for modeling mean-variability as bimodal.} -\item{percent_false_positive}{A real between 0 and 100 non included. This used to identify outliers with a specific false positive rate.} +\item{percent_false_positive}{Real number between 0 and 100 for outlier identification.} -\item{approximate_posterior_inference}{A boolean. Whether the inference of the joint posterior distribution should be approximated with variational Bayes. It confers execution time advantage.} +\item{approximate_posterior_inference}{Boolean for using variational Bayes for posterior inference.} -\item{prior_overdispersion_mean_association}{A list of the form list(intercept = c(5, 2), slope = c(0, 0.6), standard_deviation = c(20, 40)). Where for intercept and slope parameters, we specify mean and standard deviation, while for standard deviation, we specify shape and rate. This is used to incorporate prior knowledge about the mean/variability association of cell-type proportions.} +\item{prior_mean}{List with prior knowledge about mean distribution, they are the intercept and coefficient.} -\item{.sample_cell_group_pairs_to_exclude}{A column name that includes a boolean variable for the sample/cell-group pairs to be ignored in the fit. This argument is for pro-users.} +\item{prior_overdispersion_mean_association}{List with prior knowledge about mean/variability association.} -\item{verbose}{A boolean. Prints progression.} +\item{.sample_cell_group_pairs_to_exclude}{Column name with boolean for sample/cell-group pairs exclusion.} -\item{enable_loo}{A boolean. Enable model comparison by the R package LOO. This is helpful when you want to compare the fit between two models, for example, analogously to ANOVA, between a one factor model versus a interceot-only model.} +\item{verbose}{Boolean to print progression.} -\item{noise_model}{A character string. The two noise models available are multi_beta_binomial (default) and dirichlet_multinomial.} +\item{enable_loo}{Boolean to enable model comparison using the LOO package.} -\item{exclude_priors}{A boolean. Whether to run a prior-free model, for benchmarking purposes.} +\item{noise_model}{Character string for the noise model (e.g., 'multi_beta_binomial').} -\item{use_data}{A booelan. Whether to sun the model data free. This can be used for prior predictive check.} +\item{exclude_priors}{Boolean to run a prior-free model.} -\item{mcmc_seed}{An integer. Used for Markov-chain Monte Carlo reproducibility. By default a random number is sampled from 1 to 999999. This itself can be controlled by set.seed()} +\item{use_data}{Boolean to run the model data-free.} -\item{max_sampling_iterations}{An integer. This limit the maximum number of iterations in case a large dataset is used, for limiting the computation time.} +\item{mcmc_seed}{Integer for MCMC reproducibility.} -\item{pass_fit}{A boolean. Whether to pass the Stan fit as attribute in the output. Because the Stan fit can be very large, setting this to FALSE can be used to lower the memory imprint to save the output.} +\item{max_sampling_iterations}{Integer to limit maximum iterations for large datasets.} + +\item{pass_fit}{Boolean to include the Stan fit as attribute in the output.} } \value{ A nested tibble \code{tbl}, with the following columns @@ -98,7 +101,12 @@ A nested tibble \code{tbl}, with the following columns } } \description{ -The function for linear modelling takes as input a table of cell counts with three columns containing a cell-group identifier, sample identifier, integer count and the factors (continuous or discrete). The user can define a linear model with an input R formula, where the first factor is the factor of interest. Alternatively, sccomp accepts single-cell data containers (Seurat, SingleCellExperiment44, cell metadata or group-size). In this case, sccomp derives the count data from cell metadata. +The \code{sccomp_estimate} function performs linear modeling on a table of cell counts, +which includes a cell-group identifier, sample identifier, integer count, and factors +(continuous or discrete). The user can define a linear model with an input R formula, +where the first factor is the factor of interest. Alternatively, \code{sccomp} accepts +single-cell data containers (e.g., Seurat, SingleCellExperiment, cell metadata, or +group-size) and derives the count data from cell metadata. } \examples{ @@ -112,7 +120,6 @@ estimate = sample, cell_group, count, - check_outliers = FALSE, cores = 1 ) diff --git a/man/sccomp_predict.Rd b/man/sccomp_predict.Rd index 4cfdb8cf..66a9801f 100644 --- a/man/sccomp_predict.Rd +++ b/man/sccomp_predict.Rd @@ -38,7 +38,6 @@ if(.Platform$OS.type == "unix") counts_obj , ~ type, ~1, sample, cell_group, count, approximate_posterior_inference = "all", - check_outliers = FALSE, cores = 1 ) |> diff --git a/man/sccomp_remove_unwanted_variation.Rd b/man/sccomp_remove_unwanted_variation.Rd index 5cb9feda..5d7e01cb 100644 --- a/man/sccomp_remove_unwanted_variation.Rd +++ b/man/sccomp_remove_unwanted_variation.Rd @@ -31,7 +31,6 @@ data("counts_obj") counts_obj , ~ type, ~1, sample, cell_group, count, approximate_posterior_inference = "all", - check_outliers = FALSE, cores = 1 ) diff --git a/man/sccomp_replicate.Rd b/man/sccomp_replicate.Rd index 4a600966..9ddd7200 100644 --- a/man/sccomp_replicate.Rd +++ b/man/sccomp_replicate.Rd @@ -38,7 +38,6 @@ if(.Platform$OS.type == "unix") counts_obj , ~ type, ~1, sample, cell_group, count, approximate_posterior_inference = "all", - check_outliers = FALSE, cores = 1 ) |> diff --git a/man/sccomp_test.Rd b/man/sccomp_test.Rd index d3998042..eed01fa1 100644 --- a/man/sccomp_test.Rd +++ b/man/sccomp_test.Rd @@ -37,7 +37,6 @@ data("counts_obj") sccomp_estimate( counts_obj , ~ 0 + type, ~1, sample, cell_group, count, - check_outliers = FALSE, cores = 1 ) |> diff --git a/man/simulate_data.Rd b/man/simulate_data.Rd index f9af8d64..330167c0 100644 --- a/man/simulate_data.Rd +++ b/man/simulate_data.Rd @@ -54,7 +54,6 @@ estimate = counts_obj , ~ type, ~1, sample, cell_group, count, approximate_posterior_inference = "all", - check_outliers = FALSE, cores = 1 )