Skip to content

Commit

Permalink
improve CHECKs
Browse files Browse the repository at this point in the history
  • Loading branch information
stemangiola committed Dec 1, 2023
1 parent f434b45 commit 40e0d97
Show file tree
Hide file tree
Showing 13 changed files with 111 additions and 72 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ model_glm_dirichlet_multinomial_imputation.rds
#inst/stan/glm_multi_beta_binomial_generate_date
^doc$
^Meta$

4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ importFrom(dplyr,any_of)
importFrom(dplyr,cummean)
importFrom(dplyr,distinct_at)
importFrom(dplyr,enquo)
importFrom(dplyr,last_col)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,select)
importFrom(dplyr,with_groups)
importFrom(forcats,fct_relevel)
Expand All @@ -65,7 +65,6 @@ importFrom(purrr,map2)
importFrom(purrr,map2_dfc)
importFrom(purrr,map2_lgl)
importFrom(purrr,map_dbl)
importFrom(purrr,map_dfc)
importFrom(purrr,map_dfr)
importFrom(purrr,map_int)
importFrom(purrr,map_lgl)
Expand Down Expand Up @@ -120,5 +119,6 @@ importFrom(tidyr,spread)
importFrom(tidyr,unite)
importFrom(tidyr,unnest)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,last_col)
importFrom(utils,data)
useDynLib(sccomp, .registration = TRUE)
15 changes: 15 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,18 @@
#'
#'
"glm_multi_beta"

#' multipanel_theme
#'
#' @description
#' this object includes the ggplot, theme
#'
#' @importFrom utils data
#'
#' @keywords internal
#'
#' @format A ggplot theme
#' @usage data(multipanel_theme)
#'
#'
"multipanel_theme"
66 changes: 32 additions & 34 deletions R/methods.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@


#' sccomp_estimate main
#' Main Function for SCCOMP Estimate
#'
#' @description The function for linear modelling takes as input a table of cell counts with three columns containing a cell-group identifier, sample identifier, integer count and the factors (continuous or discrete). The user can define a linear model with an input R formula, where the first factor is the factor of interest. Alternatively, sccomp accepts single-cell data containers (Seurat, SingleCellExperiment44, cell metadata or group-size). In this case, sccomp derives the count data from cell metadata.
#' @description
#' The `sccomp_estimate` function performs linear modeling on a table of cell counts,
#' which includes a cell-group identifier, sample identifier, integer count, and factors
#' (continuous or discrete). The user can define a linear model with an input R formula,
#' where the first factor is the factor of interest. Alternatively, `sccomp` accepts
#' single-cell data containers (e.g., Seurat, SingleCellExperiment, cell metadata, or
#' group-size) and derives the count data from cell metadata.
#'
#' @import dplyr
#' @importFrom magrittr %$%
Expand All @@ -13,29 +19,30 @@
#' @importFrom SingleCellExperiment colData
#' @importFrom parallel detectCores
#'
#' @param .data A tibble including a cell_group name column | sample name column | read counts column (optional depending on the input class) | factor columns.
#' @param formula_composition A formula. The formula describing the model for differential abundance, for example ~treatment.
#' @param formula_variability A formula. The formula describing the model for differential variability, for example ~treatment. In most cases, if differentially variability is of interest, the formula should only include the factor of interest as a large anount of data is needed to define variability depending to each factors.
#' @param .sample A column name as symbol. The sample identifier
#' @param .cell_group A column name as symbol. The cell_group identifier
#' @param .count A column name as symbol. The cell_group abundance (read count). Used only for data frame count output. The variable in this column should be of class integer.
#'
#' @param prior_overdispersion_mean_association A list of the form list(intercept = c(5, 2), slope = c(0, 0.6), standard_deviation = c(20, 40)). Where for intercept and slope parameters, we specify mean and standard deviation, while for standard deviation, we specify shape and rate. This is used to incorporate prior knowledge about the mean/variability association of cell-type proportions.
#' @param bimodal_mean_variability_association A boolean. Whether to model the mean-variability as bimodal, as often needed in the case of single-cell RNA sequencing data, and not usually for CyTOF and microbiome data. The plot summary_plot()$credible_intervals_2D can be used to assess whether the bimodality should be modelled.
#' @param enable_loo A boolean. Enable model comparison by the R package LOO. This is helpful when you want to compare the fit between two models, for example, analogously to ANOVA, between a one factor model versus a interceot-only model.
#'
#' @param percent_false_positive A real between 0 and 100 non included. This used to identify outliers with a specific false positive rate.
#' @param exclude_priors A boolean. Whether to run a prior-free model, for benchmarking purposes.
#' @param use_data A booelan. Whether to sun the model data free. This can be used for prior predictive check.
#' @param max_sampling_iterations An integer. This limit the maximum number of iterations in case a large dataset is used, for limiting the computation time.
#' @param pass_fit A boolean. Whether to pass the Stan fit as attribute in the output. Because the Stan fit can be very large, setting this to FALSE can be used to lower the memory imprint to save the output.
#' @param approximate_posterior_inference A boolean. Whether the inference of the joint posterior distribution should be approximated with variational Bayes. It confers execution time advantage.
#' @param .sample_cell_group_pairs_to_exclude A column name that includes a boolean variable for the sample/cell-group pairs to be ignored in the fit. This argument is for pro-users.
#' @param verbose A boolean. Prints progression.
#' @param noise_model A character string. The two noise models available are multi_beta_binomial (default) and dirichlet_multinomial.
#' @param cores An integer. How many cored to be used with parallel calculations.
#' @param mcmc_seed An integer. Used for Markov-chain Monte Carlo reproducibility. By default a random number is sampled from 1 to 999999. This itself can be controlled by set.seed()
#'
#' @param .data A tibble including cell_group name column, sample name column,
#' read counts column (optional depending on the input class), and factor columns.
#' @param formula_composition A formula describing the model for differential abundance.
#' @param formula_variability A formula describing the model for differential variability.
#' @param .sample A column name as symbol for the sample identifier.
#' @param .cell_group A column name as symbol for the cell_group identifier.
#' @param .count A column name as symbol for the cell_group abundance (read count).
#' @param cores Number of cores to use for parallel calculations.
#' @param bimodal_mean_variability_association Boolean for modeling mean-variability as bimodal.
#' @param prior_mean List with prior knowledge about mean distribution, they are the intercept and coefficient.
#' @param prior_overdispersion_mean_association List with prior knowledge about mean/variability association.
#' @param percent_false_positive Real number between 0 and 100 for outlier identification.
#' @param approximate_posterior_inference Boolean for using variational Bayes for posterior inference.
#' @param enable_loo Boolean to enable model comparison using the LOO package.
#' @param exclude_priors Boolean to run a prior-free model.
#' @param use_data Boolean to run the model data-free.
#' @param max_sampling_iterations Integer to limit maximum iterations for large datasets.
#' @param pass_fit Boolean to include the Stan fit as attribute in the output.
#' @param .sample_cell_group_pairs_to_exclude Column name with boolean for sample/cell-group pairs exclusion.
#' @param verbose Boolean to print progression.
#' @param noise_model Character string for the noise model (e.g., 'multi_beta_binomial').
#' @param mcmc_seed Integer for MCMC reproducibility.
#'
#'
#' @return A nested tibble `tbl`, with the following columns
#' \itemize{
#' \item cell_group - column including the cell groups being tested
Expand Down Expand Up @@ -74,7 +81,6 @@
#' sample,
#' cell_group,
#' count,
#' check_outliers = FALSE,
#' cores = 1
#' )
#'
Expand Down Expand Up @@ -743,7 +749,6 @@ sccomp_remove_outliers.sccomp_tbl = function(.estimate,
#' sccomp_estimate(
#' counts_obj ,
#' ~ 0 + type, ~1, sample, cell_group, count,
#' check_outliers = FALSE,
#' cores = 1
#' ) |>
#'
Expand Down Expand Up @@ -774,7 +779,6 @@ sccomp_test.data.frame = function(.data,
.sample = .data |> attr(".sample")
.cell_group = .data |> attr(".cell_group")
.count = .data |> attr(".count")
check_outliers = .data |> attr("check_outliers")
model_input = .data |> attr("model_input")
truncation_df2 = .data |> attr("truncation_df2")

Expand Down Expand Up @@ -900,7 +904,6 @@ sccomp_test.data.frame = function(.data,
#' counts_obj ,
#' ~ type, ~1, sample, cell_group, count,
#' approximate_posterior_inference = "all",
#' check_outliers = FALSE,
#' cores = 1
#' ) |>
#'
Expand Down Expand Up @@ -982,7 +985,6 @@ sccomp_replicate.data.frame = function(fit,
#' counts_obj ,
#' ~ type, ~1, sample, cell_group, count,
#' approximate_posterior_inference = "all",
#' check_outliers = FALSE,
#' cores = 1
#' ) |>
#'
Expand Down Expand Up @@ -1085,7 +1087,6 @@ sccomp_predict.data.frame = function(fit,
#' counts_obj ,
#' ~ type, ~1, sample, cell_group, count,
#' approximate_posterior_inference = "all",
#' check_outliers = FALSE,
#' cores = 1
#' )
#'
Expand Down Expand Up @@ -1215,7 +1216,6 @@ sccomp_remove_unwanted_variation.data.frame = function(.data,
#' counts_obj ,
#' ~ type, ~1, sample, cell_group, count,
#' approximate_posterior_inference = "all",
#' check_outliers = FALSE,
#' cores = 1
#' )
#'
Expand Down Expand Up @@ -1350,7 +1350,6 @@ simulate_data.data.frame = function(.data,
#' sccomp_estimate(
#' counts_obj ,
#' ~ type, ~1, sample, cell_group, count,
#' check_outliers = FALSE,
#' cores = 1
#' ) |>
#' sccomp_test()
Expand Down Expand Up @@ -1417,7 +1416,6 @@ plot_boxplot(
#' counts_obj ,
#' ~ type, ~1, sample, cell_group, count,
#' approximate_posterior_inference = "all",
#' check_outliers = FALSE,
#' cores = 1
#' )
#'
Expand Down
37 changes: 31 additions & 6 deletions R/utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,7 @@ alpha_to_CI = function(fitted, censoring_iteration = 1, false_positive_rate, fac

#' @importFrom glue glue
#' @importFrom magrittr subtract
#' @noRd
get_random_intercept_design2 = function(.data_, .sample, formula_composition ){

.sample = enquo(.sample)
Expand Down Expand Up @@ -685,6 +686,7 @@ get_random_intercept_design2 = function(.data_, .sample, formula_composition ){

#' @importFrom glue glue
#' @importFrom magrittr subtract
#' @noRd
get_random_intercept_design = function(.data_, .sample, random_intercept_elements ){

.sample = enquo(.sample)
Expand Down Expand Up @@ -792,6 +794,7 @@ get_random_intercept_design = function(.data_, .sample, random_intercept_element
}

#' @importFrom glue glue
#' @noRd
get_design_matrix = function(.data_spread, formula, .sample){

.sample = enquo(.sample)
Expand Down Expand Up @@ -2016,13 +2019,33 @@ contrasts_to_enquos = function(contrasts){
contrasts |> enquo() |> quo_names() |> syms() %>% do.call(enquos_from_list_of_symbols, .)
}

#' @importFrom purrr map_dfc
#' @importFrom tibble add_column
#' @importFrom dplyr last_col
#' Mutate Data Frame Based on Expression List
#'
#' @description
#' `mutate_from_expr_list` takes a data frame and a list of formula expressions,
#' and mutates the data frame based on these expressions. It allows for ignoring
#' errors during the mutation process.
#'
#' @param x A data frame to be mutated.
#' @param formula_expr A named list of formula expressions used for mutation.
#' @param ignore_errors Logical flag indicating whether to ignore errors during mutation.
#'
#' @return A mutated data frame with added or modified columns based on `formula_expr`.
#'
#' @details
#' The function performs various checks and transformations on the formula expressions,
#' ensuring that the specified transformations are valid and can be applied to the data frame.
#' It supports advanced features like handling special characters in column names and intelligent
#' parsing of formulas.
#'
#' @importFrom purrr map2_dfc
#' @importFrom tibble add_column
#' @importFrom tidyselect last_col
#' @importFrom dplyr mutate
#' @importFrom stringr str_subset
#'
#'
#' @noRd
#'
mutate_from_expr_list = function(x, formula_expr, ignore_errors = TRUE){

if(formula_expr |> names() |> is.null())
Expand Down Expand Up @@ -2599,6 +2622,7 @@ add_formula_columns = function(.data, .original_data, .sample, formula_composit
#' cleaned_texts <- str_remove_all_ignoring_if_inside_backquotes(texts, "\\(")
#' print(cleaned_texts)
#'
#' @noRd
str_remove_all_ignoring_if_inside_backquotes <- function(text_vector, regex) {
# Nested function to handle regex removal for a single string
remove_regex_chars <- function(text, regex) {
Expand Down Expand Up @@ -2662,6 +2686,7 @@ str_remove_all_ignoring_if_inside_backquotes <- function(text_vector, regex) {
#' split_texts <- split_regex_chars_from_vector(texts, ",")
#' print(split_texts)
#'
#' @noRd
str_split_ignoring_if_inside_backquotes <- function(text_vector, regex) {
# Nested function to handle regex split for a single string
split_regex_chars <- function(text, regex) {
Expand Down Expand Up @@ -2720,7 +2745,7 @@ str_split_ignoring_if_inside_backquotes <- function(text_vector, regex) {
#' contains_only_valid_chars_for_column(c("valid_column", "invalid column", "valid123",
#' "123startWithNumber", "_startWithUnderscore"))
#'
#' @export
#' @noRd
contains_only_valid_chars_for_column <- function(column_names) {
# Function to check a single column name
check_validity <- function(column_name) {
Expand Down Expand Up @@ -2755,7 +2780,7 @@ contains_only_valid_chars_for_column <- function(column_names) {
#' @examples
#' str_remove_brackets_from_formula_intelligently(c("This is a test (with + brackets)", "`a (test) inside` backticks", "(another test)"))
#'
#' @export
#' @noRd
str_remove_brackets_from_formula_intelligently <- function(text) {
# Function to remove brackets from a single string
remove_brackets_single <- function(s) {
Expand Down
1 change: 0 additions & 1 deletion man/plot_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/sccomp_boxplot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 40e0d97

Please sign in to comment.