diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index 0f3966e..eb2591f 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -41,10 +41,10 @@ jobs: library(lintr) style_rules <- list( brace_linter(), todo_comment_linter(), equals_na_linter(), - function_left_parentheses_linter(), no_tab_linter(), + function_left_parentheses_linter(), whitespace_linter(), absolute_path_linter(), nonportable_path_linter(), pipe_continuation_linter(), semicolon_linter(), - single_quotes_linter(), trailing_blank_lines_linter(), trailing_whitespace_linter(), + quotes_linter(), trailing_blank_lines_linter(), trailing_whitespace_linter(), undesirable_function_linter(), undesirable_operator_linter() ) # TODO: expand style rules as package matures lint_package(linters = style_rules) diff --git a/NAMESPACE b/NAMESPACE index 6e27645..73f2986 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ S3method(coef,gmjmcmc) S3method(coef,gmjmcmc_merged) S3method(coef,mjmcmc) S3method(coef,mjmcmc_parallel) +S3method(fitted,fbms_predict) S3method(get.best.model,gmjmcmc) S3method(get.best.model,gmjmcmc_merged) S3method(get.best.model,mjmcmc) @@ -50,7 +51,6 @@ export(erf) export(exp_dbl) export(fbms) export(fbms.mlik.master) -export(fitted.fbms_predict) export(gaussian.loglik) export(gelu) export(gen.params.gmjmcmc) diff --git a/R/summary.R b/R/summary.R index a947243..d9d8e3a 100644 --- a/R/summary.R +++ b/R/summary.R @@ -1,7 +1,7 @@ #' Function to Print a Quick Summary of the Results #' #' @param object The results to use -#' @param pop The population to print for, defaults to last +#' @param pop The population to print for, defaults to "best", other options are "last" and "all" #' @param tol The tolerance to use as a threshold when reporting the results. #' @param labels Should the covariates be named, or just referred to as their place in the data.frame. #' @param effects Quantiles for posterior modes of the effects across models to be reported, if either effects are NULL or if labels are NULL, no effects are reported. diff --git a/R_script/appendix_script.R b/R_script/appendix_script.R new file mode 100644 index 0000000..37ce72d --- /dev/null +++ b/R_script/appendix_script.R @@ -0,0 +1,213 @@ + +#################################################################### +# Install INLA and RTMB packages (consult with IT if installation fails, +# which may occasionally happen for INLA as it is not on CRAN). +################################################################### + + + +if (!requireNamespace("RTMB", quietly = TRUE)) { + message("Trying to install optional package RTMB...") + try(install.packages("RTMB"), silent = TRUE) +} + +if (!requireNamespace("INLA", quietly = TRUE)) { + message("Trying to install optional package INLA...") + + # Try to load the installer (only if previously installed) + if (!requireNamespace("INLA", quietly = TRUE)) { + tryCatch( + { + install.packages("INLA",repos=c(getOption("repos"),INLA="https://inla.r-inla-download.org/R/stable"), dep=TRUE) + }, + error = function(e) { + message("INLA could not be installed; continuing without it.") + } + ) + } +} + + +################################################################ +################################################################ +# +# EXAMPLE 2: MIXED MODELS WITH FRACTIONAL POLYNOMIALS +# +# Section 4 of the article +# +################################################################ +################################################################ + + +library(FBMS) + + + +############################################################### +# 2.0 Load Zambia data (requires cAIC4) +############################################################### +if (!requireNamespace("cAIC4", quietly = TRUE)) { + stop("Optional package 'cAIC4' is required for Example 2. Please install it.") +} + +data(Zambia, package = "cAIC4") +df <- as.data.frame(sapply(Zambia[1:5],scale)) + + +transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2", + "p0p0","p0p05","p0p1","p0p2","p0p3", + "p0p05","p0pm05","p0pm1","p0pm2") + + +probs <- gen.probs.gmjmcmc(transforms) +probs$gen <- c(1/3,1/3,0,1/3) # Modifications and interactions! + +params <- gen.params.gmjmcmc(ncol(df) - 1) +params$feat$D <- 1 # Set depth of features to 1 (still allows for interactions) +params$feat$pop.max = 10 + + + +############################################################### +# 2.1 Define custom log-likelihoods for INLA, RTMB +############################################################### + +# --------------------------------------------------------------- +# INLA version (only used if INLA is properly installed) +mixed.model.loglik.inla <- function (y, x, model, complex, mlpost_params) +{ + if(sum(model)>1) + { + data1 = data.frame(y, as.matrix(x[,model]), mlpost_params$dr) + formula1 = as.formula(paste0(names(data1)[1],"~",paste0(names(data1)[3:(dim(data1)[2]-1)],collapse = "+"),"+ f(mlpost_params.dr,model = \"iid\")")) + } else + { + data1 = data.frame(y, mlpost_params$dr) + formula1 = as.formula(paste0(names(data1)[1],"~","1 + f(mlpost_params.dr,model = \"iid\")")) + } + + #to make sure inla is not stuck + inla.setOption(inla.timeout=30) + inla.setOption(num.threads=mlpost_params$INLA.num.threads) + + mod<-NULL + #importance with error handling for unstable libraries that one does not trust 100% + tryCatch({ + mod <- inla(family = "gaussian",silent = 1L,safe = F, data = data1,formula = formula1) + }, error = function(e) { + + # Handle the error by setting result to NULL + mod <- NULL + + # You can also print a message or log the error if needed + cat("An error occurred:", conditionMessage(e), "\n") + }) + + # logarithm of model prior + if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r + lp <- log_prior(mlpost_params, complex) + + if(length(mod)<3||length(mod$mlik[1])==0) { + return(list(crit = -10000 + lp,coefs = rep(0,dim(data1)[2]-2))) + } else { + mloglik <- mod$mlik[1] + return(list(crit = mloglik + lp, coefs = mod$summary.fixed$mode)) + } +} + +# --------------------------------------------------------------- +# RTMB version (only used if RTMB is properly installed) +mixed.model.loglik.rtmb <- function (y, x, model, complex, mlpost_params) +{ + z = model.matrix(y~mlpost_params$dr) #Design matrix for random effect + + msize = sum(model) + #Set up and estimate model + dat = list(y = y, xm = x[,model], z = z) + par = list(logsd_eps = 0, + logsd_dr = 0, + beta = rep(0,msize), + u = rep(0,mlpost_params$nr_dr)) + + nll = function(par){ + getAll(par,dat) + sd_eps = exp(logsd_eps) + sd_dr = exp(logsd_dr) + + nll = 0 + #-log likelihood random effect + nll = nll - sum(dnorm(u, 0, sd_dr, log = TRUE)) + mu = as.vector(as.matrix(xm)%*%beta) + z%*%u + nll <- nll - sum(dnorm(y, mu, sd_eps, log = TRUE)) + + return(nll) + } + obj <- MakeADFun(nll , par, random = "u", silent = T ) + opt <- nlminb ( obj$par , obj$fn , obj$gr, control = list(iter.max = 10)) + + # logarithm of model prior + if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r + lp <- log_prior(mlpost_params, complex) + + mloglik <- -opt$objective - 0.5*log(dim(x)[1])*msize + return(list(crit = mloglik + lp, coefs = opt$par[-(1:2)])) +} + + +############################################################### +# 2.2 Small demonstration run for runtime comparisons +############################################################### + +set.seed(3052024) + +library(tictoc) + + +time.inla <- -1 +if (requireNamespace("INLA", quietly = TRUE)) { + library(INLA) + library(cAIC4) + + data(Zambia, package = "cAIC4") + df <- as.data.frame(sapply(Zambia[1:5],scale)) + + tic() + result1b <- fbms( + formula = z ~ 1+., data = df, + transforms = transforms, + method = "gmjmcmc", P = 3, N = 30, + family = "custom", + loglik.pi = mixed.model.loglik.inla, + model_prior = list(r = 1/nrow(df)), + extra_params = list(dr = droplevels(Zambia$dr), + INLA.num.threads = 4) + ) + time.inla <- toc() +} + +time.rtmb <- -1 +if (requireNamespace("RTMB", quietly = TRUE)) { + library(RTMB) + + data(Zambia, package = "cAIC4") + df <- as.data.frame(sapply(Zambia[1:5],scale)) + + + tic() + result1c <- fbms( + formula = z ~ 1+., data = df, + transforms = transforms, + method = "gmjmcmc", P = 3, N = 30, + family = "custom", + loglik.pi = mixed.model.loglik.rtmb, + model_prior = list(r = 1/nrow(df)), + extra_params = list( + dr = droplevels(Zambia$dr), + nr_dr = sum(table(Zambia$dr) > 0) + ) + ) + time.rtmb <- toc() +} + +cat(c(time.inla$callback_msg, time.rtmb$callback_msg)) + diff --git a/R_script/reproducible_script.R b/R_script/reproducible_script.R new file mode 100644 index 0000000..4037edc --- /dev/null +++ b/R_script/reproducible_script.R @@ -0,0 +1,394 @@ +############################################################### +# FBMS Reproducibility Script (JSS Submission) +# ------------------------------------------------------------- +# This script reproduces examples in: +# +# FBMS: Flexible Bayesian Model Selection and Model Averaging +# +# It installs the correct package versions and runs the two +# main examples used in the article. +# +# The script uses minimal, readable checks suitable for JSS: +# - Mandatory packages are installed if missing +# - Optional packages are installed if possible; otherwise skipped +# - FBMS is always installed from a dedicated GitHub branch "jss_v2" +############################################################### + + + +############################################################### +# 0. Helper: Install a mandatory package (stop if fails) +############################################################### + +install_mandatory <- function(pkg) { + if (!requireNamespace(pkg, quietly = TRUE)) { + message("Installing mandatory package: ", pkg) + tryCatch( + install.packages(pkg), + error = function(e) { + stop("Failed to install mandatory package ", pkg, call. = FALSE) + } + ) + } +} + +############################################################### +# 1. Install mandatory packages +############################################################### + +mandatory_pkgs <- c("devtools", "parallel", "tictoc", "lme4","cAIC4") + +for (p in mandatory_pkgs) install_mandatory(p) + +library(devtools) + + +############################################################### +# 2. Install FBMS (always from GitHub to enforce correct version) +############################################################### + +message("Installing FBMS from GitHub (branch softwareX)...") +install_github("jonlachmann/FBMS@jsoftwareX", + force = TRUE, build_vignettes = FALSE) + +library(FBMS) +library(tictoc) + +################################################################ +################################################################ +# +# EXAMPLE 1: EXOPLANET DATA +# +# Section 3 of the article +# +################################################################ +################################################################ + +library(FBMS) +data(exoplanet) + +train.indx <- 1:500 +df.train = exoplanet[train.indx, ] +df.test = exoplanet[-train.indx, ] + +to3 <- function(x) x^3 +transforms <- c("sigmoid", "sin_deg", "exp_dbl", "p0", "troot", "to3") + + +############################################################### +# Example 1.1 — Default single-thread GMJMCMC (Section 3) +############################################################### +set.seed(123) + +result.default <- fbms( + formula = semimajoraxis ~ 1 + ., + data = df.train, + method = "gmjmcmc", + transforms = transforms +) + + +############################################################### +# Example 1.2 — Alternative priors +############################################################### + +set.seed(234) +result.BIC <- fbms( + formula = semimajoraxis ~ 1 + ., + data = df.train, + method = "gmjmcmc", + transforms = transforms, + beta_prior = list(type = "Jeffreys-BIC", Var = "unknown") +) + +set.seed(345) +result.EB <- fbms( + formula = semimajoraxis ~ 1 + ., + data = df.train, + method = "gmjmcmc", + transforms = transforms, + beta_prior = list(type = "EB-global", a = 1) +) + + +############################################################### +# Example 1.3 — Longer single-thread run +############################################################### +set.seed(123) + +result.P50 <- fbms( + data = df.train, + method = "gmjmcmc", + transforms = transforms, + P = 50, N = 1000, N.final = 5000 +) + + +############################################################### +# Example 1.4 — Parallel GMJMCMC +############################################################### +set.seed(123) + +result.parallel <- fbms( + data = df.train, + method = "gmjmcmc.parallel", + transforms = transforms, + runs = 40, + cores = parallel::detectCores() - 1, + P = 25 +) + + +############################################################### +# Example 1.5 — Summaries and plotting +############################################################### + +summary(result.default) +summary(result.default, pop = "all", labels = paste0("x",1:length(df.train[,-1]))) + + +summary(result.P50) +summary(result.P50, pop = "best", labels = paste0("x",1:length(df.train[,-1]))) +summary(result.P50, pop = "last", labels = paste0("x",1:length(df.train[,-1]))) +summary(result.P50, pop = "last", tol = 0.01, labels = paste0("x",1:length(df.train[,-1]))) +summary(result.P50, pop = "all") + +summary(result.parallel) +library(tictoc) +tic() +summary(result.parallel, tol = 0.01, pop = "all",data = df.train) +toc() + + + +plot(result.default) +plot(result.P50) +plot(result.parallel) + + + +############################################################### +# Example 1.6 — Prediction +############################################################### +preds <- predict(result.default, df.test[,-1]) +str(aggr(preds)) + +rmse.default <- sqrt(mean((predmean(preds) - df.test$semimajoraxis)^2)) +plot(predmean(preds), df.test$semimajoraxis) + + +############################### + +preds.P50 <- predict(result.P50, df.test[,-1]) +rmse.P50 <- sqrt(mean((predmean(preds.P50) - df.test$semimajoraxis)^2)) +plot(predmean(preds.P50), df.test$semimajoraxis) + + +############################### + + +preds.multi <- predict(result.parallel , df.test[,-1], link = function(x) x) +rmse.parallel <- sqrt(mean((predmean(preds.multi) - df.test$semimajoraxis)^2)) +plot(predmean(preds.multi), df.test$semimajoraxis) + + +round(c(rmse.default, rmse.P50, rmse.parallel),2) + + +############################################################### +# Example 1.7 — Best model & MPM +############################################################### + +best.default <- get.best.model(result.default) +mpm.default <- get.mpm.model(result.default, + y = df.train$semimajoraxis, + x = df.train[,-1]) + +sqrt(mean((predict(best.default, df.test[,-1]) - + df.test$semimajoraxis)^2)) + +sqrt(mean((predict(mpm.default, df.test[,-1]) - + df.test$semimajoraxis)^2)) + +################################################################ +################################################################ +# +# EXAMPLE 2: MIXED MODELS WITH FRACTIONAL POLYNOMIALS +# +# Section 4 of the article +# +################################################################ +################################################################ + +rm(list = ls()) +library(FBMS) + + +############################################################### +# 2.0 Load Zambia data (requires cAIC4) +############################################################### +if (!requireNamespace("lme4", quietly = TRUE)) { + stop("Optional package 'lme4' is required for Example 2. Please install it.") +} +if (!requireNamespace("tictoc", quietly = TRUE)) { + stop("Optional package 'tictoc' is required for Example 2. Please install it.") +} +if (!requireNamespace("cAIC4", quietly = TRUE)) { + stop("Optional package 'cAIC4' is required for Example 2. Please install it.") +} + +library(tictoc) +library(lme4) + + +data(Zambia, package = "cAIC4") +df <- as.data.frame(sapply(Zambia[1:5],scale)) + + +transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2", + "p0p0","p0p05","p0p1","p0p2","p0p3", + "p0p05","p0pm05","p0pm1","p0pm2") + + +probs <- gen.probs.gmjmcmc(transforms) +probs$gen <- c(1/3,1/3,0,1/3) # Modifications and interactions! + +params <- gen.params.gmjmcmc(ncol(df) - 1) +params$feat$D <- 1 # Set depth of features to 1 (still allows for interactions) +params$feat$pop.max <- 10 + + + +############################################################### +# 2.1 Define custom log-likelihoods for lme4 +############################################################### + +# lme4 version +mixed.model.loglik.lme4 <- function (y, x, model, complex, mlpost_params) +{ + + # logarithm of marginal likelihood (Laplace approximation) + if (sum(model) > 1) { + x.model = x[,model] + data <- data.frame(y, x = x.model[,-1], dr = mlpost_params$dr) + + mm <- lmer(as.formula(paste0("y ~ 1 +",paste0(names(data)[2:(dim(data)[2]-1)],collapse = "+"), "+ (1 | dr)")), data = data, REML = FALSE) + } else{ #model without fixed effects + data <- data.frame(y, dr = mlpost_params$dr) + mm <- lmer(as.formula(paste0("y ~ 1 + (1 | dr)")), data = data, REML = FALSE) + } + + mloglik <- as.numeric(logLik(mm)) - 0.5*log(length(y)) * (dim(data)[2] - 2) #Laplace approximation + + # logarithm of model prior + if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r + lp <- log_prior(mlpost_params, complex) + + + return(list(crit = mloglik + lp, coefs = fixef(mm))) +} + +############################################################### +# 2.2 Small demonstration run for runtime comparisons +############################################################### + + +set.seed(03052024) + +tic() +result1a <- fbms(formula = z ~ 1+., data = df, transforms = transforms, + method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, + family = "custom", loglik.pi = mixed.model.loglik.lme4, + model_prior = list(r = 1/dim(df)[1]), + extra_params = list(dr = droplevels(Zambia$dr))) +time.lme4 <- toc() + + +cat(c(time.lme4$callback_msg)) + +############################################################### +# 2.3 Serious analysis with lme4 (Section 4). Runs within time +# constraints of JSS on Apple M1 Max 32 GB, but can be slower +# on older machines. Please, set run.long.mixed = FALSE, if the +# example exceeds reasonable time. +############################################################### + +# Specify if to run long chains under mixed effect models. +# Default is false as these chains an run longer than 20 minutes +# depending on the machines used. +run.long.mixed <- TRUE + +if(run.long.mixed) +{ + probs <- gen.probs.gmjmcmc(transforms) + params <- gen.params.gmjmcmc(ncol(df) - 1) + params$feat$D <- 1 + params$feat$pop.max <- 10 + + + # No nonlinear features + result2a <- fbms( + formula = z ~ 1+., data = df, + N = 5000, + method = "mjmcmc.parallel", + runs = 40, + cores = parallel::detectCores() - 1, + family = "custom", + loglik.pi = mixed.model.loglik.lme4, + model_prior = list(r = 1/nrow(df)), + extra_params = list(dr = droplevels(Zambia$dr)) + ) + + summary(result2a, labels = names(df)[-1]) + plot(result2a) + + + # Fractional polynomials + result2b <- fbms( + formula = z ~ 1+., data = df, + transforms = transforms, probs = probs, params = params, + P = 25, N = 100, + method = "gmjmcmc.parallel", + runs = 40, + cores = parallel::detectCores() - 1, + family = "custom", + loglik.pi = mixed.model.loglik.lme4, + model_prior = list(r = 1/nrow(df)), + extra_params = list(dr = droplevels(Zambia$dr)) + ) + + summary(result2b, tol = 0.05, labels = names(df)[-1]) + + + # Non-linear projections + transforms.sigmoid <- c("sigmoid") + probs.sigmoid <- gen.probs.gmjmcmc(transforms.sigmoid) + probs.sigmoid$gen <- c(0, 0, 0.5, 0.5) + + params.sigmoid <- gen.params.gmjmcmc(ncol(df) - 1) + params.sigmoid$feat$pop.max <- 10 + + result2c <- fbms( + formula = z ~ 1+., data = df, + transforms = transforms.sigmoid, + probs = probs.sigmoid, + params = params.sigmoid, + P = 25, N = 100, + method = "gmjmcmc.parallel", + runs = 40, + cores = parallel::detectCores() - 1, + family = "custom", + loglik.pi = mixed.model.loglik.lme4, + model_prior = list(r = 1/nrow(df)), + extra_params = list(dr = droplevels(Zambia$dr)) + ) + + summary(result2c, tol = 0.05, labels = names(df)[-1]) + + + # Comparison + summary(result2a, labels = names(df)[-1]) + summary(result2b, labels = names(df)[-1]) + summary(result2c, labels = names(df)[-1]) +} + diff --git a/man/fitted.fbms_predict.Rd b/man/fitted.fbms_predict.Rd index dac0ade..a0e78c1 100644 --- a/man/fitted.fbms_predict.Rd +++ b/man/fitted.fbms_predict.Rd @@ -4,7 +4,7 @@ \alias{fitted.fbms_predict} \title{Access Fitted Values} \usage{ -fitted.fbms_predict(object, ...) +\method{fitted}{fbms_predict}(object, ...) } \arguments{ \item{object}{Object of class "fbms_predict".} diff --git a/man/summary.gmjmcmc.Rd b/man/summary.gmjmcmc.Rd index 7a84158..e7a7d65 100644 --- a/man/summary.gmjmcmc.Rd +++ b/man/summary.gmjmcmc.Rd @@ -18,7 +18,7 @@ \arguments{ \item{object}{The results to use} -\item{pop}{The population to print for, defaults to last} +\item{pop}{The population to print for, defaults to "best", other options are "last" and "all"} \item{tol}{The tolerance to use as a threshold when reporting the results.} diff --git a/tests_current/Ex10_Sec6_3.R b/tests_current/Ex10_Sec6_3.R deleted file mode 100644 index 433b8db..0000000 --- a/tests_current/Ex10_Sec6_3.R +++ /dev/null @@ -1,102 +0,0 @@ -####################################################### -# -# Example 10 (Section 6.3): Epil data set from the INLA package -# -# Mixed Effect Poisson Model with Fractional Polynomials, using only fbms -# -# This is the valid version for the JSS Paper -# -####################################################### - -library(FBMS) -library(INLA) -library(tictoc) - -data <- INLA::Epil -data <- data[,-c(5,6)] - -df <- data[1:5] -df$V2 <- rep(c(0,1,0,0),59) -df$V3 <- rep(c(0,0,1,0),59) -df$V4 <- rep(c(0,0,0,1),59) - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2","p0p0","p0p05","p0p1","p0p2","p0p3","p0p05","p0pm05","p0pm1","p0pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(1,1,0,1) # Only modifications! - -params <- gen.params.gmjmcmc(ncol(df) - 1) -params$feat$D <- 2 # Set depth of features to 2 (allow for interactions) -params$feat$keep.min <- 0.2 -params$greedy$steps <- 2 -params$greedy$tries <- 1 -params$sa$t.min <- 0.1 -params$sa$dt <- 10 - -# function to estimate log posterior -poisson.loglik.inla <- function (y, x, model, complex, mlpost_params) -{ - if(sum(model)>1) - { - data1 <- data.frame(y, as.matrix(x[,model]), mlpost_params$PID) - formula1 <- as.formula(paste0(names(data1)[1],"~",paste0(names(data1)[3:(dim(data1)[2]-1)],collapse = "+"),"+ f(mlpost_params.PID,model = \"iid\")")) - } else - { - data1 <- data.frame(y, mlpost_params$PID) - formula1 <- as.formula(paste0(names(data1)[1],"~","1 + f(mlpost_params.PID,model = \"iid\")")) - } - - #to make sure inla is not stuck - inla.setOption(inla.timeout=30) - inla.setOption(num.threads=mlpost_params$INLA.num.threads) - - mod<-NULL - - #error handling for unstable libraries that might crash - tryCatch({ - mod <- inla(family = "poisson",silent = 1L,safe = F,data = data1,formula = formula1) - }, error = function(e) { - # Handle the error by setting result to NULL - mod <- NULL - # Print a message or log the error if needed - cat("An error occurred:", conditionMessage(e), "\n") - }) - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - if(length(mod)<3||length(mod$mlik[1])==0) { - return(list(crit = -10000 + lp,coefs = rep(0,dim(data1)[2]-2))) - } else { - mloglik <- mod$mlik[1] - return(list(crit = mloglik + lp, coefs = mod$summary.fixed$mode)) - } -} - -set.seed(03052024) -#specify indices for a random effect - -result <- fbms(formula = y ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc", probs = probs, params = params, P=25, N = 100, - family = "custom", loglik.pi = poisson.loglik.inla, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(PID = data$Ind, INLA.num.threads = 1)) - -plot(result) -summary(result) - - -set.seed(23052024) - -tic() -# Number of threads used by INLA set to 1 to avoid conflicts between two layers of parallelization -result2 <- fbms(formula = y ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = poisson.loglik.inla, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(PID = data$Ind, INLA.num.threads = 1)) -time.inla <- toc() - -plot(result2) -summary(result2, labels = names(df)[-1], tol = 0.01) \ No newline at end of file diff --git a/tests_current/Ex11_Sec6_4.R b/tests_current/Ex11_Sec6_4.R deleted file mode 100644 index 92b10fb..0000000 --- a/tests_current/Ex11_Sec6_4.R +++ /dev/null @@ -1,185 +0,0 @@ -####################################################### -# -# Example 11 (Section 6.4): -# -# Subsampling, using only fbms -# -# Heart Disease Health Indicators Dataset” -# -# This is the valid version for the JSS Paper -# -####################################################### - -library(tictoc) -library(FBMS) -#library(devtools) -#devtools::install_github("jonlachmann/irls.sgd", force=T, build_vignettes=F) -library(irls.sgd) -#Kaggle API -library(RKaggle) - -# Download latest version -df <- RKaggle::get_dataset("alexteboul/heart-disease-health-indicators-dataset") - -summary(df) -dim(df) - - - -#number of observations and covariates in the data - -n <- dim(df)[1] -p <- dim(df)[2] - 1 - -params <- gen.params.gmjmcmc(p) -transforms <- c("sigmoid","pm1","p0","p05","p2","p3") - -r = 0.01 # Parameter for the model prior - -logistic.posterior.bic.irlssgd <- function (y, x, model, complex, mlpost_params) -{ - if (!is.null(mlpost_params$crit)) { - mod <- glm.sgd(x[,model], y, binomial(), - sgd.ctrl = list(start=mlpost_params$coefs, subs=mlpost_params$subs, - maxit=10, alpha=0.00008, decay=0.99, histfreq=10)) - mod$deviance <- get_deviance(mod$coefficients, x[,model], y, binomial()) - mod$rank <- length(mod$coefficients) - } else { - mod <- irls.sgd(as.matrix(x[,model]), y, binomial(), - irls.control=list(subs=mlpost_params$subs, maxit=20, tol=1e-7, - cooling = c(1,0.9,0.75), expl = c(3,1.5,1)), - sgd.control=list(subs=mlpost_params$subs, maxit=250, alpha=0.001, - decay=0.99, histfreq=10)) - } - - # logarithm of marginal likelihood - mloglik <- -mod$deviance / 2 - 0.5 * log(length(y)) * (mod$rank - 1) - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - crit <- mloglik + lp - - if (!is.null(mlpost_params$crit) && mlpost_params$crit > crit) { - return(list(crit = mlpost_params$crit, coefs = mlpost_params$coefs)) - } - - return(list(crit = crit, coefs = mod$coefficients)) -} - - - -############################ -# -# Testing runtime -# -############################ - -set.seed(100001) -tic() -result1 <- fbms(formula = HeartDiseaseorAttack ~ 1 + ., data = df, P = 2, - transforms = transforms, params = params, method = "gmjmcmc", - family = "custom", loglik.pi = logistic.posterior.bic.irlssgd, - model_prior = list(r = r, subs = 0.01), sub = T) -time1 <- toc() - -set.seed(100002) -# regular analysis -tic() -result2 <- fbms(formula = HeartDiseaseorAttack ~ 1 + ., data = df, P = 2, - transforms = transforms, params = params, method = "gmjmcmc", - family = "binomial", beta_prior = list(type = "Jeffreys-BIC"), - model_prior = list(r = r)) -time2 <- toc() - -c(time1, time2) - -############################ -# -# More serious analysis -# -############################ - - -# with subsampling - -set.seed(100003) - -tic() -result_parallel_1 <- fbms(formula = HeartDiseaseorAttack ~ 1 + ., data = df, P = 3, - transforms = transforms, params = params, - method = "gmjmcmc.parallel", runs = 10, cores = 10, - family = "custom", loglik.pi = logistic.posterior.bic.irlssgd, - model_prior = list(r = r, subs = 0.01), sub = T) -time3 <- toc() - -summary(result_parallel_1) - -# without subsampling - - -set.seed(100004) - -tic() -result_parallel_2 <- fbms(formula = HeartDiseaseorAttack ~ 1 + ., data = df, P = 3, - transforms = transforms, params = params, family = "binomial", - method = "gmjmcmc.parallel", runs = 10, cores = 10, - model_prior = list(r = r), - beta_prior = list(type = "Jeffreys-BIC")) -time4 <- toc() - -summary(result_parallel_2) - -filename = paste0("Ex11_Results_",r,"_4.RData") -save.image(filename) - -############################ -# -# Final analysis -# -############################ - -# with subsampling - -set.seed(100005) -tic() -result_parallel_long_1 <- fbms(formula = HeartDiseaseorAttack ~ 1 + ., data = df, P = 10, - transforms = transforms, params = params, N = 500, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = logistic.posterior.bic.irlssgd, - model_prior = list(r = r, subs = 0.01), sub = T) - -time5 <- toc() -summary(result_parallel_long_1) - -filename = paste0("Ex11_Results_",r,"_5.RData") -save.image(filename) - -# regular analysis - - -set.seed(100006) - -tic() -result_parallel_long_2 <- fbms(formula = HeartDiseaseorAttack ~ 1 + ., data = df, P = 10, - transforms = transforms, params = params, family = "binomial", - method = "gmjmcmc.parallel", runs = 40, cores = 40, N = 500, - model_prior = list(r = r), - beta_prior = list(type = "Jeffreys-BIC")) -time6 <- toc() - - -summary(result_parallel_long_2) - - -############################################################################ - -C = cor(df, use = "everything", - method = "spearman") - -corrplot::corrplot(C) - -apply((abs(C - diag(diag(C)))), 2, max) - -filename = paste0("Ex11_Results_",r,".RData") -save.image(filename) \ No newline at end of file diff --git a/tests_current/Ex12_Sec6_5.R b/tests_current/Ex12_Sec6_5.R deleted file mode 100644 index c159f0d..0000000 --- a/tests_current/Ex12_Sec6_5.R +++ /dev/null @@ -1,322 +0,0 @@ -####################################################### -# -# Example 13 (Section 6.5): -# -# Cox Regression (using only fbms) -# -# This is the valid version for the JSS Paper -# -####################################################### - -#install.packages("FBMS") -library(FBMS) -library(pec) #for the computation of cindex - -#install.packages("survival") -library(survival) - - -# Download data -download.file('https://www.uniklinik-freiburg.de/fileadmin/mediapool/08_institute/biometrie-statistik/Dateien/Studium_und_Lehre/Lehrbuecher/Multivariable_Model-building/gbsg_br_ca.zip', - 'gbsg_br_ca.zip') -df1 <- read.csv(unz('gbsg_br_ca.zip', - 'gbsg_br_ca/gbsg_br_ca.csv'), - header = TRUE) -file.remove("gbsg_br_ca.zip") - -# Prepare data -df <- df1[, c(13, 14, 2:4, 6:8, 10:12)] -names(df) = c("time","cens",names(df)[3:ncol(df)]) - -# Split into training and test set -set.seed(123) -train <- c(sample((1:nrow(df))[df$cens == 1], sum(df$cens)*2/3), # split separately events - sample((1:nrow(df))[df$cens == 0], sum(!df$cens)*2/3)) # and censored observations - -df.train <- df[train,] -df.test <- df[-train,] - -# time will be used as an extra parameter in the custom function -time <- df.train$time - - -params <- gen.params.gmjmcmc(ncol(df.train) - 2) -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(1,1,0,0) - - -# specify the custom function to estimate log posterior for cox -surv.pseudo.loglik = function(y, x, model, complex, mlpost_params){ - - data <- data.frame(time = mlpost_params$time, cens = y, as.matrix(x[,model]))[,-3] # Removing intercept - if(dim(data)[2]==2) - { - return(list(crit=-.Machine$double.xmax, coefs=rep(0,1))) - } else { - formula1 <- as.formula(paste0("Surv(time,cens)","~ 1 + .")) - - out <- coxph(formula1, data = data) - - # logarithm of marginal likelihood - mloglik <- (out$loglik[2] - out$loglik[1]) - log(length(y)) * (dim(data)[2] - 2)/2 - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - # Compute criterion and consider special cases like multicollinearity - - crit <- mloglik + lp - if(sum(is.na(out$coefficients))>0) #Get rid of models with collinearities (with more than two features) - crit <- -.Machine$double.xmax - - return(list(crit = crit, coefs = c(0,out$coefficients))) - } -} - - - -####################################################### -# -# Analysis with 4 different modeling strategies -# -# - - -# 1) Single chain analysis (just to illustrate how it works) -set.seed(121) -result1 <- fbms(formula = cens ~ 1 + .,data = df.train[,-1], params = params, P = 5, - transforms = transforms, method = "gmjmcmc", - family = "custom", loglik.pi = surv.pseudo.loglik, - model_prior = list(r = 0.5), - extra_params = list(time = time)) - - -summary(result1,labels = names(df.train)[-(1:2)]) - -# 2) Parallel version only linear terms - - -set.seed(122) -result2 <- fbms(formula = cens ~ 1 + .,data = df.train[,-1], params = params, - method = "mjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = surv.pseudo.loglik, - model_prior = list(r = 0.5), extra_params = list(time = time)) - -summary(result2,tol = 0.01,labels = names(df.train)[-(1:2)],effects = c(0.025,0.5,0.975)) - - - -# 3) Parallel version only fractional polynomials - -set.seed(123) -probs$gen <- c(0,1,0,1) -params$feat$D <- 1 - -result3 <- fbms(formula = cens ~ 1 + .,data = df.train[,-1], params = params, probs = probs, P = 10, - transforms = transforms, method = "gmjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = surv.pseudo.loglik, - model_prior = list(r = 0.5), extra_params = list(time = time)) - - -summary(result3,tol = 0.01, effects = c(0.025,0.5,0.975)) - - - -# 4) Parallel version using all types of non-linear features -set.seed(124) -probs$gen <- c(1,1,1,1) -params$feat$D <- 5 -result4 <- fbms(formula = cens ~ 1 + .,data = df.train[,-1], params = params, probs = probs,P = 20, - transforms = transforms, method = "gmjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = surv.pseudo.loglik, - model_prior = list(r = 0.5), extra_params = list(time = time)) - - -summary(result4,tol = 0.01) - - - - - - -################################################ -# -# Prediction and C index using model averaging -# -################################################ - - -linpreds1.train <- predict(result1,df.train[,-(1:2)], link = function(x) x) -linpreds1 <- predict(result1,df.test[,-(1:2)], link = function(x) x) - -linpreds2.train <- predict(result2,df.train[,-(1:2)], link = function(x) x) -linpreds2 <- predict(result2,df.test[,-(1:2)], link = function(x) x) - -linpreds3.train <- predict(result3,df.train[,-(1:2)], link = function(x) x) -linpreds3 <- predict(result3,df.test[,-(1:2)], link = function(x) x) - -linpreds4.train <- predict(result4,df.train[,-(1:2)], link = function(x) x) -linpreds4 <- predict(result4,df.test[,-(1:2)], link = function(x) x) - - - -df.train$average.lin.pred1 <- linpreds1.train$aggr$mean -df.train$average.lin.pred2 <- linpreds2.train$aggr$mean -df.train$average.lin.pred3 <- linpreds3.train$aggr$mean -df.train$average.lin.pred4 <- linpreds4.train$aggr$mean - -df.test$average.lin.pred1 <- linpreds1$aggr$mean -df.test$average.lin.pred2 <- linpreds2$aggr$mean -df.test$average.lin.pred3 <- linpreds3$aggr$mean -df.test$average.lin.pred4 <- linpreds4$aggr$mean - - - -# Compute cindex using package pec - -mod1 <- coxph(Surv(time, cens) ~ average.lin.pred1, data = as.data.frame(df.train), x = TRUE) -cindex1 <- cindex(mod1, mod1$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod2 <- coxph(Surv(time, cens) ~ average.lin.pred2, data = as.data.frame(df.train), x = TRUE) -cindex2 <- cindex(mod2, mod2$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod3 <- coxph(Surv(time, cens) ~ average.lin.pred3, data = as.data.frame(df.train), x = TRUE) -cindex3 <- cindex(mod3, mod3$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod4 <- coxph(Surv(time, cens) ~ average.lin.pred4, data = as.data.frame(df.train), x = TRUE) -cindex4 <- cindex(mod4, mod4$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - - -#Full model without nonlinearities (for the sake of comparison) -mod5 <- coxph(Surv(time, cens) ~ 1+., data = as.data.frame(df.train[,1:11]),x = T) -cindex5 <- cindex(mod5, mod5$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -#Model without predictors (for the sake of comparison) -mod6 <- coxph(Surv(time, cens) ~ 1, data = as.data.frame(df.train[,1:11]),x = T) -cindex6 <- cindex(mod6, mod6$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -all.cindices = round(unlist(c(cindex1, cindex2, cindex3, cindex4, cindex5, cindex6)),3) -names(all.cindices) = c("Model 1", "Model 2", "Model 3", "Model 4", "Full Linear Model", "Null Model") - -# Clean the train and test data for the next type of predictions - -df.train <- df[train,] -df.test <- df[-train,] - -############################################## -# -# Prediction and C index using best model -# -############################################## - - -linpreds.train.best <- predict(get.best.model(result1),df.train[,-(1:2)], link = function(x) x) -linpreds.best <- predict(get.best.model(result1),df.test[,-(1:2)], link = function(x) x) - - -linpreds2.train.best <- predict(get.best.model(result2),df.train[,-(1:2)], link = function(x) x) -linpreds2.best <- predict(get.best.model(result2),df.test[,-(1:2)], link = function(x) x) - - -linpreds3.train.best <- predict(get.best.model(result3),df.train[,-(1:2)], link = function(x) x) -linpreds3.best <- predict(get.best.model(result3),df.test[,-(1:2)], link = function(x) x) - - -linpreds4.train.best <- predict(get.best.model(result4),df.train[,-(1:2)], link = function(x) x) -linpreds4.best <- predict(get.best.model(result4),df.test[,-(1:2)], link = function(x) x) - - -df.train$best.lin.pred1 <- linpreds.train.best -df.train$best.lin.pred2 <- linpreds2.train.best -df.train$best.lin.pred3 <- linpreds3.train.best -df.train$best.lin.pred4 <- linpreds4.train.best - -df.test$best.lin.pred1 <- linpreds.best -df.test$best.lin.pred2 <- linpreds2.best -df.test$best.lin.pred3 <- linpreds3.best -df.test$best.lin.pred4 <- linpreds4.best - -mod1 <- coxph(Surv(time, cens) ~ best.lin.pred1, data = as.data.frame(df.train), x = TRUE) -cindex1 <- cindex(mod1, mod1$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod2 <- coxph(Surv(time, cens) ~ best.lin.pred2, data = as.data.frame(df.train), x = TRUE) -cindex2 <- cindex(mod2, mod2$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod3 <- coxph(Surv(time, cens) ~ best.lin.pred3, data = as.data.frame(df.train), x = TRUE) -cindex3 <- cindex(mod3, mod3$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod4 <- coxph(Surv(time, cens) ~ best.lin.pred4, data = as.data.frame(df.train), x = TRUE) -cindex4 <- cindex(mod4, mod4$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -all.cindices <- rbind(all.cindices, round(unlist(c(cindex1, cindex2, cindex3, cindex4, cindex5, cindex6)),3)) - -# Clean the train and test data for the next type of predictions - -df.train <- df[train,] -df.test <- df[-train,] - -############################################## -# -# Prediction and C index using mpm model -# -############################################## - - -linpreds.train.mpm <- predict(get.mpm.model(result1, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)), - df.train[,-(1:2)], link = function(x) x) - -linpreds.mpm <- predict(get.mpm.model(result1, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)),df.test[,-(1:2)], link = function(x) x) - - -linpreds2.train.mpm <- predict(get.mpm.model(result2, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)), - df.train[,-(1:2)], link = function(x) x) - -linpreds2.mpm <- predict(get.mpm.model(result2, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)),df.test[,-(1:2)], link = function(x) x) - -linpreds3.train.mpm <- predict(get.mpm.model(result3, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)), - df.train[,-(1:2)], link = function(x) x) -linpreds3.mpm <- predict(get.mpm.model(result3, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)),df.test[,-(1:2)], link = function(x) x) - - -linpreds4.train.mpm <- predict(get.mpm.model(result4, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)), - df.train[,-(1:2)], link = function(x) x) -linpreds4.mpm <- predict(get.mpm.model(result4, y = df.train$cens, x = df.train[, -c(1,2)],family = "custom", - loglik.pi = surv.pseudo.loglik,params = list(r = 0.5, time = time)),df.test[,-(1:2)], link = function(x) x) - - -df.train$mpm.lin.pred1 <- linpreds.train.mpm -df.train$mpm.lin.pred2 <- linpreds2.train.mpm -df.train$mpm.lin.pred3 <- linpreds3.train.mpm -df.train$mpm.lin.pred4 <- linpreds4.train.mpm - -df.test$mpm.lin.pred1 <- linpreds.mpm -df.test$mpm.lin.pred2 <- linpreds2.mpm -df.test$mpm.lin.pred3 <- linpreds3.mpm -df.test$mpm.lin.pred4 <- linpreds4.mpm - -mod1 <- coxph(Surv(time, cens) ~ mpm.lin.pred1, data = as.data.frame(df.train), x = TRUE) -cindex1 <- cindex(mod1, mod1$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod2 <- coxph(Surv(time, cens) ~ mpm.lin.pred2, data = as.data.frame(df.train), x = TRUE) -cindex2 <- cindex(mod2, mod2$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod3 <- coxph(Surv(time, cens) ~ mpm.lin.pred3, data = as.data.frame(df.train), x = TRUE) -cindex3 <- cindex(mod3, mod3$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - -mod4 <- coxph(Surv(time, cens) ~ mpm.lin.pred4, data = as.data.frame(df.train), x = TRUE) -cindex4 <- cindex(mod4, mod4$formula, data = as.data.frame(df.test), cens.model = 'cox')$AppCindex - - -all.cindices <- rbind(all.cindices, round(unlist(c(cindex1, cindex2, cindex3, cindex4, cindex5, cindex6)),3)) -rownames(all.cindices) = c("Model Averaging", "Best Model", "MPM") - -print(all.cindices) diff --git a/tests_current/Ex1_Sec3.R b/tests_current/Ex1_Sec3.R deleted file mode 100644 index c3c0cf1..0000000 --- a/tests_current/Ex1_Sec3.R +++ /dev/null @@ -1,213 +0,0 @@ -################################################# -# -# Example 1: -# -# Kepler Example with the most recent database update, only using fbms function -# -# This is the valid version for the JSS paper -# -################################################## - - -#install.packages("FBMS") -library(FBMS) - -data(exoplanet) - -train.indx <- 1:500 -df.train = exoplanet[train.indx, ] -df.test = exoplanet[-train.indx, ] - - -to3 <- function(x) x^3 -transforms <- c("sigmoid","sin_deg","exp_dbl","p0","troot","to3") - - -#################################################### -# -# single thread analysis (default values, Section 3.1) -# -#################################################### - - -set.seed(123) - -result.default <- fbms(formula = semimajoraxis ~ 1 + . , data = df.train, method = "gmjmcmc", transforms = transforms) - -get.best.model(result.default) - -#################################################### -# -# single thread analysis (more iterations, Section 3.2) -# -#################################################### - - -set.seed(123) - -result.P50 <- fbms(data = df.train, method = "gmjmcmc", transforms = transforms, - P = 50, N = 1000, N.final = 5000) - - -# coef -# add posterior modes best model sentence -coef(result_parallel) - -#for predictions rename mean to postmean same for postquantile - - - -#################################################### -# -# multiple thread analysis (Section 3.3) -# -#################################################### - -set.seed(123) - -result_parallel <- fbms(data = df.train, method = "gmjmcmc.parallel", transforms = transforms, - runs = 40, cores = parallel::detectCores()-1, P = 25) - - -#################################################### -# -# Inspection of Results (Section 3.4) -# -#################################################### - -###################### -# summary - -summary(result.default) -summary(result.default, pop = "all", labels = paste0("x",1:length(df.train[,-1]))) - - -summary(result.P50) -summary(result.P50, pop = "best", labels = paste0("x",1:length(df.train[,-1]))) -summary(result.P50, pop = "last", labels = paste0("x",1:length(df.train[,-1]))) -summary(result.P50, pop = "last", tol = 0.01, labels = paste0("x",1:length(df.train[,-1]))) - - -summary(result_parallel) -library(tictoc) -tic() -summary(result_parallel, tol = 0.01, pop = "all") -toc() - - - - -###################### -# plot - -pdf("result.pdf") -plot(result.default) -dev.off() - -plot(result.default) - - - -pdf("result.P50.pdf") -plot(result.P50) -dev.off() - -plot(result.P50) - - - -pdf("result_parallel.pdf") -plot(result_parallel) -dev.off() - -plot(result_parallel) - - -###################### -# Prediction - - -#preds <- predict(result.default, df.test[,-1], link = function(x) x) -preds <- predict(result.default, df.test[,-1]) -rmse.default <- sqrt(mean((predmean(preds) - df.test$semimajoraxis)^2)) - -pdf("prediction.pdf") -plot(predmean(preds), df.test$semimajoraxis) -dev.off() - -plot(predmean(preds), df.test$semimajoraxis) - - - - - - -############################### - - -#preds.P50 = predict(result.P50, df.test[,-1], link = function(x) x) -preds.P50 = predict(result.P50, df.test[,-1]) -rmse.P50 <- sqrt(mean((predmean(preds.P50) - df.test$semimajoraxis)^2)) - -pdf("prediction.P50.pdf") -plot(predmean(preds.P50), df.test$semimajoraxis) -dev.off() - -plot(predmean(preds.P50), df.test$semimajoraxis) - - - -############################### - - -preds.multi <- predict(result_parallel , df.test[,-1], link = function(x) x) -rmse.parallel <- sqrt(mean((predmean(preds.multi) - df.test$semimajoraxis)^2)) - -pdf("pred_parallel.pdf") -plot(predmean(preds.multi), df.test$semimajoraxis) -dev.off() - - -round(c(rmse.default, rmse.P50, rmse.parallel),2) - - -############################### - - -#Prediction based on the best model () or the MPM (Median Probability Model) - -get.best.model(result = result.default) -preds.best <- predict(get.best.model(result.default), df.test[, -1]) -sqrt(mean((preds.best - df.test$semimajoraxis)^2)) - -get.mpm.model(result = result.default, y = df.train$semimajoraxis, x = df.train[, -1]) -preds.mpm <- predict(get.mpm.model(result.default, y = df.train$semimajoraxis, x = df.train[, -1]), df.test[, -1]) -sqrt(mean((preds.mpm - df.test$semimajoraxis)^2)) - - -#################################################### -# -# Diagnostic plots (Section 3.5) -# -#################################################### - - -pdf("diagn_default.pdf") -diagn_plot(result.default, ylim = c(600,1500), FUN = max) -dev.off() -diagn_plot(result.default, ylim = c(600,1500), FUN = max) - - -pdf("diagn_long.pdf") -diagn_plot(result.P50, ylim = c(600,1500), FUN = max) -dev.off() -diagn_plot(result.P50, ylim = c(600,1500), FUN = max) - - -pdf("diagn_par.pdf") -diagn_plot(result_parallel, ylim = c(600,1500),FUN = max) -dev.off() - -diagn_plot(result_parallel, ylim = c(600,1500),FUN = max) - - diff --git a/tests_current/Ex2_Sec4_1.R b/tests_current/Ex2_Sec4_1.R deleted file mode 100644 index afc16b7..0000000 --- a/tests_current/Ex2_Sec4_1.R +++ /dev/null @@ -1,91 +0,0 @@ -####################################################### -# -# Example 2 (Section 4.1): -# -# Simulated data without any nonlinearities, only using fbms function -# -# This is the valid version for the JSS Paper -# -####################################################### - -#install.packages("FBMS") - -library(mvtnorm) -library(FBMS) - - - -n <- 100 # sample size -p <- 20 # number of covariates -p.vec <- 1:p - - -k <- 5 #size of the data generating model - - -correct.model <- 1:k -beta.k <- (1:5)/5 # Coefficents of the correct submodel - -beta <- c(rep(0, p)) -beta[correct.model] <- beta.k - -set.seed(123) - -x <- rmvnorm(n, rep(0, p)) -y <- x %*% beta + rnorm(n) -X <- as.matrix(x) - -y<-scale(y) -X<-scale(X)/sqrt(n) - - -df <- as.data.frame(cbind(y, X)) -colnames(df) <- c("Y", paste0("X", seq_len(ncol(df) - 1))) - -correct.model -beta.k - -######################################################## -# -# Models with non-linear effects (gmjmcmc) -# -# - -to3 <- function(x) x^3 -transforms <- c("sigmoid","sin_deg","exp_dbl","p0","troot","to3") - -set.seed(1) - result <- fbms(data = df, method = "gmjmcmc", transforms = transforms) - summary(result) - plot(result) - - -set.seed(2) - result2 <- fbms(data = df, method = "gmjmcmc", transforms = transforms, - N = 1000, P = 40) - summary(result2, tol = 0.1) - plot(result) - - - -######################################################## -# -# Model which includes no non-linear effects (mjmcmc) -# -# - - # The default value of N = 1000 works relatively well here. - set.seed(1) - result.lindef <- fbms(data = df) - summary(result.lindef) - plot(result.lindef) - - # Check that this is actually the default - set.seed(1) - result.lin <- fbms(data = df, N = 1000) - summary(result.lin) - plot(result.lin) - - - - diff --git a/tests_current/Ex3_Sec4_2.R b/tests_current/Ex3_Sec4_2.R deleted file mode 100644 index f2f0b4f..0000000 --- a/tests_current/Ex3_Sec4_2.R +++ /dev/null @@ -1,84 +0,0 @@ -####################################################### -# -# Example 3 (Section 4.2): -# -# Simulated data with interactions, using only fbms -# -# This is the valid version for the JSS Paper -# -####################################################### - -library(mvtnorm) -library(FBMS) - -n <- 100 # sample size -p <- 20 # number of covariates - -# Model: -# X1: Pure Main effect -# X2 : X3: Pure interaction effect -# X4 * X5: Main effects plus interaction effect - - -set.seed(1003) - -x = rmvnorm(n, rep(0, p)) -X <- as.matrix(x) -X <- scale(X)/sqrt(n) - -y <- (1.2 * x[,1] + 1.5 * x[,2]* x[,3] - x[,4] + 1.1*x[,5] - 1.3 * x[,4]*x[,5])+ rnorm(n) -y<-scale(y) - -df <- data.frame(y = y, X) - - -transforms <- c("") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(1,0,0,1) #Include interactions and mutations - -#################################################### -# -# single thread analysis (two different runs) -# -#################################################### - -set.seed(123) -result <- fbms(data = df, method = "gmjmcmc", transforms = transforms, - probs = probs) -summary(result) - - -set.seed(123) -result2 <- fbms(data = df, method = "gmjmcmc", transforms = transforms, - N = 1000, probs = probs, P=40) -summary(result2, tol = 0.01) - - -#################################################### -# -# multiple thread analysis -# -#################################################### - - -set.seed(123) - - result_parallel <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, - runs = 40, cores = 40, - probs = probs, P=25) - -summary(result_parallel, tol = 0.01) - - - -# Using longer more iterations of MJMCMC chains does not change results substantially -set.seed(123) - -result_parallel2 <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, - runs = 40, cores = 10, N=1000, N.final=2000, - probs = probs, P=25) -summary(result_parallel2, tol = 0.01) - -#summary(result_parallel2, pop = "all", tol = 0.01) - - diff --git a/tests_current/Ex4_Sec4_3.R b/tests_current/Ex4_Sec4_3.R deleted file mode 100644 index 2ffdcb7..0000000 --- a/tests_current/Ex4_Sec4_3.R +++ /dev/null @@ -1,97 +0,0 @@ -####################################################### -# -# Example 4 (Section 4.3): -# -# Fractional Polynomials: Depths is set to 1, using only fbms -# -# This is the valid version for the JSS Paper -# -####################################################### - - -library(FBMS) - - -url <- "https://www.uniklinik-freiburg.de/fileadmin/mediapool/08_institute/biometrie-statistik/Dateien/Studium_und_Lehre/Lehrbuecher/Multivariable_Model-building/ART.zip" -temp_dir <- tempfile() -download.file(url, tf <- tempfile(fileext = ".zip"), mode = "wb") -unzip(tf, exdir = temp_dir) - -df <- read.csv(file.path(temp_dir, "ART/art", "art.csv"))[,c(16,1:3,5:8,10:14)] - -summary(df) - - -#number of observations in the data - -n = dim(df)[1] - -#number of covariates - -p = dim(df)[2] - 1 - - -set.seed(040590) - - -mu = 0.1 + p05(df$x1) + df$x1 + pm05(df$x3) + p0pm05(df$x3) + df$x4a + pm1(df$x5) + p0(df$x6) + df$x8 + df$x10 -df$y = rnorm(n =n, mean = mu,sd = 1) - - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2","p0p0","p0p05","p0p1","p0p2","p0p3","p0p05","p0pm05","p0pm1","p0pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(0,1,0,1) # Only modifications! -params <- gen.params.gmjmcmc(ncol(df) - 1) -params$feat$D <- 1 # Set depth of features to 1 - - -#################################################### -# -# single thread analysis -# -#################################################### - -set.seed(123) -result <- fbms(data = df, method = "gmjmcmc", transforms = transforms, - probs = probs, params = params) -summary(result) - - - -#################################################### -# -# multiple thread analysis -# -#################################################### - -set.seed(101) -result_parallel <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, - probs = probs, params = params, P=25,runs = 40, cores = 40) -summary(result_parallel, tol = 0.05) - -diagn_plot(result_parallel, FUN = median) - - - - -set.seed(102) - result_parallel2 <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, - probs = probs, params = params, P=25, N=1000, N.final=2000, - runs = 40, cores = 40,) - -summary(result_parallel2, tol = 0.05) - -diagn_plot(result_parallel2,FUN = median) - - -########################## - -# Using Jeffreys-BIC prior - -set.seed(103) -result_parallel3 <- fbms(data = df, method = "gmjmcmc.parallel", beta_prior = list(type = "Jeffreys-BIC"), transforms = transforms, - probs = probs, params = params, P=25, N=1000, N.final=2000, - runs = 40, cores = 40,) - -summary(result_parallel3, tol = 0.05) - diff --git a/tests_current/Ex5_Sec4_3.R b/tests_current/Ex5_Sec4_3.R deleted file mode 100644 index 36942d3..0000000 --- a/tests_current/Ex5_Sec4_3.R +++ /dev/null @@ -1,97 +0,0 @@ -####################################################### -# -# Example 5 (Section 4.3): -# -# Fractional Polynomials: Depths is set to 1, using only fbms -# -# This is the valid version for the JSS Paper -# -####################################################### - - -library(FBMS) - -url <- "https://www.uniklinik-freiburg.de/fileadmin/mediapool/08_institute/biometrie-statistik/Dateien/Studium_und_Lehre/Lehrbuecher/Multivariable_Model-building/ART.zip" -temp_dir <- tempfile() -download.file(url, tf <- tempfile(fileext = ".zip"), mode = "wb") -unzip(tf, exdir = temp_dir) - -df <- read.csv(file.path(temp_dir, "ART/art", "art.csv"))[,c(16,1:3,5:8,10:14)] - -summary(df) - - -#number of observations in the data - -n = dim(df)[1] - -#number of covariates - -p = dim(df)[2] - 1 - - -set.seed(040590) - - -mu = 0.1 + p05(df$x1) + df$x1 + pm05(df$x3) + p0pm05(df$x3) + df$x4a + pm1(df$x5) + p0(df$x6) + df$x8 + df$x10 -df$y = rnorm(n =n, mean = mu,sd = 1) - - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2","p0p0","p0p05","p0p1","p0p2","p0p3","p0p05","p0pm05","p0pm1","p0pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(0,1,0,1) # Only modifications! -params <- gen.params.gmjmcmc(ncol(df) - 1) -params$feat$D <- 1 # Set depth of features to 1 - - -#################################################### -# -# single thread analysis -# -#################################################### - -set.seed(123) -result <- fbms(data = df, method = "gmjmcmc", transforms = transforms, - probs = probs, params = params) -summary(result) - - - -#################################################### -# -# multiple thread analysis -# -#################################################### - -set.seed(101) -result_parallel <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, - probs = probs, params = params, P=25,runs = 40, cores = 40) -summary(result_parallel, tol = 0.01) - -diagn_plot(result_parallel, FUN = median) - - - - -set.seed(102) - result_parallel2 <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, - probs = probs, params = params, P=25, N=1000, N.final=2000, - runs = 40, cores = 40,) - -summary(result_parallel2, tol = 0.01) - -diagn_plot(result_parallel2,FUN = median) - - -# Very large number of mjmcmc iterations (not needed for paper) -set.seed(104) - -if (use.fbms) { - result_parallel3 <- fbms(data = df, method = "gmjmcmc.parallel", transforms = transforms, beta_prior = list(type = "Jeffreys-BIC"), - probs = probs, params = params, P=50, runs = 40, cores = 40, N.init=2000, N.final=4000) -} else { - result_parallel3 = gmjmcmc.parallel(runs = 40, cores = 40, x = df[, -1], y = df[, 1], transforms = transforms, mlpost_params = list(family = "gaussian", beta_prior = list(type = "Jeffreys-BIC")), - probs = probs, params = params, P=50, N.init=2000, N.final=4000) -} -#summary(result_parallel3, labels = names(df[-1])) -summary(result_parallel3, labels = names(df[-1]), tol = 0.01) \ No newline at end of file diff --git a/tests_current/Ex6_Sec5_1.R b/tests_current/Ex6_Sec5_1.R deleted file mode 100644 index 1781f63..0000000 --- a/tests_current/Ex6_Sec5_1.R +++ /dev/null @@ -1,118 +0,0 @@ -####################################################### -# -# Example 6 (Section 5.1): Sanger data again -# -# High dimensional analysis without nonlinearities, using only FBMS -# -# This is the valid version for the JSS Paper -# -####################################################### - -library(FBMS) -library(xtable) -library(tictoc) -run.parallel <- TRUE # Flag to control whether to run gmjmcmc in parallel or just load results - - -data(SangerData2) -df <- SangerData2 -# Rename columns for clarity: response is "y", predictors "x1", "x2", ..., "xp" -colnames(df) = c("y",paste0("x",1:(ncol(df)-1))) -n = dim(df)[1]; p=dim(df)[2]-1 - -#Use only linear terms and mutations -transforms = c("") -probs = gen.probs.gmjmcmc(transforms) -probs$gen = c(0,0,0,1) - - -# Select candidate features for the first MJMCMC round by correlation with response -c.vec = unlist(mclapply(2:ncol(df), function(x)abs(cor(df[,1],df[,x])))) -ids = sort(order(c.vec,decreasing=TRUE)[1:50]) - -# Generate default parameters for GMJMCMC for p-1 predictors -params = gen.params.gmjmcmc(p) -# Restrict feature pre-filtering to top 50 predictors selected by correlation -params$feat$prel.filter <- ids - -params$feat$pop.max <- 50 # Maximum population size for the GMJMCMC search - -#################################################### -# -# Three independent runs of gmjmcmc.parallel -# -#################################################### - - -if (run.parallel) { - set.seed(123) - result_parallel1 = fbms(data=df, transforms=transforms, - beta_prior = list(type="g-prior", g=max(n,p^2)), - probs=probs,params=params, - method="gmjmcmc.parallel", - P=50,N=1000,runs=10,cores=10) - save(result_parallel1,file="Ex6_parallel1_orig.RData") - - set.seed(1234) - result_parallel2=fbms(data=df, transforms=transforms, - beta_prior = list(type="g-prior", g=max(n,p^2)), - probs=probs,params=params, - method="gmjmcmc.parallel", - P=50,N=1000,runs=10,cores=10) - save(result_parallel2,file="Ex6_parallel2_orig.RData") - - set.seed(123456) - result_parallel3=fbms(data=df, transforms=transforms, - beta_prior = list(type="g-prior", g=max(n,p^2)), - probs=probs,params=params, - method="gmjmcmc.parallel", - P=50,N=1000,runs=10,cores=10) - save(result_parallel3,file="Ex6_parallel3_orig.RData") - -} else { - - # If not running gmjmcmc.parallel again, load previously saved results - load("Ex6_parallel1.RData") - load("Ex6_parallel2.RData") - load("Ex6_parallel3.RData") - -} - - -# Summarize results from each of the three parallel runs with tolerance of 0.01 - -res1 = summary(result_parallel1,tol=0.01) -res1$marg.probs = round(res1$marg.probs,3) -res2 = summary(result_parallel2,tol=0.01) -res2$marg.probs = round(res2$marg.probs,3) -res3 = summary(result_parallel3,tol=0.01) -res3$marg.probs = round(res3$marg.probs,3) - -# Combine unique feature names found in all three runs -names.best = unique(c(res1$feats.strings,res2$feats.strings,res3$feats.strings)) - -# Find maximum number of rows across summaries to equalize sizes for cbind -m = max(nrow(res1),nrow(res2),nrow(res3)) -# Pad shorter summaries with empty rows to make them all length m -while(nrow(res1) 1) { - x.model = x[,model] - data <- data.frame(y, x = x.model[,-1], dr = mlpost_params$dr) - - mm <- lmer(as.formula(paste0("y ~ 1 +",paste0(names(data)[2:(dim(data)[2]-1)],collapse = "+"), "+ (1 | dr)")), data = data, REML = FALSE) - } else{ #model without fixed effects - data <- data.frame(y, dr = mlpost_params$dr) - mm <- lmer(as.formula(paste0("y ~ 1 + (1 | dr)")), data = data, REML = FALSE) - } - - mloglik <- as.numeric(logLik(mm)) - 0.5*log(length(y)) * (dim(data)[2] - 2) #Laplace approximation for beta prior - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - - return(list(crit = mloglik + lp, coefs = fixef(mm))) -} - - -# function to estimate log posterior with INLA - -mixed.model.loglik.inla <- function (y, x, model, complex, mlpost_params) -{ - if(sum(model)>1) - { - data1 = data.frame(y, as.matrix(x[,model]), mlpost_params$dr) - formula1 = as.formula(paste0(names(data1)[1],"~",paste0(names(data1)[3:(dim(data1)[2]-1)],collapse = "+"),"+ f(mlpost_params.dr,model = \"iid\")")) - } else - { - data1 = data.frame(y, mlpost_params$dr) - formula1 = as.formula(paste0(names(data1)[1],"~","1 + f(mlpost_params.dr,model = \"iid\")")) - } - - #to make sure inla is not stuck - inla.setOption(inla.timeout=30) - inla.setOption(num.threads=mlpost_params$INLA.num.threads) - - mod<-NULL - #importance with error handling for unstable libraries that one does not trust 100% - tryCatch({ - mod <- inla(family = "gaussian",silent = 1L,safe = F, data = data1,formula = formula1) - }, error = function(e) { - - # Handle the error by setting result to NULL - mod <- NULL - - # You can also print a message or log the error if needed - cat("An error occurred:", conditionMessage(e), "\n") - }) - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - if(length(mod)<3||length(mod$mlik[1])==0) { - return(list(crit = -10000 + lp,coefs = rep(0,dim(data1)[2]-2))) - } else { - mloglik <- mod$mlik[1] - return(list(crit = mloglik + lp, coefs = mod$summary.fixed$mode)) - } -} - - -# function to estimate log posterior with RTMB - -mixed.model.loglik.rtmb <- function (y, x, model, complex, mlpost_params) -{ - z = model.matrix(y~mlpost_params$dr) #Design matrix for random effect - - msize = sum(model) - #Set up and estimate model - dat = list(y = y, xm = x[,model], z = z) - par = list(logsd_eps = 0, - logsd_dr = 0, - beta = rep(0,msize), - u = rep(0,mlpost_params$nr_dr)) - - nll = function(par){ - getAll(par,dat) - sd_eps = exp(logsd_eps) - sd_dr = exp(logsd_dr) - - nll = 0 - #-log likelihood random effect - nll = nll - sum(dnorm(u, 0, sd_dr, log = TRUE)) - mu = as.vector(as.matrix(xm)%*%beta) + z%*%u - nll <- nll - sum(dnorm(y, mu, sd_eps, log = TRUE)) - - return(nll) - } - obj <- MakeADFun(nll , par, random = "u", silent = T ) - opt <- nlminb ( obj$par , obj$fn , obj$gr, control = list(iter.max = 10)) - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - mloglik <- -opt$objective - 0.5*log(dim(x)[1])*msize - return(list(crit = mloglik + lp, coefs = opt$par[-(1:2)])) -} - - - -###################### -# -# Compare runtime -# - -set.seed(03052024) - -tic() -result1a <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) -time.lme4 = toc() - - -tic() -result1b <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, - family = "custom", loglik.pi = mixed.model.loglik.inla, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr), - INLA.num.threads = 10)) -time.inla = toc() - -tic() -result1c <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, - family = "custom", loglik.pi = mixed.model.loglik.rtmb, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr), - nr_dr = sum((table(Zambia$dr))>0))) -time.rtmb = toc() -plot(result1c) -summary(result1c, labels = names(df)[-1]) - -c(time.lme4$callback_msg, time.inla$callback_msg, time.rtmb$callback_msg) - -###################### -# -# Analysis with lme4 -# -# - - -# Only this one is actually reported in the manuscript -set.seed(20062024) -params$feat$pop.max = 10 - -result2a <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) - -summary(result2a,tol = 0.05,labels=names(df)[-1]) - - -set.seed(21062024) - -result2b <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 120, cores = 40, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) - -summary(result2b, labels = names(df)[-1]) - -summary(result2b, labels = names(df)[-1], pop = "all") -summary(result2b, labels = names(df)[-1], pop = "last") - -plot(result2b) - - -set.seed(03072024) - -result2c <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 200, cores = 40, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) - -summary(result2c, labels = names(df)[-1]) -summary(result2c, labels = names(df)[-1], pop = "last") -summary(result2c, labels = names(df)[-1], pop = "all") -summary(result2c, labels = names(df)[-1], pop = "best") - - -summary(result2a, labels = names(df)[-1]) -summary(result2b, labels = names(df)[-1]) -summary(result2c, labels = names(df)[-1]) - - -###################### -# -# Analysis with INLA (Not used for manuscript, very long runtime) -# -# - -set.seed(22052024) - -# Number of threads used by INLA set to 1 -result2aI <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 40, cores = 40, - family = "custom", loglik.pi = mixed.model.loglik.inla, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr), INLA.num.threads = 1)) - -plot(result2aI) -summary(result2aI, labels = names(df)[-1]) - - - -params$feat$check.col = F - -set.seed(20062024) -# Number of threads used by INLA set to 1 -result2bI <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 120, cores = 40, - family = "custom", loglik.pi = mixed.model.loglik.inla, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr), INLA.num.threads = 1)) - -plot(result2bI) -summary(result2bI, labels = names(df)[-1]) \ No newline at end of file diff --git a/tests_current/JSS_Script.R b/tests_current/JSS_Script.R deleted file mode 100644 index 6340913..0000000 --- a/tests_current/JSS_Script.R +++ /dev/null @@ -1,513 +0,0 @@ -################################################# -# -# Example 1 (Section 3): -# -# Kepler Example with the most recent database update -# -# Basic introduction of the FBMS package -# -################################################## - - -#install.packages("FBMS") -#install.packages("devtools") - -library(devtools) -install_github("jonlachmann/FBMS", force=T, build_vignettes=F) - -library(FBMS) - -data(exoplanet) - -train.indx <- 1:500 -df.train = exoplanet[train.indx, ] -df.test = exoplanet[-train.indx, ] - - -to3 <- function(x) x^3 -transforms <- c("sigmoid","sin_deg","exp_dbl","p0","troot","to3") - - -#################################################### -# -# single thread analysis (default values, Section 3.1) -# -#################################################### - - -set.seed(123) - -result.default <- fbms(formula = semimajoraxis ~ 1 + . , data = df.train, - method = "gmjmcmc", transforms = transforms) - - -#################################################### -# -# Choosing a different prior (more iterations, Section 3.3) -# -#################################################### - - -set.seed(234) - -result.BIC <- fbms(formula = semimajoraxis ~ 1 + . , data = df.train, - method = "gmjmcmc", transforms = transforms, - beta_prior = list(type = "Jeffreys-BIC", Var = "unknown")) - - -set.seed(345) - -result.EB <- fbms(formula = semimajoraxis ~ 1 + . , data = df.train, - method = "gmjmcmc", transforms = transforms, - beta_prior = list(type = "EB-global", a = 1)) - -#################################################### -# -# single thread analysis (more iterations, Section 3.4) -# -#################################################### - - -set.seed(123) - -result.P50 <- fbms(data = df.train, method = "gmjmcmc", transforms = transforms, - P = 50, N = 1000, N.final = 5000) - - -#################################################### -# -# multiple thread analysis (Section 3.5) -# -#################################################### - -set.seed(123) - -result_parallel <- fbms(data = df.train, method = "gmjmcmc.parallel", transforms = transforms, - runs = 40, cores = parallel::detectCores()-1, P = 25) - - -#################################################### -# -# Inspection of Results (Section 3.6) -# -#################################################### - -###################### -# summary - -summary(result.default) -summary(result.default, pop = "all", labels = paste0("x",1:length(df.train[,-1]))) - - -summary(result.P50) -summary(result.P50, pop = "best", labels = paste0("x",1:length(df.train[,-1]))) -summary(result.P50, pop = "last", labels = paste0("x",1:length(df.train[,-1]))) -summary(result.P50, pop = "last", tol = 0.01, labels = paste0("x",1:length(df.train[,-1]))) -summary(result.P50, pop = "all") - -summary(result_parallel) -library(tictoc) -tic() -summary(result_parallel, tol = 0.01, pop = "all",data = df.train) -toc() - - - - -###################### -# plot - -pdf("result.pdf") -plot(result.default) -dev.off() - -plot(result.default) - - - -pdf("result.P50.pdf") -plot(result.P50) -dev.off() - -plot(result.P50) - - - -pdf("result_parallel.pdf") -plot(result_parallel) -dev.off() - -plot(result_parallel) - - -#################################################### -# -# Prediction (Section 3.7) -# -#################################################### - - -#preds <- predict(result.default, df.test[,-1], link = function(x) x) -preds <- predict(result.default, df.test[,-1]) - -str(aggr(preds)) - - - -rmse.default <- sqrt(mean((predmean(preds) - df.test$semimajoraxis)^2)) - -pdf("prediction.pdf") -plot(predmean(preds), df.test$semimajoraxis) -dev.off() - -plot(predmean(preds), df.test$semimajoraxis) - - - - - - -############################### - - -#preds.P50 = predict(result.P50, df.test[,-1], link = function(x) x) -preds.P50 = predict(result.P50, df.test[,-1]) -rmse.P50 <- sqrt(mean((predmean(preds.P50) - df.test$semimajoraxis)^2)) - -pdf("prediction.P50.pdf") -plot(predmean(preds.P50), df.test$semimajoraxis) -dev.off() - -plot(predmean(preds.P50), df.test$semimajoraxis) - - - -############################### - - -preds.multi <- predict(result_parallel , df.test[,-1], link = function(x) x) -rmse.parallel <- sqrt(mean((predmean(preds.multi) - df.test$semimajoraxis)^2)) - -pdf("pred_parallel.pdf") -plot(predmean(preds.multi), df.test$semimajoraxis) -dev.off() - - -round(c(rmse.default, rmse.P50, rmse.parallel),2) - - -############################### - - -#Prediction based on the best model () or the MPM (Median Probability Model) - -get.best.model(result = result.default) -preds.best <- predict(get.best.model(result.default), df.test[, -1]) -sqrt(mean((preds.best - df.test$semimajoraxis)^2)) - -get.mpm.model(result = result.default, y = df.train$semimajoraxis, x = df.train[, -1]) -preds.mpm <- predict(get.mpm.model(result.default, y = df.train$semimajoraxis, x = df.train[, -1]), df.test[, -1]) -sqrt(mean((preds.mpm - df.test$semimajoraxis)^2)) - - - -get.best.model(result = result_parallel) -preds.best_parallel <- predict(get.best.model(result_parallel), df.test[, -1]) -sqrt(mean((preds.best_parallel - df.test$semimajoraxis)^2)) - - - - -# Coefficients of the best model - -coef(result.default) -coef(result.P50) -coef(result_parallel) - - -#################################################### -# -# Diagnostic plots (Section 3.8) -# -#################################################### - - -pdf("diagn_default.pdf") -diagn_plot(result.default, ylim = c(600,1500), FUN = max) -dev.off() -diagn_plot(result.default, ylim = c(600,1500), FUN = max) - - -pdf("diagn_long.pdf") -diagn_plot(result.P50, ylim = c(600,1500), FUN = max) -dev.off() -diagn_plot(result.P50, ylim = c(600,1500), FUN = max) - - -pdf("diagn_par.pdf") -diagn_plot(result_parallel, ylim = c(600,1500),FUN = max) -dev.off() - -diagn_plot(result_parallel, ylim = c(600,1500),FUN = max) - - - -####################################################### -# -# Example 2 (Section 4): -# -# Zambia data set from the cAIC4 package -# -# Linear Mixed Model with Fractional Polynomials and other non-linear features -# -# Custom function to compute Marginal Likelihood with lme4, INLA and RTMB -# -####################################################### - -rm(list = ls()) - -library(FBMS) - -#install.packages("INLA",repos=c(getOption("repos"),INLA="https://inla.r-inla-download.org/R/stable"), dep=TRUE) -#options(repos=c( inlabruorg = "https://inlabru-org.r-universe.dev", INLA = "https://inla.r-inla-download.org/R/testing", CRAN = "https://cran.rstudio.com") ) -#install.packages("fmesher") - -library(INLA) - -library(tictoc) -library(lme4) -library(RTMB) - -#install.packages("cAIC4") -library(cAIC4) - -data(Zambia, package = "cAIC4") -df <- as.data.frame(sapply(Zambia[1:5],scale)) - - -transforms <- c("p0","p2","p3","p05","pm05","pm1","pm2","p0p0","p0p05","p0p1","p0p2","p0p3","p0p05","p0pm05","p0pm1","p0pm2") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(1,1,0,1) # Modifications and interactions! - -params <- gen.params.gmjmcmc(ncol(df) - 1) -params$feat$D <- 1 # Set depth of features to 1 (still allows for interactions) -params$feat$pop.max = 10 - - -# function to estimate log posterior with lme4 - -mixed.model.loglik.lme4 <- function (y, x, model, complex, mlpost_params) -{ - - # logarithm of marginal likelihood (Laplace approximation) - if (sum(model) > 1) { - x.model = x[,model] - data <- data.frame(y, x = x.model[,-1], dr = mlpost_params$dr) - - mm <- lmer(as.formula(paste0("y ~ 1 +",paste0(names(data)[2:(dim(data)[2]-1)],collapse = "+"), "+ (1 | dr)")), data = data, REML = FALSE) - } else{ #model without fixed effects - data <- data.frame(y, dr = mlpost_params$dr) - mm <- lmer(as.formula(paste0("y ~ 1 + (1 | dr)")), data = data, REML = FALSE) - } - - mloglik <- as.numeric(logLik(mm)) - 0.5*log(length(y)) * (dim(data)[2] - 2) #Laplace approximation for beta prior - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - - return(list(crit = mloglik + lp, coefs = fixef(mm))) -} - - -# function to estimate log posterior with INLA - -mixed.model.loglik.inla <- function (y, x, model, complex, mlpost_params) -{ - if(sum(model)>1) - { - data1 = data.frame(y, as.matrix(x[,model]), mlpost_params$dr) - formula1 = as.formula(paste0(names(data1)[1],"~",paste0(names(data1)[3:(dim(data1)[2]-1)],collapse = "+"),"+ f(mlpost_params.dr,model = \"iid\")")) - } else - { - data1 = data.frame(y, mlpost_params$dr) - formula1 = as.formula(paste0(names(data1)[1],"~","1 + f(mlpost_params.dr,model = \"iid\")")) - } - - #to make sure inla is not stuck - inla.setOption(inla.timeout=30) - inla.setOption(num.threads=mlpost_params$INLA.num.threads) - - mod<-NULL - #importance with error handling for unstable libraries that one does not trust 100% - tryCatch({ - mod <- inla(family = "gaussian",silent = 1L,safe = F, data = data1,formula = formula1) - }, error = function(e) { - - # Handle the error by setting result to NULL - mod <- NULL - - # You can also print a message or log the error if needed - cat("An error occurred:", conditionMessage(e), "\n") - }) - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - if(length(mod)<3||length(mod$mlik[1])==0) { - return(list(crit = -10000 + lp,coefs = rep(0,dim(data1)[2]-2))) - } else { - mloglik <- mod$mlik[1] - return(list(crit = mloglik + lp, coefs = mod$summary.fixed$mode)) - } -} - - -# function to estimate log posterior with RTMB - -mixed.model.loglik.rtmb <- function (y, x, model, complex, mlpost_params) -{ - z = model.matrix(y~mlpost_params$dr) #Design matrix for random effect - - msize = sum(model) - #Set up and estimate model - dat = list(y = y, xm = x[,model], z = z) - par = list(logsd_eps = 0, - logsd_dr = 0, - beta = rep(0,msize), - u = rep(0,mlpost_params$nr_dr)) - - nll = function(par){ - getAll(par,dat) - sd_eps = exp(logsd_eps) - sd_dr = exp(logsd_dr) - - nll = 0 - #-log likelihood random effect - nll = nll - sum(dnorm(u, 0, sd_dr, log = TRUE)) - mu = as.vector(as.matrix(xm)%*%beta) + z%*%u - nll <- nll - sum(dnorm(y, mu, sd_eps, log = TRUE)) - - return(nll) - } - obj <- MakeADFun(nll , par, random = "u", silent = T ) - opt <- nlminb ( obj$par , obj$fn , obj$gr, control = list(iter.max = 10)) - - # logarithm of model prior - if (length(mlpost_params$r) == 0) mlpost_params$r <- 1/dim(x)[1] # default value or parameter r - lp <- log_prior(mlpost_params, complex) - - mloglik <- -opt$objective - 0.5*log(dim(x)[1])*msize - return(list(crit = mloglik + lp, coefs = opt$par[-(1:2)])) -} - - - -###################### -# -# Compare runtime -# - -set.seed(03052024) - -tic() -result1a <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) -time.lme4 = toc() - - -tic() -result1b <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, - family = "custom", loglik.pi = mixed.model.loglik.inla, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr), - INLA.num.threads = 10)) -time.inla = toc() - -tic() -result1c <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - method = "gmjmcmc",probs = probs, params = params, P=3, N = 30, - family = "custom", loglik.pi = mixed.model.loglik.rtmb, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr), - nr_dr = sum((table(Zambia$dr))>0))) -time.rtmb = toc() - - -c(time.lme4$callback_msg, time.inla$callback_msg, time.rtmb$callback_msg) - - - -####################################################### -# -# Serious analysis with lme4 -# -# - - -# Analysis without non-linear features - - -result2a <- fbms(formula = z ~ 1+., data = df, N = 5000, - method = "mjmcmc.parallel", runs = 40, cores = parallel::detectCores()-1, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) - -summary(result2a, labels = names(df)[-1]) - -plot(result2a) - - - -# Analysis with fractional polynomials -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(1,1,0,1) # Modifications and interactions! - -params <- gen.params.gmjmcmc(ncol(df) - 1) -params$feat$D <- 1 # Set depth of features to 1 (still allows for interactions) -params$feat$pop.max = 10 - -result2b <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 40, cores = parallel::detectCores()-1, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) - -summary(result2b,tol = 0.05,labels=names(df)[-1]) - - - -# Analysis with non-linear projections -transforms <- c("sigmoid") -probs <- gen.probs.gmjmcmc(transforms) -probs$gen <- c(0,0,1,1) - -params <- gen.params.gmjmcmc(ncol(df) - 1) -params$feat$pop.max = 10 - - -result2c <- fbms(formula = z ~ 1+., data = df, transforms = transforms, - probs = probs, params = params, P=25, N = 100, - method = "gmjmcmc.parallel", runs = 40, cores = parallel::detectCores()-1, - family = "custom", loglik.pi = mixed.model.loglik.lme4, - model_prior = list(r = 1/dim(df)[1]), - extra_params = list(dr = droplevels(Zambia$dr))) - -summary(result2c,tol = 0.05,labels=names(df)[-1]) - - -# Comparison of results - -summary(result2a, labels = names(df)[-1]) -summary(result2b, labels = names(df)[-1]) -summary(result2c, labels = names(df)[-1]) - -