--- title: "BayesianLasso" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{BayesianLasso} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, message=FALSE, warning=FALSE} library(BayesianLasso) helpers_path <- system.file("helpers", package = "BayesianLasso") helper_files <- list.files(helpers_path, pattern = "\\.R$", full.names = TRUE) invisible(lapply(helper_files, source)) ``` ## Introduction The \pkg{BayesianLasso} package provides efficient sampling algorithms for Bayesian Lasso regression, modified Hans and PC samplers. The modified Hans sampler is based on a newly defined Lasso distribution. This vignette introduces the main functionality of the package and demonstrates how to apply the samplers to real and simulated data. ```{r} effective_sample_size <- function(samples) { if (!requireNamespace("posterior", quietly = TRUE)) { message( "Package 'posterior' is required to compute effective sample size (ESS).\n", "Install it with install.packages('posterior') to enable ESS diagnostics." ) return(NA_real_) } as.numeric(posterior::ess_bulk(as.numeric(samples))) } ``` ## Dataset selection Select a dataset name for dataset_name. ```{r dataset_name} datasets_ngtp <- c("simulated", "diabetes2", "Kakadu2", "Crime") datasets_pgtn <- c("cookie", "eyedata") dataset_name <- datasets_ngtp[1] ``` ## Simulated Example ```{r} if(dataset_name == datasets_ngtp[1]){ # Simulate data set.seed(123) Ns <- 2000 ns <- 100 ps <- 10 X <- matrix(rnorm(ns * ps), nrow = ns) beta <- c(rep(2, 3), rep(0, ps - 3)) y <- X %*% beta + rnorm(ns) vtime_val_Hans = c() results_Hans <- NULL # Run the modified Hans sampler for(i in 1:5){ time_val <- system.time({ res_Hans <- Modified_Hans_Gibbs( X = X, y = y, beta_init= rep(1,10), a1=2, b1=1, u1=2, v1=1, nsamples=Ns, lambda_init=1, sigma2_init=1, verbose=100, tune_lambda2 = TRUE, rao_blackwellization = FALSE) })[3] vtime_val_Hans[i] <- time_val # Initialize accumulators after first run if (is.null(results_Hans)) { results_Hans <- list( mBeta = res_Hans$mBeta, vsigma2 = res_Hans$vsigma2, vlambda2 = res_Hans$vlambda2 ) } else { results_Hans$mBeta <- results_Hans$mBeta + res_Hans$mBeta results_Hans$vsigma2 <- results_Hans$vsigma2 + res_Hans$vsigma2 results_Hans$vlambda2 <- results_Hans$vlambda2 + res_Hans$vlambda2 } } # Take averages mBeta = (results_Hans$mBeta)/5 vsigma2 = (results_Hans$vsigma2)/5 vlambda2 = (results_Hans$vlambda2)/5 time_val_Hans = mean(vtime_val_Hans) vESS <- c() for(j in 1:ps) { vESS[j] <- effective_sample_size(results_Hans$mBeta[200:Ns,j]) } Ef_Hans = median(vESS)/time_val_Hans ESS_sigma2_Hans = effective_sample_size(as.vector(results_Hans$vsigma2[200:Ns])) Ef_sigma2_Hans = ESS_sigma2_Hans/time_val_Hans ESS_lambda2_Hans = effective_sample_size(as.vector(results_Hans$vlambda2[200:Ns])) Ef_lambda2_Hans = ESS_lambda2_Hans/time_val_Hans stat_vec_Hans = c( 100*median(vESS)/Ns, Ef_Hans, 100*ESS_sigma2_Hans/Ns, Ef_sigma2_Hans, 100*ESS_lambda2_Hans/Ns, Ef_lambda2_Hans, time_val_Hans) name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time") names(stat_vec_Hans) = name_vals # ======================== PC sampler ======================================= # Run the modified PC sampler vtime_val_PC = c() results_PC <- NULL for(i in 1:5){ time_val <- system.time({ res_PC <- Modified_PC_Gibbs( X = X, y = y, a1=2, b1=1, u1=2, v1=1, nsamples=Ns, lambda_init=1, sigma2_init=1, verbose=100) })[3] vtime_val_PC[i] <- time_val # Initialize accumulators after first run if (is.null(results_PC)) { results_PC <- list( mBeta = res_PC$mBeta, vsigma2 = res_PC$vsigma2, vlambda2 = res_PC$vlambda2 ) } else { results_PC$mBeta <- results_PC$mBeta + res_PC$mBeta results_PC$vsigma2 <- results_PC$vsigma2 + res_PC$vsigma2 results_PC$vlambda2 <- results_PC$vlambda2 + res_PC$vlambda2 } } # Take averages mBeta = (results_PC$mBeta)/5 vsigma2 = (results_PC$vsigma2)/5 vlambda2 = (results_PC$vlambda2)/5 time_val_PC = mean(vtime_val_PC) # colMeans(results_PC$mBeta) vESS <- c() for(j in 1:ps) { vESS[j] <- effective_sample_size(results_PC$mBeta[200:Ns,j]) } Ef_PC = median(vESS)/time_val_PC ESS_sigma2_PC = effective_sample_size(as.vector(results_PC$vsigma2[200:Ns])) Ef_sigma2_PC = ESS_sigma2_PC/time_val_PC ESS_lambda2_PC = effective_sample_size(as.vector(results_PC$vlambda2[200:Ns])) Ef_lambda2_PC = ESS_lambda2_PC/time_val_PC stat_vec_PC = c( 100*median(vESS)/Ns, Ef_PC, 100*ESS_sigma2_PC/Ns, Ef_sigma2_PC, 100*ESS_lambda2_PC/Ns, Ef_lambda2_PC, time_val_PC) name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time") names(stat_vec_PC) = name_vals } ``` ## Data Sources This vignette demonstrates the usage of the `BayesianLasso` package on several well-known datasets. To avoid redistribution issues and keep the package lightweight, none of these datasets are included directly in the package. Instead, we provide guidance on how to access them. - **Diabetes**: Available in the \CRANpkg{lars} package and loaded using `data(diabetes, package = "lars")`. - **Kakadu**: Available in the \CRANpkg{Ecdat} package and loaded using `data(Kakadu, package = "Ecdat")`. - **Communities and Crime**: This dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/183/communities+and+crime). Due to licensing and redistribution considerations, the *Communities and Crime* dataset is not included in this package. Users can download the dataset manually by visiting the repository and following the preprocessing steps outlined in this vignette. ## Reproducing Table 1: Performance Comparison This section demonstrates how to reproduce the performance comparison table from the manuscript, comparing the mixing percentages, sampling efficiencies, and runtimes for the Diabetes, Kakadu, and Communities and Crime datasets. ```{r generate-table-1} # Load libraries library(BayesianLasso) # Example: dataset_name <- "diabetes2" if (dataset_name == datasets_ngtp[2]) { if (!requireNamespace("lars", quietly = TRUE)) { message("Package 'lars' is required for the diabetes example in this vignette.") }else{ data("diabetes", package = "lars", envir = environment()) y <- diabetes$y x <- diabetes$x inds <- seq_len(ncol(x)) # Normalizing and scaling the dataset by function normalize() norm <- normalize(y, x, scale = TRUE) x <- norm$mX x <- model.matrix(~ .^2, data = data.frame(x = x))[ , -1] y <- norm$vy } } if (dataset_name == datasets_ngtp[3]) { if (!requireNamespace("Ecdat", quietly = TRUE)) { message("Package 'Ecdat' is required for the Kakadu example in this vignette.") }else{ data("Kakadu", package = "Ecdat", envir = environment()) # Get y vector and X matrix y <- as.vector(Kakadu$income) x <- Kakadu[, c(1:20, 22)] x <- model.matrix(~ .^2, data = x)[ , -1] } } # Make sure Crime.csv (or comData.Rdata) is downloaded manually before running this if (dataset_name == datasets_ngtp[4]) { rdata_path <- system.file("extdata", "comData.Rdata", package = "BayesianLasso") if (nzchar(rdata_path) && file.exists(rdata_path)) { load(rdata_path) }else { stop("File 'comData.Rdata' not found in package extdata.") } # ---- Remove rows with NA while keeping X and Y aligned ---- datXY <- na.omit(cbind(as.data.frame(X), as.data.frame(Y))) X2 <- as.matrix(datXY[, colnames(X), drop = FALSE]) Y2 <- as.matrix(datXY[, colnames(Y), drop = FALSE]) # ---- Drop unwanted columns safely ---- drop_cols <- c("ownHousQrange", "rentUpperQ") X2 <- X2[, !colnames(X2) %in% drop_cols, drop = FALSE] # ---- Define regression inputs ---- x <- X2 y <- as.vector(Y2[, "murders"]) varnames <- colnames(x) inds <- seq_len(ncol(x)) } # Set prior hyperparameter constants a1 = 1.0E-2 # Prior shape for sigma2 b1 = 1.0E-2 # Prior scale for sigma2 u1 = 1.0E-2 # Prior shape for lambda2 v1 = 1.0E-2 # Prior scale for lambda2 # Initial values for lambda2 and sigma2 lambda2_init = 10 lambda_init = sqrt(lambda2_init) sigma2_init = 1 # Number of samples to run the MCMC nburn = 1000 nsamples = 5000 inds_use = (nburn + 1):nsamples N = length(inds_use) if(dataset_name != datasets_ngtp[1]){ # To store elapsed time and results of the PC sampler vtime_val_PC = c() results_PC <- NULL # to be initialized after the first run # Running the modified PC sampler 5 times and taking the average of the results across runs. for (i in 1:5) { time_val <- system.time({ res_PC <- Modified_PC_Gibbs( X = x, y = y, a1, b1, u1, v1, nsamples, lambda_init = lambda_init, sigma2_init = sigma2_init, verbose = 1000 ) })[3] vtime_val_PC[i] <- time_val # Initialize accumulators after first run if (is.null(results_PC)) { results_PC <- list( mBeta = res_PC$mBeta, vsigma2 = res_PC$vsigma2, vlambda2 = res_PC$vlambda2 ) } else { results_PC$mBeta <- results_PC$mBeta + res_PC$mBeta results_PC$vsigma2 <- results_PC$vsigma2 + res_PC$vsigma2 results_PC$vlambda2 <- results_PC$vlambda2 + res_PC$vlambda2 } } # Take averages mBeta = (results_PC$mBeta)/5 vsigma2 = (results_PC$vsigma2)/5 vlambda2 = (results_PC$vlambda2)/5 time_val_PC = mean(vtime_val_PC) # Compute effective sample sizes vESS <- c() p <- ncol(x) for(j in 1:p) { vESS[j] <- effective_sample_size(mBeta[inds_use,j]) } Ef_PC = median(vESS)/time_val_PC ESS_sigma2_PC = effective_sample_size(as.vector(vsigma2[inds_use])) Ef_sigma2_PC = ESS_sigma2_PC/time_val_PC ESS_lambda2_PC = effective_sample_size(as.vector(vlambda2[inds_use])) Ef_lambda2_PC = ESS_lambda2_PC/time_val_PC stat_vec_PC = c( 100*median(vESS)/N, Ef_PC, 100*ESS_sigma2_PC/N, Ef_sigma2_PC, 100*ESS_lambda2_PC/N, Ef_lambda2_PC, time_val_PC) name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time") names(stat_vec_PC) = name_vals # To store elapsed time and results of the Hans sampler vtime_val_Hans = c() results_Hans <- NULL # to be initialized after the first run # Running the modified Hans sampler 5 times and taking the average of the results across runs. for (i in 1:5) { time_val <- system.time({ res_Hans <- Modified_Hans_Gibbs( X = x, y = y, beta_init = as.vector(colMeans(mBeta)), a1, b1, u1, v1, nsamples, lambda_init = lambda_init, sigma2_init = sigma2_init, verbose = 1000, tune_lambda2 = TRUE, rao_blackwellization = FALSE ) })[3] vtime_val_Hans[i] <- time_val # Initialize accumulators after first run if (is.null(results_Hans)) { results_Hans <- list( mBeta = res_Hans$mBeta, vsigma2 = res_Hans$vsigma2, vlambda2 = res_Hans$vlambda2 ) } else { results_Hans$mBeta <- results_Hans$mBeta + res_Hans$mBeta results_Hans$vsigma2 <- results_Hans$vsigma2 + res_Hans$vsigma2 results_Hans$vlambda2 <- results_Hans$vlambda2 + res_Hans$vlambda2 } } # Take averages mBeta = (results_Hans$mBeta)/5 vsigma2 = (results_Hans$vsigma2)/5 vlambda2 = (results_Hans$vlambda2)/5 time_val_Hans = mean(vtime_val_Hans) # Compute effective sample sizes vESS <- c() p <- ncol(x) for(j in 1:p) { vESS[j] <- effective_sample_size(mBeta[inds_use,j]) } Ef_Hans = median(vESS)/time_val_Hans ESS_sigma2_Hans = effective_sample_size(as.vector(vsigma2[inds_use])) Ef_sigma2_Hans = ESS_sigma2_Hans/time_val_Hans ESS_lambda2_Hans = effective_sample_size(as.vector(vlambda2[inds_use])) Ef_lambda2_Hans = ESS_lambda2_Hans/time_val_Hans stat_vec_Hans = c( 100*median(vESS)/N, Ef_Hans, 100*ESS_sigma2_Hans/N, Ef_sigma2_Hans, 100*ESS_lambda2_Hans/N, Ef_lambda2_Hans, time_val_Hans) name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time") names(stat_vec_Hans) = name_vals if(dataset_name != datasets_ngtp[3]){ res_monomvn <- NULL if (requireNamespace("monomvn", quietly = TRUE)) { res_monomvn = benchmark_blasso_monomvn(y, x, lambda_init, sigma2_init, a1, b1, u1, v1, nburn, nsamples, trials=5, beta_inds=NA) stat_vec_monomvn <- colMeans(res_monomvn$mStat) }else { message("Skipping monomvn benchmark: package 'monomvn' is not installed.") } } # ======================= Run bayeslm ============================ ## Note that package bayeslm is not on CRAN anymore res_bayeslm <- NULL if (requireNamespace("bayeslm", quietly = TRUE)) { res_bayeslm = benchmark_blasso_bayeslm(y, x, lambda_init, sigma2_init, a1, b1, u1, v1, nburn, nsamples, trials=5, beta_inds=NA) stat_vec_bayeslm <- colMeans(res_bayeslm$mStat) }else { message("Skipping bayeslm benchmark: package 'bayeslm' is not installed.") } # ======================= Run rstan ============================ res_rstan <- NULL if (requireNamespace("rstan", quietly = TRUE)) { res_rstan = benchmark_blasso_rstan(y, x, lambda_init, sigma2_init, a1, b1, u1, v1, nburn, nsamples, trials=5, beta_inds=NA) stat_vec_rstan <- colMeans(res_rstan$mStat) }else { message("Skipping rstan benchmark: package 'rstan' is not installed.") } # ======================= Run bayesreg ============================ res_bayesreg <- NULL if (requireNamespace("bayesreg", quietly = TRUE)) { res_bayesreg = benchmark_blasso_bayesreg(y, x, lambda_init, sigma2_init, a1, b1, u1, v1, nburn, nsamples, trials=5, beta_inds=NA) stat_vec_bayesreg <- colMeans(res_bayesreg$mStat) }else { message("Skipping bayesreg benchmark: package 'bayesreg' is not installed.") } } ``` ## Convergence diagnostics Assessing convergence of the MCMC chains is an important step in Bayesian analysis. The package provides helper functions such as `mcmc_stats()` and `mcmc_diagnostics()` to summarize mixing behavior and visualize the behavior of the chains. The function `mcmc_stats()` computes numerical summaries of the MCMC output, while `mcmc_diagnostics()` produces basic diagnostic plots such as trace and autocorrelation plots for selected parameters. These tools allow users to assess mixing and stability of the chains in practice. Posterior draws can also be passed to packages such as `coda`, `posterior`, or `bayesplot` for additional diagnostics.