---
title: "BayesianLasso"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{BayesianLasso}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```
```{r setup, message=FALSE, warning=FALSE}
library(BayesianLasso)
helpers_path <- system.file("helpers", package = "BayesianLasso")
helper_files <- list.files(helpers_path, pattern = "\\.R$", full.names = TRUE)
invisible(lapply(helper_files, source))
```
## Introduction
The \pkg{BayesianLasso} package provides efficient sampling algorithms for Bayesian Lasso regression, modified Hans and PC samplers. The modified Hans sampler is based on a newly defined Lasso distribution.
This vignette introduces the main functionality of the package and demonstrates how to apply the samplers to real and simulated data.
```{r}
effective_sample_size <- function(samples) {
if (!requireNamespace("posterior", quietly = TRUE)) {
message(
"Package 'posterior' is required to compute effective sample size (ESS).\n",
"Install it with install.packages('posterior') to enable ESS diagnostics."
)
return(NA_real_)
}
as.numeric(posterior::ess_bulk(as.numeric(samples)))
}
```
## Dataset selection
Select a dataset name for dataset_name.
```{r dataset_name}
datasets_ngtp <- c("simulated", "diabetes2", "Kakadu2", "Crime")
datasets_pgtn <- c("cookie", "eyedata")
dataset_name <- datasets_ngtp[1]
```
## Simulated Example
```{r}
if(dataset_name == datasets_ngtp[1]){
# Simulate data
set.seed(123)
Ns <- 2000
ns <- 100
ps <- 10
X <- matrix(rnorm(ns * ps), nrow = ns)
beta <- c(rep(2, 3), rep(0, ps - 3))
y <- X %*% beta + rnorm(ns)
vtime_val_Hans = c()
results_Hans <- NULL
# Run the modified Hans sampler
for(i in 1:5){
time_val <- system.time({
res_Hans <- Modified_Hans_Gibbs(
X = X, y = y, beta_init= rep(1,10), a1=2, b1=1, u1=2, v1=1,
nsamples=Ns, lambda_init=1, sigma2_init=1, verbose=100, tune_lambda2 = TRUE,
rao_blackwellization = FALSE)
})[3]
vtime_val_Hans[i] <- time_val
# Initialize accumulators after first run
if (is.null(results_Hans)) {
results_Hans <- list(
mBeta = res_Hans$mBeta,
vsigma2 = res_Hans$vsigma2,
vlambda2 = res_Hans$vlambda2
)
} else {
results_Hans$mBeta <- results_Hans$mBeta + res_Hans$mBeta
results_Hans$vsigma2 <- results_Hans$vsigma2 + res_Hans$vsigma2
results_Hans$vlambda2 <- results_Hans$vlambda2 + res_Hans$vlambda2
}
}
# Take averages
mBeta = (results_Hans$mBeta)/5
vsigma2 = (results_Hans$vsigma2)/5
vlambda2 = (results_Hans$vlambda2)/5
time_val_Hans = mean(vtime_val_Hans)
vESS <- c()
for(j in 1:ps) {
vESS[j] <- effective_sample_size(results_Hans$mBeta[200:Ns,j])
}
Ef_Hans = median(vESS)/time_val_Hans
ESS_sigma2_Hans = effective_sample_size(as.vector(results_Hans$vsigma2[200:Ns]))
Ef_sigma2_Hans = ESS_sigma2_Hans/time_val_Hans
ESS_lambda2_Hans = effective_sample_size(as.vector(results_Hans$vlambda2[200:Ns]))
Ef_lambda2_Hans = ESS_lambda2_Hans/time_val_Hans
stat_vec_Hans = c(
100*median(vESS)/Ns,
Ef_Hans,
100*ESS_sigma2_Hans/Ns,
Ef_sigma2_Hans,
100*ESS_lambda2_Hans/Ns,
Ef_lambda2_Hans,
time_val_Hans)
name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time")
names(stat_vec_Hans) = name_vals
# ======================== PC sampler =======================================
# Run the modified PC sampler
vtime_val_PC = c()
results_PC <- NULL
for(i in 1:5){
time_val <- system.time({
res_PC <- Modified_PC_Gibbs(
X = X, y = y, a1=2, b1=1, u1=2, v1=1,
nsamples=Ns, lambda_init=1, sigma2_init=1, verbose=100)
})[3]
vtime_val_PC[i] <- time_val
# Initialize accumulators after first run
if (is.null(results_PC)) {
results_PC <- list(
mBeta = res_PC$mBeta,
vsigma2 = res_PC$vsigma2,
vlambda2 = res_PC$vlambda2
)
} else {
results_PC$mBeta <- results_PC$mBeta + res_PC$mBeta
results_PC$vsigma2 <- results_PC$vsigma2 + res_PC$vsigma2
results_PC$vlambda2 <- results_PC$vlambda2 + res_PC$vlambda2
}
}
# Take averages
mBeta = (results_PC$mBeta)/5
vsigma2 = (results_PC$vsigma2)/5
vlambda2 = (results_PC$vlambda2)/5
time_val_PC = mean(vtime_val_PC)
# colMeans(results_PC$mBeta)
vESS <- c()
for(j in 1:ps) {
vESS[j] <- effective_sample_size(results_PC$mBeta[200:Ns,j])
}
Ef_PC = median(vESS)/time_val_PC
ESS_sigma2_PC = effective_sample_size(as.vector(results_PC$vsigma2[200:Ns]))
Ef_sigma2_PC = ESS_sigma2_PC/time_val_PC
ESS_lambda2_PC = effective_sample_size(as.vector(results_PC$vlambda2[200:Ns]))
Ef_lambda2_PC = ESS_lambda2_PC/time_val_PC
stat_vec_PC = c(
100*median(vESS)/Ns,
Ef_PC,
100*ESS_sigma2_PC/Ns,
Ef_sigma2_PC,
100*ESS_lambda2_PC/Ns,
Ef_lambda2_PC,
time_val_PC)
name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time")
names(stat_vec_PC) = name_vals
}
```
## Data Sources
This vignette demonstrates the usage of the `BayesianLasso` package on several well-known datasets. To avoid redistribution issues and keep the package lightweight, none of these datasets are included directly in the package. Instead, we provide guidance on how to access them.
- **Diabetes**: Available in the \CRANpkg{lars} package and loaded using `data(diabetes, package = "lars")`.
- **Kakadu**: Available in the \CRANpkg{Ecdat} package and loaded using `data(Kakadu, package = "Ecdat")`.
- **Communities and Crime**: This dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/183/communities+and+crime).
Due to licensing and redistribution considerations, the *Communities and Crime* dataset is not included in this package. Users can download the dataset manually by visiting the repository and following the preprocessing steps outlined in this vignette.
## Reproducing Table 1: Performance Comparison
This section demonstrates how to reproduce the performance comparison table from the manuscript, comparing the mixing percentages, sampling efficiencies, and runtimes for the Diabetes, Kakadu, and Communities and Crime datasets.
```{r generate-table-1}
# Load libraries
library(BayesianLasso)
# Example: dataset_name <- "diabetes2"
if (dataset_name == datasets_ngtp[2]) {
if (!requireNamespace("lars", quietly = TRUE)) {
message("Package 'lars' is required for the diabetes example in this vignette.")
}else{
data("diabetes", package = "lars", envir = environment())
y <- diabetes$y
x <- diabetes$x
inds <- seq_len(ncol(x))
# Normalizing and scaling the dataset by function normalize()
norm <- normalize(y, x, scale = TRUE)
x <- norm$mX
x <- model.matrix(~ .^2, data = data.frame(x = x))[ , -1]
y <- norm$vy
}
}
if (dataset_name == datasets_ngtp[3]) {
if (!requireNamespace("Ecdat", quietly = TRUE)) {
message("Package 'Ecdat' is required for the Kakadu example in this vignette.")
}else{
data("Kakadu", package = "Ecdat", envir = environment())
# Get y vector and X matrix
y <- as.vector(Kakadu$income)
x <- Kakadu[, c(1:20, 22)]
x <- model.matrix(~ .^2, data = x)[ , -1]
}
}
# Make sure Crime.csv (or comData.Rdata) is downloaded manually before running this
if (dataset_name == datasets_ngtp[4])
{
rdata_path <- system.file("extdata", "comData.Rdata", package = "BayesianLasso")
if (nzchar(rdata_path) && file.exists(rdata_path)) { load(rdata_path)
}else { stop("File 'comData.Rdata' not found in package extdata.")
}
# ---- Remove rows with NA while keeping X and Y aligned ----
datXY <- na.omit(cbind(as.data.frame(X), as.data.frame(Y)))
X2 <- as.matrix(datXY[, colnames(X), drop = FALSE])
Y2 <- as.matrix(datXY[, colnames(Y), drop = FALSE])
# ---- Drop unwanted columns safely ----
drop_cols <- c("ownHousQrange", "rentUpperQ")
X2 <- X2[, !colnames(X2) %in% drop_cols, drop = FALSE]
# ---- Define regression inputs ----
x <- X2
y <- as.vector(Y2[, "murders"])
varnames <- colnames(x)
inds <- seq_len(ncol(x))
}
# Set prior hyperparameter constants
a1 = 1.0E-2 # Prior shape for sigma2
b1 = 1.0E-2 # Prior scale for sigma2
u1 = 1.0E-2 # Prior shape for lambda2
v1 = 1.0E-2 # Prior scale for lambda2
# Initial values for lambda2 and sigma2
lambda2_init = 10
lambda_init = sqrt(lambda2_init)
sigma2_init = 1
# Number of samples to run the MCMC
nburn = 1000
nsamples = 5000
inds_use = (nburn + 1):nsamples
N = length(inds_use)
if(dataset_name != datasets_ngtp[1]){
# To store elapsed time and results of the PC sampler
vtime_val_PC = c()
results_PC <- NULL # to be initialized after the first run
# Running the modified PC sampler 5 times and taking the average of the results across runs.
for (i in 1:5) {
time_val <- system.time({
res_PC <- Modified_PC_Gibbs(
X = x, y = y, a1, b1, u1, v1,
nsamples,
lambda_init = lambda_init,
sigma2_init = sigma2_init,
verbose = 1000
)
})[3]
vtime_val_PC[i] <- time_val
# Initialize accumulators after first run
if (is.null(results_PC)) {
results_PC <- list(
mBeta = res_PC$mBeta,
vsigma2 = res_PC$vsigma2,
vlambda2 = res_PC$vlambda2
)
} else {
results_PC$mBeta <- results_PC$mBeta + res_PC$mBeta
results_PC$vsigma2 <- results_PC$vsigma2 + res_PC$vsigma2
results_PC$vlambda2 <- results_PC$vlambda2 + res_PC$vlambda2
}
}
# Take averages
mBeta = (results_PC$mBeta)/5
vsigma2 = (results_PC$vsigma2)/5
vlambda2 = (results_PC$vlambda2)/5
time_val_PC = mean(vtime_val_PC)
# Compute effective sample sizes
vESS <- c()
p <- ncol(x)
for(j in 1:p) {
vESS[j] <- effective_sample_size(mBeta[inds_use,j])
}
Ef_PC = median(vESS)/time_val_PC
ESS_sigma2_PC = effective_sample_size(as.vector(vsigma2[inds_use]))
Ef_sigma2_PC = ESS_sigma2_PC/time_val_PC
ESS_lambda2_PC = effective_sample_size(as.vector(vlambda2[inds_use]))
Ef_lambda2_PC = ESS_lambda2_PC/time_val_PC
stat_vec_PC = c(
100*median(vESS)/N,
Ef_PC,
100*ESS_sigma2_PC/N,
Ef_sigma2_PC,
100*ESS_lambda2_PC/N,
Ef_lambda2_PC,
time_val_PC)
name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time")
names(stat_vec_PC) = name_vals
# To store elapsed time and results of the Hans sampler
vtime_val_Hans = c()
results_Hans <- NULL # to be initialized after the first run
# Running the modified Hans sampler 5 times and taking the average of the results across runs.
for (i in 1:5) {
time_val <- system.time({
res_Hans <- Modified_Hans_Gibbs(
X = x, y = y, beta_init = as.vector(colMeans(mBeta)),
a1, b1, u1, v1,
nsamples,
lambda_init = lambda_init,
sigma2_init = sigma2_init,
verbose = 1000, tune_lambda2 = TRUE, rao_blackwellization = FALSE
)
})[3]
vtime_val_Hans[i] <- time_val
# Initialize accumulators after first run
if (is.null(results_Hans)) {
results_Hans <- list(
mBeta = res_Hans$mBeta,
vsigma2 = res_Hans$vsigma2,
vlambda2 = res_Hans$vlambda2
)
} else {
results_Hans$mBeta <- results_Hans$mBeta + res_Hans$mBeta
results_Hans$vsigma2 <- results_Hans$vsigma2 + res_Hans$vsigma2
results_Hans$vlambda2 <- results_Hans$vlambda2 + res_Hans$vlambda2
}
}
# Take averages
mBeta = (results_Hans$mBeta)/5
vsigma2 = (results_Hans$vsigma2)/5
vlambda2 = (results_Hans$vlambda2)/5
time_val_Hans = mean(vtime_val_Hans)
# Compute effective sample sizes
vESS <- c()
p <- ncol(x)
for(j in 1:p) {
vESS[j] <- effective_sample_size(mBeta[inds_use,j])
}
Ef_Hans = median(vESS)/time_val_Hans
ESS_sigma2_Hans = effective_sample_size(as.vector(vsigma2[inds_use]))
Ef_sigma2_Hans = ESS_sigma2_Hans/time_val_Hans
ESS_lambda2_Hans = effective_sample_size(as.vector(vlambda2[inds_use]))
Ef_lambda2_Hans = ESS_lambda2_Hans/time_val_Hans
stat_vec_Hans = c(
100*median(vESS)/N,
Ef_Hans,
100*ESS_sigma2_Hans/N,
Ef_sigma2_Hans,
100*ESS_lambda2_Hans/N,
Ef_lambda2_Hans,
time_val_Hans)
name_vals = c("mix_beta", "eff_beta", "mix_sigma2", "eff_sigma2", "mix_lambda2", "eff_lambda2", "time")
names(stat_vec_Hans) = name_vals
if(dataset_name != datasets_ngtp[3]){
res_monomvn <- NULL
if (requireNamespace("monomvn", quietly = TRUE)) {
res_monomvn = benchmark_blasso_monomvn(y, x,
lambda_init, sigma2_init, a1, b1, u1, v1,
nburn, nsamples, trials=5,
beta_inds=NA)
stat_vec_monomvn <- colMeans(res_monomvn$mStat)
}else {
message("Skipping monomvn benchmark: package 'monomvn' is not installed.")
}
}
# ======================= Run bayeslm ============================
## Note that package bayeslm is not on CRAN anymore
res_bayeslm <- NULL
if (requireNamespace("bayeslm", quietly = TRUE)) {
res_bayeslm = benchmark_blasso_bayeslm(y, x,
lambda_init, sigma2_init, a1, b1, u1, v1,
nburn, nsamples, trials=5,
beta_inds=NA)
stat_vec_bayeslm <- colMeans(res_bayeslm$mStat)
}else {
message("Skipping bayeslm benchmark: package 'bayeslm' is not installed.")
}
# ======================= Run rstan ============================
res_rstan <- NULL
if (requireNamespace("rstan", quietly = TRUE)) {
res_rstan = benchmark_blasso_rstan(y, x,
lambda_init, sigma2_init, a1, b1, u1, v1,
nburn, nsamples, trials=5,
beta_inds=NA)
stat_vec_rstan <- colMeans(res_rstan$mStat)
}else {
message("Skipping rstan benchmark: package 'rstan' is not installed.")
}
# ======================= Run bayesreg ============================
res_bayesreg <- NULL
if (requireNamespace("bayesreg", quietly = TRUE)) {
res_bayesreg = benchmark_blasso_bayesreg(y, x,
lambda_init, sigma2_init, a1, b1, u1, v1,
nburn, nsamples, trials=5,
beta_inds=NA)
stat_vec_bayesreg <- colMeans(res_bayesreg$mStat)
}else {
message("Skipping bayesreg benchmark: package 'bayesreg' is not installed.")
}
}
```
## Convergence diagnostics
Assessing convergence of the MCMC chains is an important step in Bayesian analysis.
The package provides helper functions such as `mcmc_stats()` and `mcmc_diagnostics()`
to summarize mixing behavior and visualize the behavior of the chains. The function
`mcmc_stats()` computes numerical summaries of the MCMC output, while
`mcmc_diagnostics()` produces basic diagnostic plots such as trace and autocorrelation
plots for selected parameters. These tools allow users to assess mixing and stability
of the chains in practice. Posterior draws can also be passed to packages such as
`coda`, `posterior`, or `bayesplot` for additional diagnostics.