suppressPackageStartupMessages(library(dplyr))
library(ggplot2)
library(tidyr)
# library(cmdstanr)
suppressPackageStartupMessages(library(rstan))
rstan_options("auto_write" = TRUE)
options(mc.cores = parallel::detectCores())
library(tidybayes)
# mite data
data(mite, package = "vegan")
data(mite.env, package = "vegan")
## ALSO: the spatial data
data(mite.xy, package = "vegan")
Gaussian Processes in Stan
Smooth lines in fancy colours.
- Let’s appreciate together the power of online community resources
- Gaussian Processes are families of smooth functions we learn from data
- When used for prediction, a GP is both a “prior” and a “likelihood”
Background reading
Gaussian processes are very common, and there are lots of resources on the topic:
- The Stan manual has a chapter on it
- The Stan team gives lots of example models on Github which I adapted for this example.
- Michael Betancourt has an extremely detailed, very rigous tutorial on GPs
- Here’s a complete, worked analysis of human birthdays by world-class statisticians (Gelman, Vehtari, Simpson, et al)
- GPs are related to GAMs and can be represented by a collection of basis functions. This is approximate but much much faster. See this excellent tutorial by Aki Vehtari, and the corresponding paper (citation in the blog post).
- this blog applies GPs to spatial count data
- Here is a very long and wonderfully detailed post describing a GP approach to occupany modelling
- Another blog on Gaussian Processes, Hidden Markov Models and more, very clear explanation.
Reorganizing the mite data
Let’s begin by (once again!) loading and reorganizing the mite data. This time we’ll also use mite.xy
, which gives the coordinates of each one of the 70 samples.
Loading models and data
# combine data and environment
<- bind_cols(mite.env, mite) |>
mite_data_long mutate(plot_id = 1:length(WatrCont)) |>
pivot_longer(Brachy:Trimalc2, names_to = "spp", values_to = "abd")
<- mite_data_long |>
mite_data_long_transformed mutate(presabs = as.numeric(abd>0),
# center predictors
water = (WatrCont - mean(WatrCont)) / 100
)
# pick a species that has about 50/50 chance
|>
mite_data_long_transformed group_by(spp) |>
summarize(freq = mean(presabs)) |>
filter(freq > .4 & freq < .6)
# A tibble: 10 × 2
spp freq
<chr> <dbl>
1 Ceratoz3 0.443
2 FSET 0.429
3 HMIN 0.486
4 MEGR 0.543
5 NCOR 0.5
6 Oppiminu 0.429
7 Oribatl1 0.429
8 PWIL 0.486
9 TVEL 0.557
10 Trhypch1 0.457
## how about: PWIL
Let’s choose just one species as an example. I’ve chosen one where the relationship with water is rather strong, and for which presence and absence are roughly balanced. This is just to make the example clear.
<- mite_data_long_transformed |>
pwil_data filter(spp == "PWIL")
|>
pwil_data ggplot(aes(x = water, y = presabs)) + geom_point() +
stat_smooth(method = glm, method.args = list(family = "binomial")) +
theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
# add the spatial coordinates:
<- bind_cols(pwil_data, mite.xy)
pwil_spatial
|>
pwil_spatial ggplot(aes(x = x, y = y, fill = as.factor(presabs))) +
geom_point(size = 3, pch = 21, stroke = 1) +
scale_fill_brewer(type = "qual", palette = "Dark2") +
theme_minimal() +
coord_fixed() +
labs(fill = "Pres/Abs")
We’ll look at two possibilities in turn:
- A nonlinear function of one variable
- A smooth function of distance
Smooth function of one variable
Write the model
\[ \begin{align} \mathsf{Pr}(y_i = 1) &\sim \mathsf{Bernoulli}(p_i)\\ \mathsf{logit}(p_i) &= a + f_i\\ f_i &\sim \mathsf{multivariate\ normal}(0, K(x | \theta)) \\ K(x | \alpha, \rho, \sigma)_{i, j} &= \alpha^2 \exp \left( - \dfrac{1}{2 \rho^2} \sum_{d=1}^D (x_{i,d} - x_{j,d})^2 \right) + \delta_{i, j} \sigma^2, \end{align} \]
That’s the general notation for D dimensions. In our case we’re looking at something much simpler.
\[ \begin{align} \mathsf{Pr}(y_i = 1) &\sim \mathsf{Bernoulli}(p_i)\\ \mathsf{logit}(p_i) &= a + f_i\\ f_i &\sim \mathsf{Multivariate\ Normal}(0, K(x | \theta)) \\ K(x | \alpha, \rho, \sigma)_{i, j} &= \alpha^2 e^{ \frac{-(\text{water}_i - \text{water}_j)^2}{2 \rho^2}} + \delta_{i, j} \sigma^2 \\ \rho &\sim \mathsf{Inverse\ Gamma}(5, 14) \\ \alpha &\sim \mathsf{Normal}(0, .8) \\ a &\sim \mathsf{Normal}(0, .2) \\ \end{align} \]
Here’s an interpretation of the parameters of this model:
- \(a^2\) is the maximume covariance between two points
- \(\rho\) tells us how quickly that covariance goes down as two samples become more different in their water amount
- \(\delta_{i, j} \sigma^2\) adds the variances along the diagonal
See the explanation of this function in the Stan User’s guide
Simulate to understand it
Here is the Stan code that replicates the mathematical model above.
<- stan_model(file = "topics/04_gp/gp_example_sim.stan")
gp_example_sim
gp_example_sim
S4 class stanmodel 'anon_model' coded as follows:// Fit the hyperparameters of a latent-variable Gaussian process with an
// exponentiated quadratic kernel and a Bernoulli likelihood
// This code is from https://github.com/stan-dev/example-models/blob/master/misc/gaussian-process/gp-fit-logit.stan
data {
int<lower=1> N;
array[N] real x;
}transformed data {
real delta = 1e-9;
}parameters {
real<lower=0> rho;
real<lower=0> alpha;
real a;
vector[N] eta;
vector[N] y;
}transformed parameters {
vector[N] f;
{matrix[N, N] L_K;
matrix[N, N] K = gp_exp_quad_cov(x, alpha, rho);
// diagonal elements
for (n in 1 : N) {
K[n, n] = K[n, n] + delta;
}
L_K = cholesky_decompose(K);
f = L_K * eta;
}
}model {
5, 14);
rho ~ inv_gamma(0, .8);
alpha ~ normal(0, .2);
a ~ normal(
eta ~ std_normal(); }
<- sampling(
gp_example_sim_samples
gp_example_sim,data = list(
N = 20,
x = seq(from = -3, to = 5, length.out = 20)),
refresh = 200, chains = 1
)
SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 2.7e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.27 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 10.013 seconds (Warm-up)
Chain 1: 12.298 seconds (Sampling)
Chain 1: 22.311 seconds (Total)
Chain 1:
Warning: There were 23 divergent transitions after warmup. See
https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
to find out why this is a problem and how to eliminate them.
Warning: There were 898 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
Warning: Examine the pairs() plot to diagnose sampling problems
Warning: The largest R-hat is 2.12, indicating chains have not mixed.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#r-hat
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#bulk-ess
Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#tail-ess
<- tibble::enframe(x = seq(from = -3, to = 5, length.out = 20),
x_value_df name = "i", value = "water")
|>
gp_example_sim_samples ::spread_draws(f[i], a, ndraws = 45) |>
tidybayesleft_join(x_value_df) |>
ggplot(aes(x = water, y = plogis(f + a), group = .draw)) +
geom_line() +
coord_cartesian(ylim = c(0, 1))
Joining with `by = join_by(i)`
Express that model in code
With a working simulation, we can now adapt the model to handle real data.
<- stan_model(
gp_example_pred file = "topics/04_gp/gp_example_pred.stan")
gp_example_pred
S4 class stanmodel 'anon_model' coded as follows:// Fit the hyperparameters of a latent-variable Gaussian process with an
// exponentiated quadratic kernel and a Bernoulli likelihood
// This code is from https://github.com/stan-dev/example-models/blob/master/misc/gaussian-process/gp-fit-logit.stan
data {
int<lower=1> Nobs;
int<lower=1> N;
array[N] real x;
array[Nobs] int<lower=0, upper=1> z;
}transformed data {
real delta = 1e-9;
}parameters {
real<lower=0> rho;
real<lower=0> alpha;
real a;
vector[N] eta;
}transformed parameters {
vector[N] f;
{matrix[N, N] L_K;
matrix[N, N] K = gp_exp_quad_cov(x, alpha, rho);
// diagonal elements
for (n in 1 : N) {
K[n, n] = K[n, n] + delta;
}
L_K = cholesky_decompose(K);
f = L_K * eta;
}
}model {
5, 14);
rho ~ inv_gamma(0, .8);
alpha ~ normal(0, .2);
a ~ normal(
eta ~ std_normal();
1:Nobs]);
z ~ bernoulli_logit(a + f[ }
We need to generate data for making predictions! I’ll create a new vector of observations called new_x
that cover the range of the water
variable in our dataset.
# sample N values on the range of x
<- seq(from = -3, to = 5, length.out = 15)
new_x
<- list(N = length(pwil_spatial$presabs) + length(new_x),
gp_data_list Nobs = length(pwil_spatial$presabs),
x = c(pwil_spatial$water, new_x),
z = pwil_spatial$presabs)
# put them on the dataframe
<- sampling(
gp_example_pwil_samp
gp_example_pred,data = gp_data_list,
chains = 2, refresh = 1000)
# gp_example_samp$save_object("topics/04_gp/gp_example_samp_pwil.rds")
Note that cmdstanr
models have a method called $save_object()
, which lets you save the model outputs into an .rds
object.
Also note that these notes have been converted to rstan
so this tip doesn’t help that much anymore ;)
# gp_example_samp_pwil <- read_rds(
# "topics/04_gp/gp_example_samp_pwil.rds")
<- gp_example_pwil_samp |>
water_prediction_points ::gather_rvars(f[rownum]) |>
tidybayesslice(-(1:length(pwil_spatial$presabs)))
|>
water_prediction_points mutate(water = new_x,
presabs = posterior::rfun(plogis)(.value)) |>
ggplot(aes(x = water, dist = presabs)) +
::stat_lineribbon() +
tidybayes# scale_fill_viridis_d(option = "rocket") +
scale_fill_brewer(palette = "Reds", direction = -1) +
geom_jitter(aes(x = water, y = presabs),
inherit.aes = FALSE,
height = .01, width = 0,
data = pwil_spatial)
We can also pull out some specific functions. What I want you to see here is that there are MANY curvy lines that are consistent with this model.
<- gp_example_pwil_samp |>
some_predicted_lines # take just some draws
::spread_draws(a, f[rownum], ndraws = 63) |>
tidybayes# remove the rows that match observed data,
# and look only at the points for predictions.
filter(rownum > length(pwil_spatial$presabs)) |>
# convert to probability
mutate(prob = plogis(f + a),
rownum = rownum - 70) |>
## need a dataframe that says which "rownum" from
## above goes with which value of water from the
## new_x vector I made:
left_join(tibble::enframe(new_x,
name = "rownum",
value = "water"))
Joining with `by = join_by(rownum)`
|>
some_predicted_lines ggplot(aes(x = water, y = prob, group = .draw)) +
geom_line(alpha = 0.7) +
theme_minimal() +
coord_cartesian(ylim = c(0, 1))
Spatial predictions
To make a prediction of a function on one X variable, we needed a sequence of points to predict along.
To make spatial predictions, we need a grid of points to predict along.
<- modelr::data_grid(mite.xy,
grid_points x = modelr::seq_range(x, by = .5),
y = modelr::seq_range(y, by = .5))
|>
grid_points ggplot(aes(x = x, y = y)) +
geom_point() +
coord_fixed()
Other than a change in the data {}
block, the Stan code is unchanged!
Prior predictive simulations
<- stan_model(
gp_example_2D_prior file = "topics/04_gp/gp_example_2D_prior.stan")
Warning in readLines(file, warn = TRUE): incomplete final line found on
'/home/andrew/Documents/Projects/hiermod/topics/04_gp/gp_example_2D_prior.stan'
recompiling to avoid crashing R session
Trying to compile a simple C file
Running /usr/lib/R/bin/R CMD SHLIB foo.c11.4.0-1ubuntu1~22.04) 11.4.0’
using C compiler: ‘gcc (Ubuntu "/usr/share/R/include" -DNDEBUG -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/Rcpp/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppEigen/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppEigen/include/unsupported" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/BH/include" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/src/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppParallel/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -fpic -g -O2 -ffile-prefix-map=/build/r-base-JpkSDg/r-base-4.4.3=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -c foo.c -o foo.o
gcc -I4.4/RcppEigen/include/Eigen/Core:19,
In file included from /home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppEigen/include/Eigen/Dense:1,
from /home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
from /home/andrew/R/x86_64-pc-linux-gnu-library/
from <command-line>:4.4/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
/home/andrew/R/x86_64-pc-linux-gnu-library/679 | #include <cmath>
| ^~~~~~~
compilation terminated.195: foo.o] Error 1 make: *** [/usr/lib/R/etc/Makeconf:
gp_example_2D_prior
S4 class stanmodel 'anon_model' coded as follows:// Fit the hyperparameters of a latent-variable Gaussian process with an
// exponentiated quadratic kernel and a Bernoulli likelihood
// This code is from https://github.com/stan-dev/example-models/blob/master/misc/gaussian-process/gp-fit-logit.stan
data {
// int<lower=1> Nobs;
int<lower=1> N;
array[N] vector[2] x;
real rho_a;
real rho_b;
// array[Nobs] int<lower=0, upper=1> z;
}transformed data {
real delta = 1e-9;
}parameters {
real<lower=0> rho;
real<lower=0> alpha;
real a;
vector[N] eta;
}transformed parameters {
vector[N] f;
{matrix[N, N] L_K;
matrix[N, N] K = gp_exp_quad_cov(x, alpha, rho);
// diagonal elements
for (n in 1 : N) {
K[n, n] = K[n, n] + delta;
}
L_K = cholesky_decompose(K);
f = L_K * eta;
}
}model {
rho ~ inv_gamma(rho_a, rho_b);0, .8);
alpha ~ normal(0, .2);
a ~ normal(
eta ~ std_normal();// z ~ bernoulli_logit(a + f[1:Nobs]);
}
<- sampling(
gp_example_2D_prior_samp
gp_example_2D_prior,data = list(N = nrow(grid_points),
x = grid_points,
rho_a = 5, rho_b = 14),
chains = 2, refresh = 1000)
visualize the prior
## extract the predictors
|>
gp_example_2D_prior_samp ::spread_draws(f[rownum], a, ndraws = 6) |>
tidybayesmutate(rownum = as.character(rownum)) |>
left_join(grid_points |>
::rownames_to_column("rownum"), by = join_by(rownum)) |>
tibblemutate(presabs = plogis(f + a)) |>
ggplot(aes(x = x, y = y, fill = presabs)) +
geom_tile()+
scale_fill_viridis_c(option = "rocket") +
coord_fixed()+
theme_minimal() +
labs(fill = "Pr(y=1)") +
facet_wrap(~.draw) +
NULL
The model below, over 70 points, is the slowest model we’ve seen so far and takes about 1 minute on my (Andrew’s) laptop.
<- stan_model(
gp_example_pred_2D file = "topics/04_gp/gp_example_pred_2D.stan")
Warning in readLines(file, warn = TRUE): incomplete final line found on
'/home/andrew/Documents/Projects/hiermod/topics/04_gp/gp_example_pred_2D.stan'
recompiling to avoid crashing R session
Trying to compile a simple C file
Running /usr/lib/R/bin/R CMD SHLIB foo.c11.4.0-1ubuntu1~22.04) 11.4.0’
using C compiler: ‘gcc (Ubuntu "/usr/share/R/include" -DNDEBUG -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/Rcpp/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppEigen/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppEigen/include/unsupported" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/BH/include" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/src/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppParallel/include/" -I"/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include '/home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -fpic -g -O2 -ffile-prefix-map=/build/r-base-JpkSDg/r-base-4.4.3=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -c foo.c -o foo.o
gcc -I4.4/RcppEigen/include/Eigen/Core:19,
In file included from /home/andrew/R/x86_64-pc-linux-gnu-library/4.4/RcppEigen/include/Eigen/Dense:1,
from /home/andrew/R/x86_64-pc-linux-gnu-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
from /home/andrew/R/x86_64-pc-linux-gnu-library/
from <command-line>:4.4/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
/home/andrew/R/x86_64-pc-linux-gnu-library/679 | #include <cmath>
| ^~~~~~~
compilation terminated.195: foo.o] Error 1 make: *** [/usr/lib/R/etc/Makeconf:
gp_example_pred_2D
S4 class stanmodel 'anon_model' coded as follows:// Fit the hyperparameters of a latent-variable Gaussian process with an
// exponentiated quadratic kernel and a Bernoulli likelihood
// This code is from https://github.com/stan-dev/example-models/blob/master/misc/gaussian-process/gp-fit-logit.stan
data {
int<lower=1> Nobs;
int<lower=1> N;
array[N] vector[2] x;
array[Nobs] int<lower=0, upper=1> z;
}transformed data {
real delta = 1e-9;
}parameters {
real<lower=0> rho;
real<lower=0> alpha;
real a;
vector[N] eta;
}transformed parameters {
vector[N] f;
{matrix[N, N] L_K;
matrix[N, N] K = gp_exp_quad_cov(x, alpha, rho);
// diagonal elements
for (n in 1 : N) {
K[n, n] = K[n, n] + delta;
}
L_K = cholesky_decompose(K);
f = L_K * eta;
}
}model {
5, 14);
rho ~ inv_gamma(0, .8);
alpha ~ normal(0, .2);
a ~ normal(
eta ~ std_normal();
1:Nobs]);
z ~ bernoulli_logit(a + f[ }
plot the effect in space:
## sample the model
<- sampling(
gp_example_2D_samp
gp_example_pred_2D,data = list(N = length(pwil_spatial$presabs) + nrow(grid_points),
Nobs = length(pwil_spatial$presabs),
x = bind_rows(pwil_spatial[c("x", "y")], grid_points),
z = pwil_spatial$presabs),
chains = 2, refresh = 200)
# gp_example_2D_samp$save_object("topics/04_gp/gp_example_2D_samp_pwil.rds")
# gp_example_2D_samp_pwil <- read_rds("topics/04_gp/gp_example_2D_samp_pwil.rds")
## extract the predictors
|>
gp_example_2D_samp ::spread_rvars(f[rownum], a) |>
tidybayesslice(-(1:length(pwil_spatial$presabs))) |>
bind_cols(grid_points) |>
mutate(presabs = posterior::rfun(plogis)(f + a),
pa_median = median(presabs)) |>
ggplot(aes(x = x, y = y, fill = pa_median)) +
geom_tile()+
geom_point(aes(x = x,
y = y,
fill = presabs),
inherit.aes = FALSE,
data = pwil_spatial,
pch = 21 ,
size = 2.5,
stroke = .3,
colour = "lightblue"
+
) scale_fill_viridis_c(option = "rocket") +
coord_fixed()+
theme_minimal() +
labs(fill = "Pr(y=1)") +
NULL
# ggsave("topics/04_gp/pwil_spatial.png")
Extensions:
Add water to the model. Does the spatial effect disappear, increase, or stay kind of the same?
Next step: try to model water curve for more than one species. Would it be possible to make the species rho parameters hierarchical?