Fitting an intercept-only model

Where is the variation

Resources

  • tidybayes is an incredible tool, and the vignette is a great read for visualization approaches (even if you aren’t using rvars)
  • the posterior package is the best place to learn about how to manipulate Stan posterior distributions.

\[ \begin{align} \text{Abundance}_i &\sim \text{Poisson}(\lambda_i) \\ \log{\lambda_i} &\sim \mu + \beta_{\text{sample}[i]} + \beta_{\text{species[i]}} + \beta_i\\ \mu &\sim \text{Normal}(3, 1)\\ \beta_{\text{sample}} &\sim \text{Normal}(0, \sigma_{\text{samp}})\\ \beta_{\text{species}} &\sim \text{Normal}(0, \sigma_{\text{species}})\\ \beta_i &\sim \text{Normal}(0, \sigma_{\text{obs}}) \\ \sigma_{\text{samp}} &\sim \text{Exponential}(3)\\ \sigma_{\text{species}} &\sim \text{Exponential}(3)\\ \sigma_{\text{obs}} &\sim \text{Exponential}(3) \end{align} \]

library(tidyverse)
library(cmdstanr)
This is cmdstanr version 0.7.1
- CmdStanR documentation and vignettes: mc-stan.org/cmdstanr
- CmdStan path: /home/andrew/software/cmdstan
- CmdStan version: 2.34.1
library(tidybayes)

data("mite", package = "vegan")

spp_names <- colnames(mite)
spp_names <- setNames(1:ncol(mite), colnames(mite))


mite_long <- mite |> 
  mutate(site_id = seq_len(nrow(mite))) |> 
  tidyr::pivot_longer(-site_id,
                      names_to = "spp",
                      values_to = "abd") |> 
  dplyr::mutate(spp_id = spp_names[spp])
library(cmdstanr)

spp_site_obs_intercepts <- cmdstan_model("topics/intercept_only/spp_site_obs_intercepts.stan", 
                                  pedantic = TRUE)
spp_site_obs_intercepts
data{
  int N;
  int N_spp;
  array[N] int<lower=1,upper=N_spp> spp_id;
  int N_sites;
  array[N] int<lower=1,upper=N_sites> site_id;
  array[N] int abd;
}
parameters{
  vector[N_spp] spp_effects;
  vector[N_sites] site_effects;
  vector[N] obs_effects;
  real mu;
  real<lower=0> sigma_spp;
  real<lower=0> sigma_sites;
  real<lower=0> sigma_obs;
}
model {
  abd ~ poisson_log(mu + spp_effects[spp_id] + site_effects[site_id] + obs_effects);
  spp_effects ~ normal(0, sigma_spp);
  site_effects ~ normal(0, sigma_sites);
  obs_effects ~ normal(0, sigma_obs);
  mu ~ normal(3, 1);
  sigma_spp ~ exponential(3);
  sigma_sites ~ exponential(3);
  sigma_obs ~ exponential(3);
}

Now we can sample this model.

Warning: irresponsible statistics

I’m sampling only 2 chains below, for illustration purposes only! use more chains in your research.

spp_site_obs_intercepts_samp <- spp_site_obs_intercepts$sample(
  data = list(
    N = nrow(mite_long),
    N_spp = max(mite_long$spp_id),
    spp_id = mite_long$spp_id,
    N_sites = max(mite_long$site_id),
    site_id = mite_long$site_id,
    abd = mite_long$abd
  ),chains = 2, parallel_chains = 2)

spp_site_obs_intercepts_samp$save_object("topics/intercept_only/spp_site_obs_intercepts.rds")
spp_site_obs_intercepts_samp <- read_rds("topics/intercept_only/spp_site_obs_intercepts.rds")

Exploring the model output

spp_site_obs_intercepts_samp
       variable     mean   median    sd   mad       q5      q95 rhat ess_bulk
 lp__           17755.93 17756.60 56.81 55.89 17661.36 17845.10 1.01      236
 spp_effects[1]     1.98     1.98  0.31  0.32     1.46     2.47 1.02      150
 spp_effects[2]    -0.45    -0.44  0.35  0.35    -1.04     0.09 1.01      228
 spp_effects[3]     2.16     2.15  0.32  0.32     1.63     2.68 1.01       99
 spp_effects[4]    -0.69    -0.69  0.35  0.36    -1.26    -0.11 1.01      162
 spp_effects[5]    -1.99    -2.00  0.41  0.41    -2.66    -1.32 1.00      410
 spp_effects[6]    -1.95    -1.94  0.42  0.42    -2.67    -1.26 1.01      247
 spp_effects[7]     0.25     0.24  0.32  0.31    -0.25     0.79 1.01      162
 spp_effects[8]    -2.36    -2.37  0.46  0.45    -3.10    -1.62 1.01      353
 spp_effects[9]    -0.66    -0.65  0.34  0.34    -1.25    -0.11 1.01      222
 ess_tail
      516
      342
      469
      217
      506
      702
      786
      383
     1208
      866

 # showing 10 of 2560 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

The sampler was just fine! Note that we have just estimated many more parameters than observations.

Plot the standard deviations

sigma_post <- spp_site_obs_intercepts_samp$draws(
  variables = c("sigma_spp", "sigma_sites", "sigma_obs"))

sigma_post
# A draws_array: 1000 iterations, 2 chains, and 3 variables
, , variable = sigma_spp

         chain
iteration   1   2
        1 1.8 1.7
        2 1.3 1.4
        3 1.4 1.6
        4 1.3 1.5
        5 1.5 1.7

, , variable = sigma_sites

         chain
iteration    1    2
        1 0.73 0.67
        2 0.58 0.63
        3 0.50 0.55
        4 0.56 0.70
        5 0.61 0.60

, , variable = sigma_obs

         chain
iteration   1   2
        1 1.5 1.5
        2 1.5 1.5
        3 1.5 1.5
        4 1.5 1.5
        5 1.5 1.6

# ... with 995 more iterations

The default output is a draws array

sigma_post_df <- spp_site_obs_intercepts_samp$draws(
  variables = c("sigma_spp", "sigma_sites", "sigma_obs"),
  format = "data.frame")

sigma_post_df
# A draws_df: 1000 iterations, 2 chains, and 3 variables
   sigma_spp sigma_sites sigma_obs
1        1.8        0.73       1.5
2        1.3        0.58       1.5
3        1.4        0.50       1.5
4        1.3        0.56       1.5
5        1.5        0.61       1.5
6        1.3        0.51       1.5
7        1.6        0.48       1.5
8        1.4        0.58       1.6
9        1.5        0.61       1.6
10       1.4        0.69       1.5
# ... with 1990 more draws
# ... hidden reserved variables {'.chain', '.iteration', '.draw'}

We can also get a huge data frame. If you are comfortable manipulating data frames, you can use all your regular techniques here. Here I use tidyverse tools to reshape and plot the posterior distribution of the three \(\sigma\) variables:

sigma_post_df |> 
  pivot_longer(starts_with("sigma"), 
               names_to = "sigma", 
               values_to = "value") |> 
  ggplot(aes(x = value)) + 
  geom_histogram() + 
  facet_wrap(~sigma)
Warning: Dropping 'draws_df' class as required metadata was removed.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Caution

Parameters from a posterior are NOT independent. If you want to combine parameters in any way (to calculate an average for example) you have to do it WITHIN each posterior sample. Work on ROWS of the draws data frame above

Calculate the posterior distribution of average abundance for each species

To do this we need to extract the average \(mu\) and add it to the species effects, \(\beta_{\text{species}}\)

spp_avg_effects_df <- spp_site_obs_intercepts_samp$draws(
  variables = c("mu", "spp_effects"),
  format = "data.frame")


spp_avg_effects_df |> 
  select(mu, starts_with("spp_effects")) |> 
  mutate(row_id = seq_along(mu)) |> 
  pivot_longer(-c("mu", "row_id"), 
               names_to = "parname", 
               values_to = "spp_effect") |> 
  mutate(spp_avg = mu + spp_effect) |> 
  ggplot(aes(x = spp_avg, group = parname)) + 
  geom_density()
Warning: Dropping 'draws_df' class as required metadata was removed.

with rvars

We can do the same operation even more quickly by using some of the tools from tidybayes

spp_mu_rvars <- spp_site_obs_intercepts_samp |> 
  tidybayes::spread_rvars(mu, spp_effects[spp_id])
spp_mu_rvars |> 
  mutate(spp_avg = mu + spp_effects) |> 
  ggplot(aes(dist = spp_avg, group = spp_id)) + 
  tidybayes::stat_slab(col = "black") + 
  coord_flip() + 
  theme_minimal()