A new version 0.5-0 of my R
package bayesImageS is now available on CRAN. To accompany it is a revision to my paper with Kerrie and Tony, “Scalable Bayesian inference for the inverse temperature of a hidden Potts model.” (Moores, Pettitt & Mengersen, arXiv:1503.08066v2). This paper introduces the parametric functional approximate Bayesian (PFAB) algorithm (the ‘p’ is silent…), which is a form of Bayesian indirect likelihood (BIL).
PFAB splits computation into 3 stages:
For Stage 1, I used 2000 iterations of SW for each of 72 values of \(\beta\), but this is really overkill for most applications. I chose 72 values because I happened to have a 36-core, hyperthreaded CPU available. Here I’ll just be running everything on my laptop (an i7 CPU with 4 hyperthreaded cores), so 28 values should be plenty. The idea is to have higher density closer to the critical temperature, where the variance (and hence the gradient of the score function) is greatest
For our precomputation step, we need to know the image dimensions \(n\) and the number of labels \(k\) that we will use for pixel classification. We’ll be using the Lake of Menteith dataset from Bayesian Essentials with R (Marin & Robert, 2014):
library(bayess)
data("Menteith")
iter <- 800
burn <- iter/4 + 1
n <- prod(dim(Menteith))
k <- 6
image(as.matrix(Menteith),
asp=1,xaxt='n',yaxt='n',col=gray(0:255/255))
Lake of Menteith, Scotland
The precomputation step is usually the most expensive part, but for 100x100 pixels it should only take around 15 seconds:
library(bayesImageS)
bcrit <- log(1 + sqrt(k))
beta <- sort(c(seq(0,1,by=0.1),seq(1.05,1.15,by=0.05),bcrit-0.05,bcrit-0.02,bcrit+0.02,
seq(1.3,1.4,by=0.05),seq(1.5,2,by=0.1),2.5,3))
mask <- matrix(1, nrow=sqrt(n), ncol=sqrt(n))
neigh <- getNeighbors(mask, c(2,2,0,0))
block <- getBlocks(mask, 2)
edges <- getEdges(mask, c(2,2,0,0))
maxS <- nrow(edges)
E0 <- maxS/k
V0 <- maxS*(1/k)*(1 - 1/k)
Embarrasingly parallel, using all available CPU cores:
library(doParallel)
cores <- min(detectCores(), length(beta))
print(paste("Parallel computation using",cores,"CPU cores:",
iter,"iterations for",length(beta),"values of beta."))
cl <- makeCluster(cores)
print(cl)
clusterSetRNGStream(cl)
registerDoParallel(cl)
Simulate from the prior to verify the critical value of \(\beta\):
tm <- system.time(matu <- foreach(i=1:length(beta),
.packages=c("bayesImageS"), .combine='cbind') %dopar% {
res <- swNoData(beta[i],k,neigh,block,iter)
res$sum
})
stopCluster(cl)
## user system elapsed
## 0.185 0.151 19.384
This shows the piecewise linear approximation that we used in our first paper (Moores et al., STCO 2015):
plot(x=beta, y=colMeans(simSW$matu),main="",
xlab=expression(beta),ylab=expression(S(z)))
lrcst=approxfun(beta,colMeans(simSW$matu))
curve(lrcst,0,max(beta),add=T,col="blue")
abline(v=bcrit,col="red",lty=3)
abline(h=maxS,col=2,lty=2)
points(0,E0,col=2,pch=2)
Piecewise linear approximation
Instead, for Stage 2 we will use Stan to fit a parametric integral curve:
functions {
vector ft(vector t, real tC, real e0, real ecrit, real v0, real vmaxLo, real vmaxHi, real phi1, real phi2) {
vector[num_elements(t)] mu;
real sqrtBcritPhi = sqrt(tC)*phi1;
for (i in 1:num_elements(t)) {
if (t[i] <= tC) {
real sqrtBdiffPhi = sqrt(tC - t[i])*phi1;
mu[i] = e0 + t[i]*v0 - ((2*(vmaxLo-v0))/(phi1^2))*((sqrtBcritPhi + 1)/exp(sqrtBcritPhi) - (sqrtBdiffPhi + 1)/exp(sqrtBdiffPhi));
} else {
real sqrtBdiff = sqrt(t[i] - tC);
mu[i] = ecrit - ((2*vmaxHi)/phi2)*(sqrtBdiff/exp(phi2*sqrtBdiff) + (exp(-phi2*sqrtBdiff) - 1)/phi2);
}
}
return mu;
}
vector dfdt(vector t, real tC, real v0, real vmaxLo, real vmaxHi, real r1, real r2) {
vector[num_elements(t)] dmu;
for (i in 1:num_elements(t)) {
if (t[i] <= tC) {
dmu[i] = v0 + (vmaxLo-v0)*exp(-r1*sqrt(tC - t[i]));
} else {
dmu[i] = vmaxHi*exp(-r2*sqrt(t[i] - tC));
}
}
return dmu;
}
}
data {
int<lower = 1> M;
int<lower = 1> N;
real<lower = 1> maxY;
real<lower = 1> Vlim;
real<lower = 0> e0;
real<lower = 0> v0;
real tcrit;
matrix<lower=0, upper=maxY>[M,N] y;
vector[M] t;
}
parameters {
real<lower = 0> a;
real<lower = 0> b;
real<lower = e0, upper=maxY> ecrit;
real<lower = 0, upper=Vlim> vmaxLo;
real<lower = 0, upper=Vlim> vmaxHi;
}
transformed parameters {
vector[M] curr_mu;
vector[M] curr_var;
curr_mu = ft(t, tcrit, e0, ecrit, v0, vmaxLo, vmaxHi, a, b);
curr_var = dfdt(t, tcrit, v0, vmaxLo, vmaxHi, a, b);
}
model {
for (i in 1:M) {
y[i,] ~ normal(curr_mu[i], sqrt(curr_var[i]));
}
}
For comparison, see a previous blog post where I fitted a simple, logistic curve using Stan.
library(rstan)
options(mc.cores = min(4, parallel::detectCores()))
dat <- list(M=length(beta), N=iter-burn+1, maxY=maxS, e0=E0, v0=V0,
Vlim=2*maxS*log(maxS)/pi, tcrit=bcrit, y=t(simSW$matu[burn:iter,]), t=beta)
tm2 <- system.time(fit <- sampling(PFAB, data = dat, verbose=TRUE, iter=5000,
control = list(adapt_delta = 0.9, max_treedepth=20)))
## user system elapsed
## 6.032 7.794 2140.408
library(knitr)
kable(fitStan$summary, caption="95% highest posterior density intervals for pseudo-Voigt peaks",
align = 'lrrrrrr', digits=3)
mean | se_mean | sd | 2.5% | 97.5% | n_eff | Rhat | |
---|---|---|---|---|---|---|---|
a | 4.546 | 0.000 | 0.005 | 4.535 | 4.556 | 3018.587 | 1.001 |
b | 6.674 | 0.000 | 0.002 | 6.671 | 6.678 | 5517.964 | 1.000 |
ecrit | 14210.513 | 0.042 | 2.996 | 14204.672 | 14216.457 | 5209.181 | 1.000 |
vmaxLo | 58862.196 | 3.464 | 189.405 | 58489.777 | 59232.045 | 2989.964 | 1.001 |
vmaxHi | 124677.101 | 0.599 | 30.050 | 124595.047 | 124706.591 | 2519.701 | 1.001 |
Roughly 30 minutes to fit the surrogate model makes this the most expensive step, but only because the first step was so fast. For much larger images (a megapixel or more), it will be the other way around.
ft <- function(t, tC, e0, ecrit, v0, vmax1, vmax2, phi1, phi2) {
sqrtBcritPhi = sqrt(tC)*phi1
fval <- numeric(length(t))
for (i in 1:length(t)) {
if (t[i] <= tC) {
sqrtBdiffPhi = sqrt(tC - t[i])*phi1
fval[i] <- e0 + t[i]*v0 - ((2*(vmax1-v0))/(phi1^2))*((sqrtBcritPhi + 1)/exp(sqrtBcritPhi) - (sqrtBdiffPhi + 1)/exp(sqrtBdiffPhi));
} else {
sqrtBdiff = sqrt(t[i] - tC)
fval[i] <- ecrit - ((2*vmax2)/phi2)*(sqrtBdiff/exp(phi2*sqrtBdiff) + (exp(-phi2*sqrtBdiff) - 1)/phi2);
}
}
return(fval)
}
plot(range(beta),range(simSW$matu),type='n',xlab=expression(beta),ylab=expression(S(z)))
idx <- burn+sample.int(iter-burn+1,size=20)
abline(v=bcrit,col="red",lty=3)
abline(h=maxS,col=2,lty=2)
points(rep(beta,each=20),simSW$matu[idx,],pch=20)
lines(beta, ft(beta, bcrit, E0, 14237, V0, 59019, 124668, 4.556, 6.691),
col=4, lwd=2)
To really see how well this approximation fits the true model, we need to look at the residuals:
residMx <- matrix(nrow=iter-burn+1, ncol=length(beta))
for (b in 1:length(beta)) {
residMx[,b] <- simSW$matu[burn:iter,b] - ft(beta[b], bcrit, E0, 14237, V0, 59019, 124668, 4.556, 6.691)
}
dfdt <- function(t, tC, V0, Vmax1, Vmax2, r1, r2) {
ifelse(t < tC,
V0 + (Vmax1-V0)*exp(-r1*sqrt(tC - t)),
Vmax2*exp(-r2*sqrt(t - tC)))
}
plot(range(beta),range(residMx),type='n',xlab=expression(beta),ylab="residuals")
abline(h=0,lty=2,col=4,lwd=2)
points(rep(beta,each=iter-burn+1),residMx,pch='.',cex=3)
x <- sort(c(seq(0,3,by=0.01),bcrit))
lines(x, 3*sqrt(dfdt(x, bcrit, V0, 59019, 124668, 4.556, 6.691)), col=2, lwd=2)
lines(x, -3*sqrt(dfdt(x, bcrit, V0, 59019, 124668, 4.556, 6.691)), col=2, lwd=2)
This shows that 28 values of \(\beta\) were enough to obtain a high-quality fit between the true model and the surrogate. The location of \(\beta_{crit}\) is slightly off, because \(100 \times 100\) pixels isn’t quite enough for the asymptotics to kick in.
Now that we have our surrogate model, we can proceed to the final stage, which is to perform image segmentation using mcmcPotts
:
mh <- list(algorithm="aux", bandwidth=0.02, Vmax1=59019,
Vmax2=124668, E0=E0, Ecrit=14237, phi1=4.556,
phi2=6.691, factor=1, bcrit=bcrit, V0=V0)
priors <- list()
priors$k <- k
priors$mu <- c(0, 50, 100, 150, 200, 250)
priors$mu.sd <- rep(10,k)
priors$sigma <- rep(20,k)
priors$sigma.nu <- rep(5, k)
priors$beta <- c(0,3)
iter <- 1e4
burn <- iter/2
y <- as.vector(as.matrix(Menteith))
print(date())
## [1] "Wed Jun 6 16:26:22 2018"
## 10000 iterations of aux discarding the first 5000 as burn-in.
## user system elapsed
## 21.205 0.077 21.315
Now we compare with the approximate exchange algorithm:
mh <- list(algorithm="ex", bandwidth=0.02, auxiliary=200)
tm4 <- system.time(resAEA <-
mcmcPotts(y,neigh,block,priors,mh,iter,burn))
resAEA$time <- tm4[3]
## elapsed
## 2306.565
Over 200 times speedup, in comparison to AEA. There is reasonably good agreement between the posterior distributions:
densPFAB <- density(resPFAB$beta[burn:iter])
densAEA <- density(resAEA$beta[burn:iter])
plot(densAEA, col=4, lty=2, lwd=2, main="", xlab=expression(beta),
xlim=range(resPFAB$beta[burn:iter],resAEA$beta[burn:iter]))
lines(densPFAB, col=2, lty=3, lwd=3)
abline(h=0,lty=2)
legend("topright",legend=c("AEA","PFAB"),col=c(4,2),lty=c(2,3),
lwd=3)