# Migrate-n convergence and analysis script for use with R # C R Young # R is a free software environment for statistical computing and graphics, and can be downloaded from: # http://www.r-project.org/ # You will need to make sure that the CODA # package is installed in R before # attempting to use this script!!!!!! # These functions assume that the file "bayesallfile" # is present in the working directory (root by default) # of R. This file is the output file generated by # MCMCthin. # Instructions for installing packages in R can be # found in the html help documentation of R. # This file contains a set of R functions that I find useful in diagnosing convergence of IM runs # and in understanding the fitted model parameters, both by visualizing them and by examining # numerical output. This complete file can be copied and pasted into the R console. Several plots # (*.pdf) are saved to the root directory of R. Most of the plotting functions are fairly automatic, # and the parameters of the plotting functions should be reasonable for most data. This script is # meant to provide a starting point for analysis of migrate runs, but you will probably need to # tailor it to your particular data. # Descriptions of some of the methods used in this script are included in notes from a short MCMC # workshop given by C R Young at MBARI in 2006. The R documentation is also a good resource. # Output of this script include: # ------------------------------ # Parameter traces over the length of the chain # Cumulative quantile plots (0.025%, 50%, and 97.5%) # Marginal kernel density estimates (KDEs) # Joint KDEs # 2D contour plots of joint KDEs # Summary statistics of marginal distributions (including cross correlations and autocorrelations) # Effective sample size estimates # Geweke's convergence diagnostic # Raftery and Lewis's diagnostic # Gelman-Rubin diagnostic (requires "MCMCdata_chain1.out" and "MCMCdata_chain2.out". Modifications to script are necessary if more chains are tested. # Heidelberger-Welch diagnostic # Passing MCMC diagnostics does NOT guarantee that a chain is stationary!!!!!! # Further information on these functions can be obtained from: # http://www.maths.lth.se/help/R/.R/library/coda/html/00Index.html # or the html documentation in R # -------------------------------------------------------------------------------------------- # Load the coda library # -------------------------------------------------------------------------------------------- library(coda) # -------------------------------------------------------------------------------------------- # Clear R's memory # -------------------------------------------------------------------------------------------- rm(list = ls(all = TRUE)) # -------------------------------------------------------------------------------------------- # Read the contents of the datafile "MCMCdata.out" that should be in R's working directory # -------------------------------------------------------------------------------------------- MCMCdata <- read.table("bayesallfile", header=FALSE) # -------------------------------------------------------------------------------------------- # Turn MCMCdata into MCMC object # -------------------------------------------------------------------------------------------- size = dim(MCMCdata)[1] num.parms = dim(MCMCdata)[2] num.popns = sqrt(num.parms - 3) Likelihood <- MCMCdata[,3] Theta.parms <- MCMCdata[,4:(num.popns+3)] Mig.parms <- MCMCdata[,(num.popns+4):num.parms] All.parms <- MCMCdata[, 4:num.parms] MCMCdata.Theta = mcmc(data=Theta.parms, start = 1, end = size, thin = 1) MCMCdata.Mig = mcmc(data=Mig.parms, start = 1, end = size, thin = 1) MCMCdata.All = mcmc(data=All.parms, start = 1, end = size, thin = 1) Likelihood = mcmc(data=Likelihood, start = 1, end = size, thin = 1) # -------------------------------------------------------------------------------------------- # Set plot layout so theta is diagonal and immigration is offdiagonal # -------------------------------------------------------------------------------------------- plotlayout = sqrt(dim(MCMCdata.All)[2]) mat <- matrix(1:(num.parms-3),plotlayout,plotlayout, byrow=TRUE) plot.num <- num.popns+1 for (i in 1:num.popns) { for (j in 1:num.popns) { if ( i==j ) mat[i,j] <- i else mat[i,j] <- plot.num if ( i!=j ) plot.num <- plot.num+1; } } layout(mat) # -------------------------------------------------------------------------------------------- # Plots Cumulative Estimates of 0.025%, 50%, and 97.5% quantiles (all parameters) # -------------------------------------------------------------------------------------------- cumuplot(MCMCdata.All, probs=c(0.025,0.5,0.975), ylab="",lty=c(2,1), lwd=c(1,2), type="l", ask=FALSE, auto.layout=FALSE, col=1) savePlot(filename="Cumulative_Quantiles", type=c("pdf"),device=dev.cur()) # -------------------------------------------------------------------------------------------- # Plot traces (all parameters) # -------------------------------------------------------------------------------------------- plot(MCMCdata.All, trace = TRUE, density = FALSE, smooth = TRUE, auto.layout = FALSE, ask = FALSE) savePlot(filename="Traces", type=c("pdf"), device=dev.cur()) # -------------------------------------------------------------------------------------------- # Plot Kernel density estimates (all parameters) # -------------------------------------------------------------------------------------------- plot(MCMCdata.All, trace = FALSE, density = TRUE, smooth = TRUE, auto.layout = FALSE, ask = FALSE) savePlot(filename="Densities", type=c("pdf"), device=dev.cur()) # -------------------------------------------------------------------------------------------- # Autocorrelation function (all parameters) # -------------------------------------------------------------------------------------------- autocorr.plot(MCMCdata.All, 50, auto.layout = TRUE, ask = FALSE) savePlot(filename="ACF",type=c("pdf"),device=dev.cur()) # -------------------------------------------------------------------------------------------- # Partial Autocorrelation function (all parameters) # -------------------------------------------------------------------------------------------- pacf(MCMCdata.All, lag.max = 20, plot=TRUE, ask = FALSE) savePlot(filename="PACF",type=c("pdf"),device=dev.cur()) # -------------------------------------------------------------------------------------------- # Cross correlation Plot # -------------------------------------------------------------------------------------------- mat <- matrix(1,1,1, byrow=TRUE) layout(mat) crosscorr.plot (MCMCdata.All, col = topo.colors(10)) savePlot(filename="CrossCorr",type=c("pdf"),device=dev.cur()) # -------------------------------------------------------------------------------------------- # Basic Summary Statistics (all parameters) # -------------------------------------------------------------------------------------------- summary(MCMCdata.All, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975)) # -------------------------------------------------------------------------------------------- # Numerical output for cross correlations # -------------------------------------------------------------------------------------------- crosscorr(MCMCdata.All) # -------------------------------------------------------------------------------------------- # Numerical output for autocorrelations (all parameters) # -------------------------------------------------------------------------------------------- autocorr.diag(MCMCdata.All, lags = c(0, 1, 5, 10, 50, 100, 500, 1000, 5000), relative=TRUE) # -------------------------------------------------------------------------------------------- # Calculates effective sample sizes (all parameters) # -------------------------------------------------------------------------------------------- effectiveSize(MCMCdata.All) # -------------------------------------------------------------------------------------------- # Convergence Diagnostic (Geweke's diagnostic) # -------------------------------------------------------------------------------------------- # z-score for difference in means of first 10% of chain and last 50% (stationarity) # -1.96 < z-score < 1.96 is reasonable # # Geweke (1992) proposed a convergence diagnostic for Markov chains based on a test for # equality of the means of the first and last part of a Markov chain (by default the first # 10% and the last 50%). If the samples are drawn from the stationary distribution of the # chain, the two means are equal and Geweke's statistic has an asymptotically standard # normal distribution. # # The test statistic is a standard Z-score: the difference between the two sample means # divided by its estimated standard error. The standard error is estimated from the spectral # density at zero and so takes into account any autocorrelation. # # The Z-score is calculated under the assumption that the two parts of the chain are # asymptotically independent, which requires that the sum of frac1 and frac2 be strictly # less than 1. geweke.diag(MCMCdata.All, frac1=0.1, frac2=0.5) # -------------------------------------------------------------------------------------------- # Convergence Diagnostic (Heidelberger-Welch diagnostic) # -------------------------------------------------------------------------------------------- # The convergence test uses the Cramer-von-Mises statistic to test the null hypothesis that the # sampled values come from a stationary distribution. The test is successively applied, firstly # to the whole chain, then after discarding the first 10%, 20%, ... of the chain until either the # null hypothesis is accepted, or 50% of the chain has been discarded. The latter outcome constitutes # `failure' of the stationarity test and indicates that a longer MCMC run is needed. If the stationarity # test is passed, the number of iterations to keep and the number to discard are reported. # # The half-width test calculates a 95% confidence interval for the mean, using the portion of the chain # which passed the stationarity test. Half the width of this interval is compared with the estimate of # the mean. If the ratio between the half-width and the mean is lower than eps, the halfwidth test is # passed. Otherwise the length of the sample is deemed not long enough to estimate the mean with sufficient # accuracy. heidel.diag(MCMCdata.All, eps=0.1, pvalue=0.05) # -------------------------------------------------------------------------------------------- # Convergence Diagnostic (Raftery and Lewis's diagnostic) # -------------------------------------------------------------------------------------------- # Estimate for how long to run the chain for specified accuracy # # raftery.diag is a run length control diagnostic based on a criterion of accuracy of # estimation of the quantile q. It is intended for use on a short pilot run of a Markov # chain. The number of iterations required to estimate the quantile q to within an accuracy # of +/- r with probability p is calculated. Separate calculations are performed for each # variable within each chain. # # If the number of iterations in data is too small, an error message is printed indicating # the minimum length of pilot run. The minimum length is the required sample size for a chain # with no correlation between consecutive samples. Positive autocorrelation will increase the # required sample size above this minimum value. An estimate I (the `dependence factor') of # the extent to which autocorrelation inflates the required sample size is also provided. # Values of I larger than 5 indicate strong autocorrelation which may be due to a poor choice # of starting value, high posterior correlations or `stickiness' of the MCMC algorithm. # # The number of `burn in' iterations to be discarded at the beginning of the chain is also calculated. raftery.diag(MCMCdata.All, q=0.05, r=0.01, s=0.95, converge.eps=0.001) raftery.diag(MCMCdata.All, q=0.50, r=0.025, s=0.95, converge.eps=0.001) raftery.diag(MCMCdata.All, q=0.95, r=0.01, s=0.95, converge.eps=0.001) # -------------------------------------------------------------------------------------------- # Convergence Diagnostic (Gelman and Rubin's diagnostic) # -------------------------------------------------------------------------------------------- # Gelman and Rubin (1992) propose a general approach to monitoring convergence of MCMC output in # which two or more parallel chains are run with starting values that are overdispersed relative # to the posterior distribution. Convergence is diagnosed when the chains have `forgotten' their # initial values, and the output from all chains is indistinguishable. The gelman.diag diagnostic # is applied to a single variable from the chain. It is based a comparison of within-chain and # between-chain variances, and is similar to a classical analysis of variance. # # This script assumes that you have TWO chains named: "bayesallfile_chain1" and "bayesallfile_chain1" # in the root directory of R that are THE SAME SIZE (in number of iterations)! rm(list = ls(all = TRUE)) MCMCdata_chain1 <- read.table("bayesallfile_chain1", header=FALSE) MCMCdata_chain2 <- read.table("bayesallfile_chain2", header=FALSE) size_chain1 <- dim(MCMCdata_chain1)[1] # CHAINS MUST BE THE SAME SIZE! size_chain2 <- dim(MCMCdata_chain2)[1] # CHAINS MUST BE THE SAME SIZE! MCMCdata_chain1 <- mcmc(data= MCMCdata_chain1, start = 1, end = size_chain1, thin = 1) MCMCdata_chain2 <- mcmc(data= MCMCdata_chain2, start = 1, end = size_chain2, thin = 1) MCMCData <- mcmc.list(MCMCdata_chain1, MCMCdata_chain2) gelman.diag(MCMCData, confidence = 0.95, transform=FALSE, autoburnin=FALSE) gelman.plot(MCMCData, bin.width = 10, max.bins = 50, confidence = 0.95, transform = FALSE, ask = FALSE) savePlot(filename="Gelman-Rubin_plot",type=c("pdf"),device=dev.cur())