# Migrate-n convergence and analysis script for use with R
# C R Young

# R is a free software environment for statistical computing and graphics, and can be downloaded from:
# http://www.r-project.org/

# You will need to make sure that the CODA
# package  is installed in R before 
# attempting to use this script!!!!!!

# These functions assume that the file "bayesallfile"
# is present in the working directory (root by default) 
# of R. This file is the output file generated by 
# MCMCthin. 

# Instructions for installing packages in R can be
# found in the html help documentation of R. 

# This file contains a set of R functions that I find useful in diagnosing convergence of IM runs
# and in understanding the fitted model parameters, both by visualizing them and by examining 
# numerical output. This complete file can be copied and pasted into the R console. Several plots
# (*.pdf) are saved to the root directory of R. Most of the plotting functions are fairly automatic,
# and the parameters of the plotting functions should be reasonable for most data. This script is 
# meant to provide a starting point for analysis of migrate runs, but you will probably need to 
# tailor it to your particular data.

# Descriptions of some of the methods used in this script are included in notes from a short MCMC
# workshop given by C R Young at MBARI in 2006. The R documentation is also a good resource. 

# Output of this script include:
# ------------------------------
# Parameter traces over the length of the chain
# Cumulative quantile plots (0.025%, 50%, and 97.5%)
# Marginal kernel density estimates (KDEs)
# Joint KDEs
# 2D contour plots of joint KDEs
# Summary statistics of marginal distributions (including cross correlations and autocorrelations)
# Effective sample size estimates
# Geweke's convergence diagnostic
# Raftery and Lewis's diagnostic
# Gelman-Rubin diagnostic (requires "MCMCdata_chain1.out" and "MCMCdata_chain2.out". Modifications to script are necessary if more chains are tested.
# Heidelberger-Welch diagnostic

# Passing MCMC diagnostics does NOT guarantee that a chain is stationary!!!!!!

# Further information on these functions can be obtained from:
# http://www.maths.lth.se/help/R/.R/library/coda/html/00Index.html
# or the html documentation in R


# --------------------------------------------------------------------------------------------
# Load the coda library
# --------------------------------------------------------------------------------------------

library(coda)


# --------------------------------------------------------------------------------------------
# Clear R's memory
# --------------------------------------------------------------------------------------------

rm(list = ls(all = TRUE))


# --------------------------------------------------------------------------------------------
# Read the contents of the datafile "MCMCdata.out" that should be in R's working directory
# --------------------------------------------------------------------------------------------

MCMCdata <- read.table("bayesallfile", header=FALSE)


# --------------------------------------------------------------------------------------------
# Turn MCMCdata into MCMC object
# --------------------------------------------------------------------------------------------
size = dim(MCMCdata)[1]
num.parms = dim(MCMCdata)[2]
num.popns = sqrt(num.parms - 3)
Likelihood <- MCMCdata[,3]
Theta.parms <- MCMCdata[,4:(num.popns+3)]
Mig.parms <- MCMCdata[,(num.popns+4):num.parms]
All.parms <- MCMCdata[, 4:num.parms]
MCMCdata.Theta = mcmc(data=Theta.parms, start = 1, end = size, thin = 1)
MCMCdata.Mig   = mcmc(data=Mig.parms, start = 1, end = size, thin = 1)
MCMCdata.All   = mcmc(data=All.parms, start = 1, end = size, thin = 1)
Likelihood     = mcmc(data=Likelihood, start = 1, end = size, thin = 1)


# --------------------------------------------------------------------------------------------
# Set plot layout so theta is diagonal and immigration is offdiagonal
# --------------------------------------------------------------------------------------------

plotlayout = sqrt(dim(MCMCdata.All)[2])
mat <- matrix(1:(num.parms-3),plotlayout,plotlayout, byrow=TRUE)
plot.num <- num.popns+1
for (i in 1:num.popns) {
	for (j in 1:num.popns) {
		if ( i==j ) 
			mat[i,j] <- i
		 else 
			mat[i,j] <- plot.num 
		
		 if ( i!=j ) 
			plot.num <- plot.num+1;	
			
	}
}
layout(mat)


# --------------------------------------------------------------------------------------------
# Plots Cumulative Estimates of 0.025%, 50%, and 97.5% quantiles (all parameters)
# --------------------------------------------------------------------------------------------

cumuplot(MCMCdata.All, probs=c(0.025,0.5,0.975), ylab="",lty=c(2,1), lwd=c(1,2), type="l", ask=FALSE, auto.layout=FALSE, col=1)
savePlot(filename="Cumulative_Quantiles", type=c("pdf"),device=dev.cur())

# --------------------------------------------------------------------------------------------
# Plot traces (all parameters)
# --------------------------------------------------------------------------------------------

plot(MCMCdata.All, trace = TRUE, density = FALSE, smooth = TRUE, auto.layout = FALSE, ask = FALSE)
savePlot(filename="Traces", type=c("pdf"), device=dev.cur())


# --------------------------------------------------------------------------------------------
# Plot Kernel density estimates (all parameters)
# --------------------------------------------------------------------------------------------

plot(MCMCdata.All, trace = FALSE, density = TRUE, smooth = TRUE, auto.layout = FALSE, ask = FALSE)
savePlot(filename="Densities", type=c("pdf"), device=dev.cur())


# --------------------------------------------------------------------------------------------
# Autocorrelation function (all parameters)
# --------------------------------------------------------------------------------------------

autocorr.plot(MCMCdata.All, 50, auto.layout = TRUE, ask = FALSE)
savePlot(filename="ACF",type=c("pdf"),device=dev.cur())

# --------------------------------------------------------------------------------------------
# Partial Autocorrelation function (all parameters)
# --------------------------------------------------------------------------------------------

pacf(MCMCdata.All, lag.max = 20, plot=TRUE, ask = FALSE)
savePlot(filename="PACF",type=c("pdf"),device=dev.cur())

# --------------------------------------------------------------------------------------------
# Cross correlation Plot
# --------------------------------------------------------------------------------------------
mat <- matrix(1,1,1, byrow=TRUE)
layout(mat)
crosscorr.plot (MCMCdata.All, col = topo.colors(10))
savePlot(filename="CrossCorr",type=c("pdf"),device=dev.cur())


# --------------------------------------------------------------------------------------------
# Basic Summary Statistics  (all parameters)
# --------------------------------------------------------------------------------------------

summary(MCMCdata.All, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975))


# --------------------------------------------------------------------------------------------
# Numerical output for cross correlations
# --------------------------------------------------------------------------------------------

crosscorr(MCMCdata.All)

# --------------------------------------------------------------------------------------------
# Numerical output for autocorrelations  (all parameters)
# --------------------------------------------------------------------------------------------

autocorr.diag(MCMCdata.All, lags = c(0, 1, 5, 10, 50, 100, 500, 1000, 5000), relative=TRUE)


# --------------------------------------------------------------------------------------------
# Calculates effective sample sizes  (all parameters)
# --------------------------------------------------------------------------------------------

effectiveSize(MCMCdata.All)


# --------------------------------------------------------------------------------------------
# Convergence Diagnostic (Geweke's diagnostic)
# --------------------------------------------------------------------------------------------
# z-score for difference in means of first 10% of chain and last 50% (stationarity)
# -1.96 < z-score < 1.96 is reasonable
# 
# Geweke (1992) proposed a convergence diagnostic for Markov chains based on a test for 
# equality of the means of the first and last part of a Markov chain (by default the first 
# 10% and the last 50%). If the samples are drawn from the stationary distribution of the 
# chain, the two means are equal and Geweke's statistic has an asymptotically standard 
# normal distribution.
# 
# The test statistic is a standard Z-score: the difference between the two sample means 
# divided by its estimated standard error. The standard error is estimated from the spectral 
# density at zero and so takes into account any autocorrelation.
# 
# The Z-score is calculated under the assumption that the two parts of the chain are 
# asymptotically independent, which requires that the sum of frac1 and frac2 be strictly 
# less than 1. 

geweke.diag(MCMCdata.All, frac1=0.1, frac2=0.5) 


# --------------------------------------------------------------------------------------------
# Convergence Diagnostic (Heidelberger-Welch diagnostic)
# --------------------------------------------------------------------------------------------
# The convergence test uses the Cramer-von-Mises statistic to test the null hypothesis that the 
# sampled values come from a stationary distribution. The test is successively applied, firstly 
# to the whole chain, then after discarding the first 10%, 20%, ... of the chain until either the 
# null hypothesis is accepted, or 50% of the chain has been discarded. The latter outcome constitutes 
# `failure' of the stationarity test and indicates that a longer MCMC run is needed. If the stationarity 
# test is passed, the number of iterations to keep and the number to discard are reported.
# 
# The half-width test calculates a 95% confidence interval for the mean, using the portion of the chain 
# which passed the stationarity test. Half the width of this interval is compared with the estimate of 
# the mean. If the ratio between the half-width and the mean is lower than eps, the halfwidth test is 
# passed. Otherwise the length of the sample is deemed not long enough to estimate the mean with sufficient 
# accuracy. 

heidel.diag(MCMCdata.All, eps=0.1, pvalue=0.05)


# --------------------------------------------------------------------------------------------
# Convergence Diagnostic (Raftery and Lewis's diagnostic)
# --------------------------------------------------------------------------------------------
# Estimate for how long to run the chain for specified accuracy
# 
# raftery.diag is a run length control diagnostic based on a criterion of accuracy of 
# estimation of the quantile q. It is intended for use on a short pilot run of a Markov 
# chain. The number of iterations required to estimate the quantile q to within an accuracy 
# of +/- r with probability p is calculated. Separate calculations are performed for each 
# variable within each chain.
# 
# If the number of iterations in data is too small, an error message is printed indicating 
# the minimum length of pilot run. The minimum length is the required sample size for a chain 
# with no correlation between consecutive samples. Positive autocorrelation will increase the
# required sample size above this minimum value. An estimate I (the `dependence factor') of 
# the extent to which autocorrelation inflates the required sample size is also provided. 
# Values of I larger than 5 indicate strong autocorrelation which may be due to a poor choice 
# of starting value, high posterior correlations or `stickiness' of the MCMC algorithm.
# 
# The number of `burn in' iterations to be discarded at the beginning of the chain is also calculated. 

raftery.diag(MCMCdata.All, q=0.05, r=0.01,  s=0.95, converge.eps=0.001)
raftery.diag(MCMCdata.All, q=0.50, r=0.025, s=0.95, converge.eps=0.001)
raftery.diag(MCMCdata.All, q=0.95, r=0.01,  s=0.95, converge.eps=0.001)


# --------------------------------------------------------------------------------------------
# Convergence Diagnostic (Gelman and Rubin's diagnostic)
# --------------------------------------------------------------------------------------------
# Gelman and Rubin (1992) propose a general approach to monitoring convergence of MCMC output in 
# which two or more parallel chains are run with starting values that are overdispersed relative 
# to the posterior distribution. Convergence is diagnosed when the chains have `forgotten' their 
# initial values, and the output from all chains is indistinguishable. The gelman.diag diagnostic 
# is applied to a single variable from the chain. It is based a comparison of within-chain and 
# between-chain variances, and is similar to a classical analysis of variance.
# 
# This script assumes that you have TWO chains named: "bayesallfile_chain1" and "bayesallfile_chain1"
# in the root directory of R that are THE SAME SIZE (in number of iterations)!

rm(list = ls(all = TRUE))
MCMCdata_chain1 <- read.table("bayesallfile_chain1", header=FALSE)
MCMCdata_chain2 <- read.table("bayesallfile_chain2", header=FALSE)
size_chain1 <- dim(MCMCdata_chain1)[1]                                                  # CHAINS MUST BE THE SAME SIZE!
size_chain2 <- dim(MCMCdata_chain2)[1]                                                  # CHAINS MUST BE THE SAME SIZE!
MCMCdata_chain1 <- mcmc(data= MCMCdata_chain1, start = 1, end = size_chain1, thin = 1)
MCMCdata_chain2 <- mcmc(data= MCMCdata_chain2, start = 1, end = size_chain2, thin = 1)
MCMCData <- mcmc.list(MCMCdata_chain1, MCMCdata_chain2)
gelman.diag(MCMCData, confidence = 0.95, transform=FALSE, autoburnin=FALSE)
gelman.plot(MCMCData, bin.width = 10, max.bins = 50, confidence = 0.95, transform = FALSE, ask = FALSE)
savePlot(filename="Gelman-Rubin_plot",type=c("pdf"),device=dev.cur())