/* Copyright (C) 1999, 2000, 2001, 2002, Massachusetts Institute of Technology.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include "../config.h"
#include <check.h>
#include <mpiglue.h>
#include <mpi_utils.h>

#include "maxwell.h"

/**************************************************************************/

/* Lapack eigenvalue functions */
#ifdef SCALAR_SINGLE_PREC
#  define HEEV F77_FUNC(cheev,CHEEV)
#  define SYEV F77_FUNC(ssyev,SSYEV)
#else
#  define HEEV F77_FUNC(zheev,ZHEEV)
#  define SYEV F77_FUNC(dsyev,DSYEV)
#endif
extern void HEEV(char *, char *, int *, scalar *, int *, real *,
		 scalar *, int *, real *, int *);
extern void SYEV(char *, char *, int *, real *, int *, real *,
		 real *, int *, int *);

/* compute the 3 real eigenvalues of the matrix V */
void maxwell_sym_matrix_eigs(real eigs[3], const symmetric_matrix *V)
{
     int n = 3, nw = 9, info;
#if defined(WITH_HERMITIAN_EPSILON)
     scalar_complex Vm[3][3], W[9], W2[9];
     CASSIGN_SCALAR(Vm[0][0], V->m00, 0);
     CASSIGN_SCALAR(Vm[1][1], V->m11, 0);
     CASSIGN_SCALAR(Vm[2][2], V->m22, 0);
     Vm[0][1] = V->m01; CASSIGN_CONJ(Vm[1][0], V->m01);
     Vm[0][2] = V->m02; CASSIGN_CONJ(Vm[2][0], V->m02);
     Vm[1][2] = V->m12; CASSIGN_CONJ(Vm[2][1], V->m12);
     HEEV("V", "U", &n, &Vm[0][0], &n, eigs, W, &nw, W2, &info);
#else
     real Vm[3][3], W[9];
     Vm[0][0] = V->m00;
     Vm[1][1] = V->m11;
     Vm[2][2] = V->m22;
     Vm[0][1] = Vm[1][0] = V->m01;
     Vm[0][2] = Vm[2][0] = V->m02;
     Vm[1][2] = Vm[2][1] = V->m12;
     SYEV("V", "U", &n, &Vm[0][0], &n, eigs, W, &nw, &info);
#endif
     CHECK(info >= 0, "invalid argument in heev");
     CHECK(info <= 0, "failure to converge in heev");
}

/* Set Vinv = inverse of V, where both V and Vinv are real-symmetric
   (or possibly complex-Hermitian) matrices. */
void maxwell_sym_matrix_invert(symmetric_matrix *Vinv, 
			       const symmetric_matrix *V)
{
     real m00 = V->m00, m11 = V->m11, m22 = V->m22;

#if defined(WITH_HERMITIAN_EPSILON)
     scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12;

     if (m01.re == 0.0 && m01.im == 0.0 &&
	 m02.re == 0.0 && m02.im == 0.0 &&
	 m12.re == 0.0 && m12.im == 0.0) {
	  /* optimize common case of a diagonal matrix: */
	  Vinv->m00 = 1.0 / m00;
	  Vinv->m11 = 1.0 / m11;
	  Vinv->m22 = 1.0 / m22;
	  CASSIGN_ZERO(Vinv->m01);
	  CASSIGN_ZERO(Vinv->m02);
	  CASSIGN_ZERO(Vinv->m12);
     }
     else {
	  double detinv;
	  
	  /* compute the determinant: */
	  detinv = m00*m11*m22 - m11*CSCALAR_NORMSQR(m02) -
	       CSCALAR_NORMSQR(m01)*m22 - CSCALAR_NORMSQR(m12)*m00 +
	       2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re +
		      (m01.re * m12.im + m01.im * m12.re) * m02.im);
	  
	  CHECK(detinv != 0.0, "singular 3x3 matrix");
	  
	  detinv = 1.0/detinv;
	  
	  Vinv->m00 = detinv * (m11*m22 - CSCALAR_NORMSQR(m12));
	  Vinv->m11 = detinv * (m00*m22 - CSCALAR_NORMSQR(m02));
	  Vinv->m22 = detinv * (m11*m00 - CSCALAR_NORMSQR(m01));
	  
	  CASSIGN_SCALAR(Vinv->m02,
			 detinv * (m01.re*m12.re-m01.im*m12.im - m11*m02.re),
			 -detinv*(-m01.re*m12.im-m01.im*m12.re + m11*m02.im));

	  CASSIGN_SCALAR(Vinv->m01,
			 detinv * (m12.re*m02.re+m12.im*m02.im - m22*m01.re),
			 -detinv * (m12.im*m02.re-m12.re*m02.im + m22*m01.im));

	  CASSIGN_SCALAR(Vinv->m12,
			 detinv * (m01.re*m02.re+m01.im*m02.im - m00*m12.re),
			 -detinv * (m01.im*m02.re-m01.re*m02.im + m00*m12.im));
     }
     
#else /* real matrix */
     real m01 = V->m01, m02 = V->m02, m12 = V->m12;

     if (m01 == 0.0 && m02 == 0.0 && m12 == 0.0) {
	  /* optimize common case of a diagonal matrix: */
	  Vinv->m00 = 1.0 / m00;
	  Vinv->m11 = 1.0 / m11;
	  Vinv->m22 = 1.0 / m22;
	  Vinv->m01 = Vinv->m02 = Vinv->m12 = 0.0;
     }
     else {
	  double detinv;
	  
	  /* compute the determinant: */
	  detinv = m00*m11*m22 - m02*m11*m02 + 2.0 * m01*m12*m02 -
	       m01*m01*m22 - m12*m12*m00;
	  
	  CHECK(detinv != 0.0, "singular 3x3 matrix");
	  
	  detinv = 1.0/detinv;
	  
	  Vinv->m00 = detinv * (m11*m22 - m12*m12);
	  Vinv->m11 = detinv * (m00*m22 - m02*m02);
	  Vinv->m22 = detinv * (m11*m00 - m01*m01);
	  
	  Vinv->m02 = detinv * (m01*m12 - m11*m02);
	  Vinv->m01 = detinv * (m12*m02 - m01*m22);
	  Vinv->m12 = detinv * (m01*m02 - m00*m12);
     }
#endif /* real matrix */
}

/* Returns whether or not V is positive-definite. */
static int sym_matrix_positive_definite(symmetric_matrix *V)
{
     real det2, det3;
     real m00 = V->m00, m11 = V->m11, m22 = V->m22;

#if defined(WITH_HERMITIAN_EPSILON)
     scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12;

     det2 = m00*m11 - CSCALAR_NORMSQR(m01);
     det3 = det2*m22 - m11*CSCALAR_NORMSQR(m02) - CSCALAR_NORMSQR(m12)*m00 +
	  2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re +
		 (m01.re * m12.im + m01.im * m12.re) * m02.im);
#else /* real matrix */
     real m01 = V->m01, m02 = V->m02, m12 = V->m12;

     det2 = m00*m11 - m01*m01;
     det3 = det2*m22 - m02*m11*m02 + 2.0 * m01*m12*m02 - m12*m12*m00;
#endif /* real matrix */
     
     return (m00 > 0.0 && det2 > 0.0 && det3 > 0.0);
}

/**************************************************************************/

int check_maxwell_dielectric(maxwell_data *d,
			     int negative_epsilon_okp)
{
     int i, require_2d;

     require_2d = d->nz == 1 && (d->parity & (EVEN_Z_PARITY | ODD_Z_PARITY));

     for (i = 0; i < d->fft_output_size; ++i) {
	  if (!negative_epsilon_okp &&
	      !sym_matrix_positive_definite(d->eps_inv + i))
	       return 1;
	  if (require_2d) {
#if defined(WITH_HERMITIAN_EPSILON)
	       if (d->eps_inv[i].m02.re != 0.0 ||
		   d->eps_inv[i].m02.im != 0.0 ||
		   d->eps_inv[i].m12.re != 0.0 ||
		   d->eps_inv[i].m12.im != 0.0)
		    return 2;
#else /* real matrix */
	       if (d->eps_inv[i].m02 != 0.0 || d->eps_inv[i].m12 != 0.0)
		    return 2;
#endif /* real matrix */
	  }
     }
     return 0;
}

/**************************************************************************/

#define SHIFT3(x,y,z) {real SHIFT3_dummy = z; z = y; y = x; x = SHIFT3_dummy;}

/* Compute quadrature points and weights for integrating on the
   unit sphere.  x, y, z, and weight should be arrays of num_sq_pts
   values to hold the coordinates and weights of the quadrature points
   on output.   Currently, num_sq_pts = 12, 50, and 72 are supported. */
void spherical_quadrature_points(real *x, real *y, real *z,
				 real *weight, int num_sq_pts)
{
     int i,j,k,l, n = 0;
     real x0, y0, z0, w;

     if (num_sq_pts == 50) {
	  /* Computes quadrature points and weights for 50-point 11th degree
	     integration formula on a unit sphere.  This particular quadrature
	     formula has the advantage, for our purposes, of preserving the
	     symmetry group of an octahedraon (i.e. simple cubic symmetry, with
	     respect to the Cartesian xyz axes).  References:
	
	     A. D. McLaren, "Optimal Numerical Integration on a Sphere,"
	     Math. Comp. 17, pp. 361-383 (1963).
	     
	     Also in: Arthur H. Stroud, "Approximate Calculation of Multiple
	     Integrals" (Prentice Hall, 1971) (formula number U3:11-1). 

	     This code was written with the help of example code by
	     John Burkardt: 
	           http://www.psc.edu/~burkardt/src/stroud/stroud.html */
     
	  x0 = 1; y0 = z0 = 0;
	  w = 9216 / 725760.0;
	  for (i = 0; i < 2; ++i) {
	       x0 = -x0;
	       for (j = 0; j < 3; ++j) {
		    SHIFT3(x0,y0,z0);
		    x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
	       }
	  }
	  
	  x0 = y0 = sqrt(0.5); z0 = 0;
	  w = 16384 / 725760.0;
	  for (i = 0; i < 2; ++i) {
	       x0 = -x0;
	       for (j = 0; j < 2; ++j) {
		    y0 = -y0;
		    for (k = 0; k < 3; ++k) {
			 SHIFT3(x0,y0,z0);
			 x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
		    }
	       }
	  }
	  
	  x0 = y0 = z0 = sqrt(1.0 / 3.0);
	  w = 15309 / 725760.0;
	  for (i = 0; i < 2; ++i) {
	       x0 = -x0;
	       for (j = 0; j < 2; ++j) {
		    y0 = -y0;
		    for (k = 0; k < 2; ++k) {
			 z0 = -z0;
			 x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
		    }
	       }
	  }
	  
	  x0 = y0 = sqrt(1.0 / 11.0); z0 = 3 * x0;
	  w = 14641 / 725760.0;
	  for (i = 0; i < 2; ++i) {
	       x0 = -x0;
	       for (j = 0; j < 2; ++j) {
		    y0 = -y0;
		    for (k = 0; k < 2; ++k) {
			 z0 = -z0;
			 for (l = 0; l < 3; ++l) {
			      SHIFT3(x0,y0,z0);
			      x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
			 }
		    }
	       }
	  }
     }  
     else if (num_sq_pts == 72 || num_sq_pts == 12) {
	  /* As above (same references), but with a 72-point 14th degree
	     formula, this time with the symmetry group of an icosohedron.
	     (Stroud formula number U3:14-1.)  Alternatively, just use
	     the 12-point 5th degree formula consisting of the vertices
	     of a regular icosohedron. */
	  
	  /* first, the vertices of an icosohedron: */
	  x0 = sqrt(0.5 - sqrt(0.05));
	  y0 = sqrt(0.5 + sqrt(0.05));
	  z0 = 0;
	  if (num_sq_pts == 72)
	       w = 125 / 10080.0;
	  else
	       w = 1 / 12.0;
	  for (i = 0; i < 2; ++i) {
	       x0 = -x0;
	       for (j = 0; j < 2; ++j) {
		    y0 = -y0;
		    for (k = 0; k < 3; ++k) {
			 SHIFT3(x0,y0,z0);
			 x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
		    }
	       }
	  }
	  
	  if (num_sq_pts == 72) {
	       /* it would be nice, for completeness, to have more
                  digits here: */
	       real coords[3][5] = {
		    { -0.151108275, 0.315838353, 0.346307112, -0.101808787, -0.409228403 },
		    { 0.155240600, 0.257049387, 0.666277790,  0.817386065, 0.501547712 },
		    { 0.976251323, 0.913330032, 0.660412970,  0.567022920, 0.762221757 }
	       };
	       
	       w = 143 / 10080.0;
	       for (l = 0; l < 5; ++l) {
		    x0 = coords[0][l]; y0 = coords[1][l]; z0 = coords[2][l];
		    for (i = 0; i < 3; ++i) {
			 double dummy = x0;
			 x0 = z0;
			 z0 = -y0;
			 y0 = -dummy;
			 for (j = 0; j < 3; ++j) {
			      SHIFT3(x0,y0,z0);
			      x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
			 }
			 y0 = -y0;
			 z0 = -z0;
			 x[n] = x0; y[n] = y0; z[n] = z0; weight[n++] = w;
		    }
	       }
	  }
     }
     else
	  mpi_die("spherical_quadrature_points: passed unknown # points!");

     CHECK(n == num_sq_pts,
	   "bug in spherical_quadrature_points: wrong number of points!");
}

#define K_PI 3.141592653589793238462643383279502884197
#define SMALL 1.0e-6
#define MAX2(a,b) ((a) > (b) ? (a) : (b))
#define MIN2(a,b) ((a) < (b) ? (a) : (b))

#define NUM_SQ_PTS 50  /* number of spherical quadrature points to use */
#define MAX_MOMENT_MESH NUM_SQ_PTS /* max # of moment-mesh vectors */
#define MOMENT_MESH_R 0.5

/* A function to set up the mesh given the grid dimensions, mesh size,
   and lattice/reciprocal vectors.  (Any mesh sizes < 1 are taken to
   be 1.)  The returned values are:

   mesh_center: the center of the mesh, relative to the integer
                mesh coordinates; e.g. the mesh_center for a 3x3x3
                mesh is the point (1,1,1). 
   mesh_prod: the product of the mesh sizes.
   moment_mesh: an array of size_moment_mesh vectors, in lattice
                coordinates, of a "spherically-symmetric" mesh of
                points centered on the origin, designed to be
		used for averaging the first moment of epsilon at 
		a grid point (for finding the local surface normal).
   moment_mesh_weights: an array of size_moment_mesh weights to multiply
                        the integrand values by.  */
static void get_mesh(int nx, int ny, int nz, const int mesh_size[3],
		     real R[3][3], real G[3][3],
		     real mesh_center[3], int *mesh_prod,
		     real moment_mesh[MAX_MOMENT_MESH][3],
		     real moment_mesh_weights[MAX_MOMENT_MESH],
		     int *size_moment_mesh)
{
     int i,j;
     real min_diam = 1e20;
     real mesh_total[3] = { 0, 0, 0 };
     int rank = nz > 1 ? 3 : (ny > 1 ? 2 : 1);
     real weight_sum = 0.0;
	  
     *mesh_prod = 1;
     for (i = 0; i < 3; ++i) {
	  int ms = MAX2(mesh_size[i], 1);
	  mesh_center[i] = (ms - 1) * 0.5;
	  *mesh_prod *= ms;
     }

     /* Set the mesh to compute the "dipole moment" of epsilon over,
	in Cartesian coordinates (used to compute the normal vector
	at surfaces).  Ideally, a spherically-symmetrical distribution
	of points on the radius MOMENT_MESH_R sphere. */
     if (rank == 1) {
	  /* one-dimensional: just two points (really, normal
	     vector is always along x): */
	  *size_moment_mesh = 2;
	  moment_mesh[0][0] = -MOMENT_MESH_R;
	  moment_mesh[0][1] = 0.0;
	  moment_mesh[0][2] = 0.0;
	  moment_mesh[1][0] = MOMENT_MESH_R;
	  moment_mesh[1][1] = 0.0;
	  moment_mesh[1][2] = 0.0;
	  moment_mesh_weights[0] = moment_mesh_weights[1] = 0.5;
     }
     else if (rank == 2) {
	  /* two-dimensional: 12 points at 30-degree intervals: */
	  *size_moment_mesh = 12;
	  for (i = 0; i < *size_moment_mesh; ++i) {
	       moment_mesh[i][0] =
		    cos(2*i * K_PI / *size_moment_mesh) * MOMENT_MESH_R;
	       moment_mesh[i][1] =
		    sin(2*i * K_PI / *size_moment_mesh) * MOMENT_MESH_R;
	       moment_mesh[i][2] = 0.0;
	       moment_mesh_weights[i] = 1.0 / *size_moment_mesh;
	  }
     }
     else {
	  real x[NUM_SQ_PTS], y[NUM_SQ_PTS], z[NUM_SQ_PTS];

	  *size_moment_mesh = NUM_SQ_PTS;
	  spherical_quadrature_points(x,y,z, moment_mesh_weights, NUM_SQ_PTS);
	  for (i = 0; i < *size_moment_mesh; ++i) {
	       moment_mesh[i][0] = x[i] * MOMENT_MESH_R;
	       moment_mesh[i][1] = y[i] * MOMENT_MESH_R;
	       moment_mesh[i][2] = z[i] * MOMENT_MESH_R;
	  }
     }

     CHECK(*size_moment_mesh <= MAX_MOMENT_MESH, "yikes, moment mesh too big");

     for (i = 0; i < *size_moment_mesh; ++i)
	  weight_sum += moment_mesh_weights[i];
     CHECK(fabs(weight_sum - 1.0) < SMALL, "bug, incorrect moment weights");

     /* scale the moment-mesh vectors so that the sphere has a
	diameter of 2*MOMENT_MESH_R times the diameter of the
	smallest grid direction: */

     /* first, find the minimum distance between grid points along the
	lattice directions (should we use the maximum instead?): */
     for (i = 0; i < rank; ++i) {
	  real ri = R[i][0] * R[i][0] + R[i][1] * R[i][1] + R[i][2] * R[i][2];
	  ri = sqrt(ri) / (i == 0 ? nx : (i == 1 ? ny : nz));
	  min_diam = MIN2(min_diam, ri);
     }
     
     /* scale moment_mesh by this diameter: */
     for (i = 0; i < *size_moment_mesh; ++i) {
	  real len = 0;
	  for (j = 0; j < 3; ++j) {
	       moment_mesh[i][j] *= min_diam;
	       len += moment_mesh[i][j] * moment_mesh[i][j];
	       mesh_total[j] += moment_mesh[i][j];
	  }
	  CHECK(fabs(len - min_diam*min_diam*(MOMENT_MESH_R*MOMENT_MESH_R)) 
		< SMALL,
		"bug in get_mesh: moment_mesh vector is wrong length");
     }
     CHECK(fabs(mesh_total[0]) + fabs(mesh_total[1]) + fabs(mesh_total[2])
	   < SMALL, "bug in get_mesh: moment_mesh does not average to zero");

     /* Now, convert the moment_mesh vectors to lattice/grid coordinates;
        to do this, we multiply by the G matrix (inverse of R transposed) */
     for (i = 0; i < *size_moment_mesh; ++i) {
	  real v[3];
	  for (j = 0; j < 3; ++j)    /* make a copy of moment_mesh[i] */
	       v[j] = moment_mesh[i][j];
	  for (j = 0; j < 3; ++j)
	       moment_mesh[i][j] = G[j][0]*v[0] + G[j][1]*v[1] + G[j][2]*v[2];
     }
}

/**************************************************************************/

/* The following function initializes the dielectric tensor md->eps_inv,
   using the dielectric function epsilon(&eps, &eps_inv, r, epsilon_data).

   epsilon is averaged over a rectangular mesh spanning the space between
   grid points; the size of the mesh is given by mesh_size.

   R[0..2] are the spatial lattice vectors, and are used to convert
   the discretization grid into spatial coordinates (with the origin
   at the (0,0,0) grid element.

   In most places, the dielectric tensor is equal to eps_inv, but at
   dielectric interfaces it varies depending upon the polarization of
   the field (for faster convergence).  In particular, it depends upon
   the direction of the field relative to the surface normal vector,
   so we must compute the latter.  The surface normal is approximated
   by the "dipole moment" of the dielectric function over a spherical
   mesh.

   Implementation note: md->eps_inv is chosen to have dimensions matching
   the output of the FFT.  Thus, its dimensions depend upon whether we are
   doing a real or complex and serial or parallel FFT. */

void set_maxwell_dielectric(maxwell_data *md,
			    const int mesh_size[3],
			    real R[3][3], real G[3][3],
			    maxwell_dielectric_function epsilon,
			    void *epsilon_data)
{
     real s1, s2, s3, m1, m2, m3;  /* grid/mesh steps */
     real mesh_center[3];
     real moment_mesh[MAX_MOMENT_MESH][3];
     real moment_mesh_weights[MAX_MOMENT_MESH];
     real eps_inv_total = 0.0;
     int i, j, k;
     int mesh_prod;
     real mesh_prod_inv;
     int size_moment_mesh = 0;
     int n1, n2, n3;
#ifdef HAVE_MPI
     int local_n2, local_y_start, local_n3;
#endif
#ifndef SCALAR_COMPLEX
     int n_other, n_last, rank;
#endif

     n1 = md->nx; n2 = md->ny; n3 = md->nz;

     get_mesh(n1, n2, n3, mesh_size, R, G, 
	      mesh_center, &mesh_prod, moment_mesh, moment_mesh_weights,
	      &size_moment_mesh);
     mesh_prod_inv = 1.0 / mesh_prod;

#ifdef DEBUG
     mpi_one_printf("Using moment mesh (%d):\n", size_moment_mesh);
     for (i = 0; i < size_moment_mesh; ++i)
	  mpi_one_printf("   (%g, %g, %g) (%g)\n",
		 moment_mesh[i][0], moment_mesh[i][1], moment_mesh[i][2],
		 moment_mesh_weights[i]);
#endif

     s1 = 1.0 / n1;
     s2 = 1.0 / n2;
     s3 = 1.0 / n3;
     m1 = s1 / MAX2(1, mesh_size[0] - 1);
     m2 = s2 / MAX2(1, mesh_size[1] - 1);
     m3 = s3 / MAX2(1, mesh_size[2] - 1);

     /* Here we have different loops over the coordinates, depending
	upon whether we are using complex or real and serial or
        parallel transforms.  Each loop must define, in its body,
        variables (i2,j2,k2) describing the coordinate of the current
        point, and eps_index describing the corresponding index in 
	the array md->eps_inv[]. */

#ifdef SCALAR_COMPLEX

#  ifndef HAVE_MPI
     
     for (i = 0; i < n1; ++i)
	  for (j = 0; j < n2; ++j)
	       for (k = 0; k < n3; ++k)
     {
#         define i2 i
#         define j2 j
#         define k2 k
	  int eps_index = ((i * n2 + j) * n3 + k);

#  else /* HAVE_MPI */

     local_n2 = md->local_ny;
     local_y_start = md->local_y_start;

     /* first two dimensions are transposed in MPI output: */
     for (j = 0; j < local_n2; ++j)
          for (i = 0; i < n1; ++i)
	       for (k = 0; k < n3; ++k)
     {
#         define i2 i
	  int j2 = j + local_y_start;
#         define k2 k
	  int eps_index = ((j * n1 + i) * n3 + k);

#  endif /* HAVE_MPI */

#else /* not SCALAR_COMPLEX */

#  ifndef HAVE_MPI

     n_other = md->other_dims;
     n_last = md->last_dim_size / 2;
     rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3;

     for (i = 0; i < n_other; ++i)
	  for (j = 0; j < n_last; ++j)
     {
	  int eps_index = i * n_last + j;
	  int i2, j2, k2;
	  switch (rank) {
	      case 2: i2 = i; j2 = j; k2 = 0; break;
	      case 3: i2 = i / n2; j2 = i % n2; k2 = j; break;
	      default: i2 = j; j2 = k2 = 0;  break;
	  }

#  else /* HAVE_MPI */

     local_n2 = md->local_ny;
     local_y_start = md->local_y_start;

     /* For a real->complex transform, the last dimension is cut in
	half.  For a 2d transform, this is taken into account in local_ny
	already, but for a 3d transform we must compute the new n3: */
     if (n3 > 1)
	  local_n3 = md->last_dim_size / 2;
     else
	  local_n3 = 1;
     
     /* first two dimensions are transposed in MPI output: */
     for (j = 0; j < local_n2; ++j)
          for (i = 0; i < n1; ++i)
	       for (k = 0; k < local_n3; ++k)
     {
#         define i2 i
	  int j2 = j + local_y_start;
#         define k2 k
	  int eps_index = ((j * n1 + i) * local_n3 + k);

#  endif  /* HAVE_MPI */

#endif /* not SCALAR_COMPLEX */

     {
	  int mi, mj, mk;
#ifdef WITH_HERMITIAN_EPSILON
	  symmetric_matrix eps_mean = {0,0,0,{0,0},{0,0},{0,0}}, 
			   eps_inv_mean = {0,0,0,{0,0},{0,0},{0,0}},
			   eps_mean_inv;
#else
	  symmetric_matrix eps_mean = {0,0,0,0,0,0}, 
			   eps_inv_mean = {0,0,0,0,0,0}, eps_mean_inv;
#endif
	  real norm_len;
	  real norm0, norm1, norm2;
	  short means_different_p, diag_eps_p;
	  
	  for (mi = 0; mi < mesh_size[0]; ++mi)
	       for (mj = 0; mj < mesh_size[1]; ++mj)
		    for (mk = 0; mk < mesh_size[2]; ++mk) {
			 real r[3];
			 symmetric_matrix eps, eps_inv;
			 r[0] = i2 * s1 + (mi - mesh_center[0]) * m1;
			 r[1] = j2 * s2 + (mj - mesh_center[1]) * m2;
			 r[2] = k2 * s3 + (mk - mesh_center[2]) * m3;
			 epsilon(&eps, &eps_inv, r, epsilon_data);
			 eps_mean.m00 += eps.m00;
			 eps_mean.m11 += eps.m11;
			 eps_mean.m22 += eps.m22;
			 eps_inv_mean.m00 += eps_inv.m00;
			 eps_inv_mean.m11 += eps_inv.m11;
			 eps_inv_mean.m22 += eps_inv.m22;
#ifdef WITH_HERMITIAN_EPSILON
			 CACCUMULATE_SUM(eps_mean.m01, eps.m01);
			 CACCUMULATE_SUM(eps_mean.m02, eps.m02);
			 CACCUMULATE_SUM(eps_mean.m12, eps.m12);
			 CACCUMULATE_SUM(eps_inv_mean.m01, eps_inv.m01);
			 CACCUMULATE_SUM(eps_inv_mean.m02, eps_inv.m02);
			 CACCUMULATE_SUM(eps_inv_mean.m12, eps_inv.m12);
#else
			 eps_mean.m01 += eps.m01;
			 eps_mean.m02 += eps.m02;
			 eps_mean.m12 += eps.m12;
			 eps_inv_mean.m01 += eps_inv.m01;
			 eps_inv_mean.m02 += eps_inv.m02;
			 eps_inv_mean.m12 += eps_inv.m12;
#endif
		    }
     
	  diag_eps_p = DIAG_SYMMETRIC_MATRIX(eps_mean);
	  if (diag_eps_p) { /* handle the common case of diagonal matrices: */
	       eps_mean_inv.m00 = mesh_prod / eps_mean.m00;
	       eps_mean_inv.m11 = mesh_prod / eps_mean.m11;
	       eps_mean_inv.m22 = mesh_prod / eps_mean.m22;
#ifdef WITH_HERMITIAN_EPSILON
	       CASSIGN_ZERO(eps_mean_inv.m01);
	       CASSIGN_ZERO(eps_mean_inv.m02);
	       CASSIGN_ZERO(eps_mean_inv.m12);
#else
	       eps_mean_inv.m01 = eps_mean_inv.m02 = eps_mean_inv.m12 = 0.0;
#endif
	       eps_inv_mean.m00 *= mesh_prod_inv;
	       eps_inv_mean.m11 *= mesh_prod_inv;
	       eps_inv_mean.m22 *= mesh_prod_inv;

	       means_different_p = 
		    fabs(eps_mean_inv.m00 - eps_inv_mean.m00) > SMALL ||
		    fabs(eps_mean_inv.m11 - eps_inv_mean.m11) > SMALL ||
		    fabs(eps_mean_inv.m22 - eps_inv_mean.m22) > SMALL;
	  }
	  else {
	       eps_inv_mean.m00 *= mesh_prod_inv;
	       eps_inv_mean.m11 *= mesh_prod_inv;
	       eps_inv_mean.m22 *= mesh_prod_inv;
	       eps_mean.m00 *= mesh_prod_inv;
	       eps_mean.m11 *= mesh_prod_inv;
	       eps_mean.m22 *= mesh_prod_inv;
#ifdef WITH_HERMITIAN_EPSILON
	       eps_mean.m01.re *= mesh_prod_inv;
	       eps_mean.m01.im *= mesh_prod_inv;
	       eps_mean.m02.re *= mesh_prod_inv;
	       eps_mean.m02.im *= mesh_prod_inv;
	       eps_mean.m12.re *= mesh_prod_inv;
	       eps_mean.m12.im *= mesh_prod_inv;
	       eps_inv_mean.m01.re *= mesh_prod_inv;
	       eps_inv_mean.m01.im *= mesh_prod_inv;
	       eps_inv_mean.m02.re *= mesh_prod_inv;
	       eps_inv_mean.m02.im *= mesh_prod_inv;
	       eps_inv_mean.m12.re *= mesh_prod_inv;
	       eps_inv_mean.m12.im *= mesh_prod_inv;
#else
	       eps_mean.m01 *= mesh_prod_inv;
	       eps_mean.m02 *= mesh_prod_inv;
	       eps_mean.m12 *= mesh_prod_inv;
	       eps_inv_mean.m01 *= mesh_prod_inv;
	       eps_inv_mean.m02 *= mesh_prod_inv;
	       eps_inv_mean.m12 *= mesh_prod_inv;
#endif
	       maxwell_sym_matrix_invert(&eps_mean_inv, &eps_mean);
	       
	       means_different_p = 
		    fabs(eps_mean_inv.m00 - eps_inv_mean.m00) > SMALL ||
		    fabs(eps_mean_inv.m11 - eps_inv_mean.m11) > SMALL ||
		    fabs(eps_mean_inv.m22 - eps_inv_mean.m22) > SMALL;
#ifdef WITH_HERMITIAN_EPSILON
	       means_different_p = means_different_p ||
		    fabs(eps_mean_inv.m01.re - eps_inv_mean.m01.re) > SMALL ||
		    fabs(eps_mean_inv.m02.re - eps_inv_mean.m02.re) > SMALL ||
		    fabs(eps_mean_inv.m12.re - eps_inv_mean.m12.re) > SMALL ||
		    fabs(eps_mean_inv.m01.im - eps_inv_mean.m01.im) > SMALL ||
		    fabs(eps_mean_inv.m02.im - eps_inv_mean.m02.im) > SMALL ||
		    fabs(eps_mean_inv.m12.im - eps_inv_mean.m12.im) > SMALL;
#else
	       means_different_p = means_different_p ||
		    fabs(eps_mean_inv.m01 - eps_inv_mean.m01) > SMALL ||
		    fabs(eps_mean_inv.m02 - eps_inv_mean.m02) > SMALL ||
		    fabs(eps_mean_inv.m12 - eps_inv_mean.m12) > SMALL;
#endif
	  }

	  /* if the two averaging methods yielded different results,
	     which usually happens if epsilon is not constant, then
	     we need to find the normal vector to the dielectric interface: */
	  if (means_different_p) {
	       real moment0 = 0, moment1 = 0, moment2 = 0;

	       for (mi = 0; mi < size_moment_mesh; ++mi) {
		    real r[3], eps_trace;
		    symmetric_matrix eps, eps_inv;
		    r[0] = i2 * s1 + moment_mesh[mi][0];
		    r[1] = j2 * s2 + moment_mesh[mi][1];
		    r[2] = k2 * s3 + moment_mesh[mi][2];
		    epsilon(&eps, &eps_inv, r, epsilon_data);
		    eps_trace = eps.m00 + eps.m11 + eps.m22;
		    eps_trace *= moment_mesh_weights[mi];
		    moment0 += eps_trace * moment_mesh[mi][0];
		    moment1 += eps_trace * moment_mesh[mi][1];
		    moment2 += eps_trace * moment_mesh[mi][2];
	       }
	       /* need to convert moment from lattice to cartesian coords: */
	       norm0 = R[0][0]*moment0 + R[1][0]*moment1 + R[2][0]*moment2;
	       norm1 = R[0][1]*moment0 + R[1][1]*moment1 + R[2][1]*moment2;
	       norm2 = R[0][2]*moment0 + R[1][2]*moment1 + R[2][2]*moment2;
	  
	       norm_len = sqrt(norm0*norm0 + norm1*norm1 + norm2*norm2);
	  }
	  
	  if (means_different_p && norm_len > SMALL) {
	       real x0, x1, x2;

	       norm_len = 1.0/norm_len;
	       norm0 *= norm_len;
	       norm1 *= norm_len;
	       norm2 *= norm_len;

	       /* Compute the effective inverse dielectric tensor.
		  We define this as:
                     1/2 ( {eps_inv_mean, P} + {eps_mean_inv, 1-P} )
	          where P is the projection matrix onto the normal direction
                  (P = norm ^ norm), and {a,b} is the anti-commutator ab+ba.
		   = 1/2 {eps_inv_mean - eps_mean_inv, P} + eps_mean_inv
		   = 1/2 (n_i conj(x_j) + x_i n_j) + (eps_mean_inv)_ij
		  where n_k is the kth component of the normal vector and 
		     x_i = (eps_inv_mean - eps_mean_inv)_ik n_k  
		  Note the implied summations (Einstein notation).

		  Note that the resulting matrix is symmetric, and we get just
		  eps_inv_mean if eps_inv_mean == eps_mean_inv, as desired.

	          Note that P is idempotent, so for scalar epsilon this
	          is just eps_inv_mean * P + eps_mean_inv * (1-P)
                        = (1/eps_inv_mean * P + eps_mean * (1-P)) ^ (-1),
	          which corresponds to the expression in the Meade paper. */
	       
	       x0 = (eps_inv_mean.m00 - eps_mean_inv.m00) * norm0;
	       x1 = (eps_inv_mean.m11 - eps_mean_inv.m11) * norm1;
	       x2 = (eps_inv_mean.m22 - eps_mean_inv.m22) * norm2;
	       if (diag_eps_p) {
#ifdef WITH_HERMITIAN_EPSILON
		    md->eps_inv[eps_index].m01.re = 0.5*(x0*norm1 + x1*norm0);
		    md->eps_inv[eps_index].m01.im = 0.0;
		    md->eps_inv[eps_index].m02.re = 0.5*(x0*norm2 + x2*norm0);
		    md->eps_inv[eps_index].m02.im = 0.0;
		    md->eps_inv[eps_index].m12.re = 0.5*(x1*norm2 + x2*norm1);
		    md->eps_inv[eps_index].m12.im = 0.0;
#else
		    md->eps_inv[eps_index].m01 = 0.5*(x0*norm1 + x1*norm0);
		    md->eps_inv[eps_index].m02 = 0.5*(x0*norm2 + x2*norm0);
		    md->eps_inv[eps_index].m12 = 0.5*(x1*norm2 + x2*norm1);
#endif
	       }
	       else {
#ifdef WITH_HERMITIAN_EPSILON
		    real x0i, x1i, x2i;
		    x0 += ((eps_inv_mean.m01.re - eps_mean_inv.m01.re)*norm1 + 
			   (eps_inv_mean.m02.re - eps_mean_inv.m02.re)*norm2);
		    x1 += ((eps_inv_mean.m01.re - eps_mean_inv.m01.re)*norm0 + 
			   (eps_inv_mean.m12.re - eps_mean_inv.m12.re)*norm2);
		    x2 += ((eps_inv_mean.m02.re - eps_mean_inv.m02.re)*norm0 +
			   (eps_inv_mean.m12.re - eps_mean_inv.m12.re)*norm1);
		    x0i = ((eps_inv_mean.m01.im - eps_mean_inv.m01.im)*norm1 + 
			   (eps_inv_mean.m02.im - eps_mean_inv.m02.im)*norm2);
		    x1i = (-(eps_inv_mean.m01.im - eps_mean_inv.m01.im)*norm0+ 
			   (eps_inv_mean.m12.im - eps_mean_inv.m12.im)*norm2);
		    x2i = -((eps_inv_mean.m02.im - eps_mean_inv.m02.im)*norm0 +
			    (eps_inv_mean.m12.im - eps_mean_inv.m12.im)*norm1);

		    md->eps_inv[eps_index].m01.re = (0.5*(x0*norm1 + x1*norm0) 
						     + eps_mean_inv.m01.re);
		    md->eps_inv[eps_index].m02.re = (0.5*(x0*norm2 + x2*norm0) 
						     + eps_mean_inv.m02.re);
		    md->eps_inv[eps_index].m12.re = (0.5*(x1*norm2 + x2*norm1) 
						     + eps_mean_inv.m12.re);
		    md->eps_inv[eps_index].m01.im = (0.5*(x0i*norm1-x1i*norm0) 
						     + eps_mean_inv.m01.im);
		    md->eps_inv[eps_index].m02.im = (0.5*(x0i*norm2-x2i*norm0) 
						     + eps_mean_inv.m02.im);
		    md->eps_inv[eps_index].m12.im = (0.5*(x1i*norm2-x2i*norm1) 
						     + eps_mean_inv.m12.im);
#else
		    x0 += ((eps_inv_mean.m01 - eps_mean_inv.m01) * norm1 + 
			   (eps_inv_mean.m02 - eps_mean_inv.m02) * norm2);
		    x1 += ((eps_inv_mean.m01 - eps_mean_inv.m01) * norm0 + 
			   (eps_inv_mean.m12 - eps_mean_inv.m12) * norm2);
		    x2 += ((eps_inv_mean.m02 - eps_mean_inv.m02) * norm0 +
			   (eps_inv_mean.m12 - eps_mean_inv.m12) * norm1);

		    md->eps_inv[eps_index].m01 = (0.5*(x0*norm1 + x1*norm0) 
						  + eps_mean_inv.m01);
		    md->eps_inv[eps_index].m02 = (0.5*(x0*norm2 + x2*norm0) 
						  + eps_mean_inv.m02);
		    md->eps_inv[eps_index].m12 = (0.5*(x1*norm2 + x2*norm1) 
						  + eps_mean_inv.m12);
#endif
	       }
	       md->eps_inv[eps_index].m00 = x0*norm0 + eps_mean_inv.m00;
	       md->eps_inv[eps_index].m11 = x1*norm1 + eps_mean_inv.m11;
	       md->eps_inv[eps_index].m22 = x2*norm2 + eps_mean_inv.m22;
	  }
	  else { /* undetermined normal vector and/or constant eps */
	       md->eps_inv[eps_index] = eps_mean_inv;
	  }
	  
	  eps_inv_total += (md->eps_inv[eps_index].m00 + 
			    md->eps_inv[eps_index].m11 + 
			    md->eps_inv[eps_index].m22);
     }}  /* end of loop body */

     mpi_allreduce_1(&eps_inv_total, real, SCALAR_MPI_TYPE,
		     MPI_SUM, MPI_COMM_WORLD);
     n1 = md->fft_output_size;
     mpi_allreduce_1(&n1, int, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
     md->eps_inv_mean = eps_inv_total / (3 * n1);
}