/*
    DFT++ is a density functional package developed by the research group
    of Professor Tomas Arias

    Copyright 1996-2003 Sohrab Ismail-Beigi

    This file is part of DFT++.

    DFT++ is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    DFT++ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with DFT++; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Please see the file CREDITS for a list of authors.

    For academic users, we request that publications using results obtained with
    this software reference

    "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
    and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).

    and, if using the wavelet basis, further reference

    "Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
    T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).

    and 

    "Robust ab initio calculation of condensed matter: transparent convergence through
    semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
    Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).

    For your convenience, preprints of the above articles may be obtained from
    http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/

/****************************************************************/
/*                       Routines for D                         */

/* D(i) is the basic operation of "EVALUATING" the derivative of a
   function in the ith direction on all points of the realspace grid
   given the function's coefficient space representation.

   Ddag(i) is it's Hermitian conjugate in the sense of identification
   of real space points with their ghosts.
*/

#include <stdio.h>
#include <math.h>
#include "WL_header.h"

/* 3d array addressing short-hand */
#define DAT(a,x,y,z,Nx,Ny,Nz) ( (a)->dat[((x)*Ny+(y))*Nz+(z)] )

void Dacc(struct Grid *self)
{
  /* Indices */
  int x,y,z;

  /* Go to bottom first */
  if (self->child)
    Dacc(self->child);

  /* Do all siblings BEFORE coming up levels */
  if (self->sibling)
    Dacc(self->sibling);

  /* Accumulate ghost data up to parent (if there is one!) */
  if (self->parent) {
   /* Convenient navigational parameters */
   int Nx=self->dim.x,Ny=self->dim.y,Nz=self->dim.z;
   int Npx=self->parent->dim.x,Npy=self->parent->dim.y,Npz=self->parent->dim.z;

    for (x=0; x<self->dim.x/2; x++)
      for (y=0; y<self->dim.y/2; y++)
	for (z=0; z<self->dim.z/2; z++) 
	  self->
	    parent->
	    dat[((self->org.x+x)*Npy+(self->org.y+y))*Npz+(self->org.z+z) ]
	    += self->dat[(2*x*Ny+2*y)*Nz+2*z];
  }
}

/* Basic convolution kernel
   Convolves multiple data sets (points, lines, or planes).  Output stored
   contiguously as a three dimensional object; navigation through
   input accomplished through various strides.

   output: pointer to start of output array
   input: pointer to element in input array which corresponds to origin of
       output array
   periodic: tells whether input is to be treated as periodic (no/yes=0/1)
   sign: 0 (write into output), -1 (subtract from output), 1 (add to output)
   c: convolution coefficients for 0, 1, 2, ...
   hfl: half-filter length, extent of filter on either side beyond zero 
   symm: reflection symmetry of filter, +/-1 

   N: number of objects to be produced in output
   size: size of objects (points, lines, planes) convolved together
   stride_obj: stride between objects in input

   convs: number of convolutions to perform
   stride_conv: stride between starting convolution locations in output

   Notes:
     For convolutions along z we have the exploitable special case of
     stride_obj=1, size=1.

     In general, we have for convolutions along direction i
       N=N_i
       size=prod_{j>i} N_j
       stride_obj=prod_{j>i} N_j
       convs=prod_{j<i} N_j
       stride_conv=prod_{j>=i} N_j
*/
void convolve(double *output,double *input,int periodic,int sign,
	      double *c,int hfl,int symm,
	      int N,int size,int stride_obj,int convs,int stride_conv)
{
  int ic,n,el;

  if (sign==1) {
    printf("sign=1 not supported in convolve()!!!\n");
    exit(1);
  }

  if (sign==0) {
    /* Special case for optimization */
    if (symm==-1) {
      switch (hfl) {
      case 2:
	for (ic=0; ic<convs; ic++) {
	  n=0;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1+N)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]);

	  n=1;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]);

	  for (n=2; n<N-2; n++)
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]-input[(n-2)*stride_obj+el]);
      
	  n=N-2;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])-
		c[2]*(input[(n-2)*stride_obj+el]);

	  n=N-1;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)=
		c[1]*(input[(n+1-N)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)=-
		c[1]*(input[(n-1)*stride_obj+el])-
		c[2]*(input[(n-2)*stride_obj+el]);

	  input+=stride_conv;
	}
	break;
      default:
	printf("hfl=%d not supported for convolve()\n");
	exit(1);
      }

      return;
    }

    /* General case */
    switch (hfl) {
    case 2:
      for (ic=0; ic<convs; ic++) {
	n=0;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1+N)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]+
		    symm*input[(n-2+N)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]
		    )+
	      c[2]*(input[(n+2)*stride_obj+el]
		    );

	n=1;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]+
		    symm*input[(n-2+N)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]
		    );

	for (n=2; n<N-2; n++)
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]+
		    symm*input[(n-2)*stride_obj+el]);
      
	n=N-2;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2-N)*stride_obj+el]+
		    symm*input[(n-2)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(
		    symm*input[(n-2)*stride_obj+el]);

	n=N-1;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1-N)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2-N)*stride_obj+el]+
		    symm*input[(n-2)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(
		    symm*input[(n-2)*stride_obj+el]);

	input+=stride_conv;
      }
      break;
    default:
      printf("hfl=%d not supported for convolve()\n");
      exit(1);
    }
    return;
  }

  if (sign==-1) {
    /* Special case for optimization */
    if (symm==-1) {
      switch (hfl) {
      case 2:
	for (ic=0; ic<convs; ic++) {
	  n=0;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1+N)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]);

	  n=1;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]);

	  for (n=2; n<N-2; n++)
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2)*stride_obj+el]-input[(n-2)*stride_obj+el]);
      
	  n=N-2;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])-
		c[2]*(input[(n-2)*stride_obj+el]);

	  n=N-1;
	  if (periodic)
	    for (el=0; el<size; el++)
	      *(output++)-=
		c[1]*(input[(n+1-N)*stride_obj+el]-input[(n-1)*stride_obj+el])+
		c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
	  else
	    for (el=0; el<size; el++)
	      *(output++)-=-
		c[1]*(input[(n-1)*stride_obj+el])-
		c[2]*(input[(n-2)*stride_obj+el]);

	  input+=stride_conv;
	}
	break;
      default:
	printf("hfl=%d not supported for convolve()\n");
	exit(1);
      }

      return;
    }

    /* General case */
    switch (hfl) {
    case 2:
      for (ic=0; ic<convs; ic++) {
	n=0;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1+N)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]+
		    symm*input[(n-2+N)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]
		    )+
	      c[2]*(input[(n+2)*stride_obj+el]
		    );

	n=1;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]+
		    symm*input[(n-2+N)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]
		    );

	for (n=2; n<N-2; n++)
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2)*stride_obj+el]+
		    symm*input[(n-2)*stride_obj+el]);
      
	n=N-2;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2-N)*stride_obj+el]+
		    symm*input[(n-2)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(
		    symm*input[(n-2)*stride_obj+el]);

	n=N-1;
	if (periodic)
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(input[(n+1-N)*stride_obj+el]+
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(input[(n+2-N)*stride_obj+el]+
		    symm*input[(n-2)*stride_obj+el]);
	else
	  for (el=0; el<size; el++)
	    *(output++)-=
	      c[0]*(input[n*stride_obj+el])+
	      c[1]*(
		    symm*input[(n-1)*stride_obj+el])+
	      c[2]*(
		    symm*input[(n-2)*stride_obj+el]);

	input+=stride_conv;
      }
      break;
    default:
      printf("hfl=%d not supported for convolve()\n");
      exit(1);
    }
    return;
  }

  printf("Unrecognized case in convolve()!\n");
  exit(1);
}


void Dsame(int d,struct Grid *out, struct Grid *in)
{
  int x,y,z;

  /* Convenient aliases for important quantities */
  struct Ivec dim=in->dim;

  /* For getting appropriate filter scale */
  double d1[1+HFL_D1],D;

  int sign,symm,N,size,stride_obj,convs,stride_conv;

  /* Make version of filter scaled to appropo direction */
  switch (d) {
  case 0:
    D=1./in->scale.x;
    break;
  case 1:
    D=1./in->scale.y;
    break;
  case 2:
    D=1./in->scale.z;
    break;
  default:
    printf("Illegal direction d=%d in Dsame!\n",d);
    exit(1);
  }
  for (x=0; x<=HFL_D1; x++) d1[x]=D1[x]*D;

  switch (d) {
  case 2:
    /* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along z */
    convolve(out->dat,in->dat,in->ifperiodic,sign=0,
	     d1,HFL_D1,symm=-1,
	     N=dim.z,size=1,stride_obj=1,
	     convs=dim.x*dim.y,stride_conv=dim.z);
    break;
  case 1:
    /* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along y */
    convolve(out->dat,in->dat,in->ifperiodic,sign=0,
	     d1,HFL_D1,symm=-1,
	     N=dim.y,size=dim.z,stride_obj=dim.z,
	     convs=dim.x,stride_conv=dim.y*dim.z);
    break;
  case 0:
    /* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along x */
    convolve(out->dat,in->dat,in->ifperiodic,sign=0,
	     d1,HFL_D1,symm=-1,
	     N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
	     convs=1,stride_conv=dim.x*dim.y*dim.z);
    break;
  default:
    printf("Illegal direction d=%d in Dsame!\n");
    exit(1);
  }
    
  /* Continue on to all subgrids */      
  if (in->child) 
    Dsame(d,out->child,in->child);

  if (in->sibling) 
    Dsame(d,out->sibling,in->sibling);

  /* This should have the same effect as calling Dacc after Dsame, but
     only want this version for Dabove and not Dbelow, so simpler
     coding to just call Dacc explicitly.  Also, the explict call to
     Dacc doesn't seem to take much time (a few percent at most!). */

/*    if (out->parent) { */
/*          int Npx=out->parent->dim.x,Npy=out->parent->dim.y,Npz=out->parent->dim.z; */
/*          for (x=0; x<dim.x/2; x++) */
/*            for (y=0; y<dim.y/2; y++) */
/*      	for (z=0; z<dim.z/2; z++)  */
/*      	  out-> */
/*      	    parent-> */
/*      	    dat[((out->org.x+x)*Npy+(out->org.y+y))*Npz+(out->org.z+z) ] */
/*      	    += out->dat[(2*x*dim.y+2*y)*dim.z+2*z]; */
/*        } */


}

/* Just like Dsame, but subtracts "decrements" result from out */
void decDsame(int d,struct Grid *out, struct Grid *in)
{
  int x,y,z;

  /* Convenient aliases for important quantities */
  struct Ivec dim=in->dim;

  /* For getting appropriate filter scale */
  double d1[1+HFL_D1],D;

  int sign,symm,N,size,stride_obj,convs,stride_conv;

  /* Make version of filter scaled to appropo direction */
  switch (d) {
  case 0:
    D=1./in->scale.x;
    break;
  case 1:
    D=1./in->scale.y;
    break;
  case 2:
    D=1./in->scale.z;
    break;
  default:
    printf("Illegal direction d=%d in Dsame!\n",d);
    exit(1);
  }
  for (x=0; x<=HFL_D1; x++) d1[x]=D1[x]*D;

  switch (d) {
  case 2:
    /* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along z */
    convolve(out->dat,in->dat,in->ifperiodic,sign=-1,
	     d1,HFL_D1,symm=-1,
	     N=dim.z,size=1,stride_obj=1,
	     convs=dim.x*dim.y,stride_conv=dim.z);
    break;
  case 1:
    /* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along y */
    convolve(out->dat,in->dat,in->ifperiodic,sign=-1,
	     d1,HFL_D1,symm=-1,
	     N=dim.y,size=dim.z,stride_obj=dim.z,
	     convs=dim.x,stride_conv=dim.y*dim.z);
    break;
  case 0:
    /* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along x */
    convolve(out->dat,in->dat,in->ifperiodic,sign=-1,
	     d1,HFL_D1,symm=-1,
	     N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
	     convs=1,stride_conv=dim.x*dim.y*dim.z);
    break;
  default:
    printf("Illegal direction d=%d in Dsame!\n");
    exit(1);
  }
    
  /* Continue on to all subgrids */      
  if (in->child) 
    decDsame(d,out->child,in->child);

  if (in->sibling) 
    decDsame(d,out->sibling,in->sibling);

  /* This should have the same effect as calling Dacc after Dsame, but
     only want this version for Dabove and not Dbelow, so simpler
     coding to just call Dacc explicitly.  Also, the explict call to
     Dacc doesn't seem to take much time (a few percent at most!). */

/*    if (out->parent) { */
/*          int Npx=out->parent->dim.x,Npy=out->parent->dim.y,Npz=out->parent->dim.z; */
/*          for (x=0; x<dim.x/2; x++) */
/*            for (y=0; y<dim.y/2; y++) */
/*      	for (z=0; z<dim.z/2; z++)  */
/*      	  out-> */
/*      	    parent-> */
/*      	    dat[((out->org.x+x)*Npy+(out->org.y+y))*Npz+(out->org.z+z) ] */
/*      	    += out->dat[(2*x*dim.y+2*y)*dim.z+2*z]; */
/*        } */


}



/* Basic coarser-convolution kernel
   Coarser-convolves (creates output at twice the input density)
   multiple data sets (points, lines, or planes).  Output stored
   contiguously as a three dimensional object; navigation through
   input accomplished through various strides.

   output: pointer to start of output array
   ifacc: tells whether to accumulate results onto output (true) or
          to overwrite it (false)
   input: pointer to element in input array which corresponds to origin of
       output array
   c: convolution coefficients for 0, 1, 2, ...
   hfl: half-filter length, extend of filter on either side beyond zero 
   symm: reflection symmetry of filter, +/-1 

   N: number of objects to be produced in output
   size: size of objects (points, lines, planes) convolved together
   stride_obj: stride between objects in input

   convs: number of convolutions to perform
   stride_conv: stride between starting convolution locations in input

   Notes:
     For convolutions along z we have the exploitable special case of
     stride_obj=1, size=1.

     In general, we have for convolutions along direction i
       N=Nout_i
       size=prod_{j>i} Nout_j
       stride_obj=prod_{j>i} Nin_j
       convs=prod_{j<i} Nout_j
       stride_conv=prod_{j>=i} Nin_j
*/
void expand(double *output,int ifacc,double *input,double *c,int hfl,int symm,
	    int N,int size,int stride_obj,int convs,int stride_conv)
{
  int ic,n2,n,el;

  switch (hfl) {
  case 5:
    if (ifacc)
      for (ic=0; ic<convs; ic++) {
	for (n2=0; n2<N/2; n2++) {
	  for (el=0; el<size; el++)
	    *(output++)+=
	      c[0]*(input[n2*stride_obj+el])+
	      c[2]*(input[(n2+1)*stride_obj+el]+
		    symm*input[(n2-1)*stride_obj+el])+
	      c[4]*(input[(n2+2)*stride_obj+el]+
		    symm*input[(n2-2)*stride_obj+el]);
	  for (el=0; el<size; el++)
	    *(output++)+=
	      c[1]*(input[(n2+1)*stride_obj+el]+
		    symm*input[n2*stride_obj+el])+
	      c[3]*(input[(n2+2)*stride_obj+el]+
		    symm*input[(n2-1)*stride_obj+el])+
	      c[5]*(input[(n2+3)*stride_obj+el]+
		    symm*input[(n2-2)*stride_obj+el]);
	}
	input+=stride_conv;
      }
    else
      for (ic=0; ic<convs; ic++) {
	for (n2=0; n2<N/2; n2++) {
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[0]*(input[n2*stride_obj+el])+
	      c[2]*(input[(n2+1)*stride_obj+el]+
		    symm*input[(n2-1)*stride_obj+el])+
	      c[4]*(input[(n2+2)*stride_obj+el]+
		    symm*input[(n2-2)*stride_obj+el]);
	  for (el=0; el<size; el++)
	    *(output++)=
	      c[1]*(input[(n2+1)*stride_obj+el]+
		    symm*input[n2*stride_obj+el])+
	      c[3]*(input[(n2+2)*stride_obj+el]+
		    symm*input[(n2-1)*stride_obj+el])+
	      c[5]*(input[(n2+3)*stride_obj+el]+
		    symm*input[(n2-2)*stride_obj+el]);
	}
	input+=stride_conv;
      }
      
    break;
  default:
    printf("hfl=%d not supported for expand()\n");
    exit(1);
  }
}

/* Basic finer-convolution kernel Finer-convolves (creates output at
   half the input density) multiple objects (points, lines, or
   planes).  Input presume stored in memory continguously as a three
   dimensional object; navigation through output accomplished through
   various strides.

   output: pointer to element in output array which corresponds to
     origin of input array
   input: pointer to start of input array
   c: convolution coefficients for 0, 1, 2, ...
   hfl: half-filter length, extend of filter on either side beyond zero 
   symm: reflection symmetry of filter, +/-1 

   N: number of objects on finer scale
   size: size of objects (points, lines, planes) convolved together
   stride_obj: stride between objects in output

   convs: number of convolutions to perform
   stride_conv: stride between starting convolution locations in output

   Notes:
     For convolutions along z we have the exploitable special case of
     stride_obj=1, size=1.

     In general, we have for convolutions along direction i
       N=Nin_i
       size=prod_{j>i} Nin_j
       stride_obj=prod_{j>i} Nout_j
       convs=prod_{j<i} Nin_j
       stride_conv=prod_{j>=i} Nout_j
*/
void contract(double *output,double *input,double *c,int hfl,int symm,
	    int N,int size,int stride_obj,int convs,int stride_conv)
{
  int ic,n2,el;
  register double tmp;

  switch (hfl) {
  case 5:
    for (ic=0; ic<convs; ic++) {
      for (n2=0; n2<N/2; n2++) {
	for (el=0; el<size; el++) {
	  output[n2*stride_obj+el]+=c[0]*input[el];

	  output[(n2+1)*stride_obj+el]+=(tmp=c[2]*input[el]);
	  output[(n2-1)*stride_obj+el]+=symm*tmp;

	  output[(n2+2)*stride_obj+el]+=(tmp=c[4]*input[el]);
	  output[(n2-2)*stride_obj+el]+=symm*tmp;
	}
	input+=size;

	for (el=0; el<size; el++) {
	  output[(n2+1)*stride_obj+el]+=(tmp=c[1]*input[el]); 
	  output[n2*stride_obj+el]+=symm*tmp;

	  output[(n2+2)*stride_obj+el]+=(tmp=c[3]*input[el]);
	  output[(n2-1)*stride_obj+el]+=symm*tmp;

	  output[(n2+3)*stride_obj+el]+=(tmp=c[5]*input[el]);
	  output[(n2-2)*stride_obj+el]+=symm*tmp;
	}
	input+=size;
      }
      output+=stride_conv;
    }
    break;
  default:
    printf("hfl=%d not supported for contract()\n");
    exit(1);
  }
}

/* This operation is somewhat like Idag, but uses upward accumulation
   instead of copying.  Copying appears in the true Idag because of
   the identification in real space of ghost points with their
   parents.  For cases where Idag appears in the formal transpose of a
   product operation, the copying cuts off data flow paths necessary
   in the true conjugate, thus the need for this "Idag_flow".
   
   This code was "derived" basically by taking Obelowsimple (which
   faced a similar issue) and cutting out those parts referring to O.
*/

void Idag_flow( struct Grid *self,
		double *workspace1, double *workspace2, int lwork)
{
  int i,j,k;

  /*Same process on siblings*/
  if (self->sibling) 
    Idag_flow(self->sibling, workspace1, workspace2, lwork);
  
  /*Same process on children*/
  if (self->child) 
    Idag_flow(self->child, workspace1, workspace2, lwork);

  
  /* Here is the accumulating upward "flow" of data */
  if (self->level>0)
      for (i=0; i<self->dim.x; i+=2)
          for (j=0; j<self->dim.y; j+=2)
              for (k=0; k<self->dim.z; k+=2)
                  *( self->parent->dat + 
                     ( (self->org.x+i/2) * self->parent->dim.y + 
                       (self->org.y+j/2) ) * self->parent->dim.z +
                     (self->org.z+k/2) )
                      +=
                      *( self->dat+(i*self->dim.y+j)*self->dim.z+k );
  
  /*  Interlevel convolution with full set of Idag coefficients. */
  if ( (self->parent) && (self->child) ) {
    if ( lwork < (self->dim.x+3*HFL_T)*(self->dim.y+3*HFL_T)
	 *(self->dim.z+3*HFL_T) ) {
      fprintf(stderr,"Insufficient work space in Idag_flow for ");
      fprintf(stderr,"data set of size (%dx%dx%d)\n",
	      self->dim.x,self->dim.y,self->dim.z);
      fprintf(stderr,"Exiting.\n");
      exit(1);
    }

    aPconv3d( self->parent->dat+
	      ( (self->org.x) * self->parent->dim.y 
		+ (self->org.y)) * self->parent->dim.z
	      + (self->org.z),
	      self->dat,
	      1, tr, tf, ts,
	      self->parent->dim.x, 
	      self->parent->dim.y, 
	      self->parent->dim.z, 
	      self->dim.x,
	      self->dim.y,
	      self->dim.z,
	      HFL_T, workspace1, workspace2 );
  }

}  
#define IND(x,y,z,Nx,Ny,Nz) ( ((x)*(Ny)+(y))*(Nz)+(z) )

void Dabove(int d, struct Grid *self,double *work1,double *work2,int lwork)
{
  /* Indices */
  int x,y,z;
  int Nx=self->dim.x,Ny=self->dim.y,Nz=self->dim.z;

  /* Sanity check on derivative direction */
  if (d<-1 || d>2) {
    printf("Illegal direction d=%d in Dabove!!!\n",d);
    exit(1);
  }
  /* Check on workspace */
  if ( self->level > 0 ) {
    if ( lwork < (self->dim.x+3*HFL_T)*(self->dim.y+3*HFL_T)
	 *(self->dim.z+3*HFL_T) ) {
      fprintf(stderr,"Insufficient work space in Dabove for I...");
      fprintf(stderr,"data set of size (%dx%dx%d)\n",
	      self->dim.x,self->dim.y,self->dim.z);
      fprintf(stderr,"Exiting.\n");
      exit(1);
    }
    if ( lwork < (self->dim.x/2+5)*self->dim.y*self->dim.z ) {
      fprintf(stderr,"Insufficient work space in Dabove...");
      fprintf(stderr,"data set of size (%dx%dx%d)\n",
	      self->dim.x,self->dim.y,self->dim.z);
      fprintf(stderr,"Exiting.\n");
      exit(1);
    }
  }


  /* This has same effect as taking I explicitly just before calling Dabove */
  if ( self->level > 0 )    
      aconvP3d( self->parent->dat+
		(self->org.x * self->parent->dim.y + 
		 self->org.y) * self->parent->dim.z
		+ self->org.z,
		self->dat,
		1, tr, tf, ts,
		self->parent->dim.x, 
		self->parent->dim.y,
		self->parent->dim.z,
		self->dim.x, self->dim.y,self->dim.z,
		HFL_T, work1, work2 );


  /* Work from bottom up */
  if (self->child)
    Dabove(d,self->child,work1,work2,lwork);

  /* Do siblings AFTER children to make inheritance streams for cache */
  if (self->sibling)
    Dabove(d,self->sibling,work1,work2,lwork);

  /* Convolve parent onto self */
  if (self->parent==NULL)
    /* Zero contribution if no parent */
    for (x=0; x<Nx*Ny*Nz; x++)
      self->dat[x]=0.;
  else {
    struct Ivec dim=self->dim;
    struct Ivec org=self->org;

    struct Ivec pdim=self->parent->dim;
    struct Dvec pscale=self->parent->scale;
    double *pdat=self->parent->dat;

    int x,ifacc,N,size,stride_obj,convs,stride_conv;

    double *F;
    int hfl,sign;

    /* Appropriately scaled filter for taking derivatives */
    double d2x[1+HFL_D2];
    double d2y[1+HFL_D2];
    double d2z[1+HFL_D2];

    int i;

    for (i=0; i<=HFL_D2; i++) {
      d2x[i]=D2[i]/pscale.x;
      d2y[i]=D2[i]/pscale.y;
      d2z[i]=D2[i]/pscale.z;
    }

    /* Expand the box
       [org.x-2,org.y-2,org.z]->(org.x+Nx/2+3,org.y+Ny/2+3,org.z+Nz/2)
       along z to make array of size [Nx/2+5,Ny/2+5,Nz].  The extra
       space is needed for borders for upcoming x and y
       convolutions. */

    /* For expansions along z, we loop over each plane as the
       stride between planes is not contiguous (and so can't be done
       in a single call to expand) due to expansion region not filling
       the full parent.

       AND, INTERLEAVE this with expanding along y to make array of
       size [Nx/2+5,Ny,Nz] with final optimization of noting that only
       one x plane of work1 is ever used and so we always use the first
       one. */
    for (x=0; x<dim.x/2+5; x++) {
      /* Select interpolet/derivative filter */
      if (d==2) {F=d2z; hfl=HFL_D2; sign=-1;}
      else {F=T2; hfl=HFL_T2; sign=1;}
      expand(work1,ifacc=0,
	     &pdat[IND(org.x-2+x,org.y-2,org.z,pdim.x,pdim.y,pdim.z)],
	     F,hfl,sign,
	     N=dim.z,size=1,stride_obj=1,
	     convs=dim.y/2+5,stride_conv=pdim.z);

    /* Select interpolet/derivative filter */
      if (d==1) {F=d2y; hfl=HFL_D2; sign=-1;}
      else {F=T2; hfl=HFL_T2; sign=1;}
      expand(&work2[IND(x,0,0,dim.x/2+5,dim.y,dim.z)],ifacc=0,
	     &work1[IND(0,2,0,dim.x/2+5,dim.y/2+5,dim.z)],
	     F,hfl,sign,
	     N=dim.y,size=dim.z,stride_obj=dim.z,
	     convs=1,stride_conv=(dim.y/2+5)*dim.z);
    }

    /* Expand the last box [0,0,0]->(Nx/2+5,Ny,Nz) along z, making the
       final array of size [Nx,Ny,Nz] in the output data block */
    if (d==0) {F=d2x; hfl=HFL_D2; sign=-1;}
    else {F=T2; hfl=HFL_T2; sign=1;}
    expand(self->dat,ifacc=0,
	   &work2[IND(2,0,0,dim.x/2+5,dim.y,dim.z)],
	   F,hfl,sign,
	   N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
	   convs=1,stride_conv=(dim.x/2+5)*dim.y*dim.z);
  }
}

/* Accumlative version of Dabove. Identical arguments, except for --
   out: grid onto which results are accumulated */
void accDabove(int d, struct Grid *out,struct Grid *self,double *work1,double *work2,int lwork)
{
  /* Indices */
  int x,y,z;
  int Nx=self->dim.x,Ny=self->dim.y,Nz=self->dim.z;

  /* Sanity check on derivative direction */
  if (d<0 || d>2) {
    printf("Illegal direction d=%d in Dabove!!!\n",d);
    exit(1);
  }
  
  /* Check on workspace */
  if (self->level > 0) {
    if ( lwork < (self->dim.x+3*HFL_T)*(self->dim.y+3*HFL_T)
	 *(self->dim.z+3*HFL_T) ) {
      fprintf(stderr,"Insufficient work space in accDabove for I...");
      fprintf(stderr,"data set of size (%dx%dx%d)\n",
	      self->dim.x,self->dim.y,self->dim.z);
      fprintf(stderr,"Exiting.\n");
      exit(1);
    }
    if ( lwork < (self->dim.x/2+5)*self->dim.y*self->dim.z ) {
      fprintf(stderr,"Insufficient work space in accDabove...");
      fprintf(stderr,"data set of size (%dx%dx%d)\n",
	      self->dim.x,self->dim.y,self->dim.z);
      fprintf(stderr,"Exiting.\n");
      exit(1);
    }
  }

  /* This has same effect as taking I explicitly just before calling Dabove */
  if ( self->level > 0 )    
      aconvP3d( self->parent->dat+
		(self->org.x * self->parent->dim.y + 
		 self->org.y) * self->parent->dim.z
		+ self->org.z,
		self->dat,
		1, tr, tf, ts,
		self->parent->dim.x, 
		self->parent->dim.y,
		self->parent->dim.z,
		self->dim.x, self->dim.y,self->dim.z,
		HFL_T, work1, work2 );


  /* Work from bottom up */
  if (self->child)
    accDabove(d,out->child,self->child,work1,work2,lwork);

  /* Do siblings AFTER children to make inheritance streams for cache */
  if (self->sibling)
    accDabove(d,out->sibling,self->sibling,work1,work2,lwork);

  /* Convolve parent onto self */
  /* Zero contribution if no parent */
  if (self->parent) {
    struct Ivec dim=self->dim;
    struct Ivec org=self->org;

    struct Ivec pdim=self->parent->dim;
    struct Dvec pscale=self->parent->scale;
    double *pdat=self->parent->dat;

    int x,ifacc,N,size,stride_obj,convs,stride_conv;

    double *F;
    int hfl,sign;

    /* Appropriately scaled filter for taking derivatives */
    double d2x[1+HFL_D2];
    double d2y[1+HFL_D2];
    double d2z[1+HFL_D2];

    int i;

    for (i=0; i<=HFL_D2; i++) {
      d2x[i]=D2[i]/pscale.x;
      d2y[i]=D2[i]/pscale.y;
      d2z[i]=D2[i]/pscale.z;
    }

    /* Expand the box
       [org.x-2,org.y-2,org.z]->(org.x+Nx/2+3,org.y+Ny/2+3,org.z+Nz/2)
       along z to make array of size [Nx/2+5,Ny/2+5,Nz].  The extra
       space is needed for borders for upcoming x and y
       convolutions. */

    /* For expansions along z, we loop over each plane as the
       stride between planes is not contiguous (and so can't be done
       in a single call to expand) due to expansion region not filling
       the full parent.

       AND INTERLEAVE this with expanding along y to make array of
       size [Nx/2+5,Ny,Nz] with final optimization of noting that only
       one x plane of work1 is ever used and so we always use the first
       one. */
    for (x=0; x<dim.x/2+5; x++) {
      /* Select interpolet/derivative filter */
      if (d==2) {F=d2z; hfl=HFL_D2; sign=-1;}
      else {F=T2; hfl=HFL_T2; sign=1;}
      expand(work1,ifacc=0,
	     &pdat[IND(org.x-2+x,org.y-2,org.z,pdim.x,pdim.y,pdim.z)],
	     F,hfl,sign,
	     N=dim.z,size=1,stride_obj=1,
	     convs=dim.y/2+5,stride_conv=pdim.z);

    /* Select interpolet/derivative filter */
      if (d==1) {F=d2y; hfl=HFL_D2; sign=-1;}
      else {F=T2; hfl=HFL_T2; sign=1;}
      expand(&work2[IND(x,0,0,dim.x/2+5,dim.y,dim.z)],ifacc=0,
	     &work1[IND(0,2,0,dim.x/2+5,dim.y/2+5,dim.z)],
	     F,hfl,sign,
	     N=dim.y,size=dim.z,stride_obj=dim.z,
	     convs=1,stride_conv=(dim.y/2+5)*dim.z);
    }

    /* Expand the last box [0,0,0]->(Nx/2+5,Ny,Nz) along z, making the
       final array of size [Nx,Ny,Nz] in the output data block */
    if (d==0) {F=d2x; hfl=HFL_D2; sign=-1;}
    else {F=T2; hfl=HFL_T2; sign=1;}
    expand(out->dat,ifacc=1,
	   &work2[IND(2,0,0,dim.x/2+5,dim.y,dim.z)],
	   F,hfl,sign,
	   N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
	   convs=1,stride_conv=(dim.x/2+5)*dim.y*dim.z);
  }
}


/* Dagger of Dabove in the sense of identification of ghost images of
   real space points, thus including prezeroing of ghost points before
   convolution */
void Dbelow(int d, struct Grid *out, struct Grid *in,double *work1,double *work2,int lwork)
{
  /* Indices */
  int x,y,z;
  int Nx=out->dim.x,Ny=out->dim.y,Nz=out->dim.z;

  /* Sanity check on derivative direction */
  if (d<-1 || d>2) {
    printf("Illegal direction d=%d in Dabove!!!\n",d);
    exit(1);
  }

  /* Working from top down... */
  
  /* Accumulate-convolve onto parent (if any) */
  if (out->parent) {
    struct Ivec dim=out->dim;
    struct Ivec org=out->org;

    struct Ivec pdim=out->parent->dim;
    struct Dvec pscale=out->parent->scale;
    double *pdat=out->parent->dat;

    int x,N,size,stride_obj,convs,stride_conv;

    double *F;
    int hfl,sign;

    double d2x[1+HFL_D2];
    double d2y[1+HFL_D2];
    double d2z[1+HFL_D2];
    int i;

    for (i=0; i<=HFL_D2; i++) {
      d2x[i]=D2[i]/pscale.x;
      d2y[i]=D2[i]/pscale.y;
      d2z[i]=D2[i]/pscale.z;
    }

    /* Check on workspace */
    if ( lwork < (in->dim.x/2+5)*in->dim.y*in->dim.z) {
      fprintf(stderr,"Insufficient work space in Dbelow for ");
      fprintf(stderr,"data set of size (%dx%dx%d)\n",
	      in->dim.x,in->dim.y,in->dim.z);
      fprintf(stderr,"Exiting.\n");
      exit(1);
    }

    /* Copy input and zero ghosts (which don't contrib to Dbelow) */
    for (x=0; x<dim.x; x++)
      for (y=0; y<dim.y; y++)
	for (z=0; z<dim.z; z++)
	  if (x%2==0 && y%2==0 && z%2==0)
	    out->dat[IND(x,y,z,dim.x,dim.y,dim.z)]=0.;
	  else
	    out->dat[IND(x,y,z,dim.x,dim.y,dim.z)]=
	      in->dat[IND(x,y,z,dim.x,dim.y,dim.z)];

    /* Contract the box [0,0,0]->(Nx,Ny,Nz) from out along x, making
       the final array of size [Nx/2+5,Ny,Nz] into work1.  Note that
       the origin of the input box correpsonds to the point (2,0,0) in
       the output box because the effects of the lower level 'puff
       out' along x. 
       NOTE: must zero work1 first because contract() is accumulative */
    for (x=0; x<(dim.x/2+5)*dim.y*dim.z; x++)
      work1[x]=0.;

    if (d==0) {F=d2x; hfl=HFL_D2; sign=-1;}
    else {F=T2; hfl=HFL_T2; sign=1;}
    contract(&work1[IND(2,0,0,dim.x/2+5,dim.y,dim.z)],
	     out->dat,
	     F,hfl,sign,
	     N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
	     convs=1,stride_conv=(dim.x/2+5)*dim.y*dim.z);


    /* Contract the box [0,0,0]->(Nx/2+5,Ny,Nz) along y to make array
       of size [Nx/2+5,Ny/2+5,Nz] in work2.  Note that the origin of
       the input box corresponds to (0,2,0) in the output.     
       NOTE: must zero work2 first because contract() is accumulative.

       AND INTERLEAVE this operation with contracting the box
       [0,0,0]->(Nx/2+5,Ny/2+5,Nz) along z and accumulate onto parent
       with origin of work2 corresponding to (org.x-2,org.y-2,org.z).
       The -2's are because of the buffer we've built into work2.
    
       The final optimization was to note that only one plane of work2
       is ever used at once, so we just use the 0th one to reduce
       memory volume */
    for (x=0; x<dim.x/2+5; x++) {
      for (y=0; y<(dim.y/2+5)*dim.z; y++)
	work2[y]=0.;
      if (d==1) {F=d2y; hfl=HFL_D2; sign=-1;}
      else {F=T2; hfl=HFL_T2; sign=1;}
      contract(&work2[IND(0,2,0,dim.x/2+5,dim.y/2+5,dim.z)],
	       &work1[IND(x,0,0,dim.x/2+5,dim.y,dim.z)],
	       F,hfl,sign,
	       N=dim.y,size=dim.z,stride_obj=dim.z,
	       convs=1,stride_conv=(dim.y/2+5)*dim.z);

      if (d==2) {F=d2z; hfl=HFL_D2; sign=-1;}
      else {F=T2; hfl=HFL_T2; sign=1;}
      contract(&pdat[IND(org.x-2+x,org.y-2,org.z,pdim.x,pdim.y,pdim.z)],
	       &work2[IND(0,0,0,dim.x/2+5,dim.y/2+5,dim.z)],
	       F,hfl,sign,
	       N=dim.z,size=1,stride_obj=1,
	       convs=dim.y/2+5,stride_conv=pdim.z);
    }

  }

  /* Zero out in anticipation of accumulation by children */
  for (x=0; x<Nx*Ny*Nz; x++)
    out->dat[x]=0.;

  /* Do children */
  if (out->child)
    Dbelow(d,out->child,in->child,work1,work2,lwork);

  /* Do siblings AFTER children to make inheritance streams for cache */
  if (out->sibling)
    Dbelow(d,out->sibling,in->sibling,work1,work2,lwork);
}

/* Evaluate the "Dd" operator, the first derivative in dth direction
   in REAL SPACE given value of function in coefficient space ---
   d: direction of derivative
   out: output (in real space)
   in: input (in coeff space)
   work1: workspace
   work2: workspace
   lwork: size of workspaces
*/
void D(int d, struct Grid *out, struct Grid *in,struct Grid *work,double *work1,double *work2,int lwork)
{
  /* Save copy of input.  Needed so that D isn't destructive.  If
     destructive were permissible, eliminating this would lead to a
     noticeable time savings. */
  copygridnew(work,in);

  /* Contributions from levels below */
  Dsame(d,out,in);
  Dacc(out);

  /* Add on contributions from levels above */
  accDabove(d,out,work,work1,work2,lwork); 

  return;
}


/* Hermitian conjugate of D, with same inputs.
   Routine derived by "daggering" (reversing data flow and
   daggering) the routine for D: */
void Ddag(int d, struct Grid *out, struct Grid *in,struct Grid *work,double *work1,double *work2,int lwork)
{

  /* Find contributions from below current level */
  Dbelow(d,out,in,work1,work2,lwork);
  Idag_flow(out,work1,work2,lwork);

  /* Because the first derivative filter is asymmetric, its dagger is
     just minus itself.  Thus, decrement same convolution result from
     out. */
  decDsame(d,out,in);

  /* Impose those zeros on ghosts */
  zeroghostrec(out);
}


syntax highlighted by Code2HTML, v. 0.9.1