/*
DFT++ is a density functional package developed by the research group
of Professor Tomas Arias
Copyright 1996-2003 Sohrab Ismail-Beigi
This file is part of DFT++.
DFT++ is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
DFT++ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with DFT++; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Please see the file CREDITS for a list of authors.
For academic users, we request that publications using results obtained with
this software reference
"New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).
and, if using the wavelet basis, further reference
"Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).
and
"Robust ab initio calculation of condensed matter: transparent convergence through
semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).
For your convenience, preprints of the above articles may be obtained from
http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/
/****************************************************************/
/* Routines for D */
/* D(i) is the basic operation of "EVALUATING" the derivative of a
function in the ith direction on all points of the realspace grid
given the function's coefficient space representation.
Ddag(i) is it's Hermitian conjugate in the sense of identification
of real space points with their ghosts.
*/
#include <stdio.h>
#include <math.h>
#include "WL_header.h"
/* 3d array addressing short-hand */
#define DAT(a,x,y,z,Nx,Ny,Nz) ( (a)->dat[((x)*Ny+(y))*Nz+(z)] )
void Dacc(struct Grid *self)
{
/* Indices */
int x,y,z;
/* Go to bottom first */
if (self->child)
Dacc(self->child);
/* Do all siblings BEFORE coming up levels */
if (self->sibling)
Dacc(self->sibling);
/* Accumulate ghost data up to parent (if there is one!) */
if (self->parent) {
/* Convenient navigational parameters */
int Nx=self->dim.x,Ny=self->dim.y,Nz=self->dim.z;
int Npx=self->parent->dim.x,Npy=self->parent->dim.y,Npz=self->parent->dim.z;
for (x=0; x<self->dim.x/2; x++)
for (y=0; y<self->dim.y/2; y++)
for (z=0; z<self->dim.z/2; z++)
self->
parent->
dat[((self->org.x+x)*Npy+(self->org.y+y))*Npz+(self->org.z+z) ]
+= self->dat[(2*x*Ny+2*y)*Nz+2*z];
}
}
/* Basic convolution kernel
Convolves multiple data sets (points, lines, or planes). Output stored
contiguously as a three dimensional object; navigation through
input accomplished through various strides.
output: pointer to start of output array
input: pointer to element in input array which corresponds to origin of
output array
periodic: tells whether input is to be treated as periodic (no/yes=0/1)
sign: 0 (write into output), -1 (subtract from output), 1 (add to output)
c: convolution coefficients for 0, 1, 2, ...
hfl: half-filter length, extent of filter on either side beyond zero
symm: reflection symmetry of filter, +/-1
N: number of objects to be produced in output
size: size of objects (points, lines, planes) convolved together
stride_obj: stride between objects in input
convs: number of convolutions to perform
stride_conv: stride between starting convolution locations in output
Notes:
For convolutions along z we have the exploitable special case of
stride_obj=1, size=1.
In general, we have for convolutions along direction i
N=N_i
size=prod_{j>i} N_j
stride_obj=prod_{j>i} N_j
convs=prod_{j<i} N_j
stride_conv=prod_{j>=i} N_j
*/
void convolve(double *output,double *input,int periodic,int sign,
double *c,int hfl,int symm,
int N,int size,int stride_obj,int convs,int stride_conv)
{
int ic,n,el;
if (sign==1) {
printf("sign=1 not supported in convolve()!!!\n");
exit(1);
}
if (sign==0) {
/* Special case for optimization */
if (symm==-1) {
switch (hfl) {
case 2:
for (ic=0; ic<convs; ic++) {
n=0;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1+N)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]);
n=1;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]);
for (n=2; n<N-2; n++)
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]-input[(n-2)*stride_obj+el]);
n=N-2;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])-
c[2]*(input[(n-2)*stride_obj+el]);
n=N-1;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n+1-N)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=-
c[1]*(input[(n-1)*stride_obj+el])-
c[2]*(input[(n-2)*stride_obj+el]);
input+=stride_conv;
}
break;
default:
printf("hfl=%d not supported for convolve()\n");
exit(1);
}
return;
}
/* General case */
switch (hfl) {
case 2:
for (ic=0; ic<convs; ic++) {
n=0;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1+N)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]+
symm*input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]
)+
c[2]*(input[(n+2)*stride_obj+el]
);
n=1;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]+
symm*input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]
);
for (n=2; n<N-2; n++)
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]+
symm*input[(n-2)*stride_obj+el]);
n=N-2;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]+
symm*input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(
symm*input[(n-2)*stride_obj+el]);
n=N-1;
if (periodic)
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1-N)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]+
symm*input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n*stride_obj+el])+
c[1]*(
symm*input[(n-1)*stride_obj+el])+
c[2]*(
symm*input[(n-2)*stride_obj+el]);
input+=stride_conv;
}
break;
default:
printf("hfl=%d not supported for convolve()\n");
exit(1);
}
return;
}
if (sign==-1) {
/* Special case for optimization */
if (symm==-1) {
switch (hfl) {
case 2:
for (ic=0; ic<convs; ic++) {
n=0;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1+N)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]);
n=1;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]-input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]);
for (n=2; n<N-2; n++)
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]-input[(n-2)*stride_obj+el]);
n=N-2;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1)*stride_obj+el]-input[(n-1)*stride_obj+el])-
c[2]*(input[(n-2)*stride_obj+el]);
n=N-1;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[1]*(input[(n+1-N)*stride_obj+el]-input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]-input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=-
c[1]*(input[(n-1)*stride_obj+el])-
c[2]*(input[(n-2)*stride_obj+el]);
input+=stride_conv;
}
break;
default:
printf("hfl=%d not supported for convolve()\n");
exit(1);
}
return;
}
/* General case */
switch (hfl) {
case 2:
for (ic=0; ic<convs; ic++) {
n=0;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1+N)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]+
symm*input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]
)+
c[2]*(input[(n+2)*stride_obj+el]
);
n=1;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]+
symm*input[(n-2+N)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]
);
for (n=2; n<N-2; n++)
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2)*stride_obj+el]+
symm*input[(n-2)*stride_obj+el]);
n=N-2;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]+
symm*input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(
symm*input[(n-2)*stride_obj+el]);
n=N-1;
if (periodic)
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(input[(n+1-N)*stride_obj+el]+
symm*input[(n-1)*stride_obj+el])+
c[2]*(input[(n+2-N)*stride_obj+el]+
symm*input[(n-2)*stride_obj+el]);
else
for (el=0; el<size; el++)
*(output++)-=
c[0]*(input[n*stride_obj+el])+
c[1]*(
symm*input[(n-1)*stride_obj+el])+
c[2]*(
symm*input[(n-2)*stride_obj+el]);
input+=stride_conv;
}
break;
default:
printf("hfl=%d not supported for convolve()\n");
exit(1);
}
return;
}
printf("Unrecognized case in convolve()!\n");
exit(1);
}
void Dsame(int d,struct Grid *out, struct Grid *in)
{
int x,y,z;
/* Convenient aliases for important quantities */
struct Ivec dim=in->dim;
/* For getting appropriate filter scale */
double d1[1+HFL_D1],D;
int sign,symm,N,size,stride_obj,convs,stride_conv;
/* Make version of filter scaled to appropo direction */
switch (d) {
case 0:
D=1./in->scale.x;
break;
case 1:
D=1./in->scale.y;
break;
case 2:
D=1./in->scale.z;
break;
default:
printf("Illegal direction d=%d in Dsame!\n",d);
exit(1);
}
for (x=0; x<=HFL_D1; x++) d1[x]=D1[x]*D;
switch (d) {
case 2:
/* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along z */
convolve(out->dat,in->dat,in->ifperiodic,sign=0,
d1,HFL_D1,symm=-1,
N=dim.z,size=1,stride_obj=1,
convs=dim.x*dim.y,stride_conv=dim.z);
break;
case 1:
/* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along y */
convolve(out->dat,in->dat,in->ifperiodic,sign=0,
d1,HFL_D1,symm=-1,
N=dim.y,size=dim.z,stride_obj=dim.z,
convs=dim.x,stride_conv=dim.y*dim.z);
break;
case 0:
/* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along x */
convolve(out->dat,in->dat,in->ifperiodic,sign=0,
d1,HFL_D1,symm=-1,
N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
convs=1,stride_conv=dim.x*dim.y*dim.z);
break;
default:
printf("Illegal direction d=%d in Dsame!\n");
exit(1);
}
/* Continue on to all subgrids */
if (in->child)
Dsame(d,out->child,in->child);
if (in->sibling)
Dsame(d,out->sibling,in->sibling);
/* This should have the same effect as calling Dacc after Dsame, but
only want this version for Dabove and not Dbelow, so simpler
coding to just call Dacc explicitly. Also, the explict call to
Dacc doesn't seem to take much time (a few percent at most!). */
/* if (out->parent) { */
/* int Npx=out->parent->dim.x,Npy=out->parent->dim.y,Npz=out->parent->dim.z; */
/* for (x=0; x<dim.x/2; x++) */
/* for (y=0; y<dim.y/2; y++) */
/* for (z=0; z<dim.z/2; z++) */
/* out-> */
/* parent-> */
/* dat[((out->org.x+x)*Npy+(out->org.y+y))*Npz+(out->org.z+z) ] */
/* += out->dat[(2*x*dim.y+2*y)*dim.z+2*z]; */
/* } */
}
/* Just like Dsame, but subtracts "decrements" result from out */
void decDsame(int d,struct Grid *out, struct Grid *in)
{
int x,y,z;
/* Convenient aliases for important quantities */
struct Ivec dim=in->dim;
/* For getting appropriate filter scale */
double d1[1+HFL_D1],D;
int sign,symm,N,size,stride_obj,convs,stride_conv;
/* Make version of filter scaled to appropo direction */
switch (d) {
case 0:
D=1./in->scale.x;
break;
case 1:
D=1./in->scale.y;
break;
case 2:
D=1./in->scale.z;
break;
default:
printf("Illegal direction d=%d in Dsame!\n",d);
exit(1);
}
for (x=0; x<=HFL_D1; x++) d1[x]=D1[x]*D;
switch (d) {
case 2:
/* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along z */
convolve(out->dat,in->dat,in->ifperiodic,sign=-1,
d1,HFL_D1,symm=-1,
N=dim.z,size=1,stride_obj=1,
convs=dim.x*dim.y,stride_conv=dim.z);
break;
case 1:
/* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along y */
convolve(out->dat,in->dat,in->ifperiodic,sign=-1,
d1,HFL_D1,symm=-1,
N=dim.y,size=dim.z,stride_obj=dim.z,
convs=dim.x,stride_conv=dim.y*dim.z);
break;
case 0:
/* Convolve box [0,0,0]->(Nx,Ny,Nz) with deriv along x */
convolve(out->dat,in->dat,in->ifperiodic,sign=-1,
d1,HFL_D1,symm=-1,
N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
convs=1,stride_conv=dim.x*dim.y*dim.z);
break;
default:
printf("Illegal direction d=%d in Dsame!\n");
exit(1);
}
/* Continue on to all subgrids */
if (in->child)
decDsame(d,out->child,in->child);
if (in->sibling)
decDsame(d,out->sibling,in->sibling);
/* This should have the same effect as calling Dacc after Dsame, but
only want this version for Dabove and not Dbelow, so simpler
coding to just call Dacc explicitly. Also, the explict call to
Dacc doesn't seem to take much time (a few percent at most!). */
/* if (out->parent) { */
/* int Npx=out->parent->dim.x,Npy=out->parent->dim.y,Npz=out->parent->dim.z; */
/* for (x=0; x<dim.x/2; x++) */
/* for (y=0; y<dim.y/2; y++) */
/* for (z=0; z<dim.z/2; z++) */
/* out-> */
/* parent-> */
/* dat[((out->org.x+x)*Npy+(out->org.y+y))*Npz+(out->org.z+z) ] */
/* += out->dat[(2*x*dim.y+2*y)*dim.z+2*z]; */
/* } */
}
/* Basic coarser-convolution kernel
Coarser-convolves (creates output at twice the input density)
multiple data sets (points, lines, or planes). Output stored
contiguously as a three dimensional object; navigation through
input accomplished through various strides.
output: pointer to start of output array
ifacc: tells whether to accumulate results onto output (true) or
to overwrite it (false)
input: pointer to element in input array which corresponds to origin of
output array
c: convolution coefficients for 0, 1, 2, ...
hfl: half-filter length, extend of filter on either side beyond zero
symm: reflection symmetry of filter, +/-1
N: number of objects to be produced in output
size: size of objects (points, lines, planes) convolved together
stride_obj: stride between objects in input
convs: number of convolutions to perform
stride_conv: stride between starting convolution locations in input
Notes:
For convolutions along z we have the exploitable special case of
stride_obj=1, size=1.
In general, we have for convolutions along direction i
N=Nout_i
size=prod_{j>i} Nout_j
stride_obj=prod_{j>i} Nin_j
convs=prod_{j<i} Nout_j
stride_conv=prod_{j>=i} Nin_j
*/
void expand(double *output,int ifacc,double *input,double *c,int hfl,int symm,
int N,int size,int stride_obj,int convs,int stride_conv)
{
int ic,n2,n,el;
switch (hfl) {
case 5:
if (ifacc)
for (ic=0; ic<convs; ic++) {
for (n2=0; n2<N/2; n2++) {
for (el=0; el<size; el++)
*(output++)+=
c[0]*(input[n2*stride_obj+el])+
c[2]*(input[(n2+1)*stride_obj+el]+
symm*input[(n2-1)*stride_obj+el])+
c[4]*(input[(n2+2)*stride_obj+el]+
symm*input[(n2-2)*stride_obj+el]);
for (el=0; el<size; el++)
*(output++)+=
c[1]*(input[(n2+1)*stride_obj+el]+
symm*input[n2*stride_obj+el])+
c[3]*(input[(n2+2)*stride_obj+el]+
symm*input[(n2-1)*stride_obj+el])+
c[5]*(input[(n2+3)*stride_obj+el]+
symm*input[(n2-2)*stride_obj+el]);
}
input+=stride_conv;
}
else
for (ic=0; ic<convs; ic++) {
for (n2=0; n2<N/2; n2++) {
for (el=0; el<size; el++)
*(output++)=
c[0]*(input[n2*stride_obj+el])+
c[2]*(input[(n2+1)*stride_obj+el]+
symm*input[(n2-1)*stride_obj+el])+
c[4]*(input[(n2+2)*stride_obj+el]+
symm*input[(n2-2)*stride_obj+el]);
for (el=0; el<size; el++)
*(output++)=
c[1]*(input[(n2+1)*stride_obj+el]+
symm*input[n2*stride_obj+el])+
c[3]*(input[(n2+2)*stride_obj+el]+
symm*input[(n2-1)*stride_obj+el])+
c[5]*(input[(n2+3)*stride_obj+el]+
symm*input[(n2-2)*stride_obj+el]);
}
input+=stride_conv;
}
break;
default:
printf("hfl=%d not supported for expand()\n");
exit(1);
}
}
/* Basic finer-convolution kernel Finer-convolves (creates output at
half the input density) multiple objects (points, lines, or
planes). Input presume stored in memory continguously as a three
dimensional object; navigation through output accomplished through
various strides.
output: pointer to element in output array which corresponds to
origin of input array
input: pointer to start of input array
c: convolution coefficients for 0, 1, 2, ...
hfl: half-filter length, extend of filter on either side beyond zero
symm: reflection symmetry of filter, +/-1
N: number of objects on finer scale
size: size of objects (points, lines, planes) convolved together
stride_obj: stride between objects in output
convs: number of convolutions to perform
stride_conv: stride between starting convolution locations in output
Notes:
For convolutions along z we have the exploitable special case of
stride_obj=1, size=1.
In general, we have for convolutions along direction i
N=Nin_i
size=prod_{j>i} Nin_j
stride_obj=prod_{j>i} Nout_j
convs=prod_{j<i} Nin_j
stride_conv=prod_{j>=i} Nout_j
*/
void contract(double *output,double *input,double *c,int hfl,int symm,
int N,int size,int stride_obj,int convs,int stride_conv)
{
int ic,n2,el;
register double tmp;
switch (hfl) {
case 5:
for (ic=0; ic<convs; ic++) {
for (n2=0; n2<N/2; n2++) {
for (el=0; el<size; el++) {
output[n2*stride_obj+el]+=c[0]*input[el];
output[(n2+1)*stride_obj+el]+=(tmp=c[2]*input[el]);
output[(n2-1)*stride_obj+el]+=symm*tmp;
output[(n2+2)*stride_obj+el]+=(tmp=c[4]*input[el]);
output[(n2-2)*stride_obj+el]+=symm*tmp;
}
input+=size;
for (el=0; el<size; el++) {
output[(n2+1)*stride_obj+el]+=(tmp=c[1]*input[el]);
output[n2*stride_obj+el]+=symm*tmp;
output[(n2+2)*stride_obj+el]+=(tmp=c[3]*input[el]);
output[(n2-1)*stride_obj+el]+=symm*tmp;
output[(n2+3)*stride_obj+el]+=(tmp=c[5]*input[el]);
output[(n2-2)*stride_obj+el]+=symm*tmp;
}
input+=size;
}
output+=stride_conv;
}
break;
default:
printf("hfl=%d not supported for contract()\n");
exit(1);
}
}
/* This operation is somewhat like Idag, but uses upward accumulation
instead of copying. Copying appears in the true Idag because of
the identification in real space of ghost points with their
parents. For cases where Idag appears in the formal transpose of a
product operation, the copying cuts off data flow paths necessary
in the true conjugate, thus the need for this "Idag_flow".
This code was "derived" basically by taking Obelowsimple (which
faced a similar issue) and cutting out those parts referring to O.
*/
void Idag_flow( struct Grid *self,
double *workspace1, double *workspace2, int lwork)
{
int i,j,k;
/*Same process on siblings*/
if (self->sibling)
Idag_flow(self->sibling, workspace1, workspace2, lwork);
/*Same process on children*/
if (self->child)
Idag_flow(self->child, workspace1, workspace2, lwork);
/* Here is the accumulating upward "flow" of data */
if (self->level>0)
for (i=0; i<self->dim.x; i+=2)
for (j=0; j<self->dim.y; j+=2)
for (k=0; k<self->dim.z; k+=2)
*( self->parent->dat +
( (self->org.x+i/2) * self->parent->dim.y +
(self->org.y+j/2) ) * self->parent->dim.z +
(self->org.z+k/2) )
+=
*( self->dat+(i*self->dim.y+j)*self->dim.z+k );
/* Interlevel convolution with full set of Idag coefficients. */
if ( (self->parent) && (self->child) ) {
if ( lwork < (self->dim.x+3*HFL_T)*(self->dim.y+3*HFL_T)
*(self->dim.z+3*HFL_T) ) {
fprintf(stderr,"Insufficient work space in Idag_flow for ");
fprintf(stderr,"data set of size (%dx%dx%d)\n",
self->dim.x,self->dim.y,self->dim.z);
fprintf(stderr,"Exiting.\n");
exit(1);
}
aPconv3d( self->parent->dat+
( (self->org.x) * self->parent->dim.y
+ (self->org.y)) * self->parent->dim.z
+ (self->org.z),
self->dat,
1, tr, tf, ts,
self->parent->dim.x,
self->parent->dim.y,
self->parent->dim.z,
self->dim.x,
self->dim.y,
self->dim.z,
HFL_T, workspace1, workspace2 );
}
}
#define IND(x,y,z,Nx,Ny,Nz) ( ((x)*(Ny)+(y))*(Nz)+(z) )
void Dabove(int d, struct Grid *self,double *work1,double *work2,int lwork)
{
/* Indices */
int x,y,z;
int Nx=self->dim.x,Ny=self->dim.y,Nz=self->dim.z;
/* Sanity check on derivative direction */
if (d<-1 || d>2) {
printf("Illegal direction d=%d in Dabove!!!\n",d);
exit(1);
}
/* Check on workspace */
if ( self->level > 0 ) {
if ( lwork < (self->dim.x+3*HFL_T)*(self->dim.y+3*HFL_T)
*(self->dim.z+3*HFL_T) ) {
fprintf(stderr,"Insufficient work space in Dabove for I...");
fprintf(stderr,"data set of size (%dx%dx%d)\n",
self->dim.x,self->dim.y,self->dim.z);
fprintf(stderr,"Exiting.\n");
exit(1);
}
if ( lwork < (self->dim.x/2+5)*self->dim.y*self->dim.z ) {
fprintf(stderr,"Insufficient work space in Dabove...");
fprintf(stderr,"data set of size (%dx%dx%d)\n",
self->dim.x,self->dim.y,self->dim.z);
fprintf(stderr,"Exiting.\n");
exit(1);
}
}
/* This has same effect as taking I explicitly just before calling Dabove */
if ( self->level > 0 )
aconvP3d( self->parent->dat+
(self->org.x * self->parent->dim.y +
self->org.y) * self->parent->dim.z
+ self->org.z,
self->dat,
1, tr, tf, ts,
self->parent->dim.x,
self->parent->dim.y,
self->parent->dim.z,
self->dim.x, self->dim.y,self->dim.z,
HFL_T, work1, work2 );
/* Work from bottom up */
if (self->child)
Dabove(d,self->child,work1,work2,lwork);
/* Do siblings AFTER children to make inheritance streams for cache */
if (self->sibling)
Dabove(d,self->sibling,work1,work2,lwork);
/* Convolve parent onto self */
if (self->parent==NULL)
/* Zero contribution if no parent */
for (x=0; x<Nx*Ny*Nz; x++)
self->dat[x]=0.;
else {
struct Ivec dim=self->dim;
struct Ivec org=self->org;
struct Ivec pdim=self->parent->dim;
struct Dvec pscale=self->parent->scale;
double *pdat=self->parent->dat;
int x,ifacc,N,size,stride_obj,convs,stride_conv;
double *F;
int hfl,sign;
/* Appropriately scaled filter for taking derivatives */
double d2x[1+HFL_D2];
double d2y[1+HFL_D2];
double d2z[1+HFL_D2];
int i;
for (i=0; i<=HFL_D2; i++) {
d2x[i]=D2[i]/pscale.x;
d2y[i]=D2[i]/pscale.y;
d2z[i]=D2[i]/pscale.z;
}
/* Expand the box
[org.x-2,org.y-2,org.z]->(org.x+Nx/2+3,org.y+Ny/2+3,org.z+Nz/2)
along z to make array of size [Nx/2+5,Ny/2+5,Nz]. The extra
space is needed for borders for upcoming x and y
convolutions. */
/* For expansions along z, we loop over each plane as the
stride between planes is not contiguous (and so can't be done
in a single call to expand) due to expansion region not filling
the full parent.
AND, INTERLEAVE this with expanding along y to make array of
size [Nx/2+5,Ny,Nz] with final optimization of noting that only
one x plane of work1 is ever used and so we always use the first
one. */
for (x=0; x<dim.x/2+5; x++) {
/* Select interpolet/derivative filter */
if (d==2) {F=d2z; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
expand(work1,ifacc=0,
&pdat[IND(org.x-2+x,org.y-2,org.z,pdim.x,pdim.y,pdim.z)],
F,hfl,sign,
N=dim.z,size=1,stride_obj=1,
convs=dim.y/2+5,stride_conv=pdim.z);
/* Select interpolet/derivative filter */
if (d==1) {F=d2y; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
expand(&work2[IND(x,0,0,dim.x/2+5,dim.y,dim.z)],ifacc=0,
&work1[IND(0,2,0,dim.x/2+5,dim.y/2+5,dim.z)],
F,hfl,sign,
N=dim.y,size=dim.z,stride_obj=dim.z,
convs=1,stride_conv=(dim.y/2+5)*dim.z);
}
/* Expand the last box [0,0,0]->(Nx/2+5,Ny,Nz) along z, making the
final array of size [Nx,Ny,Nz] in the output data block */
if (d==0) {F=d2x; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
expand(self->dat,ifacc=0,
&work2[IND(2,0,0,dim.x/2+5,dim.y,dim.z)],
F,hfl,sign,
N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
convs=1,stride_conv=(dim.x/2+5)*dim.y*dim.z);
}
}
/* Accumlative version of Dabove. Identical arguments, except for --
out: grid onto which results are accumulated */
void accDabove(int d, struct Grid *out,struct Grid *self,double *work1,double *work2,int lwork)
{
/* Indices */
int x,y,z;
int Nx=self->dim.x,Ny=self->dim.y,Nz=self->dim.z;
/* Sanity check on derivative direction */
if (d<0 || d>2) {
printf("Illegal direction d=%d in Dabove!!!\n",d);
exit(1);
}
/* Check on workspace */
if (self->level > 0) {
if ( lwork < (self->dim.x+3*HFL_T)*(self->dim.y+3*HFL_T)
*(self->dim.z+3*HFL_T) ) {
fprintf(stderr,"Insufficient work space in accDabove for I...");
fprintf(stderr,"data set of size (%dx%dx%d)\n",
self->dim.x,self->dim.y,self->dim.z);
fprintf(stderr,"Exiting.\n");
exit(1);
}
if ( lwork < (self->dim.x/2+5)*self->dim.y*self->dim.z ) {
fprintf(stderr,"Insufficient work space in accDabove...");
fprintf(stderr,"data set of size (%dx%dx%d)\n",
self->dim.x,self->dim.y,self->dim.z);
fprintf(stderr,"Exiting.\n");
exit(1);
}
}
/* This has same effect as taking I explicitly just before calling Dabove */
if ( self->level > 0 )
aconvP3d( self->parent->dat+
(self->org.x * self->parent->dim.y +
self->org.y) * self->parent->dim.z
+ self->org.z,
self->dat,
1, tr, tf, ts,
self->parent->dim.x,
self->parent->dim.y,
self->parent->dim.z,
self->dim.x, self->dim.y,self->dim.z,
HFL_T, work1, work2 );
/* Work from bottom up */
if (self->child)
accDabove(d,out->child,self->child,work1,work2,lwork);
/* Do siblings AFTER children to make inheritance streams for cache */
if (self->sibling)
accDabove(d,out->sibling,self->sibling,work1,work2,lwork);
/* Convolve parent onto self */
/* Zero contribution if no parent */
if (self->parent) {
struct Ivec dim=self->dim;
struct Ivec org=self->org;
struct Ivec pdim=self->parent->dim;
struct Dvec pscale=self->parent->scale;
double *pdat=self->parent->dat;
int x,ifacc,N,size,stride_obj,convs,stride_conv;
double *F;
int hfl,sign;
/* Appropriately scaled filter for taking derivatives */
double d2x[1+HFL_D2];
double d2y[1+HFL_D2];
double d2z[1+HFL_D2];
int i;
for (i=0; i<=HFL_D2; i++) {
d2x[i]=D2[i]/pscale.x;
d2y[i]=D2[i]/pscale.y;
d2z[i]=D2[i]/pscale.z;
}
/* Expand the box
[org.x-2,org.y-2,org.z]->(org.x+Nx/2+3,org.y+Ny/2+3,org.z+Nz/2)
along z to make array of size [Nx/2+5,Ny/2+5,Nz]. The extra
space is needed for borders for upcoming x and y
convolutions. */
/* For expansions along z, we loop over each plane as the
stride between planes is not contiguous (and so can't be done
in a single call to expand) due to expansion region not filling
the full parent.
AND INTERLEAVE this with expanding along y to make array of
size [Nx/2+5,Ny,Nz] with final optimization of noting that only
one x plane of work1 is ever used and so we always use the first
one. */
for (x=0; x<dim.x/2+5; x++) {
/* Select interpolet/derivative filter */
if (d==2) {F=d2z; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
expand(work1,ifacc=0,
&pdat[IND(org.x-2+x,org.y-2,org.z,pdim.x,pdim.y,pdim.z)],
F,hfl,sign,
N=dim.z,size=1,stride_obj=1,
convs=dim.y/2+5,stride_conv=pdim.z);
/* Select interpolet/derivative filter */
if (d==1) {F=d2y; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
expand(&work2[IND(x,0,0,dim.x/2+5,dim.y,dim.z)],ifacc=0,
&work1[IND(0,2,0,dim.x/2+5,dim.y/2+5,dim.z)],
F,hfl,sign,
N=dim.y,size=dim.z,stride_obj=dim.z,
convs=1,stride_conv=(dim.y/2+5)*dim.z);
}
/* Expand the last box [0,0,0]->(Nx/2+5,Ny,Nz) along z, making the
final array of size [Nx,Ny,Nz] in the output data block */
if (d==0) {F=d2x; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
expand(out->dat,ifacc=1,
&work2[IND(2,0,0,dim.x/2+5,dim.y,dim.z)],
F,hfl,sign,
N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
convs=1,stride_conv=(dim.x/2+5)*dim.y*dim.z);
}
}
/* Dagger of Dabove in the sense of identification of ghost images of
real space points, thus including prezeroing of ghost points before
convolution */
void Dbelow(int d, struct Grid *out, struct Grid *in,double *work1,double *work2,int lwork)
{
/* Indices */
int x,y,z;
int Nx=out->dim.x,Ny=out->dim.y,Nz=out->dim.z;
/* Sanity check on derivative direction */
if (d<-1 || d>2) {
printf("Illegal direction d=%d in Dabove!!!\n",d);
exit(1);
}
/* Working from top down... */
/* Accumulate-convolve onto parent (if any) */
if (out->parent) {
struct Ivec dim=out->dim;
struct Ivec org=out->org;
struct Ivec pdim=out->parent->dim;
struct Dvec pscale=out->parent->scale;
double *pdat=out->parent->dat;
int x,N,size,stride_obj,convs,stride_conv;
double *F;
int hfl,sign;
double d2x[1+HFL_D2];
double d2y[1+HFL_D2];
double d2z[1+HFL_D2];
int i;
for (i=0; i<=HFL_D2; i++) {
d2x[i]=D2[i]/pscale.x;
d2y[i]=D2[i]/pscale.y;
d2z[i]=D2[i]/pscale.z;
}
/* Check on workspace */
if ( lwork < (in->dim.x/2+5)*in->dim.y*in->dim.z) {
fprintf(stderr,"Insufficient work space in Dbelow for ");
fprintf(stderr,"data set of size (%dx%dx%d)\n",
in->dim.x,in->dim.y,in->dim.z);
fprintf(stderr,"Exiting.\n");
exit(1);
}
/* Copy input and zero ghosts (which don't contrib to Dbelow) */
for (x=0; x<dim.x; x++)
for (y=0; y<dim.y; y++)
for (z=0; z<dim.z; z++)
if (x%2==0 && y%2==0 && z%2==0)
out->dat[IND(x,y,z,dim.x,dim.y,dim.z)]=0.;
else
out->dat[IND(x,y,z,dim.x,dim.y,dim.z)]=
in->dat[IND(x,y,z,dim.x,dim.y,dim.z)];
/* Contract the box [0,0,0]->(Nx,Ny,Nz) from out along x, making
the final array of size [Nx/2+5,Ny,Nz] into work1. Note that
the origin of the input box correpsonds to the point (2,0,0) in
the output box because the effects of the lower level 'puff
out' along x.
NOTE: must zero work1 first because contract() is accumulative */
for (x=0; x<(dim.x/2+5)*dim.y*dim.z; x++)
work1[x]=0.;
if (d==0) {F=d2x; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
contract(&work1[IND(2,0,0,dim.x/2+5,dim.y,dim.z)],
out->dat,
F,hfl,sign,
N=dim.x,size=dim.y*dim.z,stride_obj=dim.y*dim.z,
convs=1,stride_conv=(dim.x/2+5)*dim.y*dim.z);
/* Contract the box [0,0,0]->(Nx/2+5,Ny,Nz) along y to make array
of size [Nx/2+5,Ny/2+5,Nz] in work2. Note that the origin of
the input box corresponds to (0,2,0) in the output.
NOTE: must zero work2 first because contract() is accumulative.
AND INTERLEAVE this operation with contracting the box
[0,0,0]->(Nx/2+5,Ny/2+5,Nz) along z and accumulate onto parent
with origin of work2 corresponding to (org.x-2,org.y-2,org.z).
The -2's are because of the buffer we've built into work2.
The final optimization was to note that only one plane of work2
is ever used at once, so we just use the 0th one to reduce
memory volume */
for (x=0; x<dim.x/2+5; x++) {
for (y=0; y<(dim.y/2+5)*dim.z; y++)
work2[y]=0.;
if (d==1) {F=d2y; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
contract(&work2[IND(0,2,0,dim.x/2+5,dim.y/2+5,dim.z)],
&work1[IND(x,0,0,dim.x/2+5,dim.y,dim.z)],
F,hfl,sign,
N=dim.y,size=dim.z,stride_obj=dim.z,
convs=1,stride_conv=(dim.y/2+5)*dim.z);
if (d==2) {F=d2z; hfl=HFL_D2; sign=-1;}
else {F=T2; hfl=HFL_T2; sign=1;}
contract(&pdat[IND(org.x-2+x,org.y-2,org.z,pdim.x,pdim.y,pdim.z)],
&work2[IND(0,0,0,dim.x/2+5,dim.y/2+5,dim.z)],
F,hfl,sign,
N=dim.z,size=1,stride_obj=1,
convs=dim.y/2+5,stride_conv=pdim.z);
}
}
/* Zero out in anticipation of accumulation by children */
for (x=0; x<Nx*Ny*Nz; x++)
out->dat[x]=0.;
/* Do children */
if (out->child)
Dbelow(d,out->child,in->child,work1,work2,lwork);
/* Do siblings AFTER children to make inheritance streams for cache */
if (out->sibling)
Dbelow(d,out->sibling,in->sibling,work1,work2,lwork);
}
/* Evaluate the "Dd" operator, the first derivative in dth direction
in REAL SPACE given value of function in coefficient space ---
d: direction of derivative
out: output (in real space)
in: input (in coeff space)
work1: workspace
work2: workspace
lwork: size of workspaces
*/
void D(int d, struct Grid *out, struct Grid *in,struct Grid *work,double *work1,double *work2,int lwork)
{
/* Save copy of input. Needed so that D isn't destructive. If
destructive were permissible, eliminating this would lead to a
noticeable time savings. */
copygridnew(work,in);
/* Contributions from levels below */
Dsame(d,out,in);
Dacc(out);
/* Add on contributions from levels above */
accDabove(d,out,work,work1,work2,lwork);
return;
}
/* Hermitian conjugate of D, with same inputs.
Routine derived by "daggering" (reversing data flow and
daggering) the routine for D: */
void Ddag(int d, struct Grid *out, struct Grid *in,struct Grid *work,double *work1,double *work2,int lwork)
{
/* Find contributions from below current level */
Dbelow(d,out,in,work1,work2,lwork);
Idag_flow(out,work1,work2,lwork);
/* Because the first derivative filter is asymmetric, its dagger is
just minus itself. Thus, decrement same convolution result from
out. */
decDsame(d,out,in);
/* Impose those zeros on ghosts */
zeroghostrec(out);
}
syntax highlighted by Code2HTML, v. 0.9.1