/*
    DFT++ is a density functional package developed by the research group
    of Professor Tomas Arias

    Copyright 1996-2003 Sohrab Ismail-Beigi

    This file is part of DFT++.

    DFT++ is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    DFT++ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with DFT++; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Please see the file CREDITS for a list of authors.

    For academic users, we request that publications using results obtained with
    this software reference

    "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
    and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).

    and, if using the wavelet basis, further reference

    "Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
    T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).

    and 

    "Robust ab initio calculation of condensed matter: transparent convergence through
    semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
    Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).

    For your convenience, preprints of the above articles may be obtained from
    http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/

/*
 * Code by Sohrab Ismail-Beigi,  Mar. 1997
 * 
 * Non-local pseudopotentials, with multiple-projector extension of
 * basic Kleinman-Bylander form.
 *
 */

/* $Id: PW_nlpot.cpp,v 1.1.2.9 2003/05/29 18:54:03 ivan Exp $ */

#include "header.h"

void accumulate_Vnl(const ColumnBundle &Y, ColumnBundle &HspY, Everything &e)
{
  int i, sp, lm;

  // Nonlocal part of potential: sum(ions,l,m,...) { Vnl*Mnl*Vnl^Y }
  for (sp=0; sp < e.ioninfo.nspecies; sp++)
    for (lm=0; lm < e.ioninfo.species[sp].pot.nlm; lm++){
      if (e.ioninfo.species[sp].pot.ngamma[lm] > 1) {
	    dft_log(DFT_SILENCE,
                "\nMultiple-projectors:  running slow Hsp()!\n");
        
	    /* this is the slow way where we go one atom at a time...
	     * the smarter way would be to somehow make a new class
	     * which is a block-diagonal matrix class (a string of matrix
	     * classes on the diagonal of a bigger one), where each
	     * diagonal is just Mnl below, and to define an
	     * block_diag_matrix*matrix (returning matrix) operator.
	     * Then we can do what we do with the Kleinman-Bylander
	     * below with minimal changes. */
        
	    ColumnBundle Vnl(e.ioninfo.species[sp].pot.ngamma[lm],
                         Y.col[0]->basis,"local");
	    /* reference */
	    ComplexMatrix &Mnl = e.ioninfo.species[sp].pot.M[lm]; 
        
	    for (i=0; i < e.ioninfo.species[sp].natoms; i++) {
          Vnl_pseudo(sp,i,lm,&(e.ioninfo),Vnl);
          HspY += Vnl*(Mnl*(Vnl^Y));
        }
	  }
      /* Kleinman-Bylander:  bunch up all local potentials for
       * the atoms of this species and state into a big ColumnBundle
       * and work on them instead (should be faster due to ^ and *
       * operators being block-multiplies, etc.) */
      else {
	    // Vnl is created as distributed ColumnBundle.
	    // the dimension that's distributed is ioninfo.species[sp].natoms
        ColumnBundle Vnl(e.ioninfo.species[sp].natoms,Y.col[0]->basis,
                         "distributed");
        
        // Fill in Vnl with pseudopotential elements
        Vnl_pseudo_fill_matrix(&(e.ioninfo), Vnl,sp,lm);
        
        // Now use Vnl!
        // we want to do:   HspY += Vnl*(Mnl*(Vnl^Y));
        scalar Mnl = e.ioninfo.species[sp].pot.M[lm](0,0);
        do_ColumnBundle_matrix_mult(Vnl,Mnl*(Vnl^Y),HspY,1);
      }
    }
}

/*
 * Calculates the matrix elements of the non-local pseudopotential
 * (multiple-projector form) for the ion'th ion of species sp
 * for atomic state lm at k-vector kvec and returs the result
 * as a column bundle of ioninfo->species[sp].ngamma[lm] columns of
 * length basis->nbasis.  ngamma is the number of multiple projectors
 * for the species sp and state lm.
 *
 * Letting q = k + G, the matrix we are computing is Vnl with entries:
 * 
 * Vnl(q,gamma) = <q|V_gamma*phi_gamma>
 *
 * where |q> is a plane-wave state with <r|q> = exp(i*q*r)/sqrt(Vol)
 * and V_gamma is the non-local potential for the gamma'th state
 * and phi_gamma is the atomic wave-function for the gamma'th state.
 *
 * Now, V_gamma*phi_gamma(r) = V_l(|r|)*phi_l(|r|)*Ylm(rhat)
 *                                           (hat == unit vec.)
 *
 * and using the relation:
 *
 *   exp(iq.r) = sum_{l,m} { 4pi(i^l)j_l(|q|*|r|)Ylm*(qhat)Ylm(rhat) } 
 *
 * We arrivate at,
 *
 *   Vnl(q,gamma) = Y_lm(qhat)*(4*pi)*f_l(|q|)/sqrt(Vol)
 *
 * where
 *
 *   f_l(q) = integral{0,inf} { dr*r^2*V_l(r)*phi_l(r)*(i^l*j_l(|r|*|q|)) }
 *
 * We get f_l(q) from the internal tables stored in ioninfo via interpolation.
 *
 */
void
Vnl_pseudo(int sp,int ion,int lm,
           Ioninfo *ioninfo,
           ColumnBundle &Vnl)
{
  Basis *basis = Vnl.col[0]->basis;
  QuantumNumber *qnum = basis->qnum;
  Lattice *lattice = basis->basis_spec->lattice;
  complex S,Ylm;
  real dot,posx,posy,posz,kx,ky,kz,kplusGx,kplusGy,kplusGz;
  real qx,qy,qz,q,invq,qhatx,qhaty,qhatz;
  real flq=0.0,r,invdq=0.0;
  real GGTxx,GGTyy,GGTzz,GGTxy,GGTxz,GGTyz;
  real Vol,invsqrtVol;
  real ym1,y0,y1,y2,a,b,c,x;
  int ngamma,gamma,n,j,nmax=0,l,m;
  matrix3 G;

  /* pi et al. */
  const real pi = M_PI;
  const real twopi = (real)2.0*pi;
  const real fourpi = (real)4.0*pi;
  const real sixth = (real)1.0/(real)6.0;
  /* Ylm normalization constants */
  const real N00  =  sqrt(1.0/(4.0*pi));
  const real N10  =  sqrt(3.0/(4.0*pi));
  const real N11  = -sqrt(3.0/(8.0*pi));
  const real N1m1 =  sqrt(3.0/(8.0*pi));
  const real N20  =  sqrt(5.0/(16.0*pi));
  const real N21  = -sqrt(15.0/(8.0*pi));
  const real N2m1 =  sqrt(15.0/(8.0*pi));
  const real N22  =  sqrt(15.0/(32.0*pi));
  const real N2m2 =  sqrt(15.0/(32.0*pi));

  Speciesinfo *sp_info = &(ioninfo->species[sp]);

  /* Vnl will contain the final result */
  ngamma = sp_info->pot.ngamma[lm];
  if (Vnl.tot_ncols != ngamma)
    die("\nIn Vnl_pseudo(), ngamma != Vnl.ncols\n\n");
  if (Vnl.col_length != basis->nbasis)
    die("\nIn Vnl_pseudo(), nbasis != Vnl.col_length\n\n");
  Vnl.zero_out();

  /* Get ionic position, k-vector, cell volume, G, and GGT matrix entries */
  posx = sp_info->atpos[ion].v[0];
  posy = sp_info->atpos[ion].v[1];
  posz = sp_info->atpos[ion].v[2];
  kx = qnum->kvec.v[0];
  ky = qnum->kvec.v[1];
  kz = qnum->kvec.v[2];
  G = lattice->G;
  GGTxx = lattice->GGT.m[0][0];
  GGTyy = lattice->GGT.m[1][1];
  GGTzz = lattice->GGT.m[2][2];
  GGTxy = lattice->GGT.m[0][1];
  GGTxz = lattice->GGT.m[0][2];
  GGTyz = lattice->GGT.m[1][2];
  Vol = lattice->unit_cell_volume;
  invsqrtVol = (real)1.0/sqrt(Vol);

  /* Get l,m values for the potential */
  l = sp_info->pot.l[lm];
  m = sp_info->pot.m[lm];
  if (l < 0 || l > 2)
    die("l != {0,1,2} nonlocal pseudopotentials not supported yet!\n");
  if (m < -l || m > l)
    die("Nonlocal pseudopotential with |m| > l !?!?\n");

  /* Calculate the non-local pseudopotential matrix entries */
  for (gamma = 0; gamma < ngamma; gamma++){
    invdq = (real)1.0/sp_info->pot.dq_nl[lm][gamma];
    nmax = sp_info->pot.ngrid_nl[lm][gamma]-3;

    for (n = 0; n < basis->nbasis; n++){
	  /* q = k+G:  here q is in reciprocal lattice coordinates */
	  qx = kplusGx = kx + basis->Gx[n];
	  qy = kplusGy = ky + basis->Gy[n];
	  qz = kplusGz = kz + basis->Gz[n];
	  /* S = exp(-i*dot(q,pos)) */
	  dot = -twopi*(posx*qx + posy*qy + posz*qz);
	  S.x = cos(dot);
	  S.y = sin(dot);

	  
	  q = sqrt( qx*qx*GGTxx + qy*qy*GGTyy + qz*qz*GGTzz +
                2.0*(qx*qy*GGTxy + qx*qz*GGTxz + qy*qz*GGTyz) );

	  /* Interpolate flq using internal tables */
	  r = q*invdq;
	  j = (int)r;
	  if (j > nmax){
        dft_log(DFT_SILENCE,
                "G=[%d %d %d] q=[%g %g %g] k=[%g %g %g]\n",
                basis->Gx[n],basis->Gy[n],basis->Gz[n],
                qx,qy,qz,kx,ky,kz);
        die("Nonlocal pseudopotential grid is too small!\n");
      }

	  /* Do cubic interlopation on flq with j-1, j, j+1, and j+2
	   * points from internal tables.  If j==0, we clearly can't
	   * real j-1.  However f_l(-q) = (-1)^l*f_l(q) so we can
	   * fill in the j-1 entry with the j+1 entry times (-1)^l */
	  if (j == 0)
	    ym1 = (1-2*(l%2))*sp_info->pot.flq[lm][gamma][j+1];
	  else
	    ym1 = sp_info->pot.flq[lm][gamma][j-1];
	  y0  = sp_info->pot.flq[lm][gamma][j  ];
	  y1  = sp_info->pot.flq[lm][gamma][j+1];
	  y2  = sp_info->pot.flq[lm][gamma][j+2];
	  a = sixth*(y2-ym1+3.0*(y0-y1));
	  b = 0.5*(y1+ym1-y0-y0);
	  c = y1-sixth*(y2+3.0*y0+ym1+ym1);
	  x = r-j;
	  flq = ((a*x+b)*x+c)*x+y0;
	  /* The line below is quadratic interploation: */
	  /* flq = y0 + 0.5*x*(y1-ym1+x*(y1+ym1-y0-y0)); */

	  /* If we're at q=0, then we must handle the case separately
	   * since Ylm and qhat are not defined for q=0.  The cases are:
	   * if l==0, Ylm is const.
	   * if l>0, flq=0 anyways, so even though Ylm is undefined,
	   * it doesn't matter what it is since the final result is zero. */
	  if (q < (real)1.0e-13){
        if (l==0) { Ylm.x = N00; Ylm.y = 0.0; }
        else	{ Ylm.x = Ylm.y = 0.0; }
        goto skipcalcYlm;
      }


	  /* Replace the above if block by the one below to reporduce
	   * cgrad results. */
	  /* 	  if (j == 0) */
	  /* 	    { */
	  /* 	      flq = y0 + (y1-y0)*x; */
	  /* 	      if (l==0) { Ylm.x = N00; Ylm.y = 0.0; } */
	  /* 	      else	{ Ylm.x = Ylm.y = 0.0; } */
	  /* 	      goto skipcalcYlm; */
	  /* 	    } */


	  /* Otherwise, we have to calculate Ylm: 
	   * qhat is a unit vector in direction of q:  here q is in "real"
	   * bohr^-1 units and q = (k+G)*G where (k+G) is a row-vector. */
	  qx = kplusGx*G.m[0][0] + kplusGy*G.m[1][0] + kplusGz*G.m[2][0];
	  qy = kplusGx*G.m[0][1] + kplusGy*G.m[1][1] + kplusGz*G.m[2][1];
	  qz = kplusGx*G.m[0][2] + kplusGy*G.m[1][2] + kplusGz*G.m[2][2];
	  invq = (real)1.0/q;
	  qhatx = qx*invq;
	  qhaty = qy*invq;
	  qhatz = qz*invq;
	  /* Calculate Ylm */
	  switch (l){
        case 0:
          Ylm.x = N00;
          Ylm.y = 0.0; break;
        case 1:
          switch(m){
            case 0:
              Ylm.x = N10*qhatz;
              Ylm.y = 0.0; break;
            case 1:
              Ylm.x = N11*qhatx;
              Ylm.y = N11*qhaty; break;
            case -1:
              Ylm.x =  N1m1*qhatx;
              Ylm.y = -N1m1*qhaty; break;
            default:
              die("PW_nlpot -- invalid m: %d for l: %d\n", m, l);
          } break;
        case 2:
          switch (m){
            case 0:
              Ylm.x = N20*(3.0*qhatz*qhatz-1.0);
              Ylm.y = 0.0;  break;
            case 1:
              Ylm.x = N21*(qhatz*qhatx);
              Ylm.y = N21*(qhatz*qhaty); break;
            case -1:
              Ylm.x = N2m1*( qhatz*qhatx);
              Ylm.y = N2m1*(-qhatz*qhaty); break;
            case 2:
              Ylm.x = N22*(qhatx*qhatx-qhaty*qhaty);
              Ylm.y = N22*(2.0*qhatx*qhaty); break;
            case -2:
              Ylm.x = N2m2*(qhatx*qhatx-qhaty*qhaty);
              Ylm.y = N2m2*(-2.0*qhatx*qhaty); break;
            default:
              die("PW_nlpot -- invalid m: %d for l: %d\n", m, l);
          }  break;
        default:
          die("nonlocal pseudopot: l:d ?!?!!\n",l);
      }
      
      skipcalcYlm: ;
      /* Calculate Vnl */
	  Vnl.col[gamma]->data.d[n] = (S*Ylm);
	  Vnl.col[gamma]->data.d[n] *= fourpi*flq*invsqrtVol;
      
	}
  }
  
}

//
// This routine fills in a non-local pseudopot. matrix in the case
// of Kleinman-Bylander projectors, for columns start_col to
// start_col+n_cols_todo-1 of Vnl.
//
static void
Vnl_pseudo_fill_some(Ioninfo *ioninfo,
                     ColumnBundle &Vnl,
                     int sp,int lm,
                     int start_col,int n_cols_todo)
{
  if (n_cols_todo == 0) return;
  
  Basis *basis = Vnl.col[0]->basis;
  // Temporary space to hold the pseudopot for one atom
  ColumnBundle Vnloneatom(1,basis,"local");
  
  // Fill in the columns of Vnl
  int i,j;
  for (i=start_col; i < start_col+n_cols_todo; i++) {
    Vnl_pseudo(sp,i+Vnl.start_ncol,
               lm,ioninfo,Vnloneatom);
    for (j=0; j < basis->nbasis; j++)
      Vnl.col[i]->data.d[j] = Vnloneatom.col[0]->data.d[j];
  }
}

//
// A thread runs this routine.  The routine figures out which part of
// Vnl this thread should fill out, and then does exactly that by
// calling the above routine.
//
#ifdef DFT_THREAD
static void *
Vnl_pseudo_thread(void *arg)
{
  // Decode the inputs
  dft_thread_data *data = (dft_thread_data *)arg;
  Ioninfo *ioninfo = (Ioninfo *)data->p1;
  ColumnBundle *Vnl = (ColumnBundle *)data->p2;
  int start_col = data->start;
  int n_cols_todo = data->n;
  int sp = data->i1;
  int lm = data->i2;

  // Fill in the matrix
  Vnl_pseudo_fill_some(ioninfo,*Vnl,sp,lm,start_col,n_cols_todo);
  
  // Free up data-passing struct and exit
  myfree(arg);
  return NULL;
}
#endif

//
// This routine is just a coding-saver, called from the outside,
// that fills in Vnl with non-local pseudo-pot matrix elements.
//
void
Vnl_pseudo_fill_matrix(Ioninfo *ioninfo,ColumnBundle &Vnl,
                       int sp,int lm)
{
#ifdef DFT_THREAD
  // Threads!  Distribute the work of filling in Vnl
  dft_call_threads(Vnl.my_ncols,
                   (void *)ioninfo,(void *)&Vnl,NULL,NULL,NULL,
                   sp,lm,0,0,0,0,
                   Vnl_pseudo_thread);
#else
  // Serial:  just fill Vnl with one call
  Vnl_pseudo_fill_some(ioninfo,Vnl,sp,lm,0,Vnl.my_ncols);
#endif
}


/*
     Textbook formulae from Liboff:

  Y0,0  = 1/sqrt(4pi)

  Y1,0      = sqrt(3/(4*pi))*cos(theta)
  Y1,(+-)1  = (-+)sqrt(3/(8pi))*sin(theta)*exp((+-)i*phi)

  Y2,0      = sqrt(5/(16*pi))*(3*cos(theta)^2 -1)
  Y2,(+-)1  = (-+)sqrt(15/(8pi))(sin(theta)*cos(theta))*exp((+-)i*phi)
  Y2,(+-)2  = sqrt(15/(32*pi))(sin(theta)^2)*exp((+-)2i*phi)
*/

/*
 * This routine does what the above one does, and also fills in
 * dVnlx/y/z with the derivatives of Vnl versus the position of the
 * ion 'ion' of species 'sp' (lattice coords).
 */
void
dVnl_pseudo_datom_pos(int sp,int ion,int lm,
/*                      Basis *basis,
                        Lattice *lattice,
*/                    Ioninfo *ioninfo,
                      ColumnBundle &Vnl,
                      ColumnBundle &dVnlx,
                      ColumnBundle &dVnly,
                      ColumnBundle &dVnlz )
{
  Basis *basis = Vnl.col[0]->basis;
  QuantumNumber *qnum = basis->qnum;
  Lattice *lattice = basis->basis_spec->lattice;
  
  complex S,Ylm,negtwopiiVnl;
  real dot,posx,posy,posz,kx,ky,kz,kplusGx,kplusGy,kplusGz;
  real qx,qy,qz,q,invq,qhatx,qhaty,qhatz;
  real flq=0.0,r,invdq=0.0;
  real GGTxx,GGTyy,GGTzz,GGTxy,GGTxz,GGTyz;
  real Vol,invsqrtVol;
  real ym1,y0,y1,y2,a,b,c,x;
  int ngamma,gamma,n,j,nmax=0,l,m;
  matrix3 G;

  /* pi et al. */
  const real pi = M_PI;
  const real twopi = (real)2.0*pi;
  const real fourpi = (real)4.0*pi;
  const real sixth = (real)1.0/(real)6.0;
  /* Ylm normalization constants */
  const real N00  =  sqrt(1.0/(4.0*pi));
  const real N10  =  sqrt(3.0/(4.0*pi));
  const real N11  = -sqrt(3.0/(8.0*pi));
  const real N1m1 =  sqrt(3.0/(8.0*pi));
  const real N20  =  sqrt(5.0/(16.0*pi));
  const real N21  = -sqrt(15.0/(8.0*pi));
  const real N2m1 =  sqrt(15.0/(8.0*pi));
  const real N22  =  sqrt(15.0/(32.0*pi));
  const real N2m2 =  sqrt(15.0/(32.0*pi));

  Speciesinfo *sp_info = &(ioninfo->species[sp]);

  /* Vnl will contain the final result */
  ngamma = sp_info->pot.ngamma[lm];
  if (Vnl.tot_ncols != ngamma)
    die("\nIn Vnl_pseudo(), ngamma != Vnl.ncols\n\n");
  if (Vnl.col_length != basis->nbasis)
    die("\nIn Vnl_pseudo(), nbasis != Vnl.col_length\n\n");
  Vnl.zero_out();

  /* Get ionic position, k-vector, cell volume, G, and GGT matrix entries */
  posx = sp_info->atpos[ion].v[0];
  posy = sp_info->atpos[ion].v[1];
  posz = sp_info->atpos[ion].v[2];
  kx = qnum->kvec.v[0];
  ky = qnum->kvec.v[1];
  kz = qnum->kvec.v[2];
  G = lattice->G;
  GGTxx = lattice->GGT.m[0][0];
  GGTyy = lattice->GGT.m[1][1];
  GGTzz = lattice->GGT.m[2][2];
  GGTxy = lattice->GGT.m[0][1];
  GGTxz = lattice->GGT.m[0][2];
  GGTyz = lattice->GGT.m[1][2];
  Vol = lattice->unit_cell_volume;
  invsqrtVol = (real)1.0/sqrt(Vol);

  /* Get l,m values for the potential */
  l = sp_info->pot.l[lm];
  m = sp_info->pot.m[lm];
  if (l < 0 || l > 2)
    die("l != {0,1,2} nonlocal pseudopotentials not supported yet!\n");
  if (m < -l || m > l)
    die("Nonlocal pseudopotential with |m| > l !?!?\n");

  /* Calculate the non-local pseudopotential matrix entries */
  for (gamma = 0; gamma < ngamma; gamma++) {
    invdq = (real)1.0/sp_info->pot.dq_nl[lm][gamma];
    nmax = sp_info->pot.ngrid_nl[lm][gamma]-3;

    for (n = 0; n < basis->nbasis; n++) {
	  /* q = k+G:  here q is in reciprocal lattice coordinates */
	  qx = kplusGx = kx + basis->Gx[n];
	  qy = kplusGy = ky + basis->Gy[n];
	  qz = kplusGz = kz + basis->Gz[n];
	  /* S = exp(-i*dot(q,pos)) */
	  dot = -twopi*(posx*qx + posy*qy + posz*qz);
	  S.x = cos(dot);
	  S.y = sin(dot);

	  q = sqrt( qx*qx*GGTxx + qy*qy*GGTyy + qz*qz*GGTzz +
                2.0*(qx*qy*GGTxy + qx*qz*GGTxz + qy*qz*GGTyz) );

	  /* Interpolate flq using internal tables */
	  r = q*invdq;
	  j = (int)r;
	  if (j > nmax) {
        dft_log(DFT_SILENCE,
                "G=[%d %d %d] q=[%g %g %g] k=[%g %g %g]\n",
                basis->Gx[n],basis->Gy[n],basis->Gz[n],
                qx,qy,qz,kx,ky,kz);
        die("Nonlocal pseudopotential grid is too small!\n");
      }

	  /* Do cubic interlopation on flq with j-1, j, j+1, and j+2
	   * points from internal tables.  If j==0, we clearly can't
	   * real j-1.  However f_l(-q) = (-1)^l*f_l(q) so we can
	   * fill in the j-1 entry with the j+1 entry times (-1)^l */
	  if (j == 0)
	    ym1 = (1-2*(l%2))*sp_info->pot.flq[lm][gamma][j+1];
	  else
	    ym1 = sp_info->pot.flq[lm][gamma][j-1];
	  y0  = sp_info->pot.flq[lm][gamma][j  ];
	  y1  = sp_info->pot.flq[lm][gamma][j+1];
	  y2  = sp_info->pot.flq[lm][gamma][j+2];
	  a = sixth*(y2-ym1+3.0*(y0-y1));
	  b = 0.5*(y1+ym1-y0-y0);
	  c = y1-sixth*(y2+3.0*y0+ym1+ym1);
	  x = r-j;
	  flq = ((a*x+b)*x+c)*x+y0;
	  /* The line below is quadratic interploation: */
	  /* flq = y0 + 0.5*x*(y1-ym1+x*(y1+ym1-y0-y0)); */

	  /* If we're at q=0, then we must handle the case separately
	   * since Ylm and qhat are not defined for q=0.  The cases are:
	   * if l==0, Ylm is const.
	   * if l>0, flq=0 anyways, so even though Ylm is undefined,
	   * it doesn't matter what it is since the final result is zero. */
	  if (q < (real)1.0e-13) {
        if (l==0) { Ylm.x = N00; Ylm.y = 0.0; }
        else	{ Ylm.x = Ylm.y = 0.0; }
        goto skipcalcYlm;
      }

	  /* Otherwise, we have to calculate Ylm: 
	   * qhat is a unit vector in direction of q:  here q is in "real"
	   * bohr^-1 units and q = (k+G)*G where (k+G) is a row-vector. */
	  qx = kplusGx*G.m[0][0] + kplusGy*G.m[1][0] + kplusGz*G.m[2][0];
	  qy = kplusGx*G.m[0][1] + kplusGy*G.m[1][1] + kplusGz*G.m[2][1];
	  qz = kplusGx*G.m[0][2] + kplusGy*G.m[1][2] + kplusGz*G.m[2][2];
	  invq = (real)1.0/q;
	  qhatx = qx*invq;
	  qhaty = qy*invq;
	  qhatz = qz*invq;
	  /* Calculate Ylm */
	  switch(l){
        case 0:
          Ylm.x = N00;
          Ylm.y = 0.0; break;
        case 1:
          switch(m){
            case 0:
              Ylm.x = N10*qhatz;
              Ylm.y = 0.0; break;
            case 1:
              Ylm.x = N11*qhatx;
              Ylm.y = N11*qhaty; break;
            case -1:
              Ylm.x =  N1m1*qhatx;
              Ylm.y = -N1m1*qhaty; break;
            default:
              die("PW_atom_nlpot -- invalid m: %d for l: %d\n", m, l);
          } break;
        case 2:
          switch(m){
            case 0:
              Ylm.x = N20*(3.0*qhatz*qhatz-1.0);
              Ylm.y = 0.0; break;
            case 1:
              Ylm.x = N21*(qhatz*qhatx);
              Ylm.y = N21*(qhatz*qhaty); break;
            case -1:
              Ylm.x = N2m1*( qhatz*qhatx);
              Ylm.y = N2m1*(-qhatz*qhaty); break;
            case 2:
              Ylm.x = N22*(qhatx*qhatx-qhaty*qhaty);
              Ylm.y = N22*(2.0*qhatx*qhaty); break;
            case -2:
              Ylm.x = N2m2*(qhatx*qhatx-qhaty*qhaty);
              Ylm.y = N2m2*(-2.0*qhatx*qhaty); break;
            default:
              die("PW_atom_nlpot -- invalid m: %d for l: %d\n", m, l);
          } break;
        default:
	    die("nonlocal pseudopot: l: %d ?!?!!\n",l);
      }
      
      skipcalcYlm: ;
	  /* Calculate Vnl */
	  Vnl.col[gamma]->data.d[n] = (S*Ylm);
	  Vnl.col[gamma]->data.d[n] *= fourpi*flq*invsqrtVol;
	  /* dVnl(k+G)/dtau[j] = -2*pi*i*(k[j]+G[j])*Vnl(k+G) */
	  negtwopiiVnl.x =  twopi*Vnl.col[gamma]->data.d[n].y;
	  negtwopiiVnl.y = -twopi*Vnl.col[gamma]->data.d[n].x;
	  dVnlx.col[gamma]->data.d[n] = kplusGx*negtwopiiVnl;
	  dVnly.col[gamma]->data.d[n] = kplusGy*negtwopiiVnl;
	  dVnlz.col[gamma]->data.d[n] = kplusGz*negtwopiiVnl;
	}
  }
  
}

//
// Fills in columns start_col to start_col+n_cols_todo-1
// of Vnl and dVx,y,z by calling above routine.
//
static void
dVnl_pseudo_datom_fill_some(Ioninfo *ioninfo,
                            ColumnBundle &Vnl,
                            ColumnBundle &dVx,
                            ColumnBundle &dVy,
                            ColumnBundle &dVz,
                            int sp,int lm,
                            int start_col,int n_cols_todo)
{
  Basis *basis = Vnl.col[0]->basis;
  
  // Temporary workspaces to hold results for each atom
  ColumnBundle Vnloneatom(1,basis,"local");
  ColumnBundle dVxoneatom(1,basis,"local");
  ColumnBundle dVyoneatom(1,basis,"local");
  ColumnBundle dVzoneatom(1,basis,"local");

  // Fill in the columns!
  for (int i=start_col; i < start_col+n_cols_todo; i++) {
    int j;
    
    dVnl_pseudo_datom_pos(sp,i+Vnl.start_ncol,lm,ioninfo,
                          Vnloneatom,dVxoneatom,dVyoneatom,dVzoneatom);
    for (j=0; j < basis->nbasis; j++) {
      Vnl.col[i]->data.d[j] = Vnloneatom.col[0]->data.d[j];
      dVx.col[i]->data.d[j] = dVxoneatom.col[0]->data.d[j];
      dVy.col[i]->data.d[j] = dVyoneatom.col[0]->data.d[j];
      dVz.col[i]->data.d[j] = dVzoneatom.col[0]->data.d[j];
    }
  }
}

#ifdef DFT_THREAD
//
// This routine is what a thread runs:  it calls the above filling
// routine with the parts appropriate for this thread to fill out
//
static void *
dVnl_pseudo_datom_thread(void *arg)
{
    // Decode what's coming in
    dft_thread_data *data = (dft_thread_data *)arg;
    Ioninfo *ioninfo = (Ioninfo *)data->p1;
    ColumnBundle **Varray = (ColumnBundle **)data->p2;
    ColumnBundle *Vnl = Varray[0];
    ColumnBundle *dVx = Varray[1];
    ColumnBundle *dVy = Varray[2];
    ColumnBundle *dVz = Varray[3];
    int sp = data->i1;
    int lm = data->i2;
    int start_col = data->start;
    int n_cols_todo = data->n;

    
    // Fill in the parts that this thread should fill
    dVnl_pseudo_datom_fill_some(ioninfo,*Vnl,*dVx,*dVy,*dVz,
                                sp,lm,start_col,n_cols_todo);
    
    // Free up data-passing struct and exit thread
    myfree(arg);
    return NULL;
    
}
#endif

//
// This routine has Vnl and dVx,y,z filled in with appropriate
// matrix elements of non-local pseudopot. for Kleinman-Bylander case.
//
void
dVnl_pseudo_datom_fill(Ioninfo *ioninfo,
                       ColumnBundle &Vnl,
                       ColumnBundle &dVx,
                       ColumnBundle &dVy,
                       ColumnBundle &dVz,
                       int sp,int lm)
{  
#ifdef DFT_THREAD
  // Threads!  Distribute the filling among threads
  // We need to pack the addresses of Vnl and dVx/y/z since 
  // we can only pass 4 pointers in the current version of threads
  ColumnBundle *(Varray[4]);
  Varray[0] = &Vnl; Varray[1] = &dVx; Varray[2] = &dVy; Varray[3] = &dVz;
  dft_call_threads(Vnl.my_ncols,
                   (void *)ioninfo,(void *)Varray,NULL, NULL, NULL,
                   sp,lm,0,0,0,0,
                   dVnl_pseudo_datom_thread);
#else
  // Serial:  fill them in with one call
  dVnl_pseudo_datom_fill_some(ioninfo,Vnl,dVx,dVy,dVz,
                              sp,lm,0,Vnl.my_ncols);
#endif
}