/*
 * sff2.c:  simple force field: implement routines to read a "prmtop"
 * file from AMBER, and calculate the energy.  Implements only
 * some of AMBER's functionality: bonds, angles, dihedrals, and
 * nonbonded interactions with a distance-dependent dielectric.
 * Does not (yet) include support for period systems.
 *
 * Main interface is through routines "mme2" and "mme_init".
 *
 * Second derivatives, in particular the egb2 function, were
 * implemented by Russ Brown (russ.brown@sun.com).
 *
 * Changes to difang, eangl2 and ephi2 by David A. Case (case@scripps.edu).
 *
 * For the OpenMP implementation, only the egb2 function is parallelized
 * so the econs2, ebond2, eangl2, ephi2 and nbond2 functions execute in
 * single-threaded mode.
 *
 * For the ScaLAPACK implementation, only the egb2 function is parallelized
 * so the econs2, ebond2, eangl2, ephi2 and nbond2 functions all compute
 * each gradient vector and Hessian matrix element.  However, because the
 * gradient and Hessian are distributed, the ptr1d, ptr1f, ptr2d and ptr2f
 * functions are used to write the subvector and submatrix elements by only
 * the process that owns the subvector or submatrix.  The SUPERD_T type is
 * used to facilitate argument transmission to the ptr2f function.
 */

#ifdef SCALAPACK

typedef struct superd {
  INT_T dtype, ctxt, m, n, mb, nb, rsrc, csrc, lld, row, col;
  size_t size;
  INT_T *divmb, *divnb, *modmb, *modnb, *divrw, *divcl, *modrw, *modcl;
} SUPERD_T;

#endif

int numroc_(int *, int *, int *, int *, int *);

void descinit_(int *, int *, int *, int *, int *,
	       int *, int *, int *, int *, int *);

void pdgemr2d_(INT_T *, INT_T *,
	       REAL_T *, INT_T *, INT_T *, INT_T *, 
               REAL_T *, INT_T *, INT_T *, INT_T *,
	       INT_T *);

void pdgemm_(char *, char *, INT_T *, INT_T *, INT_T *, REAL_T *,
	     REAL_T *, INT_T *, INT_T *, INT_T *,
	     REAL_T *, INT_T *, INT_T *, INT_T *, REAL_T *,
	     REAL_T *, INT_T *, INT_T *, INT_T *);

void dgemm_( char *, char *, INT_T *, INT_T*, INT_T*, REAL_T *, REAL_T *,
	     INT_T *, REAL_T *, INT_T *, REAL_T *, REAL_T*, INT_T * );


/***********************************************************************
                            SUPERDESC()
************************************************************************/

/* Create and initialize a superdescriptor. */

#ifdef SCALAPACK

SUPERD_T *superdesc( INT_T *desc )
{
  SUPERD_T *sd;
  int nprow, npcol, myrow, mycol, i, zero=0;
  size_t locp, locq;

  /* Malloc a superdescriptor and copy values from the descriptor. */

  if ( ( sd = (SUPERD_T*)malloc( sizeof(SUPERD_T) ) ) == NULL ) {
    printf("Error allocating superdescriptor in superdesc!\n");
    fflush(stdout);
    mpierror(-1);
  }

  sd->dtype = desc[DTYPE_];
  sd->ctxt  = desc[CTXT_];
  sd->m     = desc[M_];
  sd->n     = desc[N_];
  sd->mb    = desc[MB_];
  sd->nb    = desc[NB_];
  sd->rsrc  = desc[RSRC_];
  sd->csrc  = desc[CSRC_];
  sd->lld   = desc[LLD_];

  if (gb2_debug) {
    printf("superdesc: context = %d\n", sd->ctxt);
    fflush(stdout);
  }

  /* Initialize the division and modulus array pointers to NULL. */

  sd->divmb = sd->modmb = sd->divrw = sd->modrw = NULL;
  sd->divnb = sd->modnb = sd->divcl = sd->modcl = NULL;

  /* Return NULL if the context is invalid. */

  if (sd->lld < 0) return (NULL);

  /* 
   * Calculate the maximum size of the submatrix array.
   * Store the row and column for this process.
   */

  blacs_gridinfo_( &(sd->ctxt), &nprow, &npcol, &myrow, &mycol);
  locp = numroc_( &(sd->m), &(sd->mb), &myrow, &zero, &nprow );
  locq = numroc_( &(sd->n), &(sd->nb), &mycol, &zero, &npcol );

  sd->size = locp * locq;
  sd->row = myrow;
  sd->col = mycol;

  /* Return NULL if the size or the matrix dimensions are invalid. */

  if (sd->size < 0 || sd->m < 0 || sd->n < 0) return (NULL);

  /* Create and fill the division and modulus arrays. */

  sd->divmb = ivector( 0, sd->m );
  sd->modmb = ivector( 0, sd->m );
  sd->divrw = ivector( 0, sd->m );
  sd->modrw = ivector( 0, sd->m );

  sd->divnb = ivector( 0, sd->n );
  sd->modnb = ivector( 0, sd->n );
  sd->divcl = ivector( 0, sd->n );
  sd->modcl = ivector( 0, sd->n );

  for (i = 0; i < sd->m; i++) {
    (sd->divmb)[i] = i/(sd->mb);
    (sd->modmb)[i] = i%(sd->mb);
    (sd->divrw)[i] = i/nprow;
    (sd->modrw)[i] = i%nprow;
  }

  for (i = 0; i < sd->n; i++) {
    (sd->divnb)[i] = i/(sd->nb);
    (sd->modnb)[i] = i%(sd->nb);
    (sd->divcl)[i] = i/npcol;
    (sd->modcl)[i] = i%npcol;
  }

  /* Return the superdescriptor. */

  return (sd);

}

#endif

/***********************************************************************
                            FREE_SUPERDESC()
************************************************************************/

#ifdef SCALAPACK

/* Free a superdescriptor including all of its dynamic arrays. */

void free_superdesc( SUPERD_T *sd )
{
  if (sd == NULL) return;

  if (sd->divmb != NULL) free_ivector( sd->divmb, 0, sd->m );
  if (sd->modmb != NULL) free_ivector( sd->modmb, 0, sd->m );
  if (sd->divrw != NULL) free_ivector( sd->divrw, 0, sd->m );
  if (sd->modrw != NULL) free_ivector( sd->modrw, 0, sd->m );

  if (sd->divnb != NULL) free_ivector( sd->divnb, 0, sd->n );
  if (sd->modnb != NULL) free_ivector( sd->modnb, 0, sd->n );
  if (sd->divcl != NULL) free_ivector( sd->divcl, 0, sd->n );
  if (sd->modcl != NULL) free_ivector( sd->modcl, 0, sd->n );

  free(sd);
}

#endif

/***********************************************************************
                            ADR1D()
************************************************************************/

/* Perform the 1D address mapping from the global to the local address. */

#ifdef SCALAPACK

static
size_t adr1d(int i, int mb, int nprow)
{
  return ( (i/(nprow*mb))*mb + i%mb );
}

#endif

/***********************************************************************
                            ADR2D()
************************************************************************/

#ifdef SCALAPACK

/* Perform the 2D address mapping from the global to the local address. */

static
size_t adr2d(int i, int j, int mb, int nb, int nprow, int npcol, int lld)
{
  size_t lbi, lbj, bai, baj;

  /* Calculate the local block address (lbi, lbj) */

  lbi = i/(nprow*mb);
  lbj = j/(npcol*nb);

  /* Calculate the offset (bai, baj) within the local block. */

  bai = i%mb;
  baj = j%nb;

  /* Calculate the local array address for column-major addressing. */

  return ( lld*(nb*lbj + baj) + mb*lbi + bai );
}

#endif

/***********************************************************************
                            PTR1D()
************************************************************************/

/* Return a pointer to a submatrix element or NULL if undefined. */

#ifdef SCALAPACK

REAL_T *ptr1d( REAL_T *a, int *desc, int i )
{
  int nprow, npcol, myrow, mycol, locp, locq, zero=0;
  size_t adr;

  /* Return NULL if the matrix isn't active in the context. */

  if (desc[CTXT_] < 0) return (NULL);

  /* Return NULL if the global index is out of range. */

  if (i > desc[M_]) {
    printf("Global vector address overflow, i=%d  imax=%d  context=%d\n",
	   i, desc[M_], desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }

  /* Get the process grid information. */

  blacs_gridinfo_(&desc[CTXT_], &nprow, &npcol, &myrow, &mycol);

  /*
   * Return NULL if the submatrix doesn't exist in this process.
   * Perform the column test first because the row test is likely
   * to be true for egb2 because a row test is performed at the
   * level of the loop index.
   */

  if ( mycol != 0 || !myroc(i, desc[MB_], nprow, myrow) ) return (NULL);

  /* Calculate the submatrix address and return NULL if out of range. */

  locp = numroc_(&desc[M_], &desc[MB_], &myrow, &zero, &nprow);
  locq = numroc_(&desc[N_], &desc[NB_], &mycol, &zero, &npcol);
  adr = adr1d(i, desc[MB_], nprow);
  if ( adr > (size_t)(locp*locq) ) {
    printf("Local subvector address overflow, adr=%lld  adrmax=%lld  context=%d\n",
	   adr, (size_t)(locp*locq), desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }

  /* If the vector array isn't allocated return NULL. */

  if (a == NULL) {
    printf("Local subvector is not allocated, context=%d\n", desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }

  /* Otherwise, return the address of the matrix element. */

  return (&a[adr]);
}

#endif

/***********************************************************************
                            PTR1F()
************************************************************************/

/* Return a pointer to a submatrix element or NULL if undefined. */

#ifdef SCALAPACK

REAL_T *ptr1f( REAL_T *a, SUPERD_T *sd, int i )
{
  int lbi, bai, adr;

  /* Return NULL if the superdescriptor is NULL. */

  if (sd == NULL) return (NULL);

  /* Return NULL if the matrix isn't active in the context. */

  if (sd->ctxt < 0) return (NULL);

  /* Return NULL if the global index is out of range. */

  if (i < 0 || i > sd->m) {
    printf("Global vector address overflow, i=%d  imax=%lld  context=%d\n",
	   i, sd->m, sd->ctxt);
    fflush(stdout);
    return (NULL);
  }

  /*
   * Return NULL if the submatrix doesn't exist in this process.
   * Perform the column test first because the row test is likely
   * to be true for egb2 because a row test is performed at the
   * level of the loop index.
   */

  lbi = (sd->divmb)[i];
  if ( sd->col != 0 || (sd->modrw)[lbi] != sd->row ) return (NULL);

  /* Calculate the submatrix address and return NULL if out of range. */

  lbi = (sd->divrw)[lbi];
  bai = (sd->modmb)[i];
  adr = (sd->mb) * lbi + bai;
  if ( adr < 0 || adr > sd->size ) {
    printf("Local subvector address overflow, adr=%lld  adrmax=%lld  context=%d\n",
	   adr, sd->size, sd->ctxt);
    fflush(stdout);
    return (NULL);
  }

  /* If the vector array isn't allocated return NULL. */

  if (a == NULL) {
    printf("Local subvector is not allocated, context=%d\n", sd->ctxt);
    fflush(stdout);
    return (NULL);
  }

  /* Otherwise, return the address of the matrix element. */

  return (&a[adr]);
}

#endif

/***********************************************************************
                            PTR2D()
************************************************************************/

/* Return a pointer to a submatrix element or NULL if undefined. */

#ifdef SCALAPACK

REAL_T *ptr2d( REAL_T *a, int *desc, int i, int j )
{
  int nprow, npcol, myrow, mycol, locp, locq, zero=0;
  size_t adr;

  /* Return NULL if the matrix isn't active in the context. */

  if (desc[CTXT_] < 0) return (NULL);

  /* Return NULL if the global indices are out of range. */

  if (i > desc[M_]) {
    printf("Global matrix address overflow, i=%d  imax=%lld  context=%d\n",
	   i, desc[M_], desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }
  if (j > desc[N_]) {
    printf("Global matrix address overflow, j=%d  jmax=%lld  context=%d\n",
	   j, desc[N_], desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }

  /* Get the process grid information. */

  blacs_gridinfo_(&desc[CTXT_], &nprow, &npcol, &myrow, &mycol);

  /* Return NULL if the submatrix doesn't exist in this process. */

  if ( !myroc(i + desc[RSRC_], desc[MB_], nprow, myrow) ||
       !myroc(j + desc[CSRC_], desc[NB_], npcol, mycol) ) return (NULL);

  /* Calculate the submatrix address and return NULL if out of range. */

  locp = numroc_(&desc[M_], &desc[MB_], &myrow, &zero, &nprow);
  locq = numroc_(&desc[N_], &desc[NB_], &mycol, &zero, &npcol);
  adr = adr2d(i, j, desc[MB_], desc[NB_], nprow, npcol, locp);
  if ( adr > (size_t)(locp*locq) ) {
    printf("Local submatrix address overflow, adr=%lld  adrmax=%lld  context=%d\n",
	   adr, (size_t)(locp*locq), desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }

  /* If the matrix array isn't allocated return NULL. */

  if (a == NULL) {
    printf("Local submatrix is not allocated, context=%d\n", desc[CTXT_]);
    fflush(stdout);
    return (NULL);
  }

  /* Otherwise, return the address of the matrix element. */

  return (&a[adr]);
}

#endif

/***********************************************************************
                            PTR2F()
************************************************************************/

/*
 * Return a pointer to a submatrix element or NULL if undefined.
 * Always print an error message for 'hard' errors, such as address
 * overflows, but only print an error message for normal NULL,
 * such as 'matrix isn't active in context' if gb2_debug != 0.
 */

#ifdef SCALAPACK

REAL_T *ptr2f( REAL_T *a, SUPERD_T *sd, int i, int j, int *info )
{
  size_t lbi, lbj, bai, baj, adr;

  /* Return NULL if the superdescriptor is NULL. */

  if (sd == NULL) {
    if (gb2_debug) {
      printf("ptr2f: superdescriptor is NULL\n");
      fflush(stdout);
    }
    *info = -1;
    return (NULL);
  }
 
  /* Return NULL if the matrix isn't active in the context. */

  if (sd->ctxt < 0) {
    if (gb2_debug) {
      printf("ptr2f: matrix isn't active in context %d\n", sd->ctxt);
      fflush(stdout);
    }
    *info = -2;
    return (NULL);
  }

  /* Return NULL if the global indices are out of range. */

  if (i < 0) {
    printf("ptr2f: matrix row address underflow, i=%d  imax=%d  context=%d\n",
	   i, sd->m, sd->ctxt);
    fflush(stdout);
    *info = -3;
    return (NULL);
  }
  if (i > sd->m) {
    printf("ptr2f: matrix row address overflow, i=%d  imax=%d  context=%d\n",
	   i, sd->m, sd->ctxt);
    fflush(stdout);
    *info = -4;
    return (NULL);
  }
  if (j < 0) {
    printf("ptr2f: matrix column address underflow, j=%d  jmax=%d  context=%d\n",
	   j, sd->n, sd->ctxt);
    fflush(stdout);
    *info = -5;
    return (NULL);
  }
  if (j > sd->n) {
    printf("ptr2f: matrix column address overflow, j=%d  jmax=%d  context=%d\n",
	   j, sd->n, sd->ctxt);
    fflush(stdout);
    *info = -6;
    return (NULL);
  } 

  /*
   * Return NULL if the submatrix doesn't exist in this process.
   * Perform the column test first because the row test is likely
   * to be true for egb2 because a row test is performed at the
   * level of the loop index.
   *
   * The test is (csrc+j/nb)%npcol != mycol || (rsrc+i/mb)%nprow != myrow
   */

  lbi = (sd->divmb)[i];
  lbj = (sd->divnb)[j];
  if ( (sd->modcl)[sd->csrc + lbj] != sd->col ||
       (sd->modrw)[sd->rsrc + lbi] != sd->row ) {
    if (gb2_debug) {
      printf("ptr2f: submatrix doesn't exist in process, row=%d  col=%d\n",
	     sd->row, sd->col);
      fflush(stdout);
    }
    *info = -7;
    return (NULL);
  }

  /* Calculate the local block address (lbi=i/mb/nprow, lbj=j/nb/npcol) */

  lbi = (sd->divrw)[lbi];
  lbj = (sd->divcl)[lbj];

  /* Calculate the offset (bai=i%mb, baj=j%nb) within the local block. */

  bai = (sd->modmb)[i];
  baj = (sd->modnb)[j];

  /* Calculate the submatrix address and return NULL if out of range. */

  adr = (sd->lld) * ( (sd->nb) * lbj + baj ) + (sd->mb) * lbi + bai;
  if ( adr < 0 || adr > sd->size ) {
    printf("ptr2f: submatrix address overflow, adr=%lld  adrmax=%lld  context=%d\n",
	   adr, sd->size, sd->ctxt);
    fflush(stdout);
    *info = -8;
    return (NULL);
  }

  /* If the matrix array isn't allocated return NULL. */

  if (a == NULL) {
    printf("ptr2f: submatrix is not allocated, context=%d\n", sd->ctxt);
    fflush(stdout);
    *info = -9;
    return (NULL);
  }

  /* Otherwise, return the address of the matrix element. */

  *info = 0;
  return (&a[adr]);
}

#endif

/***********************************************************************
                            ECONS2()
************************************************************************/
 
/* Calculate the constrained energy and derivatives. */

static
REAL_T econs2( REAL_T *x, REAL_T *f, REAL_T *g, 
	       int context_PxQ, int *descF_PxQ, int *descG_PxQ )

{
  int i;
  size_t n;
  REAL_T e_cons, rx, ry, rz;

#ifdef SCALAPACK
  REAL_T *ptr;
#endif

  n = 3*prm->Natom;
  e_cons = 0.0;
  for (i = 0; i < prm->Natom; i++) {
    if (constrained[i]) {
      rx = x[3 * i    ] - x0[3 * i    ];
      ry = x[3 * i + 1] - x0[3 * i + 1];
      rz = x[3 * i + 2] - x0[3 * i + 2];
      e_cons += wcons * (rx * rx + ry * ry + rz * rz);

#ifdef SCALAPACK

      ptr = ptr1d(f, descF_PxQ, 3*i+0);
      if (ptr != NULL) *ptr += 2. * wcons * rx;
      ptr = ptr1d(f, descF_PxQ, 3*i+1);
      if (ptr != NULL) *ptr += 2. * wcons * ry;
      ptr = ptr1d(f, descF_PxQ, 3*i+2);
      if (ptr != NULL) *ptr += 2. * wcons * rz;

      ptr = ptr2d(g, descG_PxQ, 3*i+0, 3*i+0);
      if (ptr != NULL) *ptr += 2. * wcons;
      ptr = ptr2d(g, descG_PxQ, 3*i+1, 3*i+1);
      if (ptr != NULL) *ptr += 2. * wcons;
      ptr = ptr2d(g, descG_PxQ, 3*i+1, 3*i+1);
      if (ptr != NULL) *ptr += 2. * wcons;
#else
      f[3 * i    ] += 2. * wcons * rx;
      f[3 * i + 1] += 2. * wcons * ry;
      f[3 * i + 2] += 2. * wcons * rz;

      g[3*i + 0 + n*(3*i + 0)] += 2. * wcons;
      g[3*i + 1 + n*(3*i + 1)] += 2. * wcons;
      g[3*i + 2 + n*(3*i + 2)] += 2. * wcons;
#endif
    }
  }
  return (e_cons);
}

/***********************************************************************
                            EBOND2()
************************************************************************/
 
/* Calculate bond stretching energy and derivatives. */

static
REAL_T ebond2( int nbond, int *a1, int *a2, int *atype,
	       REAL_T *Rk, REAL_T *Req, REAL_T *x, REAL_T *f, REAL_T *g,
	       int context_PxQ, int *descF_PxQ, int *descG_PxQ )

{
  int i, k, l, at1, at2, atyp;
  size_t n;
  REAL_T e_bond, r, r2, rinv, r2inv, rx, ry, rz, dfx, dfy, dfz, db, df, dg, e;
  REAL_T di[3], dj[3], d2ii[3][3], d2jj[3][3], d2ij[3][3], ijterm;

#ifdef SCALAPACK
  REAL_T *ptr;
#endif

  n = 3*prm->Natom;
  e_bond = 0.0;
  for (i = 0; i < nbond; i++) {
    at1 = a1[i];
    at2 = a2[i];
    atyp = atype[i] - 1;

    /* Calculate the interatomic distance and its (x,y,z) components. */

    rx = x[at1    ] - x[at2    ];
    ry = x[at1 + 1] - x[at2 + 1];
    rz = x[at1 + 2] - x[at2 + 2];
    r2 = rx * rx + ry * ry + rz * rz;
    r2inv = 1.0 / r2;
    r = sqrt(r2);
    rinv = r * r2inv;

    /*
     * Calculate the first and second derivatives of the interatomic
     * distance Dij with respect to the cartesian coordinates of atoms
     * i and j.  The results are placed into five arrays:
     *
     *   di[] for the first derivatives with respect to atom i
     *   dj[] for the first derivatives with respect to atom j
     *   d2ii[] for the second derivatives with respect to atom i
     *   d2jj[] for the second derivatives with respect to atom j
     *   d2ij[] for the second derivatives with respect to atoms i and j
     *
     * Some useful symmetry obtains.  The d2ii, d2jj and d2ij arrays
     * are symmetric in that their lower and upper triangles are
     * the transposes of one another.  Also, d2jj equals d2ii, and
     * d2ij is the negative of d2ii.
     *
     * As was exploited by David Case and his colleagues, the vector dj
     * is the negative of the vector di.
     */
      
    di[0] = rx;
    di[1] = ry;
    di[2] = rz;

    dj[0] = -rx;
    dj[1] = -ry;
    dj[2] = -rz;

    /* Load the upper triangle of d2ii. */

    d2ii[0][0] = 1.0 - rx * rx * r2inv;
    d2ii[0][1] = -rx * ry * r2inv;
    d2ii[0][2] = -rx * rz * r2inv;

    d2ii[1][1] = 1.0 - ry * ry * r2inv;
    d2ii[1][2] = -ry * rz * r2inv;

    d2ii[2][2] = 1.0 - rz * rz * r2inv;

    /* Finish loading the rest of all of the matrices. */

    for (k = 0; k < 3; k++) {

      /* Load the upper triangles of d2jj and d2ij. */

      for (l = k; l < 3; l++) {
	d2jj[k][l] = d2ii[k][l];
	d2ij[k][l] = -d2ii[k][l];
      }

      /* Load the symmetric elements of d2ii, d2jj and d2ij. */

      for (l = k+1; l < 3; l++) {
	d2ii[l][k] = d2ii[k][l];
	d2jj[l][k] = d2jj[k][l];
	d2ij[l][k] = d2ij[k][l];
      }
    }

    /*
     * Calculate the bond stretching energy as:
     *
     *      e = Rk*(r - Req)^2
     */

    db = r - Req[atyp];
    df = Rk[atyp] * db;
    e = df * db;
    e_bond += e;

    /*
     * For the calculation of the 1st derivatives of the energy, the
     * 1st derivatives of the bond distance are multiplied by df:
     *
     *      de/dx = df*(dr/dx)
     *
     * where ds/dx represents the 1st derivative of the bond distance, and: 
     *
     *      df = de/dr = 2*Rk*(r - Req)
     *
     * because the chain rule gives:
     *
     *      de/dx = (de/dr)*(dr/dx) = df*(dr/dx)
     *
     * Divide df by r because this factor is not present in dr/dx.
     */

    df *= 2.0 * rinv;

    dfx = df * rx;
    dfy = df * ry;
    dfz = df * rz;

#ifdef SCALAPACK

    ptr = ptr1d(f, descF_PxQ, at1+0);
    if (ptr != NULL) *ptr += dfx;
    ptr = ptr1d(f, descF_PxQ, at1+1);
    if (ptr != NULL) *ptr += dfy;
    ptr = ptr1d(f, descF_PxQ, at1+2);
    if (ptr != NULL) *ptr += dfz;

    ptr = ptr1d(f, descF_PxQ, at2+0);
    if (ptr != NULL) *ptr -= dfx;
    ptr = ptr1d(f, descF_PxQ, at2+1);
    if (ptr != NULL) *ptr -= dfy;
    ptr = ptr1d(f, descF_PxQ, at2+2);
    if (ptr != NULL) *ptr -= dfz;

#else
    f[at1    ] += dfx;
    f[at1 + 1] += dfy;
    f[at1 + 2] += dfz;

    f[at2    ] -= dfx;
    f[at2 + 1] -= dfy;
    f[at2 + 2] -= dfz;
#endif

    /*
     * For the calculation of the 2nd derivatives of the energy, the 1st and 2nd
     * derivatives of the bond distance are multiplied by dg and df as follows:
     *
     *      d2e/dx2dx1 = df*(d2r/dx2dx1) + dg*(dr/dx2)*(dr/dx1)
     *
     * where d2s/dx2dx1 represents a 2nd derivative of the bond distance,
     * dr/dx1 and dr/dx2 represent 1st derivatives of the bond distance, and:
     *
     *      dg = (d/dr)df = (d/dr)(de/dr) = 2*Rk
     *
     * because the chain rule gives:
     *
     *      d2e/dx2dx1 = (d/dx2)(de/dx1)
     *
     *                 = (d/dx2)[(de/dr)*(dr/dx1)]
     *
     *                 = (d/dx2)[df*(dr/dx1)]
     *
     *                 = [(d/dx2)df]*(dr/dx1) + df*[(d/dx2)(dr/dx1)]
     *
     *                 = [(d/dx2)(de/dr)]*(dr/dx1) + df*(d2r/dx2dx1)
     *
     *                 = {[(d/dr)(de/dr)]*(dr/dx2)}*(dr/dx1) + df*(d2r/dx2dx1)
     *
     *                 = dg*(dr/dx2)*(dr/dx1) + df*(d2r/dx2dx1)
     *
     * Divide dg by r^2 because this factor is not present in (dr/dx1)*(dr/dx2).
     */

    dg = 2.0 * Rk[atyp] * r2inv;

    for (k = 0; k < 3; k++) {
      for (l = 0; l < 3; l++) {
	ijterm = df * d2ij[k][l] + dg * di[k] * dj[l];

#ifdef SCALAPACK

	ptr = ptr2d(g, descG_PxQ, at1+l, at1+k);
	if (ptr != NULL) *ptr += df * d2ii[k][l] + dg * di[k] * di[l];
	ptr = ptr2d(g, descG_PxQ, at2+l, at2+k);
	if (ptr != NULL) *ptr += df * d2jj[k][l] + dg * dj[k] * dj[l];
	ptr = ptr2d(g, descG_PxQ, at2+l, at1+k);
	if (ptr != NULL) *ptr += ijterm;
	ptr = ptr2d(g, descG_PxQ, at1+k, at2+l);
	if (ptr != NULL) *ptr += ijterm;
#else
        g[at1 + k + n*(at1 + l)] += df * d2ii[k][l] + dg * di[k] * di[l];
        g[at2 + k + n*(at2 + l)] += df * d2jj[k][l] + dg * dj[k] * dj[l];
        g[at1 + k + n*(at2 + l)] += ijterm;
        g[at2 + l + n*(at1 + k)] += ijterm;
#endif
      }
    }
  }

  if (e_debug)
    EXPR("%9.3f", e_bond);
  return (e_bond);
}

/***********************************************************************
                            DIFANG()
************************************************************************/
 
/*
 * This function sets up second derivatives for angle-like potentials,
 * and is used by the eangl2 and ephi2 functions.
 *
 * Calling parameters are as follows:
 *
 * cst - cosine of theta
 * s - bond vectors ij and kj
 * dc - first der. of costheta w/respect to the cartesian differences in s
 * ddc - output of second derivative of costheta w/respect to the cartesian
 *       differences
 */

static
void difang(REAL_T cst, REAL_T *s, REAL_T *dc, REAL_T **ddc)
{
  REAL_T bij, boi2, boij3, boji3, bkj, boj2, br, bt;
  int i, i1, j;

  /*     ----- first set up distances needed -----   */

  bij = s[1]*s[1] + s[2]*s[2] + s[3]*s[3];
  boi2 = 1.0 / bij;
  bij = sqrt(bij);
  bkj = s[4]*s[4] + s[5]*s[5] + s[6]*s[6];
  boj2 = 1.0 / bkj;
  bkj = sqrt(bkj);
  br = boi2;

  /*
   *     ----- set up ddc first as second derivative of costheta w/respect
   *           to cartesian differences -----
   */

  for (i = 1; i <= 6; i++) {
    if (i == 4) br = boj2;
    ddc[i][i] = -br*(2.0*s[i]*dc[i] + cst*(1.0 - s[i]*s[i]*br));
  }

  for (i = 1; i <= 3; i++) {
    ddc[i][i+3] = -boi2*s[i]*dc[i+3] - boj2*s[i+3]*dc[i]
      - boi2*boj2*(s[i]*s[i+3]*cst - bij*bkj);
  }

  for (i = 1; i <= 2; i++) {
    i1 = i + 1;
    for (j = i1; j <= 3; j++) {
      ddc[i][j] =
	-boi2*(s[i]*dc[j] + s[j]*dc[i] - boi2*cst*s[i]*s[j]);
    }
  }

  for (i = 4; i <= 5; i++) {
    i1 = i + 1;
    for (j = i1; j <= 6; j++) {
      ddc[i][j] =
	-boj2*(s[i]*dc[j] + s[j]*dc[i] - boj2*cst*s[i]*s[j]);
    }
  }

  boij3 = boj2/(bij*bkj);
  boji3 = boi2/(bij*bkj);
  bt = boi2*boj2*cst;
  for (i = 1; i <= 3; i++) {
    for (j = 4; j <= 6; j++) {
      if (j != i + 3)
	ddc[i][j] =
	  -boij3*s[i+3]*s[j] - boji3*s[i]*s[j-3] + bt*s[i]*s[j];
    }
  }

  /*    ----- symmetrize the ddc array -----   */

  for (i = 1; i <= 6; i++) {
    for (j = i; j <= 6; j++) {
      ddc[j][i] = ddc[i][j];
    }
  }
  return;
}

/***********************************************************************
                            EANGL2()
************************************************************************/
 
/* Calculate the bond bending energy and derivatives.*/

static
REAL_T eangl2( int nang, int *a1, int *a2, int *a3, int *atype,
	       REAL_T *Tk, REAL_T *Teq, REAL_T *x, REAL_T *f, REAL_T *g,
	       int context_PxQ, int *descF_PxQ, int *descG_PxQ )
{
  int i, j, ia, atyp, at1, at2, at3, ist, iof, inew, jst, jof, jnew;
  size_t n;
  REAL_T x1, y1, z1, x2, y2, z2, x3, y3, z3;
  REAL_T c, st, st2c, dtheta, theta, df, ddf, e, e_theta;
  REAL_T rij, rkj, rij2, rkj2, rrik;

  REAL_T s[7], dc[7], dr[10];
  REAL_T **ddc, **ddr;

#ifdef SCALAPACK
  REAL_T *ptr;
#endif

  ddc = matrix(1,6,1,6);
  ddr = matrix(1,9,1,9);

  n = 3*prm->Natom;
  e_theta = 0.0;
  for (ia = 0; ia < nang; ia++) {
    at1 = a1[ia];
    at2 = a2[ia];
    at3 = a3[ia];
    atyp = atype[ia] - 1;

    /* Set up the variables x1..x3, y1..y3 and z1..z3 for use later. */

    x1 = x[at1    ];
    y1 = x[at1 + 1];
    z1 = x[at1 + 2];

    x2 = x[at2    ];
    y2 = x[at2 + 1];
    z2 = x[at2 + 2];

    x3 = x[at3    ];
    y3 = x[at3 + 1];
    z3 = x[at3 + 2];

    s[1] = x1 - x2;
    s[2] = y1 - y2;
    s[3] = z1 - z2;
    s[4] = x3 - x2;
    s[5] = y3 - y2;
    s[6] = z3 - z2;
    rij2 = s[1]*s[1] + s[2]*s[2] + s[3]*s[3];
    rkj2 = s[4]*s[4] + s[5]*s[5] + s[6]*s[6];
    rij = sqrt(rij2);
    rkj = sqrt(rkj2);
    rrik = rij * rkj;
    c = (s[1]*s[4] + s[2]*s[5] + s[3]*s[6]) / rrik;

    c = c > 1.0 ? 1.0 : c;
    c = c < -1.0 ? -1.0 : c;
    theta = acos(c);
    dtheta = theta - Teq[atyp];

    /* df and ddf are derivatives of E with respect to c == costheta: */
    df = dtheta * Tk[atyp];
    ddf = 2. * Tk[atyp];
    e = df * dtheta;
    e_theta += e;

    /*
     * Calculate the sine then limit small values of the sine to the range
     * -10**-3..+10**-3 so as to avoid division by zero.
     */

    st = sin(theta);
    if (st > 0 && st < 1.e-3)
      st = 1.e-3;
    else if (st < 0 && st > -1.e-3)
      st = -1.e-3;
    df *= 2.0;   /* check this!!  */

    /* dc = derivative of c with respect to cartesian differences: */

    for( i=1; i<=3; i++ ){
      dc[i] = (s[i+3]/rkj - c*s[i]/rij)/rij;
      dc[i+3] = (s[i]/rij - c*s[i+3]/rkj)/rkj;
    }

    /* get ddc = second derivative of c with respect to 
       cartesian differences:  */
    difang( c, s, dc, ddc );

    /* change ddc to second. der. of theta with respect to 
       cartesian differences:  */
    st2c = c/(st*st);
    for( i=1; i<=6; i++ ){
      for( j=i; j<=6; j++ ){
	ddc[i][j] = -(ddc[i][j] + dc[i]*dc[j]*st2c)/st;
	ddc[j][i] = ddc[i][j];
      }
    }

    /* change dc to derivative of theta w/ respect to cartesian
       differences:  */
    for( i=1; i<=6; i++ ){
      dc[i] = -dc[i]/st;
    }

    /* dr will hold -derivates of theta w/ respect to cartesians: */
    for( i=1; i<=3; i++ ){
      dr[i] = -dc[i];
      dr[i+6] = -dc[i+3];
      dr[i+3] = dc[i] + dc[i+3];
    }

    /* update the forces:  */

#ifdef SCALAPACK

    ptr = ptr1d(f, descF_PxQ, at1+0);
    if (ptr != NULL) *ptr -= df*dr[1];
    ptr = ptr1d(f, descF_PxQ, at1+1);
    if (ptr != NULL) *ptr -= df*dr[2];
    ptr = ptr1d(f, descF_PxQ, at1+2);
    if (ptr != NULL) *ptr -= df*dr[3];

    ptr = ptr1d(f, descF_PxQ, at2+0);
    if (ptr != NULL) *ptr -= df*dr[4];
    ptr = ptr1d(f, descF_PxQ, at2+1);
    if (ptr != NULL) *ptr -= df*dr[5];
    ptr = ptr1d(f, descF_PxQ, at2+2);
    if (ptr != NULL) *ptr -= df*dr[6];

    ptr = ptr1d(f, descF_PxQ, at3+0);
    if (ptr != NULL) *ptr -= df*dr[7];
    ptr = ptr1d(f, descF_PxQ, at3+1);
    if (ptr != NULL) *ptr -= df*dr[8];
    ptr = ptr1d(f, descF_PxQ, at3+2);
    if (ptr != NULL) *ptr -= df*dr[9];

#else

    f[at1    ] -= df*dr[1];
    f[at1 + 1] -= df*dr[2];
    f[at1 + 2] -= df*dr[3];

    f[at2    ] -= df*dr[4];
    f[at2 + 1] -= df*dr[5];
    f[at2 + 2] -= df*dr[6];

    f[at3    ] -= df*dr[7];
    f[at3 + 1] -= df*dr[8];
    f[at3 + 2] -= df*dr[9];

#endif
		
    /* ddr will hold second derivatives of theta with respect to carts: */
    for( i=1; i<=3; i++ ){
      for( j=i; j<=3; j++ ){
	ddr[i][j] = ddc[i][j];
	ddr[i+6][j+6] = ddc[i+3][j+3];
      }
      for( j=1; j<=3; j++ ){
	ddr[i][j+6] = ddc[i][j+3];
	ddr[i][j+3] = -ddc[i][j] - ddc[i][j+3];
	ddr[i+3][j+6] = -ddc[i+3][j] - ddc[i+3][j+3];
      }
    }
    for( i=1; i<=3; i++ ){
      for( j=i; j<=3; j++ ){
	ddr[i+3][j+3] = -ddr[i][j+3] - ddr[i+3][j+6];
      }
    }
    /*
      for( i=1; i<=9; i++ ){
      for( j=i; j<=9; j++ ){
      ddr[j][i] = ddr[i][j];
      }
      }
    */

    for( i=1; i<=9; i++ ){
      ist = at3;
      if( i<=6 ) ist = at2;
      if( i<=3 ) ist = at1;
      iof = i%3 ; if( iof == 0 ) iof =3;
      inew = ist + iof - 1;

      for( j=i; j<=9; j++ ){
	jst = at3;
	if( j<=6 ) jst = at2;
	if( j<=3 ) jst = at1;
	jof = j%3; if( jof == 0 ) jof =3;
	jnew = jst + jof - 1;

#ifdef SCALAPACK

	ptr = ptr2d(g, descG_PxQ, jnew, inew);
	if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j];
	if ( inew != jnew ) {
	  ptr = ptr2d(g, descG_PxQ, inew, jnew);
	  if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j];
	}

#else
	g[inew + n*jnew] += ddf*dr[i]*dr[j] + df*ddr[i][j];
	if( inew != jnew )
	  g[jnew + n*inew] += ddf*dr[i]*dr[j] + df*ddr[i][j];
#endif
      }
    }
  }

  free_matrix( ddc, 1,6,1,6 );
  free_matrix( ddr, 1,6,1,6 );

  if (e_debug)
    EXPR("%9.3f", e_theta);
  return (e_theta);
}

/***********************************************************************
                            EPHI2()
************************************************************************/
 
/* Calculate the dihedral torsion energy and derivatives. */

static
REAL_T ephi2( int nphi, int *a1, int *a2,
	      int *a3, int *a4, int *atype,
	      REAL_T *Pk, REAL_T *Pn, REAL_T *Phase,
	      REAL_T *x, REAL_T *f, REAL_T *g,
	      int context_PxQ, int *descF_PxQ, int *descG_PxQ )
{
  int i, j, k, l, at1, at2, at3, at4, atyp, iper,iphi;
  size_t n;
  int inew, ist, iof, jnew, jst, jof;
  REAL_T e, df, ddf, e_tors;
  REAL_T xij, yij, zij, xkj, ykj, zkj, xkl, ykl, zkl;
  REAL_T dx, dy, dz, gx, gy, gz, bi, bk, ct, boi2, boj2, z1, z2, bioj, bjoi;
  REAL_T ct2, arg, z11, z12, z22;
  REAL_T t[7], dc[7], dr[13];
  REAL_T dum,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12;
  REAL_T **dtx, **ddr, **ddc;

#ifdef SCALAPACK
  REAL_T *ptr;
#endif

  ddc = matrix( 1,6, 1,6 );
  dtx = matrix( 1,6, 1,12 );
  ddr = matrix( 1,12, 1,12 );

  n = 3*prm->Natom;
  e_tors = 0.0;
  for (iphi = 0; iphi < nphi; iphi++) {

    /*
     * Get the atom numbers.  Atom 3 may be negative to specify that
     * the end group interactions are to be ignored.  Atom 4 may be 
     * negative to specify that the dihedral angle is an "improper
     * torsion".  However, neither of these conditions is actually
     * treated in this function, so just ignore negative values.
     */

    at1 = a1[iphi];
    at2 = a2[iphi];
    at3 = abs(a3[iphi]);
    at4 = abs(a4[iphi]);
    atyp = atype[iphi] - 1;


    /*     ----- calculation of the dihedral -----     */


    xij = x[at1    ] - x[at2    ];
    yij = x[at1 + 1] - x[at2 + 1];
    zij = x[at1 + 2] - x[at2 + 2];

    xkj = x[at3    ] - x[at2    ];
    ykj = x[at3 + 1] - x[at2 + 1];
    zkj = x[at3 + 2] - x[at2 + 2];

    xkl = x[at3    ] - x[at4    ];
    ykl = x[at3 + 1] - x[at4 + 1];
    zkl = x[at3 + 2] - x[at4 + 2];

    dx = yij * zkj - zij * ykj;
    dy = zij * xkj - xij * zkj;
    dz = xij * ykj - yij * xkj;

    gx = zkj * ykl - ykj * zkl;
    gy = xkj * zkl - zkj * xkl;
    gz = ykj * xkl - xkj * ykl;

    bi = dx * dx + dy * dy + dz * dz;
    bk = gx * gx + gy * gy + gz * gz;
    ct = dx * gx + dy * gy + dz * gz;

    /*    ----- branch if linear dihedral -----   */

    assert(bi > 0.01);
    assert(bk > 0.01);

    boi2 = 1. / bi;
    boj2 = 1. / bk;
    bi = sqrt(bi);
    bk = sqrt(bk);
    z1 = 1. / bi;
    z2 = 1. / bk;
    bioj = bi * z2;
    bjoi = bk * z1;
    ct = ct * z1 * z2;
    if (ct > 1.)
      ct = 1.;
    if (ct < (-1.))
      ct = -1.;

    /*     ----- ct value above is actually -cosphi; here we change
     *           its sign -----
     */
    ct = -ct;

    /*     ----- calculate the energy and derivatives -----


     *     ----- get df = first der. of potential w/respect to cosphi; and
     *           ddf = second der. of potntial w/respect to cosphi -----

     *           the torsional potential is assumed to have the form:
     *            e = pk(ic) * (1.0+phase*cos(pn(ic)*phi)
     *            where phase = 1.0 or -1.0, and pn = 1,2,3,4, or 6

     *     ----- energy terms for dihedrals are expressed in terms of
     *           cosphi in order to eliminate problems for planar angles ----
     */

  multi_term:
    iper = (int) fabs(Pn[atyp]) + 0.0001;
    assert(iper != (fabs(Pn[atyp]) - 0.0001));
    assert(iper >= 1 && iper <= 6);
    switch (iper) {

    case 1:
      e = ct;
      df = 1.0;
      ddf = 0.0;
      break;

    case 2:
      e = 2.0 * ct * ct - 1.0;
      df = 4.0 * ct;
      ddf = 4.0;
      break;

    case 3:
      ct2 = ct * ct;
      e = ct * (4.0 * ct2 - 3.0);
      df = 12.0 * ct2 - 3.0;
      ddf = 24.0 * ct;
      break;

    case 4:
      ct2 = ct * ct;
      e = 1.0 + ct2 * 8.0 * (ct2 - 1.0);
      df = 16.0 * ct * (ct2 + ct2 - 1.0);
      ddf = 16.0 * (6.0 * ct2 - 1.0);
      break;

    case 6:
      ct2 = ct * ct;
      e = ct2 * (ct2 * (ct2 * 32.0 - 48.0) + 18.0) - 1.0;
      df = ct * (ct2 * (ct2 * 192.0 - 192.0) + 36.0);
      ddf = ct2 * (ct2 * 960.0 - 576.0) + 36.0;

    default:
      fprintf(stderr, "bad periodicity: %d\n", iper);
      exit(1);
    }

    if (Phase[atyp] == 0.0) {
      arg = Pk[atyp];
    } else {
      arg = -Pk[atyp];
      assert(fabs(3.14159 - Phase[atyp]) < 0.01);
    }

    e = Pk[atyp] + arg * e;
    df = df * arg;
    ddf = ddf * arg;
    e_tors += e;

    t[1] = dx;
    t[2] = dy;
    t[3] = dz;

    t[4] = -gx;
    t[5] = -gy;
    t[6] = -gz;

    /*
     *     ----- now, set up array dc = first der. of cosphi w/respect
     *           to the cartesian differences t -----
     */

    z11 = z1 * z1;
    z12 = z1 * z2;
    z22 = z2 * z2;
    for (i = 1; i <= 3; i++) {
      dc[i] = t[i+3] * z12 - ct * t[i] * z11;
      dc[i+3] = t[i] * z12 - ct * t[i+3] * z22;
    }

    /*
     *     ----- subroutine difang will now create array ddc which is second
     *           derivative of cosphi with respect to the t s -----
     */

    difang(ct, t, dc, ddc);

    /* ---now set up array s, given on page 118 of cff book-- */

    s1 = xij;
    s2 = yij;
    s3 = zij;
    s4 = xkj;
    s5 = ykj;
    s6 = zkj;
    s7 = -xkj;
    s8 = -ykj;
    s9 = -zkj;
    s10 = -xkl;
    s11 = -ykl;
    s12 = -zkl;

    /*
     *     ----- set up dtx[i][j] = derivative of t[i] w/respect to x[j]
     *           see p. 120 of cff book -----
     */

    for (i = 1; i <= 6; i++) {
      for (j = 1; j <= 12; j++) {
	dtx[i][j] = 0.0;
      }
    }

    dtx[1][2] = s6;
    dtx[1][3] = -s5;
    dtx[1][5] = s3 - s6;
    dtx[1][6] = s5 - s2;
    dtx[1][8] = -s3;
    dtx[1][9] = s2;
    dtx[2][1] = -s6;
    dtx[2][3] = s4;
    dtx[2][4] = s6 - s3;
    dtx[2][6] = s1 - s4;
    dtx[2][7] = s3;
    dtx[2][9] = -s1;
    dtx[3][1] = s5;
    dtx[3][2] = -s4;
    dtx[3][4] = s2 - s5;
    dtx[3][5] = s4 - s1;
    dtx[3][7] = -s2;
    dtx[3][8] = s1;
    dtx[4][5] = s12;
    dtx[4][6] = -s11;
    dtx[4][8] = s9 - s12;
    dtx[4][9] = s11 - s8;
    dtx[4][11] = -s9;
    dtx[4][12] = s8;
    dtx[5][4] = -s12;
    dtx[5][6] = s10;
    dtx[5][7] = s12 - s9;
    dtx[5][9] = s7 - s10;
    dtx[5][10] = s9;
    dtx[5][12] = -s7;
    dtx[6][4] = s11;
    dtx[6][5] = -s10;
    dtx[6][7] = s8 - s11;
    dtx[6][8] = s10 - s7;
    dtx[6][10] = -s8;
    dtx[6][11] = s7;

    /*
     *     ----- set up dr array, containing -first derivative of cosphi with
     *           respect to cartesians -----
     */

    for (i = 1; i <= 12; i++) {
      dum = 0.0;
      for (j = 1; j <= 6; j++) {
	dum += dc[j]*dtx[j][i];
      }
      dr[i] = -dum;
    }
       

    /*     ----- update the force array ----- */

#ifdef SCALAPACK

    ptr = ptr1d(f, descF_PxQ, at1+0);
    if (ptr != NULL) *ptr -= df * dr[1];
    ptr = ptr1d(f, descF_PxQ, at1+1);
    if (ptr != NULL) *ptr -= df * dr[2];
    ptr = ptr1d(f, descF_PxQ, at1+2);
    if (ptr != NULL) *ptr -= df * dr[3];

    ptr = ptr1d(f, descF_PxQ, at2+0);
    if (ptr != NULL) *ptr -= df * dr[4];
    ptr = ptr1d(f, descF_PxQ, at2+1);
    if (ptr != NULL) *ptr -= df * dr[5];
    ptr = ptr1d(f, descF_PxQ, at2+2);
    if (ptr != NULL) *ptr -= df * dr[6];

    ptr = ptr1d(f, descF_PxQ, at3+0);
    if (ptr != NULL) *ptr -= df * dr[7];
    ptr = ptr1d(f, descF_PxQ, at3+1);
    if (ptr != NULL) *ptr -= df * dr[8];
    ptr = ptr1d(f, descF_PxQ, at3+2);
    if (ptr != NULL) *ptr -= df * dr[9];

    ptr = ptr1d(f, descF_PxQ, at4+0);
    if (ptr != NULL) *ptr -= df * dr[10];
    ptr = ptr1d(f, descF_PxQ, at4+1);
    if (ptr != NULL) *ptr -= df * dr[11];
    ptr = ptr1d(f, descF_PxQ, at4+2);
    if (ptr != NULL) *ptr -= df * dr[12];

#else

    f[at1    ] -= df * dr[1];
    f[at1 + 1] -= df * dr[2];
    f[at1 + 2] -= df * dr[3];

    f[at2    ] -= df * dr[4];
    f[at2 + 1] -= df * dr[5];
    f[at2 + 2] -= df * dr[6];

    f[at3    ] -= df * dr[7];
    f[at3 + 1] -= df * dr[8];
    f[at3 + 2] -= df * dr[9];

    f[at4    ] -= df * dr[10];
    f[at4 + 1] -= df * dr[11];
    f[at4 + 2] -= df * dr[12];

#endif

    /*
     *     ----- now set up the ddr array = second der. of cosphi w/respect
     *           to cartesians; first we take the first term of last formula
     *           on p. 113 of cff book -----
     */

    for (i = 1; i <= 12; i++) {
      for (j = i; j <= 12; j++) {
	ddr[i][j] = 0.0;
	for (k = 1; k <= 6; k++) {
	  for (l = 1; l <= 6; l++) {
	    ddr[i][j] = 
	      ddr[i][j] + ddc[k][l]*dtx[k][i]*dtx[l][j];
	  }
	}
      }
    }

    /*     ----- now do the second term of this equation -----  */

    ddr[2][9] += dc[1];
    ddr[3][5] += dc[1];
    ddr[6][8] += dc[1];
    ddr[2][6] -= dc[1];
    ddr[3][8] -= dc[1];
    ddr[5][9] -= dc[1];
    ddr[1][6] += dc[2];
    ddr[3][7] += dc[2];
    ddr[4][9] += dc[2];
    ddr[1][9] -= dc[2];
    ddr[3][4] -= dc[2];
    ddr[6][7] -= dc[2];
    ddr[1][8] += dc[3];
    ddr[2][4] += dc[3];
    ddr[5][7] += dc[3];
    ddr[1][5] -= dc[3];
    ddr[2][7] -= dc[3];
    ddr[4][8] -= dc[3];
    ddr[5][12] += dc[4];
    ddr[6][8] += dc[4];
    ddr[9][11] += dc[4];
    ddr[5][9] -= dc[4];
    ddr[6][11] -= dc[4];
    ddr[8][12] -= dc[4];
    ddr[4][9] += dc[5];
    ddr[6][10] += dc[5];
    ddr[7][12] += dc[5];
    ddr[4][12] -= dc[5];
    ddr[6][7] -= dc[5];
    ddr[9][10] -= dc[5];
    ddr[4][11] += dc[6];
    ddr[5][7] += dc[6];
    ddr[8][10] += dc[6];
    ddr[4][8] -= dc[6];
    ddr[5][10] -= dc[6];
    ddr[7][11] -= dc[6];

    /*     ----- Now form the second derivative matrix -----  */

    for (i = 1; i <= 12; i++) {
      ist = at4;
      if (i <= 9) ist = at3;
      if (i <= 6) ist = at2;
      if (i <= 3) ist = at1;
      iof = i % 3; if (iof == 0) iof = 3;
      inew = ist + iof - 1;
      for (j = i; j <= 12; j++) {
	jst = at4;
	if (j <= 9) jst = at3;
	if (j <= 6) jst = at2;
	if (j <= 3) jst = at1;
	jof = j % 3; if (jof == 0) jof = 3;
	jnew = jst + jof - 1;

#ifdef SCALAPACK

	ptr = ptr2d(g, descG_PxQ, jnew, inew);
	if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j];
	if (inew != jnew) {
	  ptr = ptr2d(g, descG_PxQ, inew, jnew);
	  if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j];
	}
#else
	g[inew + n*jnew] += ddf*dr[i]*dr[j] + df*ddr[i][j];
	if (inew != jnew)
	  g[jnew + n*inew] += ddf*dr[i]*dr[j] + df*ddr[i][j];
#endif
      }
    }


#ifdef PRINT_EPHI
    printf("%4d %4d %4d %4d %4d %9.4f\n", i + 1, at1 / 3, at2 / 3,
	   at3 / 3, at4 / 3, e);
#endif

    /* A negative value of Pn means that more terms are needed in the expansion. */

    if (Pn[atyp] < 0) {
      atyp++;
      goto multi_term;
    }
  }
  if (e_debug)
    EXPR("%9.3f", e_tors);

  free_matrix( dtx, 1,12, 1,12 );
  free_matrix( ddr, 1,12, 1,12 );

  return (e_tors);
}

/***********************************************************************
                            NBOND2()
************************************************************************/

/* 
 * Calculate the non-bonded energy, 1st and 2nd derivatives.
 * This function is complicated by the fact that it must
 * process two forms of pair lists: the 1-4 pair list and
 * the non-bonded pair list.  The non-bonded pair list
 * must be modified by the excluded atom list whereas the
 * 1-4 pair list is used unmodified.  Also, the non-bonded
 * pair list comprises lower and upper triangles whereas
 * the 1-4 pair list comprises an upper triangle only.
 *
 * Calling parameters are as follows:
 *
 * lpears - the number of pairs on the lower triangle pair list
 * upears - the number of pairs on the upper trianble pair list
 * pearlist - either the 1-4 pair list or the non-bonded pair list
 * N14 - set to 0 for the non-bonded pair list, 1 for the 1-4 pair list
 * x - the atomic coordinate array
 * f - the gradient vector
 * g - the Hessian matrix mapped onto a linear vector
 * enb - Van der Waals energy return value, passed by reference
 * eel - Coulombic energy return value, passed by reference
 * enbfac - scale factor for Van der Waals energy
 * eelfac - scale factor for Coulombic energy
 */

static
int nbond2( int *lpears, int *upears, int **pearlist, int N14,
	    REAL_T *x, REAL_T *f, REAL_T *g, REAL_T *enb, REAL_T *eel,
	    REAL_T enbfac, REAL_T eelfac,
	    int context_PxQ, int *descF_PxQ, int *descG_PxQ )
{
  int i, j, k, l, jn, ic, npr, lpair, iaci;
  size_t n;
  int *iexw;
  REAL_T r, r2, r2inv, df2, dg2, r6, r10, f1, f2, df, dg;
  REAL_T dis, kij, d0, diff, rinv, rs, rssq, eps1, epsi, cgijr, pow, cgi;
  REAL_T dumx, dumy, dumz, enbfaci, eelfaci;
  int nhbpair, ibig, isml;
  REAL_T xi, yi, zi, xij, yij, zij;
  REAL_T di[3], dj[3], d2ii[3][3], d2jj[3][3], d2ij[3][3], ijterm;

#ifdef SCALAPACK
  REAL_T *ptr;
#endif

#if 0
  REAL_T hbener, nb14;
  nhbpair = 0;
  hbener = 0.0; 
  nb14 = 0.0;
#endif

#define SIG 0.3
#define DIW 78.0
#define C1 38.5

  /*
   * For the various methods of calculating the nonbonded energy
   * both the energy and its 1st and 2nd derivatives are calculated.
   *
   * For the calculation of the 1st derivatives of the energy, the
   * 1st derivatives the nonbonded distance r are multiplied by df:
   *
   *      de/dx = df*(dr/dx)
   *
   * where dr/dx represents the 1st derivative of the distance, and 
   * 
   *      df = de/dr
   *
   * is required by the chain rule:
   *
   *      de/dx = (de/dr)*(dr/dx) = df*(dr/dx)
   *
   * For the calculation of the 2nd derivatives of the energy, the 1st and
   * 2nd derivatives of the distance are multiplied by dg and df as follows:
   *
   *      d2e/dxjdxi = df*(d2r/dxjdxi) + dg*(dr/dxj)*(dr/dxi)
   *
   * where d2r/dxjdxi represents a 2nd derivative of the distance,
   * dr/dxi and dr/dxj represent 1st derivatives of the distance, and:
   *
   *      dg = (d/dr)df
   *
   * is required by the chain rule:
   *
   *      d2e/dxjdxi = (d/dxj)(de/dxi)
   *
   *                 = (d/dxj)[(de/dr)*(dr/dxi)]
   *
   *                 = (d/dxj)[df*(dr/dxi)]
   *
   *                 = [(d/dxj)df]*(dr/dxi) + df*[(d/dxj)(dr/dxi)]
   *
   *                 = [(d/dxj)(de/dr)]*(dr/dxi) + df*(d2r/dxjdxi)
   *
   *                 = {[(d/dr)(de/dr)]*(dr/dxj)}*(dr/dxi) + df*(d2r/dxjdxi)
   *
   *                 = dg*(dr/dxj)*(dr/dxi) + df*(d2r/dxjdxi)
   *
   */

  n = 3*prm->Natom;
  *enb = 0.;
  *eel = 0.;
  enbfaci = 1. / enbfac;
  eelfaci = 1. / eelfac;

  /*
   * Allocate and initialize the iexw array used for skipping excluded
   * atoms.  Note that because of the manner in which iexw is used, it
   * is necessary to initialize it before only the first iteration of
   * the following loop.
   */

  iexw = ivector(-1, prm->Natom);
  for (i = -1; i < prm->Natom; i++) {
    iexw[i] = -1;
  }

  /* Loop over all atoms i except for the final atom i. */

  for (i = 0; i < prm->Natom - 1; i++) {

    /* Check whether there are any atoms j on the pair list of atom i. */

    npr = upears[i];
    if (npr <= 0) continue;

    iaci = prm->Ntypes * (prm->Iac[i] - 1);
    dumx = 0.0;
    dumy = 0.0;
    dumz = 0.0;
    xi = x[3 * i    ];
    yi = x[3 * i + 1];
    zi = x[3 * i + 2];
    cgi = eelfaci * prm->Charges[i];

    /*
     * Expand the excluded list into the iexw array by storing i
     * at array address j.
     */

    for (j = 0; j < prm->Iblo[i]; j++) {
      iexw[IexclAt[i][j] - 1] = i;
    }

    /*
     * If the 'N14' calling parameter is clear, use the beginning
     * address of the upper triangle pair list, which happens
     * to be the number of atoms on the lower triangle pair list.
     * If the 'N14' calling parameter is set, the beginning
     * address is zero because no lower triangle pair list is
     * used for the N14 interactions.
     */

    if (N14 == 0) {
      lpair = lpears[i];
    } else {
      lpair = 0;
    }

    /* Select atom j from the pair list.  Non-graceful error handling. */

    for (jn = 0; jn < npr; jn++) {

      if (pearlist[i] == NULL) {
	printf("NULL pair list entry in nbond2, taskid = %d\n", mytaskid);
	fflush(stdout);
      }
      j = pearlist[i][lpair + jn];

      /*
       * If the 'N14' calling parameter is clear, check whether
       * this i,j pair is exempted by the excluded atom list.
       */

      if (N14 != 0 || iexw[j] != i) {
	xij = xi - x[3 * j    ];
	yij = yi - x[3 * j + 1];
	zij = zi - x[3 * j + 2];
	r2 = xij * xij + yij * yij + zij * zij;
	r2inv = 1.0 / r2;
	r = sqrt(r2);
	rinv = r * r2inv;

	/*
	 * Calculate the first and second derivatives of the interatomic
	 * distance Dij with respect to the cartesian coordinates of atoms
	 * i and j.  The results are placed into five arrays:
	 *
	 *   di[] for the first derivatives with respect to atom i
	 *   dj[] for the first derivatives with respect to atom j
	 *   d2ii[] for the second derivatives with respect to atom i
	 *   d2jj[] for the second derivatives with respect to atom j
	 *   d2ij[] for the second derivatives with respect to atoms i and j
	 *
	 * Some useful symmetry obtains.  The d2ii, d2jj and d2ij arrays
	 * are symmetric in that their lower and upper triangles are
	 * the transposes of one another.  Also, d2jj equals d2ii, and
	 * d2ij is the negative of d2ii.
	 *
	 * As was exploited by David Case and his colleagues, the vector dj
	 * is the negative of the vector di.
	 */
      
	di[0] = xij;
	di[1] = yij;
	di[2] = zij;

	dj[0] = -xij;
	dj[1] = -yij;
	dj[2] = -zij;

	/* Load the upper triangle of d2ii. */

	d2ii[0][0] = 1.0 - xij * xij * r2inv;
	d2ii[0][1] = -xij * yij * r2inv;
	d2ii[0][2] = -xij * zij * r2inv;

	d2ii[1][1] = 1.0 - yij * yij * r2inv;
	d2ii[1][2] = -yij * zij * r2inv;

	d2ii[2][2] = 1.0 - zij * zij * r2inv;

	/* Finish loading the rest of all of the matrices. */

	for (k = 0; k < 3; k++) {

	  /* Load the upper triangles of d2jj and d2ij. */

	  for (l = k; l < 3; l++) {
	    d2jj[k][l] = d2ii[k][l];
	    d2ij[k][l] = -d2ii[k][l];
	  }

	  /* Load the symmetric elements of d2ii, d2jj and d2ij. */

	  for (l = k+1; l < 3; l++) {
	    d2ii[l][k] = d2ii[k][l];
	    d2jj[l][k] = d2jj[k][l];
	    d2ij[l][k] = d2ij[k][l];
	  }
	}

	/* Calculate the energy and derivatives according to dield. */

	if (dield == -3) {

	  /* special code Ramstein & Lavery dielectric, 94 force field */

	  rs = SIG * r;
	  rssq = rs * rs;
	  pow = exp(-rs);
	  eps1 = rssq + rs + rs + 2.0;
	  epsi = 1.0 / (DIW - C1 * pow * eps1);
	  cgijr = cgi * prm->Charges[j] * rinv * epsi;
	  *eel += cgijr;
	  df2 = -cgijr * (1.0 + C1 * pow * rs * rssq * epsi);
	  dg2 =
	    cgijr*(epsi*(C1*pow*SIG*(rssq*(2.0*(epsi*C1*pow*SIG*rssq + rinv)
					   + SIG) - 2.0*SIG*rs)) + 2.0*r2inv);
	  ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	  if( ic >= 0 ){
	    r6 = r2inv * r2inv * r2inv;
	    f2 = prm->Cn2[ic] * r6;
	    f1 = prm->Cn1[ic] * r6 * r6;
	    *enb += (f1 - f2) * enbfaci;
	    df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	    dg = dg2 + (156.0 * f1 - 42.0 * f2 * enbfaci) * r2inv;
	  } else {
	    df = df2 * rinv;
	    dg = dg2;
	  }

	} else if (dield == -4) {

	  /* distance-dependent dielectric code, 94 ff */
	  /* epsilon = r  */

	  rs = cgi * prm->Charges[j] * r2inv;
	  df2 = -2.0 * rs;
	  dg2 = 6.0 * rs * r2inv;
	  *eel += rs;
	  ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	  if( ic >= 0 ){
	    r6 = r2inv * r2inv * r2inv;
	    f2 = prm->Cn2[ic] * r6;
	    f1 = prm->Cn1[ic] * r6 * r6;
	    *enb += (f1 - f2) * enbfaci;
	    df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	    dg = dg2 + (156.0 * f1 - 42.0 * f2 * enbfaci) * r2inv;
	  } else {
	    df = df2 * rinv;
	    dg = dg2;
	  }

	} else if (dield == -5) {

	  /* non-bonded term from yammp  */

	  dis = r;
	  ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	  d0 = prm->Cn2[ic];
	  if (dis < d0) {
	    kij = prm->Cn1[ic];
	    diff = dis - d0;
	    *enb += kij * diff * diff;
	    df = 2.0 * kij * diff;
	    dg = 2.0 * kij;
	  } else {
	    df = 0.0;
	    dg = 0.0;
	  }
	} else {

	  /*
	   * Code for various dielectric models.
	   * The df2 variable should hold r(dV/dr).
	   */

	  if (dield == 0) {

	    /* epsilon = r  */

	    rs = cgi * prm->Charges[j] * r2inv;
	    df2 = -2.0 * rs;
	    dg2 = 6.0 * rs * r2inv;
	    *eel += rs;

	  } else if (dield == 1) {
	      
	    /* epsilon = 1  */

	    rs = cgi * prm->Charges[j] * rinv;
	    df2 = -rs;
	    dg2 = 2.0 * rs * r2inv;
	    *eel += rs;

	  } else if (dield == -2) {

	    /* Ramstein & Lavery dielectric, PNAS 85, 7231 (1988). */

	    rs = SIG * r;
	    rssq = rs * rs;
	    pow = exp(-rs);
	    eps1 = rssq + rs + rs + 2.0;
	    epsi = 1.0 / (DIW - C1 * pow * eps1);
	    cgijr = cgi * prm->Charges[j] * rinv * epsi;
	    *eel += cgijr;
	    df2 = -cgijr * (1.0 + C1 * pow * rs * rssq * epsi);
	    dg2 =
	      cgijr*(epsi*(C1*pow*SIG*(rssq*(2.0*(epsi*C1*pow*SIG*rssq + rinv)
					     + SIG) - 2.0*SIG*rs)) + 2.0*r2inv);
	  }

	  /* Calculate either Van der Waals or hydrogen bonded term. */

	  ic = prm->Cno[iaci + prm->Iac[j] - 1];
	  if (ic > 0 || enbfac != 1.0) {
	    if (ic > 0) {
	      ic--;
	    } else {
	      ibig = prm->Iac[i] > prm->Iac[j] ?
		prm->Iac[i] : prm->Iac[j];
	      isml = prm->Iac[i] > prm->Iac[j] ?
		prm->Iac[j] : prm->Iac[i];
	      ic = ibig * (ibig - 1) / 2 + isml - 1;
	    }
	    r6 = r2inv * r2inv * r2inv;
	    f2 = prm->Cn2[ic] * r6;
	    f1 = prm->Cn1[ic] * r6 * r6;
	    *enb += (f1 - f2) * enbfaci;
	    df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	    dg = dg2 + (156.0 * f1 - 42.0 * f2) * enbfaci * r2inv;
#if 0
	    if( enbfac != 1.0 ) nb14 += (f1-f2)*enbfaci;
#endif
	  } else {
	    ic = -ic - 1;
	    r10 = r2inv * r2inv * r2inv * r2inv * r2inv;
	    f2 = prm->HB10[ic] * r10;
	    f1 = prm->HB12[ic] * r10 * r2inv;
	    *enb += (f1 - f2) * enbfaci;
	    df = (df2 + (10.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	    dg = dg2 + (156.0 * f1 - 110.0 * f2 * enbfaci) * r2inv;
	    nhbpair++;
#if 0
	    hbener += (f1-f2)*enbfaci;
#endif
	  }
	}

	/* Divide by r here instead dividing the derivatives by r. */

	df *= rinv;
	dg *= r2inv;

	/* Update the gradient and the Jacobian. */

	dumx += df * di[0];
	dumy += df * di[1];
	dumz += df * di[2];

#ifdef SCALAPACK

	ptr = ptr1d(f, descF_PxQ, 3*j+0);
	if (ptr != NULL) *ptr += df * dj[0];
	ptr = ptr1d(f, descF_PxQ, 3*j+1);
	if (ptr != NULL) *ptr += df * dj[1];
	ptr = ptr1d(f, descF_PxQ, 3*j+2);
	if (ptr != NULL) *ptr += df * dj[2];
#else
	f[3 * j    ] += df * dj[0];
	f[3 * j + 1] += df * dj[1];
	f[3 * j + 2] += df * dj[2];
#endif

	/*
	 * Calculate the second derivatives of Dij for the Van der Waals
	 * and Coulombic interactions.
	 */

	for (k = 0; k < 3; k++) {
	  for (l = 0; l < 3; l++) {
	    ijterm = df * d2ij[k][l] + dg * di[k] * dj[l];

#ifdef SCALAPACK

	    ptr = ptr2d(g, descG_PxQ, 3*i+l, 3*i+k);
	    if (ptr != NULL) *ptr += df * d2ii[k][l] + dg * di[k] * di[l];
	    ptr = ptr2d(g, descG_PxQ, 3*j+l, 3*j+k);
	    if (ptr != NULL) *ptr += df * d2jj[k][l] + dg * dj[k] * dj[l];
	    ptr = ptr2d(g, descG_PxQ, 3*j+l, 3*i+k);
	    if (ptr != NULL) *ptr += ijterm;
	    ptr = ptr2d(g, descG_PxQ, 3*i+k, 3*j+l);
	    if (ptr != NULL) *ptr += ijterm;
#else
	    g[3*i + k + n*(3*i + l)] += df * d2ii[k][l] + dg * di[k] * di[l];
	    g[3*j + k + n*(3*j + l)] += df * d2jj[k][l] + dg * dj[k] * dj[l];
	    g[3*i + k + n*(3*j + l)] += ijterm;
	    g[3*j + l + n*(3*i + k)] += ijterm;
#endif
	  }
	}
      }
    }

    /* For atom i, the gradient is updated in the i-loop only. */

#ifdef SCALAPACK

    ptr = ptr1d(f, descF_PxQ, 3*i+0);
    if (ptr != NULL) *ptr += dumx;

    ptr = ptr1d(f, descF_PxQ, 3*i+1);
    if (ptr != NULL) *ptr += dumy;

    ptr = ptr1d(f, descF_PxQ, 3*i+2);
    if (ptr != NULL) *ptr += dumz;

#else

    f[3 * i    ] += dumx;
    f[3 * i + 1] += dumy;
    f[3 * i + 2] += dumz;

#endif

  }

  /* Deallocate the iexw array. */

  free_ivector(iexw, -1, prm->Natom);

  return (0);
}

/***********************************************************************
                            EGB2()
************************************************************************/

/*
 * Calculate the generalized Born energy, 1st and 2nd derivatives.
 *
 * Calling parameters are as follows:
 *
 * lpears - the number of pairs on the lower triangle pair list
 * upears - the number of pairs on the upper trianble pair list
 * pearlist - the pair list, contiguous for the upper and lower triangles
 * x - input: the atomic (x,y,z) coordinates
 * f - updated: the gradient vector
 * g - updated: the Hessian matrix
 * grad - updated: the gradient vector in context_1x1
 * fs - input: overlap parameters
 * rborn - input: atomic radii
 * q - input: atomic charges
 * kappa - input: inverse of the Debye-Huckel length
 * diel_ext - input: solvent dielectric constant
 * enb - updated: Lennard-Jones energy
 * eelt - updated: gas-phase electrostatic energy
 * context_Nx1 - input: the general context for ScaLAPACK
 * context_1x1 - input: the non-distributed vector context for ScaLAPACK
 * context_PxQ - input: the distributed matrix context for ScaLAPACK
 * desc_3Nx1 - input: ScaLAPACK descriptor for the gradient vector in context_PxQ
 * desc_1x1 - input: ScaLAPACK descriptor for the gradient vector in context_1x1
 * desc_3Nx3N - input: ScaLAPACK descriptor for the Hessian matrix in context_PxQ
 * gridim - input: the ScaLAPACK process grid dimension (=1 for single task)
 * freevectors - input: if !=0 free the static vectors and return
 */

static
REAL_T egb2( int *lpears, int *upears, int **pearlist, REAL_T *x, REAL_T *f,
	     REAL_T *g, REAL_T *grad, REAL_T *fs, REAL_T *rborn, REAL_T *q,
	     REAL_T *kappa, REAL_T *diel_ext, REAL_T *enb, REAL_T *eelt,
	     int context_Nx1, int context_1x1, int context_PxQ,
	     int *desc_3Nx1, int *desc_1x1, int *desc_3Nx3N,
	     int gridim, int freevectors )

#define BOFFSET 0.09
#define KSCALE 0.73

#define PIx4 12.5663706143591724639918
#define PIx2  6.2831853071795862319959
#define PIx1  3.1415926535897931159979

  /*FGB taylor coefficients follow */
  /* from A to L :                 */
  /* 1/3 , 2/5 , 3/7 , 4/9 , 5/11  */
  /* 4/3 , 12/5 , 24/7 , 40/9 , 60/11 */
  /* 20/3, 84/5, 216/7, 440/9, 780/11 */

#define TA 0.33333333333333333333
#define TB 0.4
#define TC 0.42857142857142857143
#define TD 0.44444444444444444444
#define TDD 0.45454545454545454545

#define TE 1.33333333333333333333
#define TF 2.4
#define TG 3.42857142857142857143
#define TH 4.44444444444444444444
#define THH 5.45454545454545454545

#define TI 6.66666666666666666667
#define TJ 16.8
#define TK 30.8571428571428571429
#define TL 48.8888888888888888889
#define TLL 70.9090909090909090909

{

#ifdef SCALAPACK
  int zero=0, one=1, info=0;
  int myrow, mycol, nprow, npcol;
  int bs, bs3, ierror, lld;
  static int desc_NxN[DLEN_], desc_Nx3N[DLEN_];
  static SUPERD_T *super_3Nx3N=NULL, *super_NxN=NULL, *super_Nx3N=NULL;
  static REAL_T *reductarr=NULL, *gg=NULL;
  static REAL_T *sg=NULL, *sh=NULL, *sr=NULL, *sn=NULL;
  REAL_T *ptr;
  size_t ii, locp;
#endif

#ifdef OPENMP
  int iexcl_last;
#endif

  int i, j, k, l, m, n, n2, m3;
  int npairs, ic, iaci, threadnum, numthreads;
  int *iexw;
  size_t in, i3, in3, jn, j3, jn3, nn, n3, nn3;
  REAL_T elec, evdw, epol, dielfac, qi, qj, qiqj, fgbi, fgbk, rb2, expmkf;
  REAL_T xi, yi, zi, xj, yj, zj, xij, yij, zij;
  REAL_T dumbo,tmpsd;
  REAL_T df, dg, dx, dy, dz;
  REAL_T dedx, dedy, dedz;
  REAL_T dij1i, dij2i, dij3i, temp1, temp2, t1, t2;
  REAL_T qi2h, qid2h, datmp, da2tmp;
  REAL_T theta, ri1i;

  REAL_T dij, sumi, ijterm;
  REAL_T eel, f6, f12, rinv, r2inv, r4inv, r6inv;
  REAL_T r2, ri, ri2, riinv, rj, rjinv, rirjinv, rj2, sj, sj2, thi;
  REAL_T uij, efac, temp4, temp5, temp6, temp7, sumda;
  REAL_T d_dij, e_dij, f_dij, g_dij, h_dij, i_dij;
  REAL_T j_dij, l_dij, m_dij, n_dij;
  REAL_T a_dij, b_dij, c_dij, p_dij, r_dij;
  REAL_T temp8, temp9, temp10, temp11, temp12, temp13;
  REAL_T temp14, temp15, temp16, temp17, temp18;
  REAL_T temp20, temp21, temp22, temp23;

  int count, count2, icount;
  REAL_T si, sumAij, sumAjk, sumAijAjk, sumdAijddijdxi;
  REAL_T sumdAijddijdyi, sumdAijddijdzi, sumdAijddijdxiAjk;
  REAL_T sumdAijddijdyiAjk, sumdAijddijdziAjk, rij, tmpaij, Aij,
    dAijddij;
  REAL_T dAijddijdxj, dAijddijdyj, dAijddijdzj;
  REAL_T sumdAjkddjkdxj, sumdAjkddjkdyj, sumdAjkddjkdzj, p3p4Aij;
  REAL_T xk, yk, zk, rjk2, djk1i, rjk, vdw2dif, tmpajk, Ajk, sumAjk2,
    dAjkddjk;
  REAL_T dAjkddjkdxj, dAjkddjkdyj, dAjkddjkdzj, lastxj, lastyj, lastzj;
  REAL_T dAidxj, dAidyj, dAidzj, Ai, dAidxi, dAidyi, dAidzi;
  REAL_T totsasa;
  REAL_T rgbmax1i, rgbmax2i, rgbmaxpsmax2;
  REAL_T dxj, dyj, dzj;
  REAL_T di[3], dj[3], d2ii[3][3], d2jj[3][3], d2ij[3][3];

  char transa, transb;
  REAL_T dblone=1.0;

  static REAL_T *reff=NULL, *dreff=NULL, *psi=NULL;
  static REAL_T *sumdeijda=NULL, *sumdeijdn=NULL;
  static REAL_T *sumdeijdg=NULL, *sumdeijdh=NULL;


  /* If PRINT_EGB2_TIMES is defined print some calculation times. */

#undef PRINT_EGB2_TIMES


  /*
   * If ROW_CYCLIC is defined allocate the Hessian and some temporary
   * matrices in a row cyclic manner via the "first touch" method.
   */

#define ROW_CYCLIC

  if (gb2_debug) {
    printf("enter egb2, n = %d\n", prm->Natom);
#ifdef SCALAPACK
    printf("egb2: context_Nx1=%d  context_1x1=%d  context_PxQ=%d\n",
	 context_Nx1, context_1x1, context_PxQ);
    printf("egb2: descF_PxQ[CTXT_]=%d  descF_1x1[CTXT_]=%d  descG_PxQ[CTXT_]=%d\n",
	   desc_3Nx1[CTXT_], desc_1x1[CTXT_], desc_3Nx3N[CTXT_]);
#endif
    fflush(stdout);
  }

  t1 = seconds();
  count = 0;
  totsasa = 0.0;

  /* Compute these because they are used a fair amount. */

  n = prm->Natom;
  nn = n * n;
  n2 = 2 * n;
  m3 = 3 * n;
  n3 = m3;
  nn3 = n * n3;

  /*
   * If freevectors != 0, deallocate the static arrays that have been
   * previously allocated and return.  Note that when ScaLAPACK is
   * defined, the size of the arrays is determined by the 'size' field
   * of the associated superdescriptor, and so in principle the array
   * should be free before its associated superdescriptor.  In practice,
   * however, the free_vector function doesn't use the size argument,
   * so any dummy value could be supplied.
   */

  if (freevectors != 0) {

#ifndef SCALAPACK

    free_vector( sumdeijdn, 0, nn );
    sumdeijdn = NULL;

    free_vector( sumdeijdg, 0, nn3 );
    sumdeijdg = NULL;

    free_vector( sumdeijdh, 0, nn3 );
    sumdeijdh = NULL;

    free_vector( dreff, 0, nn3 );
    dreff = NULL;

#else

    if (context_PxQ >= 0) {

      free_superdesc( super_3Nx3N );
      super_3Nx3N = NULL;

      free_vector( sumdeijdn, 0, super_NxN->size );
      sumdeijdn = NULL;

      free_superdesc( super_NxN );
      super_NxN = NULL;

      free_vector( sumdeijdg, 0, super_Nx3N->size );
      sumdeijdg = NULL;

      free_vector( sumdeijdh, 0, super_Nx3N->size );
      sumdeijdh = NULL;

      free_vector( dreff, 0, super_Nx3N->size );
      dreff = NULL;

      free_superdesc( super_Nx3N );
      super_Nx3N = NULL;

      free_vector( reductarr, 0, 3*n3 );
      reductarr = NULL;
    }

    free_vector( gg, 0, 3*n3 );
    gg = NULL;

    free_vector( sg, 0, n3 );
    sg = NULL;

    free_vector( sh, 0, n3 );
    sh = NULL;

    free_vector( sr, 0, n3 );
    sr = NULL;

    free_vector( sn, 0, n );
    sn = NULL;

#endif

    free_vector( reff, 0, n );
    reff = NULL;

    free_vector( sumdeijda, 0, n );
    sumdeijda = NULL;

    free_vector( psi, 0, n );
    psi = NULL;

    if (gb2_debug) {
      printf("freevectors\n");
      fflush(stdout);
    }

    return (0.0);
  }

  /* Allocate some static arrays if they have not been allocated already. */

  if (reff == NULL) reff = vector(0, n);
  if (sumdeijda == NULL) sumdeijda = vector(0, n);
  if ( (psi == NULL) && (gb==2 || gb==5) ) psi = vector(0, n);

#ifndef SCALAPACK

  /*
   * If SCALAPACK is not defined, allocate sumdeijdn, sumdeijdg, sumdeijdh
   * and dreff as NAB vectors.
   */

  if (sumdeijdn == NULL) sumdeijdn = vector(0, nn);
  if (sumdeijdg == NULL) sumdeijdg = vector(0, nn3);
  if (sumdeijdh == NULL) sumdeijdh = vector(0, nn3);
  if (dreff == NULL) dreff = vector(0, nn3);

#else

  /*
   * If SCALAPACK is defined, initialize matrix descriptors for the
   * sumdeindn, sumdeijdg, sumdeijdh and dreff matrices, then create
   * superdescriptors that contain the matrix size (for each process),
   * as well as precomputed division and modulus arrays to accelerate
   * the mapping from a global matrix (row, column) address to a local
   * submatrix array address.  Once the superdescriptors are created,
   * allocate the submatrix arrays.
   *
   * Get the block size for the square block and multiply by three.
   */

  bs = BLOCKSIZE;
  bs3 = 3*bs;

  /*
   * Get the number of rows and columns on the block cyclic (PxQ) process
   * grid, as well as this task's row and column on the grid.
   */

  blacs_gridinfo_(&context_PxQ, &nprow, &npcol, &myrow, &mycol);

  /*
   * Allocate the reduction array in all processes because MPI_Allreduce
   * will use MPI_COMM_WORLD as its communicator.  It is unclear that
   * it is possible to use a ScaLAPACK context as an MPI communicator,
   * so the default MPI_COMM_WORLD is used instead.
   */

  if (reductarr == NULL) reductarr = vector(0, 3*n3);

  /*
   * If this task is on the process grid, set up the array descriptors.
   * If this task isn't on the process grid, set desc_NxN[CTXT_] and
   * desc_Nx3N[CTXT_] to -1 in case any of the associated matrices are
   * called by the pdgemr2d_ function, and set the superdescriptors
   * to NULL.
   */

    /*
     * Distribute the sumdeijdg, sumdeijdh and dreff matrices on the
     * block cyclic process grid.  Because the number of columns is
     * three times the number of rows for these matrices, use a row
     * block size of BLOCKSIZE and a column block size of 3*BLOCKSIZE
     * so that a test of the row index i and the column index j (not 3*j)
     * may be used to identify the grid process to which a global matrix
     * address maps.
     *
     * Distribute the sumdeijdn matrix on the block cyclic process
     * grid.  Use row and column block sizes of BLOCKSIZE because
     * the number of rows equals the number of columns for this matrix.
     *
     * The numroc_ function is used to calculate the number of matrix
     * elements that are distributed across a PxQ processor grid.
     *
     * Once the descriptor has been initialized by the descinit function,
     * create a superdescriptor from the descriptor.
     *
     * Allocate the gg, sg, sh, sr and sn vectors for the diagonal
     * elements of the g, sumdeijdg, sumdeijdh, sumdeijdr and
     * sumdeijdn matrices.
     */

  if (context_PxQ >= 0) {

    if (super_3Nx3N == NULL) {
      if ( ( super_3Nx3N = superdesc(desc_3Nx3N) ) == NULL ) {
	printf("egb2: superdescriptor 3Nx3N == NULL, mytaskid = %d\n", mytaskid);
	fflush(stdout);
      }
    }

    if (super_NxN == NULL) {
      locp = numroc_(&n, &bs, &myrow, &zero, &nprow);
      lld = locp;
      descinit_(desc_NxN, &n, &n, &bs, &bs, &zero, &zero,
		&context_PxQ, &lld, &info);
      if (info) {
	printf("egb2: error in desc_NxN initialization, mytaskid = %d\n", mytaskid);
	fflush(stdout);
      }
      if ( ( super_NxN = superdesc(desc_NxN) ) == NULL) {
	printf("egb2: superdescriptor NxN == NULL, mytaskid = %d\n", mytaskid);
	fflush(stdout);
      }
    }
    if (sumdeijdn == NULL && super_NxN->size > 0) {
      sumdeijdn = vector( 0, super_NxN->size );
    }

    if (super_Nx3N == NULL) {
      locp = numroc_(&n, &bs, &myrow, &zero, &nprow);
      lld = locp;
      descinit_(desc_Nx3N, &n, &m3, &bs, &bs3, &zero, &zero,
		&context_PxQ, &lld, &info);
      if (info) {
	printf("egb2: error in desc_Nx3N initialization, mytaskid = %d\n", mytaskid);
	fflush(stdout);
      }
      if ( ( super_Nx3N = superdesc(desc_Nx3N) ) == NULL) {
	printf("egb2: superdescriptor Nx3N == NULL, mytaskid = %d\n", mytaskid);
	fflush(stdout);
      }
    }
    if (sumdeijdg == NULL && super_Nx3N->size > 0) {
      sumdeijdg = vector( 0, super_Nx3N->size );
    }
    if (sumdeijdh == NULL && super_Nx3N->size > 0) {
      sumdeijdh = vector( 0, super_Nx3N->size );
    }
    if (dreff == NULL && super_Nx3N->size > 0) {
      dreff = vector( 0, super_Nx3N->size );
    }
  } else {
    desc_NxN[CTXT_] = desc_Nx3N[CTXT_] =  -1;
    super_3Nx3N = super_NxN = super_Nx3N = NULL;
  }

  gg = vector( 0, 3*n3 );
  sr = vector( 0, n3 );
  sg = vector( 0, n3 );
  sh = vector( 0, n3 );
  sn = vector( 0, n );

  if (gb2_debug) {
    printf("vector allocation, mytaskid = %d\n", mytaskid);
    fflush(stdout);
  }

#endif

  /*
   * Smooth "cut-off" in calculating GB effective radii.
   * Implementd by Andreas Svrcek-Seiler and Alexey Onufriev.
   * The integration over solute is performed up to rgbmax and includes
   * parts of spheres; that is an atom is not just "in" or "out", as
   * with standard non-bonded cut.  As a result, calclated effective
   * radii are less than rgbmax. This saves time, and there is no
   * discontinuity in dReff/drij.
   *
   * Only the case rgbmax > 5*max(sij) = 5*fsmax ~ 9A is handled; this is
   * enforced in mdread().  Smaller values would not make much physical
   * sense anyway.
   *
   * Note: rgbmax must be less than or equal to cut so that the pairlist
   * generated from cut may be applied to calculation of the effective
   * radius and its derivatives.
   */

  if (rgbmax > cut) {
    printf("Error in egb2: rgbmax = %f is greater than cutoff = %f\n",
	   rgbmax, cut);
    exit(1);
  }
  rgbmax1i = 1.0/rgbmax;
  rgbmax2i = rgbmax1i*rgbmax1i;
  rgbmaxpsmax2 = (rgbmax+prm->Fsmax)*(rgbmax+prm->Fsmax);

#if 0
  if (gb2_debug) {
    printf("Effective Born radii:\n");
  }
#endif

  /* 
   * Get the "effective" Born radii via the approximate pairwise method
   * Use Eqs 9-11 of Hawkins, Cramer, Truhlar, J. Phys. Chem. 100:19824
   * (1996).
   *
   * This computation is not executed in parallel for the case of gbsa==1
   * due to the loop-carried dependence of the 'count' variable.  However,
   * an apparent bug in the Sun implementation of OpenMP may cause this
   * computation to execute more slowly than in serial mode in the case
   * where an 'if (gbsa != 1)' clause is added to the following pragma.
   * To avoid this problem, the code is replicated verbatim and selected
   * with an 'if (gbsa != 1)' statement.  When gbsa != 1 the pragma version
   * is selected; otherwise, the non-pragma version is selected.
   */

  if (gbsa != 1) {

    /*
     * For SCALAPACK or MPI, initialize all elements of the reff array
     * to 0 because although each task will calculate only a subset of the
     * elements, a reduction is used to combine the results from all tasks.
     * Initialize this array even for tasks that are not on the process grid
     * because MPI_Allreduce will use the default MPI_COMM_WORLD.
     *
     * For OpenMP, the combination of all threads loads all elements
     * of the reff array so there is no need to initialize it to 0.
     *
     * For single-threaded execution all elements of the reff array are
     * loaded so there is no need to initialize it to 0.
     */

#if defined(SCALAPACK) || defined(MPI)

    for (i = 0; i < n; i++) {
      reff[i] = 0.0;
    }

    if (gb2_debug) {
      printf("reff initialized, mytaskid = %d\n", mytaskid);
      fflush(stdout);
    }

#endif

#pragma omp parallel \
  private (i, xi, yi, zi, ri, ri1i, sumi, j, k, xij, yij, zij, \
	   r2, dij1i, dij, sj, sj2, uij, dij2i, tmpsd, dumbo, theta, \
           threadnum, numthreads )
    {
      /*
       * Get the thread number and the number of threads for multi-threaded
       * execution under OpenMP.  For ScaLAPACK, use the process column and
       * the number of process columns, respectively.  For single-threaded
       * execution and MPI there is no need to set these variables.
       */

#ifdef OPENMP

      threadnum = omp_get_thread_num();
      numthreads = omp_get_num_threads();

#elif defined(SCALAPACK)

      threadnum = mycol;
      numthreads = npcol;

#endif

      /*
       * Loop over all atoms i.
       *
       * Explicitly assign threads to loop indices for the following loop,
       * in a manner equivalent to (static, N) scheduling with OpenMP, and
       * identical to the manner in which threads are assigned in nblist.
       * 
       * OpenMP utilizes all threads.  ScaLAPACK utilizes all processes
       * on the process grid.  MPI is not parallelized.
       *
       * The reff array is written in the following loops.  It is necessary
       * to synchronize the OpenMP threads that execute these loops after
       * loop execution so that a race condition does not exist for reading
       * the reff array before it is written.  Even if all subsequent loops
       * use loop index to thread mapping that is identical to that of the
       * following loop, elements of the reff array are indexed by other
       * than such a loop indices, so synchronization is necessary.
       *
       * OpenMP synchronization is accomplished by the implied barrier
       * at the end of this parallel region.  ScaLAPACK synchronization is
       * accomplished by MPI_Allreduce.
       */

      for (i = 0; i < n; i++) {

#if defined(OPENMP) || defined(SCALAPACK)

	if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue;
#endif

	xi = x[3 * i    ];
	yi = x[3 * i + 1];
	zi = x[3 * i + 2];
	ri = rborn[i] - BOFFSET;
	ri1i = 1. / ri;
	sumi = 0.0;

	/* Select atom j from the pair list.  Non-graceful error handling. */

	for (k = 0; k < lpears[i] + upears[i]; k++) {

	  if (pearlist[i] == NULL) {
	    printf("NULL pair list entry in egb2 loop 1, mytaskid = %d\n", mytaskid);
	    fflush(stdout);
	  }
	  j = pearlist[i][k];

#ifdef SCALAPACK

	  if (gb2_debug) {
	    if ( !myroc(j, BLOCKSIZE, nprow, myrow) ) {
	      printf("pair list error!\n");
	      fflush(stdout);
	    }
	  }

#endif

	  xij = xi - x[3 * j    ];
	  yij = yi - x[3 * j + 1];
	  zij = zi - x[3 * j + 2];
	  r2 = xij * xij + yij * yij + zij * zij;
	  if ( r2 > rgbmaxpsmax2 ) continue;
	  dij1i = 1.0 / sqrt(r2);
	  dij = r2 * dij1i;
	  sj = fs[j] * (rborn[j] - BOFFSET);
	  sj2 = sj * sj;

	  /*
	   * The following equations are from the Appendix of Schaefer and Froemmel,
	   * JMB 216:1045-1066, 1990.  Taylor series expansion for d>>s is by Andreas
	   * Svrcek-Seiler.  Smooth rgbmax idea is from Andreas Svrcek-Seiler and
	   * Alexey Onufriev.
	   */
      
	  if ( dij > rgbmax + sj ) continue;

	  if ( dij > rgbmax - sj ) {

	    uij = 1./(dij - sj);

	    sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij +
				     rgbmax2i * (r2 - 4.0 * rgbmax * dij - sj2) -
				     2.0 * log(rgbmax * uij));

	  } else if ( dij > 4.0 * sj ) {

	    dij2i = dij1i * dij1i;
	    tmpsd = sj2 * dij2i;
	    dumbo = TA + tmpsd*(TB + tmpsd*(TC + tmpsd*(TD + tmpsd*TDD)));

	    sumi -= sj2 * sj * dij2i * dij2i * dumbo;

	  } else if ( dij > ri + sj ) {

	    sumi -= 0.5 * (sj / (r2 - sj2) +
			   0.5 * dij1i * log((dij - sj) / (dij + sj)));

	  } else if ( dij > fabs(ri - sj) ) {

	    theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2);
	    uij = 1. / (dij + sj);

	    sumi -= 0.25 * (ri1i * (2. - theta) - uij +
			    dij1i * log(ri * uij));

	  } else if ( ri < sj ) {

	    sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i +
			   0.5 * dij1i * log((sj - dij) / (sj + dij)));

	  }

	  if (gbsa == 1) {
	    if ((P0[i] + P0[j]) > dij) {
	      if (P0[i] > 2.5 && P0[j] > 2.5) {

		ineighbor[count] = j + 1;
		/* this +1 is VERY important */
		count += 1;
	      }
	    }
	  }
	}

#define GBALPHA 0.8
#define GBBETA 0.0
#define GBGAMMA 2.909125

	/*
	 * For ScaLAPACK store sums instead of reciprocal sums
	 * because more terms will be contributed to each sum
	 * from other process columns via the subsequent reduction.
	 */

#ifdef SCALAPACK

	if ( gb==1 ) {
	  
	  /* "standard" (HCT) effective radii:  */

	  reff[i] = sumi;

	} else {

	  /* "gbao" formulas:  */

	  psi[i] = -ri*sumi;

	  reff[i] = -tanh((GBALPHA + GBGAMMA*psi[i]*psi[i])*psi[i])/rborn[i];
	}
#else

	if ( gb==1 ) {
	  
	  /* "standard" (HCT) effective radii:  */

	  reff[i] = 1.0/(ri1i + sumi);

	} else {

	  /* "gbao" formulas:  */

	  psi[i] = -ri*sumi;

	  reff[i] = ri1i - tanh( (GBALPHA + GBGAMMA*psi[i]*psi[i])
				 *psi[i])/rborn[i];
	}
#endif

      }
    }

    if (gb2_debug) {
      printf("reff calculated, mytaskid = %d\n", mytaskid);
      fflush(stdout);
    }

    /*
     * The ScaLAPACK synchronization is accomplished via reduction
     * of the reff array.
     *
     * No reduction is accomplished for MPI because egb2 is not
     * parallelized for MPI.
     */

#ifdef SCALAPACK

    ierror = MPI_Allreduce(reff, reductarr, n,
			   MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (ierror != MPI_SUCCESS) {
      printf("Error in egb reff reduction, error = %d  myrow = %d  mycol = %d\n",
	     ierror, myrow, mycol);
    }

    if (gb2_debug) {
      printf("MPI_Allreduce reff, mytaskid = %d\n", mytaskid);
      fflush(stdout);
    }

    for (i = 0; i < n; i++) {
      reff[i] = reductarr[i];
      if (reff[i] == 0.0) {
	printf("reff[%d]=0!, myrow = %d  mycol = %d\n", i, myrow, mycol);
	fflush(stdout);
      }
    }

    /* For ScaLAPACK reciprocate the reduced sums in reff. */

    for (i = 0; i < n; i++) {
      ri = rborn[i] - BOFFSET;
      ri1i = 1. / ri;
      reff[i] = 1.0 / (ri1i + reff[i]);
    }

#endif

  } else {

    /*
     * Here is the second replication of the code, always executed serially.
     * Loop over all atoms i.
     */

    for (i = 0; i < n; i++) {

      xi = x[3 * i    ];
      yi = x[3 * i + 1];
      zi = x[3 * i + 2];
      ri = rborn[i] - BOFFSET;
      ri1i = 1. / ri;
      sumi = 0.0;

      /* Select atom j from the pair list.  Non-graceful error handling. */

      for (k = 0; k < lpears[i] + upears[i]; k++) {

	if (pearlist[i] == NULL) {
	  printf("NULL pair list entry in egb2 loop 2, taskid %d\n", mytaskid);
	  fflush(stdout);
	}
	j = pearlist[i][k];

	xij = xi - x[3 * j    ];
	yij = yi - x[3 * j + 1];
	zij = zi - x[3 * j + 2];
	r2 = xij * xij + yij * yij + zij * zij;
	if ( r2 > rgbmaxpsmax2 ) continue;
	dij1i = 1.0 / sqrt(r2);
	dij = r2 * dij1i;
	sj = fs[j] * (rborn[j] - BOFFSET);
	sj2 = sj * sj;

	/*
	 * The following equations are from the Appendix of Schaefer and Froemmel,
	 * JMB 216:1045-1066, 1990.  Taylor series expansion for d>>s is by Andreas
	 * Svrcek-Seiler.  Smooth rgbmax idea is from Andreas Svrcek-Seiler and
	 * Alexey Onufriev.
	 */
      
	if ( dij > rgbmax + sj ) continue;

	if ( dij > rgbmax - sj ) {

	  uij = 1./(dij - sj);

	  sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij +
				   rgbmax2i * (r2 - 4.0 * rgbmax * dij - sj2) -
				   2.0 * log(rgbmax * uij));

	} else if ( dij > 4.0 * sj ) {

	  dij2i = dij1i * dij1i;
	  tmpsd = sj2 * dij2i;
	  dumbo = TA + tmpsd*(TB + tmpsd*(TC + tmpsd*(TD + tmpsd*TDD)));

	  sumi -= sj2 * sj * dij2i * dij2i * dumbo;

	} else if ( dij > ri + sj ) {

	  sumi -= 0.5 * (sj / (r2 - sj2) +
			 0.5 * dij1i * log((dij - sj) / (dij + sj)));

	} else if ( dij > fabs(ri - sj) ) {

	  theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2);
	  uij = 1. / (dij + sj);

	  sumi -= 0.25 * (ri1i * (2. - theta) - uij +
			  dij1i * log(ri * uij));

	} else if ( ri < sj ) {

	  sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i +
			 0.5 * dij1i * log((sj - dij) / (sj + dij)));

	}

	if (gbsa == 1) {
	  if ((P0[i] + P0[j]) > dij) {
	    if (P0[i] > 2.5 && P0[j] > 2.5) {

	      ineighbor[count] = j + 1;
	      /* this +1 is VERY important */
	      count += 1;
	    }
	  }
	}
      }

      if ( gb==1 ){

	/* "standard" (HCT) effective radii:  */
	reff[i] = 1.0/(ri1i + sumi);

      } else {

	/* "gbao" formulas:  */

#define GBALPHA 0.8
#define GBBETA 0.0
#define GBGAMMA 2.909125

	psi[i] = -ri*sumi;
	reff[i] = 1.0 / (ri1i - tanh( (GBALPHA + GBGAMMA*psi[i]*psi[i])
				      *psi[i])/rborn[i] );
      }

      /* gbsa == 1 */

      ineighbor[count] = 0;
      count = count + 1;

#if 0
      reff[i] = rborn[i] > reff[i] ? rborn[i] : reff[i];
      if (gb2_debug) {
	printf("%d\t%15.7f\t%15.7f\n", i + 1, rborn[i], reff[i]);
      }
#endif

    }

    if (gb2_debug) {
      printf("reff calculated\n");
      fflush(stdout);
    }
  }

  /*
   * Main LCPO stuff follows.  No second derivative support for this.
   * Also no parallelization for OpenMP or ScaLAPACK due to the loop-carried
   * dependence of the 'count' variable.
   */

  if (gbsa == 1) {
    totsasa = 0.0;
    count = 0;
    for (i = 0; i < n; i++) {
      if (ineighbor[count] == 0) {
	count = count + 1;
      } else {

	si = PIx4 * P0[i] * P0[i];
	sumAij = 0.0;
	sumAjk = 0.0;
	sumAjk2 = 0.0;
	sumAijAjk = 0.0;
	sumdAijddijdxi = 0.0;
	sumdAijddijdyi = 0.0;
	sumdAijddijdzi = 0.0;
	sumdAijddijdxiAjk = 0.0;
	sumdAijddijdyiAjk = 0.0;
	sumdAijddijdziAjk = 0.0;

	icount = count;

      L70:j = ineighbor[count] - 1;
				/* mind the -1 , fortran again */
      xi = x[3 * i    ];
      yi = x[3 * i + 1];
      zi = x[3 * i + 2];

      xj = x[3 * j    ];
      yj = x[3 * j + 1];
      zj = x[3 * j + 2];


      r2 = (xi-xj) * (xi-xj) + (yi-yj) * (yi-yj) + (zi-zj) * (zi-zj);
      dij1i = 1. / sqrt(r2);
      rij = r2 * dij1i;
      tmpaij =
	P0[i] - rij * 0.5 - (P0[i] * P0[i] -
			     P0[j] * P0[j]) * 0.5 * dij1i;

      Aij = PIx2 * P0[i] * tmpaij;

      dAijddij =
	PIx1 * P0[i] * (dij1i * dij1i *
			(P0[i] * P0[i] - P0[j] * P0[j]) - 1.0);

      dAijddijdxj = dAijddij * (xj - xi) * dij1i;
      dAijddijdyj = dAijddij * (yj - yi) * dij1i;
      dAijddijdzj = dAijddij * (zj - zi) * dij1i;

      sumAij = sumAij + Aij;

      count2 = icount;
      sumAjk2 = 0.0;
      sumdAjkddjkdxj = 0.0;
      sumdAjkddjkdyj = 0.0;
      sumdAjkddjkdzj = 0.0;

      p3p4Aij = -surften * (P3[i] + P4[i] * Aij);

      L80:k = ineighbor[count2] - 1;
				/*same as above, -1 ! */
      if (j == k)
	goto L85;

      xk = x[3 * k];
      yk = x[3 * k + 1];
      zk = x[3 * k + 2];

      rjk2 =
	(xj - xk) * (xj - xk) + (yj - yk) * (yj - yk) + (zj -
							 zk) *
	(zj - zk);

      djk1i = 1.0 / sqrt(rjk2);

      rjk = rjk2 * djk1i;

      if (P0[j] + P0[k] > rjk) {
	vdw2dif = P0[j] * P0[j] - P0[k] * P0[k];
	tmpajk = 2.0 * P0[j] - rjk - vdw2dif * djk1i;

	Ajk = PIx1 * P0[j] * tmpajk;

	sumAjk = sumAjk + Ajk;
	sumAjk2 = sumAjk2 + Ajk;

	dAjkddjk =
	  PIx1 * P0[j] * djk1i * (djk1i * djk1i * vdw2dif -
				  1.0);

	dAjkddjkdxj = dAjkddjk * (xj - xk);
	dAjkddjkdyj = dAjkddjk * (yj - yk);
	dAjkddjkdzj = dAjkddjk * (zj - zk);

	f[3 * k] = f[3 * k] + dAjkddjkdxj * p3p4Aij;
	f[3 * k + 1] = f[3 * k + 1] + dAjkddjkdyj * p3p4Aij;
	f[3 * k + 2] = f[3 * k + 2] + dAjkddjkdzj * p3p4Aij;

	sumdAjkddjkdxj = sumdAjkddjkdxj + dAjkddjkdxj;
	sumdAjkddjkdyj = sumdAjkddjkdyj + dAjkddjkdyj;
	sumdAjkddjkdzj = sumdAjkddjkdzj + dAjkddjkdzj;

      }

      L85:count2 = count2 + 1;
      if (ineighbor[count2] != 0) {
	goto L80;
      } else {
	count2 = icount;
      }


      sumAijAjk = sumAijAjk + Aij * sumAjk2;

      sumdAijddijdxi = sumdAijddijdxi - dAijddijdxj;
      sumdAijddijdyi = sumdAijddijdyi - dAijddijdyj;
      sumdAijddijdzi = sumdAijddijdzi - dAijddijdzj;
      sumdAijddijdxiAjk =
	sumdAijddijdxiAjk - dAijddijdxj * sumAjk2;
      sumdAijddijdyiAjk =
	sumdAijddijdyiAjk - dAijddijdyj * sumAjk2;
      sumdAijddijdziAjk =
	sumdAijddijdziAjk - dAijddijdzj * sumAjk2;

      lastxj = dAijddijdxj * sumAjk2 + Aij * sumdAjkddjkdxj;
      lastyj = dAijddijdyj * sumAjk2 + Aij * sumdAjkddjkdyj;
      lastzj = dAijddijdzj * sumAjk2 + Aij * sumdAjkddjkdzj;

      dAidxj =
	surften * (P2[i] * dAijddijdxj +
		   P3[i] * sumdAjkddjkdxj + P4[i] * lastxj);
      dAidyj =
	surften * (P2[i] * dAijddijdyj +
		   P3[i] * sumdAjkddjkdyj + P4[i] * lastyj);
      dAidzj =
	surften * (P2[i] * dAijddijdzj +
		   P3[i] * sumdAjkddjkdzj + P4[i] * lastzj);

      f[3 * j] = f[3 * j] + dAidxj;
      f[3 * j + 1] = f[3 * j + 1] + dAidyj;
      f[3 * j + 2] = f[3 * j + 2] + dAidzj;

      count = count + 1;
      if (ineighbor[count] != 0) {
	goto L70;
      } else {
	count = count + 1;
      }

      Ai = P1[i] * si + P2[i] * sumAij + P3[i] * sumAjk +
	P4[i] * sumAijAjk;

      dAidxi =
	surften * (P2[i] * sumdAijddijdxi +
		   P4[i] * sumdAijddijdxiAjk);
      dAidyi =
	surften * (P2[i] * sumdAijddijdyi +
		   P4[i] * sumdAijddijdyiAjk);
      dAidzi =
	surften * (P2[i] * sumdAijddijdzi +
		   P4[i] * sumdAijddijdziAjk);

      f[3 * i] = f[3 * i] + dAidxi;
      f[3 * i + 1] = f[3 * i + 1] + dAidyi;
      f[3 * i + 2] = f[3 * i + 2] + dAidzi;

      totsasa = totsasa + Ai;
				/*  printf("totsasa up to %d %f\n",i,totsasa); */
      }
    }
    /*      printf("SASA %f , ESURF %f \n",totsasa,totsasa*surften); */
    esurf = totsasa * surften;
  }

  /*
   * Compute the generalized Born energy for all pair interactions, as
   * well as the Van der Waals and Coulombic energy for all pairs that
   * are not on the excluded list.  Compute the 1st and 2nd derivatives
   * of the of the Van der Waals and Coulombic energy for all non-excluded
   * pairs.
   *
   * In addition, compute the 1st derivative of the Born energy for all
   * pairs of atoms (selected by the indices i and j).  David Case and
   * his colleagues have shown that the first derivative for an atom pair
   * may be calculated by first building the following equations in
   * terms of Ri (the effective radius of atom i), Rj (the effective
   * radius of atom j) and Dij (the distance between atoms i and j):
   *
   *      rb2 = Ri * Rj
   *
   *      efac = exp(Dij^2 / (4 * rb2))
   *
   *      fgbi = 1 / sqrt(Dij^2 + rb2 * efac)
   *
   *      fgbk = -kappa * KSCALE / fgbi
   *
   *      expmkf = exp(fgbk) / DIELECTRIC_CONSTANT
   *
   *      dielfac = 1 - expmkf
   *
   *      temp4 = fgbi * fgbi * fgbi
   *
   *      temp6 = Qi * Qj * temp4 * (dielfac + fgbk * expmkf)
   *
   *      temp5 = efac * temp6 * (rb2 + 0.25 * r2) / 2
   *
   * From the above equations it is possible to express the first
   * derivative of the Born energy with respect to any cartesian coordinate
   * (designated by Xl) by summing contributions from the three following
   * equations:
   *
   * 1.   d(Dij)/d(Xl) * A(Dij) where
   *
   *      A(Dij) = (1 - efac / 4) * temp6
   *
   * 2.   d(Ri)/d(Xl) * B(Dij) where
   *
   *      B(Dij) = Ri * temp5
   *
   * 3.   d(Rj)/d(Xl) * C(Dij) where
   *
   *      C(Dij) = Rj * temp5
   *
   * Equation 1 is the simplest of the three to evaluate.  One calculates
   * all of the derivatives of Dij with respect to cartesian coordinates,
   * multiplies these derivatives by A(Dij), and sums the resulting products
   * to the gradient vector.  There are six derivatives involving atoms
   * i and j:
   *
   *      d(Dij)/d(Xi), d(Dij)/d(Yi), d(Dij)/d(Zi)
   *
   *      d(Dij)/d(Xj), d(Dij)/d(Yj), d(Dij)/d(Zj)
   *
   * The derivatives of Dij with respect to the cartesian coordinates
   * contain a factor of Dij in the denominator.  This factor is absorbed
   * by equation 1 (and all of the other equations that incorporate the
   * derivatives) rather than carried with the individual derivatives.
   *
   * Equations 2 and 3 are more complicated because the derivatives of the
   * effective radii Ri and Rj with respect to the cartesian coordinates are
   * sometimes series, but sometimes single terms.  For example:
   *
   *      d(Ri)/d(Xi) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xi)
   *
   *                  + d[f(Dik)]/d(Dik) * d(Dik)/d(Xi) + ...
   *
   *                  + d[f(Din)]/d(Din) * d(Din)/d(Xi)
   *
   *      d(Ri)/d(Xj) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xj)
   *
   *      d(Rj)/d(Xj) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xj)
   *
   *                  + d[f(Djk)]/d(Djk) * d(Djk)/d(Xj) + ...
   *
   *                  + d[f(Djn)]/d(Djn) * d(Djn)/d(Xj)
   *
   *      d(Rj)/d(Xi) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xi)
   *
   * where n is the number of atoms and f() designates a function of the
   * interatomic distance.
   *
   * Moreover, when all of the atom pairs are considered, one can refactor
   * the computation to obtain terms such as:
   *
   *      d(Ri)/d(Xi) * [B(Dij) + B(Dik) + ... + B(Din)]
   *
   *      d(Rj)/d(Xj) * [C(Dij) + C(Djk) + ... + C(Djn)]
   *
   * The square brackets [] that B(Dij) + B(Dik) + ... + B(Din) are summed
   * prior to multiplication by the derivatives of the effective radius.
   *
   * David Case and his colleagues have discovered an algorithm that permits
   * evaluation of the first derivatives while requiring that the first
   * derivatives of the effective radii be evaluated once only.  They
   * use an array called "sumdeijda" into which the above series are summed.
   * For example, the series B(Dij) + B(Dik) + ... + B(Din) is summed into
   * sumdeijda[i], and the series C(Dij) + C(Djk) + ... + C(Din) is summed
   * into sumdeijda[j].  One can think of the contents of sumdeijda[i]
   * as the sum of everything that gets multiplied by d(Ri)/d(Xl), where
   * d(Ri)/d(Xl) represents all of the derivatives of Ri with respect to the
   * cartesian coordinates.  Similarly, one can think of the contents of
   * sumdeijda[j] as the sum of everything that gets multiplied by d(Rj)/d(Xl),
   * where d(Rj)/d(Xl) represents all of the derivatives of Rj with respect
   * to the cartesian coordinates.
   *
   * Once the sumdeijda array is loaded, the d(Ri)/d(Xl) and d(Rj)/d(Xl) are
   * calculated and multiplied by the respective elements of sumdeijda.  For
   * example, for d(Ri)/d(Xl) the individual terms of the d(Ri)/d(Xi) series
   * are calculated and stored into another array called da.  Each term has
   * an x-, y- and z-component.  The components of the jth term are:
   *
   *      d[f(Dij)]/d(Dij) * d(Dij)/dXi
   *
   *      d[f(Dij)]/d(Dij) * d(Dij)/dYi
   *
   *      d[f(Dij)]/d(Dij) * d(Dij)/dZi 
   *
   *      d[f(Dij)]/d(Dij) * d(Dij)/dXj
   *
   *      d[f(Dij)]/d(Dij) * d(Dij)/dYj
   *
   *      d[f(Dij)]/d(Dij) * d(Dij)/dZj
   *
   * Note that the jth term contains derivatives with respect to the cartesian
   * coordinates of atoms i and j.  The derivatives with respect to the cartesian
   * coordinates of atom j are stored in da[j].  The derivatives with respect to
   * the cartesian coordinates of atom i are summed into da[i] with the result
   * that da[i] contains derivatives from f(Dij), f(Dik), ..., f(Din).
   *
   * Each element of the da array is then multiplied by sumdeijda[i] to form:
   *
   * [ d(Rij)/d(Xl) + d(Rik)/d(Xl) + ... ] * [f(Dij) + f(Dik) + ... ]

   * where the series d(Rij)/d(Xl) + d(Rik)/d(Xl) + ... contains n terms each
   * of which has an (x,y,z) coordinate.  Therefore this result is a vector of
   * length 3*n where n is the number of atoms.  This vector represents
   * the derivatives of Ri with respect to all of the cartesian coordinates.
   * It is summed into the gradient vector.
   *
   * The process is then repeated for Rj, Rk, ..., Rn.
   *
   * There is one further refinement that relates to the first derivatives
   * of the "diagonal" terms, i.e., those energy terms that are functions
   * of the effective radius Ri but not of the interatomic distance Dij.
   * One begins by building up the following equations:
   *
   *      expmkf = exp(-kappa * KSCALE * Ri) / DIELECTRIC_CONSTANT
   *
   *      dielfac = 1 - expmkf
   *
   *      qi2h = Qi^2 / 2
   *
   *      qid2h = di2h * dielfac
   *
   * Then the first derivative with respect to Ri is the equation:
   *
   * 4.   d(Ri)/d(Xl) * R(Dij) where
   *
   *      R(Dij) = qid2h - kappa * KSCALE * qi2h * expmkf * Ri
   *
   *
   * Equation 4, like equation 2, is multiplied by d(Ri)/d(Xl) and
   * therefore one can add these two equations and store the result
   * in the ith element of the sumdeijda array.
   *
   * Note that equations 2, 3 and 4 include a factor of either -Ri*Ri
   * or -Rj*Rj which arises from the derivative of Ri or Rj, respectively.
   * The negation is introduced when  sumda is multiplied by an element
   * from the dreff array.
   *
   * The above algorithm may be extended to the calculation of the second
   * derivatives.  One proceeds by first building the following equations
   * in terms of Ri, Rj and Dij:
   *
   *      RiInv = 1 / Ri
   *
   *      RjInv = 1 / Rj
   *
   *      RiRjInv = RiInv * RjInv
   *
   *      temp4 = fgbi * fgbi * fgbi
   *
   *      temp5 = Qi * Qj * temp4 * (dielfac + fgbk * expmkf) / 2
   *
   *      temp6 = fgbi * fgbi * temp5
   *
   *      temp7 = Qi * Qj * kappa^2 * KSCALE^2 * temp4 * expmkf
   *
   *      temp8 = 1 - efac / 4
   *
   *      temp9 = temp7 / 4 - 3 * temp6 / 2
   *
   *      temp10 = Dij^2 / 4
   *
   *      temp11 = Ri * Rj + temp10
   *
   *      temp12 = Ri^2 * Rj^2
   *
   *      temp13 = Ri * temp11
   *
   *      temp14 = Rj * temp11
   *
   *      temp15 = temp10 * RjInv
   *
   *      temp16 = temp10 * RiInv
   *
   *      temp17 = temp9 * temp13 * efac
   *
   *      temp18 = temp9 * temp14 * efac
   *
   *      temp20 = temp8 * temp9
   *
   *      temp21 = temp20 * temp13
   *
   *      temp22 = temp20 * temp14
   *
   *      temp23 = 2 * efac;
   *
   *
   * From the above equations it is possible to express the second
   * derivative of the Born energy with respect to two cartesian coordinates
   * (designated by Xl and Xm) by summing contributions from the following
   * equations:
   *
   * 5.   d(Ri)/d(Xm) * [ d(Dij)/d(Xl) * D(Dij) ] where
   *
   *      D(Dij) = temp23 * (temp21 - temp15 * temp5 / 4)
   *
   *
   * 6.   d(Rj)/d(Xm) * [ d(Dij)/d(Xl) * E(Dij) ] where
   *
   *      E(Dij) = temp23 * (temp22 - temp16 * temp5 / 4)
   *
   *
   * 7.   d(Ri)/d(Xl) * [ d(Dij)/d(Xm) * J(Dij) ] where
   *
   *      J(Dij) = temp23 * ((Ri - RjInv * temp11) * temp5 / 4 + temp21)
   *
   *
   * 8.   d(Rj)/d(Xl) * [ d(Dij)/d(Xm) * N(Dij) ] where
   *
   *      N(Dij) = temp23 * ((Rj - RiInv * temp11) * temp5 / 4 + temp22)
   *
   *
   * 9.   d(Ri)/d(Xm) * [ d(Ri)/d(Xl) * H(Dij) ] where
   *
   *      H(Dij) = efac * (temp10 * (temp13 * RjInv - Ri^2) * temp5 + temp13 * temp17)
   *
   *
   * 10.  d(Rj)/d(Xm) * [ d(Rj)/d(Xl) * M(Dij) ] where
   *
   *      M(Dij) = efac * (temp10 * (temp14 * RiInv - Rj^2) * temp5 + temp14 * temp18)
   *
   *
   * 11.  d(Rj)/d(Xm) * [ d(Ri)/d(Xl) * I(Dij) ] where
   *
   *      I(Dij) = efac * ((temp12 + temp16 * temp13) * temp5 + temp13 * temp18)
   *
   *
   * 12.  d(Ri)/d(Xm) * [ d(Rj)/d(Xl) * L(Dij) ] where
   *
   *      L(Dij) = efac * ((temp12 + temp15 * temp14) * temp5 + temp14 * temp17)
   *
   *
   * 15.  d2(Dij)/d(Xm)d(Xl) * F(Dij) where
   *
   *      F(Dij) = 2 * temp8 * temp5
   *
   *
   * 16.  d(Dij)/d(Xm) * [ d(Dij)/d(Xl) * G(Dij) ] where
   *
   *      G(Dij) = 4 * temp8 * temp20
   *
   *               + (2 * temp8 + efac * temp10 * RiRjInv) * temp5 / Dij^2
   *
   *
   * Using the above equations 5-16 it is possible to calculate the
   * second derivatives of the Born energy in an optimized manner.
   * Note, however, that computing the second derivatives of the effective
   * radius with respect to the cartesian coordinates involves some
   * complexity.  This fact can be appreciated by beginning with the
   * following expression for the effective radius Ri of atom i:
   *
   *
   * Ri = 1 / [1/Rborn(i) + f(Dij) + f(Dik) + ... + f(Din)]
   *
   *
   * where Rborn(i) represents the Born radius of atom i and f() respesents
   * a function of the interatomic distance.  Differentiating with respect to
   * a cartesian coordinate Xl gives:
   *
   *
   * d(Ri)/d(Xl) = -Ri^2 * { d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) +
   * 
   *                         d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... +
   *
   *                         d[f(Din)]/d(Din) * d(Din)/d(Xl) }
   *
   *
   * The above expression is the general case.  Many of the terms are
   * zero, for example, d(Dij)/d(Xl) is zero when l equals neither i nor j.
   *
   * Differentiating again with respect to a cartesian coordinate Xm gives:
   *
   *
   * d2(Ri)/d(Xm)d(Xl) =
   *
   * - Ri^2 * { d[f(Dij)]/d(Dij) * d2(Dij)/d(Xm)d(Xl) +
   *
   *            d[f(Dik)]/d(Dik) * d2(Dik)/d(Xm)d(Xl) + ... +
   *
   *            d[f(Din)]/d(Din) * d2(Din)/d(Xm)d(Xl) }
   *
   *
   * - Ri^2 * { d2[f(Dij)]/d(Dij)2 * d(Dij)/d(Xm) * d(Dij)/d(Xl) +
   *
   *            d2[f(Dik)]/d(Dik)2 * d(Dik)/d(Xm) * d(Dik)/d(Xl) + ... +
   *
   *            d2[f(Din)]/d(Din)2 * d(Din)/d(Xm) * d(Din)/d(Xl) }
   *
   *
   * + 2Ri^3 * { d[f(Dij)]/d(Dij) * d(Dij)/d(Xm) * d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) +
   *
   *             d[f(Dij)]/d(Dij) * d(Dij)/d(Xm) * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... +
   *
   *             d[f(Dij)]/d(Dij) * d(Dij)/d(Xm) * d[f(Din)]/d(Din) * d(Din)/d(Xl) +
   *
   * 
   *             d[f(Dik)]/d(Dik) * d(Dik)/d(Xm) * d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) +
   *
   *             d[f(Dik)]/d(Dik) * d(Dik)/d(Xm) * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... +
   *
   *             d[f(Dik)]/d(Dik) * d(Dik)/d(Xm) * d[f(Din)]/d(Din) * d(Din)/d(Xl) + ... +
   *
   *
   *             d[f(Din)]/d(Din) * d(Din)/d(Xm) * d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) +
   *
   *             d[f(Din)]/d(Din) * d(Din)/d(Xm) * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... +
   *
   *             d[f(Din)]/d(Din) * d(Din)/d(Xm) * d[f(Din)]/d(Din) * d(Din)/d(Xl) }
   *
   *
   * Many of the terms in the above expression are zero, or example, when neither
   * l nor m equals i or j.  However, many non-zero terms remain, and in particular
   * there exists a large number of the terms that are multiplied by 2Ri^3.  But
   * fortunately these particular terms may be computed by simply multiplying
   * the derivative of the Ri with respect to all cartesian coordinates by itself.
   *
   * As can be appreciated from the above analysis, differentiation gives rise to
   * factors of -Ri^2, Ri^3, -Rj^2 and Rj^3.  A factor of Ri^2 or Rj^2 is included
   * equations 5-16, as well as equations 17-18 (below).
   *
   * Square brackets [ ] have been added to many of equations 5-18
   * to indicate the order of evaluation.  The derivative inside of the
   * brackets will be called the inner derivative.  The derivative outside
   * of the brackets will be called the outer derivative.  The inner
   * derivative will be evaluated prior to the outer derivative.  The
   * inner derivative that is multiplied by the accompanying functions
   * of Dij, e.g., H(Dij) of equation 7, produces a gradient vector.
   *
   * Equations 15 and 16 are analogous to equation 1 and are simple to
   * evaluate because they do not involve derivatives of the effective
   * radii Ri and Rj and so do not comprise series.  Instead, they comprise
   * the derivatives of Dij with respect to the cartesian coordinates
   * of atoms i and j only.
   *
   * Equations 13 and 14 are analogous to equations 2 and 3 except that
   * they involve second derivatives of the effective radii Ri and Rj instead
   * of first derivatives.  However, the second derivatives of the effective
   * radius Ri include terms such as d2(Ri)/d(Xl)d(Xm) which affect many Hessian
   * elements when neither l nor m equals i.
   *
   * Equations 9, 10, 11 and 12 imply two applications of the approach used
   * to evaluate equations 2 and 3.  However, the result of the multiplication
   * by the inner derivative (which is enclosed in [ ]) is not a single term
   * but rather is a gradient vector.  Therefore, multiplication by the outer
   * derivative involves multiplying each derivative of Ri or Rj by all of the
   * terms in the gradient vector.  Moreover, this process requires a
   * two-dimensional array, i.e., an array of gradient vectors which is named
   * either "sumdeijdg" or "sumdeijdh", and is analogous to the sumdeijda array
   * that is used to evaluate equations 2, 3 and 4.  The order of evaluation of
   * the derivatives appears to be unimportant for equations 9 and 10, i.e., the
   * inner derivatives may be swappped  with the outer derivatives without
   * affecting the results.
   *
   * However, there is a subtlety involving equations 11 and 12 which requires
   * that if the inner derivatives are swapped with the outer derivatives in
   * equation 11, they must be similarly swapped in equation 12.  This subtlety
   * arises because the the sumdeijdg array is used to collect all inner
   * derivatives with respect to Xl that will subsequently be multiplied by
   * outer derivatives with respect to Xm  An analogous sumdeijdh array is used
   * to collect all inner derivatives with respect to Xm that will subsequently
   * be multiplied by outer derivatives with respect to Xl.  The inner derivatives
   * of equations 11 and 12 are stored in the sumdeijdg array, which implies
   * that for these equations the inner derivatives are taken with respect to Xl.
   * Swapping the inner and outer derivatives for both of these equations preserves
   * the symmetry of the Hessian matrix, but swapping the inner and outer derivatives
   * for only one of these equations breaks the symmetry and leads to erroneous
   * results.  For example, if one were to swap the inner and outer derivatives
   * in equation 11, but not in equation 12, the following pair of equations would
   * obtain:
   *
   * 11'. d(Ri)/d(Xm) * [ d(Rj)/d(Xl) * I(Dij) ]
   *
   * 12.  d(Ri)/d(Xm) * [ d(Rj)/d(Xl) * L(Dij) ]
   *
   * This pair of equations is not symmetric and is therefore incorrect.
   *
   * Equations 5, 6, 7 and 8 are evaluated by first multiplying by
   * an inner derivative of Dij and then by multiplying by an outer
   * derivative of Ri or Rj.  The inner derivatives of Dij are
   * collected into either the sumdeijdg array or sumdeijdh array.
   * The derivatives of a particular Dij produce a gradient vector
   * that is stored in one row of either sumdeijdg or sumdeijdh.
   * Subsequent multiplication by the outer derivative of Ri or Rj
   * involves multiplying each derivative of Ri or Rj by all of the
   * terms in the gradient vector.  This process involves
   * a subtlety.  In equations 5 and 6 the inner derivative is with
   * respect to Xl whereas in equations 7 and 8 the inner derivative
   * is with respect to Xm.  For this reason, the inner derivatives
   * of equations 5 and 6 are collected into the sumdeijdg array,
   * whereas the inner derivatives of equations 7 and 8 are collected
   * into the sumdeijdh array, following the convention that was
   * discussed above for equations 11 and 12.  The outer derivatives
   * taken with respect to Xm are computed using the sumdeijdg array,
   * whereas the outer derivatives taken with respect to Xl are computed
   * using the sumdeijdh array.  These two sets of derivatives are
   * written into transpose-related elements of the Hessian matrix in
   * order to preserve the symmetry of the Hessian.
   *
   * On another topic, it is possible to generate equations for the second
   * derivatives of the non-pairwise ("diagonal") terms.  One begins by
   * building up the following equations:
   *
   *      expmkf = exp(-kappa * KSCALE * Ri) / DIELECTRIC_CONSTANT
   *
   *      dielfac = 1 - expmkf
   *
   *      temp18 = Ri * (kappa * KSCALE * expmkf * Ri * (2 + kappa * KSCALE * Ri)
   *
   *                     - 2 * dielfac)
   *
   *
   * Using the above equations it is possible to calculate the second
   * derivatives of the non-pairwise terms via the two following equations:
   *
   * 17.  d2(Ri)/d(Xm)d(Xl) * R(Dij) where R(Dij) is defined in equation 4.
   *
   *
   * 18.  d(Ri)/d(Xm) * [ d(Ri)/d(Xl) * P(Dij) ] where
   *
   *      P(Dij) = Qi^2 * temp18 / 2
   *
   * Equations 17 and 18 are handled in a manner analogous to equation 4.
   *
   *
   * Calculate the Born, van der Waals and Coulombic energies.  Complete
   * equations 15 and 16, and set up the remaining equations.  For OpenMP
   * in this collection of loops sum data relative only to atom i or to
   * pairs ii and ij so that OpenMP can be used to parallelize the outer loop.
   * For ScaLAPACK sum data relative to atoms i and j, as well as to pairs
   * ii and jj.  When in single-threaded mode sum data relative to atoms
   * i and j, as well as to pairs ii, ij, ji and jj.
   *
   * Initialize the energy by all OpenMP threads or ScaLAPACK processes.
   */

	epol = elec = evdw = 0.0;

#pragma omp parallel reduction (+: epol, elec, evdw) \
  private (i, i3, in, in3, xi, yi, zi, qi, qj, ri, ri2, riinv, expmkf, dielfac, \
	   qi2h, qid2h, temp17, temp18, p_dij, r_dij, iaci, j, j3, jn, jn3, \
	   xij, yij, zij, r2, r2inv, qiqj, rj, rj2, rjinv, rirjinv, rb2, efac, \
	   fgbi, fgbk, temp4, temp6, a_dij, temp5, b_dij, c_dij, di, dj, d2ii, d2ij, \
	   k, l, df, dg, rinv, r4inv, eel, ic, r6inv, f6, f12, dedx, dedy, dedz, \
	   temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15, \
	   temp16, temp20, temp21, temp22, temp23, d_dij, e_dij, f_dij, \
	   g_dij, h_dij, i_dij, j_dij, l_dij, m_dij, n_dij, \
	   npairs, xj, yj, zj, thi, threadnum, numthreads, \
	   d2jj, ri1i, temp1, dij3i, dx, dy, dz, m, iexw, \
	   datmp, tmpsd, dumbo, temp2, da2tmp, sj, sj2, dij1i, dij2i, dij, \
	   dxj, dyj, dzj, sumda, ijterm)
  {

    /*
     * Get the thread number and the number of threads for multi-threaded
     * execution under OpenMP.  For ScaLAPACK, use the process column and
     * the number of process columns, respectively.  These variables are
     * not used for MPI or single-threaded execution.
     */

#ifdef OPENMP

    threadnum = omp_get_thread_num();
    numthreads = omp_get_num_threads();

#elif defined(SCALAPACK)

    threadnum = mycol;
    numthreads = npcol;

#endif

    /*
     * For OpenMP, initialize the sumdeijda, sumdeijdg, sumdeijdh,
     * sumdeijdn and dreff arrays inside of the parallel region.
     *
     * The "first touch" memory allocation strategy will locate
     * the elements of these arrays that are initialized by
     * a particular CPU in memory that is local to that CPU.
     *
     * Explicitly assign threads to loop indices for the following loop,
     * in a manner equivalent to (static, N) scheduling with OpenMP, and
     * identical to the manner in which threads are assigned in nblist.
     *
     * It is not necessary to synchronize OpenMP threads following
     * this loop because the next loops use identical loop index to
     * thread mapping.  Hence the sumdeijda array is guaranteed to
     * be initialized above before it is updated below.
     *
     * For ScaLAPACK, each process maintains a full copy of the sumdeijda
     * and grad arrays, so initialize the entirety of these arrays for each
     * task.  Note that because each process maintains a full copy of these
     * arrays they may be updated in i,j loops only and not in j,i loops.
     *
     * For ScaLAPACK initialize as well the gg, sg, sh, sr and sn vectors.
     *
     * Because at present MPI_Allreduce uses the default MPI_COMM_WORLD,
     * initialization must be performed for all tasks, not just the tasks
     * on the process grid.
     *
     * Each process on the process grid maintains a submatrix of the
     * distributed sumdeijdg, sumdeijdh, sumdeijdn and dreff matrices,
     * so initialize the submatrix that is maintained by each process.
     */

#ifdef SCALAPACK

    for (i = 0; i < n; i++) {
      sumdeijda[i] = sn[i] = 0.0;
    }

    for (i = 0; i < n3; i++) {
      grad[i] = sg[i] = sh[i] = sr[i] = 0.0;
    }

    for (i = 0; i < 3*n3; i++) {
      gg[i] = 0.0;
    }

    if (context_PxQ >= 0) {
      for (i = 0; i < super_NxN->size; i++) {
	sumdeijdn[i] = 0.0;
      }
      for (i = 0; i < super_Nx3N->size; i++) {
	sumdeijdg[i] = 0.0;
      }
      for (i = 0; i < super_Nx3N->size; i++) {
	sumdeijdh[i] = 0.0;
      }
      for (i = 0; i < super_Nx3N->size; i++) {
	dreff[i] = 0.0;
      }
    }

    if (gb2_debug) {
      printf("arrays initialized\n");
      fflush(stdout);
    }

#else

#ifndef ROW_CYCLIC
#pragma omp for
#endif

    for (i = 0; i < n; i++) {

#if defined(ROW_CYCLIC) && defined(OPENMP)

      if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue;
#endif

      in = n * i;
      in3 = 3 * in;

      sumdeijda[i] = 0.0;

      for (j = 0; j < n; j++) {
	sumdeijdn[in + j] = 0.0;
      }

      for (j = 0; j < n3; j++) {
	sumdeijdg[in3 + j] = sumdeijdh[in3 + j] = dreff[in3 + j] = 0.0;
      }
    }

#endif

    /*
     * Allocate and initialize the iexw array used for skipping excluded
     * atoms.  Note that because of the manner in which iexw is used, it
     * is necessary to initialize it before only the first iteration of
     * the following loop.
     */

    iexw = ivector(-1, n);
    for (i = -1; i < n; i++) {
      iexw[i] = -1;
    }

    /*
     * Loop over all atoms i.
     *
     * Explicitly assign threads to loop indices for the following loop,
     * in a manner equivalent to (static, N) scheduling with OpenMP, and
     * identical to the manner in which threads are assigned in nblist.
     *
     * For OpenMP the threads are arranged in a row cyclic manner so each
     * iteration of the following loops is assigned to a different thread.
     *
     * For ScaLAPACK the processes are arranged in a block cyclic manner
     * so each iteration of the following loops is assigned to all process
     * columns of a given process row.  The mapping of iterations onto
     * process rows is accomplished such that the submatrices maintained
     * by the processes of the row are indexed by the global matrix row
     * index that is specified by the loop index.  Before a matrix element
     * is updated, a check is performed to select the correct process
     * column as well.
     *
     * The Hessian matrix g has dimensionality 3Nx3N.  The gradient
     * vector f has dimensionality 3Nx1.  The sumdeijdn array has
     * dimensionality NxN.  The sumdeijdg, sumdeijdh and dreff arrays
     * have dimensionality Nx3N.  To facilitate selection of the correct
     * process row and column for all of these matrices, the block
     * dimensions have been chosen to match the dimensionality, that is,
     * a dimensionality of 3N requires a block dimension of 3*BLOCKSIZE,
     * whereas a dimensionality of N requires a block dimension of
     * BLOCKSIZE.  This approach permits the use of the bs variable
     * instead of the bs3 variable in the calls to the myroc function
     * below.
     *
     * It is not necessary to synchronize OpenMP threads following
     * this loop because the next loop uses identical loop index to
     * thread mapping.  Hence there will be no race condition associated
     * with updates of the sumdeijda, sumdeijdn, sumdeijdg, sumdeijdh,
     * gradient and Hessian arrays.  In particlar, this (i,j) loop
     * updates i elements of the sumdeijda and f gradient vectors, and
     * (i,i) and (i,j) elements of the sumdeijdg, sumdeijdh and Hessian
     * matrices, whereas the next (j,i) loop updates j elements of the
     * sumdeijda and f gradient vectors,and (j,j) and (j,i) elements
     * of the sumdeijdg, sumdeijdh and Hessian matrices, so each loop
     * updates the same array elements.
     *
     * For ScaLAPACK this loop updates i and j elements of the sumdeijda
     * and (grad) gradient vectors, so there is no need to update these
     * vectors in the next (j,i) loop.  For ScaLAPACK this loop updates
     * matrices in the same manner as OpenMP, so the following (j,i) loop
     * is necessary.
     *
     * For MPI and single-threaded execution this loop updates all elements
     * of all vectors and matrices, so the following (j,i) loop is not
     * necessary.
     */

    for (i = 0; i < n; i++) {

#if defined(OPENMP) || defined(SCALAPACK)

      if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue;

#endif

      in = n * i;

      qi = q[i];
      ri = reff[i];

      /*
       * If atom i is not frozen, compute the "diagonal" energy that
       * is a function of only the effective radius Ri but not of the
       * interatomic distance Dij.  Compute also the contribution of
       * the diagonal energy term to the sum by which the derivatives
       * of Ri will be multiplied.
       */

      if (!frozen[i]) {
	expmkf = exp(-KSCALE * (*kappa) * ri) / (*diel_ext);
	dielfac = 1.0 - expmkf;
	qi2h = 0.5 * qi * qi;
	qid2h = qi2h * dielfac;

	/*
	 * For ScaLAPACK, update scalars such as epol by only one
	 * process row (such as row 0) of a given process column
	 * in this i-loop, but update scalars by all process rows
	 * in the i,j and j,i nested loops.
	 */

#ifdef SCALAPACK

	if (myrow == 0) {
	  epol += -qid2h / ri;
	}
#else
	epol += -qid2h / ri;
#endif

	temp18 = ( (*kappa) * KSCALE * expmkf * ri * (2.0 + (*kappa) * KSCALE * ri)
		   - 2.0 * dielfac ) * ri;
	p_dij = qi2h * temp18;
	r_dij = qid2h - KSCALE * (*kappa) * qi2h * expmkf * ri;

	/*
	 * Each of the ScaLAPACK process rows performs the following
	 * update to sumdeijda[i] because it occurs in the i-loop not
	 * the j-loop.  Therefore, disallow all but one update.
	 */

#ifdef SCALAPACK

	if (myrow == 0) {
	  sumdeijda[i] += r_dij;
	}
#else
	sumdeijda[i] += r_dij;
#endif

	/*
	 * For ScaLAPACK, the diagonal elements of matrices
	 * such as sumdeijdn are not updated; rather, an
	 * associated vector such as sn is updated instead.
	 * This approach is necessary because the diagonal
	 * elements (i,i) and (j,j) receive contributions
	 * from all (i,j) atom pairs; however, because the
	 * pair list is distributed across process columns,
	 * a process that calculates the contribution from
	 * atom pair (i,j) will be able to access element
	 * (i,j) of the matrix but that process may not be
	 * able to access element (i,i) or (j,j).  Hence
	 * a vector such as sn exists in each process for
	 * the purpose of updating the diagonal elements of
	 * these matrices.
	 *
	 * Also, as discussed above for sumdeijda[i], disallow
	 * all but one update of sn[i] in the i-loop.
	 */

#ifdef SCALAPACK

	if (myrow == 0) {
	  sn[i] += p_dij;
	}
#else
	sumdeijdn[in + i] += p_dij;
#endif
      }

      /*
       * Skip the pair calculations if there are no atoms j on the
       * pair list of atom i.
       */

      npairs = upears[i];
      if ( npairs <= 0 ) continue;

      i3 = 3 * i;
      in3 = 3 * in;

      xi = x[i3    ];
      yi = x[i3 + 1];
      zi = x[i3 + 2];

      iaci = prm->Ntypes * (prm->Iac[i] - 1);

      ri2 = ri * ri;
      riinv = 1.0 / ri;

      /* Initialize the first derivative accumulators. */

      dx = dy = dz = 0.0;

      /*
       * Expand the excluded atom list into the iexw array by storing i
       * at array address j.
       */

      for (j = 0; j <  prm->Iblo[i]; j++) {
	iexw[IexclAt[i][j] - 1] = i;
      }

      /* Select atom j from the pair list.  Non-graceful error handling. */

      for (m = lpears[i]; m < lpears[i] + npairs; m++) {
	
	if (pearlist[i] == NULL) {
	  printf("NULL pair list entry in egb2 loop 3, taskid = %d\n", mytaskid);
	  fflush(stdout);
	}
	j = pearlist[i][m];

	j3 = 3 * j;
	jn = n * j;
	jn3 = 3 * jn;

	/* Continue computing the non-diagonal energy term. */

	xij = xi - x[j3    ];
	yij = yi - x[j3 + 1];
	zij = zi - x[j3 + 2];
	r2 = xij * xij + yij * yij + zij * zij;

	/*
	 * Because index j is retrieved from the pairlist array it is
	 * not constrained to a particular range of values; therefore,
	 * the threads that have loaded the reff array must be
	 * synchronized prior to the use of reff below.
	 */

	r2inv = 1.0 / r2;
	qiqj = qi * q[j];
	rj = reff[j];
	rj2 = rj * rj;
	rjinv = 1.0 / rj;
	rirjinv = riinv * rjinv;
	rb2 = ri * rj;
	efac = exp(-r2 / (4.0 * rb2));
	fgbi = 1.0 / sqrt(r2 + rb2 * efac);
	fgbk = -(*kappa) * KSCALE / fgbi;

	/*
	 * Calculate the "non-diagonal" energy term, i.e., the term that is a
	 * function of the effective radius Ri and the interatomic distance Dij.
	 */ 

	expmkf = exp(fgbk) / (*diel_ext);
	dielfac = 1.0 - expmkf;

	epol += -qiqj * dielfac * fgbi;

	temp4 = fgbi * fgbi * fgbi;
	temp6 = qiqj * temp4 * (dielfac + fgbk * expmkf);

	/* The following term is required by equation 1. */

	a_dij = temp6 * (1.0 - 0.25 * efac);

	/*
	 * Prepare to compute the 1st derivatives of the effective radii.
	 * The following code performs the sums indicated by equations 2 and 3.
	 * Compute the contribution of the non-diagonal energy term to the sum
	 * by which the derivatives of Ri and Rj will be multiplied. For OpenMP
	 * sum the contribution for the derivative of Ri only; otherwise, sum
	 * the contribution for the derivatives of Ri and Rj.
	 */

	temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2);
	b_dij = ri * temp5;
	c_dij = rj * temp5;

	sumdeijda[i] += b_dij;

#if !defined(OPENMP) && !defined(SCALAPACK)

	sumdeijda[j] += c_dij;
#endif


	/*
	 * Calculate the first and second derivatives of the interatomic
	 * distance Dij with respect to the cartesian coordinates of atoms
	 * i and j.  The results are placed into five arrays:
	 *
	 *   di[] for the first derivatives with respect to atom i
	 *   dj[] for the first derivatives with respect to atom j
	 *   d2ii[] for the second derivatives with respect to atom i
	 *   d2jj[] for the second derivatives with respect to atom j
	 *   d2ij[] for the second derivatives with respect to atoms i and j
	 *
	 * Some useful symmetry obtains.  The d2ii and d2ij arrays
	 * are symmetric in that their lower and upper triangles are
	 * the transposes of one another.  Also, d2jj equals d2ii, and
	 * d2ij is the negative of d2ii.
	 *
	 * Note: the additional factor of the interatomic distance Dij that
	 * ought to be included in the denominator of the following equations
	 * is provided by the absence of Dij in equations 15 and 16, as well as
	 * by one less power of Dij in the denominator of the equations for the
	 * derivatives of the Van der Waals and Coulombic energy terms.
	 */
      
	di[0] = xij;
	di[1] = yij;
	di[2] = zij;

	dj[0] = -xij;
	dj[1] = -yij;
	dj[2] = -zij;

	/* Load the upper triangle of d2ii. */

	d2ii[0][0] = 1.0 - xij * xij * r2inv;
	d2ii[0][1] = -xij * yij * r2inv;
	d2ii[0][2] = -xij * zij * r2inv;

	d2ii[1][1] = 1.0 - yij * yij * r2inv;
	d2ii[1][2] = -yij * zij * r2inv;

	d2ii[2][2] = 1.0 - zij * zij * r2inv;

	/* Finish loading the rest of all of the matrices. */

	for (k = 0; k < 3; k++) {

	  /* Load the upper triangles of d2jj and d2ij. */

	  for (l = k; l < 3; l++) {

#if !defined(OPENMP) && !defined(SCALAPACK)

	    d2jj[k][l] = d2ii[k][l];
#endif
	    d2ij[k][l] = -d2ii[k][l];
	  }

	  /* Load the symmetric elements of d2ii, d2jj and d2ij. */

	  for (l = k+1; l < 3; l++) {
	    d2ii[l][k] = d2ii[k][l];

#if !defined(OPENMP) && !defined(SCALAPACK)

	    d2jj[l][k] = d2jj[k][l];
#endif
	    d2ij[l][k] = d2ij[k][l];
	  }
	}

	/*
	 * Compute the Van der Waals and Coulomb energy here because
	 * if egb2 is called nbond2 is not called.  Remember that the
	 * derivatives of Dij with respect to x, y and z supply one more
	 * power of 1/Dij but that power of 1/Dij is included in the
	 * equations below, instead of with the derivatives of Dij.
	 *
	 * Compute the Van der Waals and Coulombic energies for only
	 * those pairs which are not on the excluded atom list.  Any
	 * pair on the excluded atom list will have atom i stored at
	 * address j of the iexw array.  It is not necessary to reset
	 * the elements of the iexw array to -1 between successive
	 * iterations in i because an i,j pair is uniquely identified
	 * by atom i stored at array address j.  Thus for example, the
	 * i+1,j pair would be stored at the same address as the i,j
	 * pair but after the i,j pair were used.
	 *
	 * Initialize df here instead of within the following if statement
	 * so that df is initialized unconditionally.
	 */

	df = 0.0;

	if (iexw[j] != i) {

	  /* Initialize dg. */

	  dg = 0.0;

	  rinv = 1. / sqrt(r2);
	  r4inv = r2inv * r2inv;

	  /*  gas-phase Coulomb energy:  */

	  eel = qiqj * rinv;

	  elec += eel;

	  df -= eel;
	  dg += 2. * eel;

	  /* Lennard-Jones energy:   */

	  ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	  if( ic >= 0 ){
	    r6inv = r2inv * r2inv * r2inv;
	    f6 = prm->Cn2[ic] * r6inv;
	    f12 = prm->Cn1[ic] * r6inv * r6inv;

	    evdw += f12 - f6;

	    df += 6. * f6 - 12. * f12;
	    dg += 156. * f12 - 42. * f6;
	  }

	  df *= r2inv;
	  dg *= r4inv;

	  /*
	   * Calculate the second derivatives of Dij for the Van der Waals
	   * and Coulombic interactions.  This calculation uses df and dg
	   * for these interactions only.
	   *
	   * When in single-threaded mode sum to the Hessian the ii, ij, ji
	   * and jj terms.  For OpenMP sum to the ii and ij terms only.
	   * For ScaLAPACK sum to the ji term only and sum to the gg vector
	   * instead of to the ii term.
	   */

	  for (k = 0; k < 3; k++) {
	    for (l = 0; l < 3; l++) {
	      ijterm = df*d2ij[k][l] + dg*di[k]*dj[l];

#ifdef SCALAPACK

	      gg[3*(i3 + k) + l] += df*d2ii[k][l] + dg*di[k]*di[l];

	      ptr = ptr2f(g, super_3Nx3N, j3+l, i3+k, &info);
	      if (ptr != NULL) *ptr += ijterm;
#else
	      g[n3*(i3 + k) + i3 + l] += df*d2ii[k][l] + dg*di[k]*di[l];
	      g[n3*(i3 + k) + j3 + l] += ijterm;
#endif

#if !defined(OPENMP) && !defined(SCALAPACK)

	      g[n3*(j3 + l) + i3 + k] += ijterm;
	      g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l];
#endif
	    }
	  }
	}

	/*
	 * Add to df the contribution to the gradient vector from
	 * equation 1 so that it will be added to the gradient vector
	 * along with the contributions from the Van der Waals and
	 * Coulombic terms.
	 */

	df += a_dij;

	/*
	 * Complete the first derivatives of Dij for the Van der Waals
	 * and Coulombic interactions, as well as for the component of
	 * the first derivatives that is contributed by equation 1.
	 *
	 * Sum into the first derivative accumulators (dx, dy, dz)
	 * the derivatives that are computed relative to the Cartesian
	 * coordiantes of atom i.
	 *
	 * For ScaLAPACK sum into the gradient vector grad the derivative
	 * that is computed relative to the Cartesian coordinates of atom
	 * j.  For all other cases except OpenMP sum this derivative into
	 * the gradient vector f.
	 */

	dedx = df * xij;
	dedy = df * yij;
	dedz = df * zij;

	dx += dedx;
	dy += dedy;
	dz += dedz;

#ifdef SCALAPACK

	grad[j3    ] -= dedx;
	grad[j3 + 1] -= dedy;
	grad[j3 + 2] -= dedz;

#elif !defined(OPENMP)

	f[j3    ] -= dedx;
	f[j3 + 1] -= dedy;
	f[j3 + 2] -= dedz;
#endif

	/* The following code sets up the sums for equations 5-16. */

	temp5 = 0.5 * temp6;
	temp6 = fgbi * fgbi * temp5;
	temp7 = qiqj * (*kappa) * (*kappa) * KSCALE * KSCALE * temp4 * expmkf;
	temp8 = 1 - 0.25 * efac;
	temp9 = 0.25 * temp7 - 1.5 * temp6;
	temp10 = 0.25 * r2;
	temp11 = rb2 + temp10;
	temp12 = ri2 * rj2;
	temp13 = ri * temp11;
	temp14 = rj * temp11;
	temp15 = temp10 * rjinv;
	temp16 = temp10 * riinv;
	temp17 = temp9 * temp13 * efac;
	temp18 = temp9 * temp14 * efac;
	temp20 = temp8 * temp9;
	temp21 = temp20 * temp13;
	temp22 = temp20 * temp14;
	temp23 = 2.0 * efac;

	d_dij = temp23 * (temp21 - 0.25 * temp15 * temp5);

	e_dij = temp23 * (temp22 - 0.25 * temp16 * temp5);

	f_dij = 2.0 * temp8 * temp5;

	g_dij = 4.0 * temp8 * temp8 * temp9
	  + (2.0 * temp8 + efac * temp10 * rirjinv ) * r2inv * temp5;

	h_dij = efac * (temp10 * (temp13 * rjinv - ri2) * temp5 + temp13 * temp17);

	i_dij = efac * ((temp12 + temp16 * temp13) * temp5 + temp13 * temp18);

	j_dij = temp23 * ((ri - rjinv * temp11) * 0.25 * temp5 + temp21);

	l_dij = efac * ((temp12 + temp15 * temp14) * temp5 + temp14 * temp17);

	m_dij = efac * (temp10 * (temp14 * riinv - rj2) * temp5 + temp14 * temp18);

	n_dij = temp23 * ((rj - riinv * temp11) * 0.25 * temp5 + temp22);


	/*
	 * Evaluate equations 15 and 16 and sum the results to the Hessian
	 * matrix.  These equations involve the derivatives d(Dij)/d(Xl),
	 * d(Dij)/d(Xm) and d2(Dij)/d(Xl)d(Xm), which derivatives exist
	 * only for values of k==i, k==j, l==i and l==j.
	 *
	 * When in single-threaded mode sum to the Hessian the ii, ij, ji
	 * and jj terms.  For OpenMP sum to the ii and ij terms only.
	 * For ScaLAPACK sum to the ji term only and sum to the gg vector
	 * instead of to the ii term.
	 */

	df = f_dij;
	dg = g_dij;

	for (k = 0; k < 3; k++) {
	  for (l = 0; l < 3; l++) {
	    ijterm = df*d2ij[k][l] + dg*di[k]*dj[l];

#ifdef SCALAPACK

	    gg[3*(i3 + k) + l] += df*d2ii[k][l] + dg*di[k]*di[l];

	    ptr = ptr2f(g, super_3Nx3N, j3+l, i3+k, &info);
	    if (ptr != NULL) *ptr += ijterm;
#else
	    g[n3*(i3 + k) + i3 + l] += df*d2ii[k][l] + dg*di[k]*di[l];
	    g[n3*(i3 + k) + j3 + l] += ijterm;
#endif

#if !defined(OPENMP) && !defined(SCALAPACK)

	    g[n3*(j3 + l) + i3 + k] += ijterm;
	    g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l];
#endif
	  }
	}

	/*
	 * Setup the inner differentiations indicated by equations 9 and 12.
	 * The column index selects the first derivative to be calculated,
	 * i.e., d(Ri) or d(Rj).  The row index is i because we are
	 * setting up the outer multiplication by d(Ri).
	 */

#ifdef SCALAPACK

	sn[i] += h_dij;

	ptr = ptr2f(sumdeijdn, super_NxN, j, i, &info);
	if (ptr != NULL) *ptr += i_dij;
#else
	sumdeijdn[in + i] += h_dij;
	sumdeijdn[in + j] += l_dij;
#endif

	/*
	 * Setup the inner differentiations indicated by equations 10 and 11.
	 * The column index selects the first derivative to be calculated,
	 * i.e., d(Ri) or d(Rj).  The row index is j because we are
	 * setting up the outer multiplication by d(Rj).  When in single-
	 * threaded mode sum both terms.  When in multi-threaded mode sum
	 * neither term.
	 */

#if !defined(OPENMP) && !defined(SCALAPACK)

	sumdeijdn[jn + i] += i_dij;
	sumdeijdn[jn + j] += m_dij;
#endif

	/*
	 * Perform the inner differentiation indicated by equations 5, 6, 7 and 8.
	 * The row or first index is selected by i because we are setting up the
	 * outer differentiation by the derivatives of Ri.  The column or second
	 * index is selected by i or j because the Dij term affects the derivatives
	 * of atoms i and j.  Both the sumdeijdg and sumdeijdh arrays are used
	 * because equations 5 and 6 perform inner differentiation with respect to
	 * Xl, whereas equations 7 and 8 perform inner differentiation with respect
	 * to Xm.  Distinguishing these two cases is accomplished by transposing
	 * the Hessian addresses, as will be done in the final two calls to the
	 * dgemm() function below.
	 *
	 * When in single-threaded mode sum into the sumdeigdg and sumdeijdh arrays
	 * all terms.  When in multi-threaded mode sum only those terms that comprise
	 * d(Ri) times d(Dij).
	 *
	 * For ScaLAPACK, update matrices such as sumdeijdg
	 * by the correct process column.  The correct process
	 * row has already been selected by the loop index i.
	 * Note that because sumdeijdg has dimensionality Nx3N
	 * both blocksize (bs) and 3*blocksize (bs3) are used in
	 * calculation of the local submatrix address.
	 */

	for (k = 0; k < 3; k++) {

#ifdef SCALAPACK

	  sg[i3 + k] += d_dij * di[k];
	  sh[i3 + k] += j_dij * di[k];

	  ptr = ptr2f(sumdeijdg, super_Nx3N, j, i3+k, &info);
	  if (ptr != NULL) *ptr += e_dij * di[k];

	  ptr = ptr2f(sumdeijdh, super_Nx3N, j, i3+k, &info);
	  if (ptr != NULL) *ptr += n_dij * di[k];

#else
	  sumdeijdg[in3 + i3 + k] += d_dij * di[k];
	  sumdeijdg[in3 + j3 + k] += d_dij * dj[k];
	  sumdeijdh[in3 + i3 + k] += j_dij * di[k];
	  sumdeijdh[in3 + j3 + k] += j_dij * dj[k];
#endif

#if !defined(OPENMP) && !defined(SCALAPACK)

	  sumdeijdg[jn3 + i3 + k] += e_dij * di[k];
	  sumdeijdg[jn3 + j3 + k] += e_dij * dj[k];
	  sumdeijdh[jn3 + i3 + k] += n_dij * di[k];
	  sumdeijdh[jn3 + j3 + k] += n_dij * dj[k];
#endif
	}
      }

      /* Sum the first derivative accumulators into the gradient vector. */

#ifdef SCALAPACK

      grad[i3    ] += dx;
      grad[i3 + 1] += dy;
      grad[i3 + 2] += dz;

#else

      f[i3    ] += dx;
      f[i3 + 1] += dy;
      f[i3 + 2] += dz;

#endif

    }

    if (gb2_debug) {
      printf("end loop 1\n");
      fflush(stdout);
    }

    /*
     * Do not execute the following nest of loops for single-threads or MPI.
     * For OpenMP update the j elements of the gradient vector f and of the
     * sumdeijda array.  For OpenMP and ScaLAPACK update the j,i and j,j
     * elements of the sumdeijdg, sumdeijdh and Hessian matrices.
     */

#if defined(OPENMP) || defined(SCALAPACK)

    /*
     * Initialize the iexw array used for skipping excluded atoms.
     * Note that because of the manner in which iexw is used, it
     * is necessary to initialize it before only the first iteration
     * of the following loop.
     */

    for (i = -1; i < n; i++) {
      iexw[i] = -1;
    }

    /*
     * Loop over all atoms j.
     *
     * Explicitly assign threads to loop indices for the following loop,
     * in a manner equivalent to (static, N) scheduling with OpenMP, and
     * identical to the manner in which threads are assigned in nblist.
     */

    for (j = 0; j < n; j++) {

      if ( !myroc(j, BLOCKSIZE, numthreads, threadnum) ) continue;

      /*
       * Skip the pair calculations if there are no atoms i on the
       * pair list of atom j.
       */

      npairs = lpears[j];
      if ( npairs <= 0 ) continue;

      j3 = 3 * j;
      jn = n * j;
      jn3 = n * j3;

      xj = x[j3    ];
      yj = x[j3 + 1];
      zj = x[j3 + 2];

      qj = q[j];
      rj = reff[j];
      rj2 = rj * rj;
      rjinv = 1.0 / rj;

      /* Initialize the first derivative accumulators. */

      dx = dy = dz = 0.0;

      /*
       * Expand the excluded list into the iexw array by storing j
       * at array address i.
       */

      for (i = 0; i < Jblo[j]; i++) {
	iexw[JexclAt[j][i] - 1] = j;
      }

      /* Select atom i from the pair list.  Non-graceful error handling. */

      for (m = 0; m < npairs; m++) {

	if (pearlist[j] == NULL) {
	  printf("NULL pair list entry in egb2 loop 4, taskid = %d\n", mytaskid);
	  fflush(stdout);
	}
	i = pearlist[j][m];

	in = n * i;
	i3 = 3 * i;
	in3 = n * i3;

	xij = x[i3    ] - xj;
	yij = x[i3 + 1] - yj;
	zij = x[i3 + 2] - zj;
	r2 = xij * xij + yij * yij + zij * zij;
	r2inv = 1.0 / r2;

	iaci = prm->Ntypes * (prm->Iac[i] - 1);

	/*
	 * Because index i is retrieved from the pairlist array it is
	 * not constrained to a particular range of values; therefore,
	 * the threads that have loaded the reff array must be
	 * synchronized prior to the use of reff below.
	 */

	qiqj = q[i] * qj;
	ri = reff[i];
	ri2 = ri * ri;
	riinv = 1.0 / ri;
	rirjinv = riinv * rjinv;
	rb2 = ri * rj;
	efac = exp(-r2 / (4.0 * rb2));
	fgbi = 1.0 / sqrt(r2 + rb2 * efac);
	fgbk = -(*kappa) * KSCALE / fgbi;

	/*
	 * Calculate the "non-diagonal" energy term, i.e., the term that is a
	 * function of the effective radius Ri and the interatomic distance Dij.
	 */ 

	expmkf = exp(fgbk) / (*diel_ext);
	dielfac = 1.0 - expmkf;

	temp4 = fgbi * fgbi * fgbi;
	temp6 = qiqj * temp4 * (dielfac + fgbk * expmkf);

	/* The following term is required by equation 1. */

	a_dij = temp6 * (1.0 - 0.25 * efac);

	/*
	 * Prepare to compute the 1st derivatives of the effective radii.
	 * The following code performs the sums indicated by equations 2 and 3.
	 * Compute the contribution of the non-diagonal energy term to the sum
	 * by which the derivative of Rj will be multiplied.
	 */

	temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2);
	c_dij = rj * temp5;

	/* Update the j element of the sumdeijda array. */

	sumdeijda[j] += c_dij;

	/*
	 * Calculate the first and second derivatives of the interatomic
	 * distance Dij with respect to the cartesian coordinates of atoms
	 * i and j.  The results are placed into two arrays:
	 *
	 *   di[] for the first derivatives with respect to atom i
	 *   dj[] for the first derivatives with respect to atom j
	 *   d2jj[] for the second derivatives with respect to atom j
	 *   d2ij[] for the second derivatives with respect to atoms i and j
	 *
	 * Some useful symmetry obtains.  The d2ii and d2ij arrays
	 * are symmetric in that their lower and upper triangles are
	 * the transposes of one another.  Also, d2jj equals d2ii, and
	 * d2ij is the negative of d2ii.
	 *
	 * Note: the additional factor of the interatomic distance Dij that
	 * ought to be included in the denominator of the following equations
	 * is provided by the absence of Dij in equations 15 and 16, as well as
	 * by one less power of Dij in the denominator of the equations for the
	 * derivatives of the Van der Waals and Coulombic energy terms.
	 */
      
	di[0] = xij;
	di[1] = yij;
	di[2] = zij;

	dj[0] = -xij;
	dj[1] = -yij;
	dj[2] = -zij;

	/* Load the upper triangle of d2jj. */

	d2jj[0][0] = 1.0 - xij * xij * r2inv;
	d2jj[0][1] = -xij * yij * r2inv;
	d2jj[0][2] = -xij * zij * r2inv;

	d2jj[1][1] = 1.0 - yij * yij * r2inv;
	d2jj[1][2] = -yij * zij * r2inv;

	d2jj[2][2] = 1.0 - zij * zij * r2inv;

	/* Finish loading the rest of all of the matrices. */

	for (k = 0; k < 3; k++) {

	  /* Load the upper triangle of d2ij. */

	  for (l = k; l < 3; l++) {
	    d2ij[k][l] = -d2jj[k][l];
	  }

	  /* Load the symmetric elements of d2jj and d2ij. */

	  for (l = k+1; l < 3; l++) {
	    d2jj[l][k] = d2jj[k][l];
	    d2ij[l][k] = d2ij[k][l];
	  }
	}

	/*
	 * Compute the Van der Waals and Coulomb energy here because
	 * if egb2 is called nbond2 is not called.  Remember that the
	 * derivatives of Dij with respect to x, y and z supply one more
	 * power of 1/Dij but that power of 1/Dij is included in the
	 * equations below, instead of with the derivatives of Dij.
	 *
	 * Compute the Van der Waals and Coulombic energies for only
	 * those pairs which are not on the excluded atom list.  Any
	 * pair on the excluded atom list will have atom i stored at
	 * address j of the iexw array.  It is not necessary to reset
	 * the elements of the iexw array to -1 between successive
	 * iterations in i because an i,j pair is uniquely identified
	 * by atom i stored at array address j.  Thus for example, the
	 * i,j+1 pair would be stored at the same address as the i,j
	 * pair but after the i,j pair were used.
	 *
	 * Initialize df here instead of within the following if statement
	 * so that df is initialized unconditionally.
	 */

	df = 0.0;

	if (iexw[i] != j) {

	  /* Initialize dg. */

	  dg = 0.0;

	  rinv = 1. / sqrt(r2);
	  r4inv = r2inv * r2inv;

	  /*  gas-phase Coulomb energy:  */

	  eel = qiqj * rinv;

	  df -= eel;
	  dg += 2. * eel;

	  /* Lennard-Jones energy:   */

	  ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	  if( ic >= 0 ){
	    r6inv = r2inv * r2inv * r2inv;
	    f6 = prm->Cn2[ic] * r6inv;
	    f12 = prm->Cn1[ic] * r6inv * r6inv;

	    df += 6. * f6 - 12. * f12;
	    dg += 156. * f12 - 42. * f6;
	  }

	  df *= r2inv;
	  dg *= r4inv;

	  /*
	   * Calculate the second derivatives of Dij for the Van der Waals
	   * and Coulombic interactions.  This calculation uses df and dg
	   * for these interactions only.
	   *
	   * For OpenMP sum the ji and jj terms into the Hessian.
	   * For ScaLAPACK sum the ij term into the Hessian but
	   * sum the jj term into the gg vector.
	   */

	  for (k = 0; k < 3; k++) {
	    for (l = 0; l < 3; l++) {

#ifdef SCALAPACK

	      ptr = ptr2f(g, super_3Nx3N, i3+k, j3+l, &info);
	      if (ptr != NULL) *ptr += df*d2ij[k][l] + dg*di[k]*dj[l];

	      gg[3*(j3 + k) + l] += df*d2jj[k][l] + dg*dj[k]*dj[l];

#else

	      g[n3*(j3 + l) + i3 + k] += df*d2ij[k][l] + dg*di[k]*dj[l];
	      g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l];
#endif
	    }
	  }
	}

	/*
	 * Add to df the contribution to the gradient vector from
	 * equation 1 so that it will be added to the gradient vector
	 * along with the contributions from the Van der Waals and
	 * Coulombic terms.
	 */

	df += a_dij;

	/*
	 * Complete the first derivatives of Dij for the Van der Waals
	 * and Coulombic interactions, as well as for the component of
	 * the first derivatives that is contributed by equation 1.
	 *
	 * Sum the derivatives into the derivative accumulators.
	 */

	dedx = df * xij;
	dedy = df * yij;
	dedz = df * zij;

	dx += dedx;
	dy += dedy;
	dz += dedz;

	/* The following code sets up the sums for equations 5-16. */

	temp5 = 0.5 * temp6;
	temp6 = fgbi * fgbi * temp5;
	temp7 = qiqj * (*kappa) * (*kappa) * KSCALE * KSCALE * temp4 * expmkf;
	temp8 = 1 - 0.25 * efac;
	temp9 = 0.25 * temp7 - 1.5 * temp6;
	temp10 = 0.25 * r2;
	temp11 = rb2 + temp10;
	temp12 = ri2 * rj2;
	temp13 = ri * temp11;
	temp14 = rj * temp11;
	temp15 = temp10 * rjinv;
	temp16 = temp10 * riinv;
	temp17 = temp9 * temp13 * efac;
	temp18 = temp9 * temp14 * efac;
	temp20 = temp8 * temp9;
	temp21 = temp20 * temp13;
	temp22 = temp20 * temp14;
	temp23 = 2.0 * efac;

	d_dij = temp23 * (temp21 - 0.25 * temp15 * temp5);

	e_dij = temp23 * (temp22 - 0.25 * temp16 * temp5);

	f_dij = 2.0 * temp8 * temp5;

	g_dij = 4.0 * temp8 * temp8 * temp9
	  + (2.0 * temp8 + efac * temp10 * rirjinv ) * r2inv * temp5;

	h_dij = efac * (temp10 * (temp13 * rjinv - ri2) * temp5 + temp13 * temp17);

	i_dij = efac * ((temp12 + temp16 * temp13) * temp5 + temp13 * temp18);

	j_dij = temp23 * ((ri - rjinv * temp11) * 0.25 * temp5 + temp21);

	l_dij = efac * ((temp12 + temp15 * temp14) * temp5 + temp14 * temp17);

	m_dij = efac * (temp10 * (temp14 * riinv - rj2) * temp5 + temp14 * temp18);

	n_dij = temp23 * ((rj - riinv * temp11) * 0.25 * temp5 + temp22);


	/*
	 * Evaluate equations 15 and 16 and sum the results to the Hessian
	 * matrix.  These equations involve the derivatives d(Dij)/d(Xl),
	 * d(Dij)/d(Xm) and d2(Dij)/d(Xl)d(Xm), which derivatives exist
	 * only for values of k==i, k==j, l==i and l==j.
	 *
	 * For OpenMP sum the ji and jj terms into the Hessian.
	 * For ScaLAPACK sum the ij term into the Hessian but
	 * sum the jj term into the gg vector.
	 */

	df = f_dij;
	dg = g_dij;

	for (k = 0; k < 3; k++) {
	  for (l = 0; l < 3; l++) {

#ifdef SCALAPACK

	    ptr = ptr2f(g, super_3Nx3N, i3+k, j3+l, &info);
	    if (ptr != NULL) *ptr += df*d2ij[k][l] + dg*di[k]*dj[l];

	    gg[3*(j3 + k) + l] += df*d2jj[k][l] + dg*dj[k]*dj[l];

#else

	    g[n3*(j3 + l) + i3 + k] += df*d2ij[k][l] + dg*di[k]*dj[l];
	    g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l];
#endif
	  } 
	}

	/*
	 * Setup the inner differentiations indicated by equations 10 and 11.
	 * The column index selects the first derivative to be calculated,
	 * i.e., d(Rj).  The row index is j because we are setting up the
	 * outer multiplication by d(Rj).
	 */

#ifdef SCALAPACK

	ptr = ptr2f(sumdeijdn, super_NxN, i, j, &info);
	if (ptr != NULL) *ptr += l_dij;

	sn[j] += m_dij;
#else
	sumdeijdn[jn + i] += i_dij;
	sumdeijdn[jn + j] += m_dij;
#endif

	/*
	 * Perform the inner differentiation indicated by equations 5, 6, 7 and 8.
	 * The row or first index is selected by j because we are setting up the
	 * outer differentiation by the derivatives of Rj.  The column or second
	 * index is selected by i or j because the Dij term affects the derivatives
	 * of atoms i and j.  Both the sumdeijdg and sumdeijdh arrays are used
	 * because equations 5 and 6 perform inner differentiation with respect to
	 * Xl, whereas equations 7 and 8 perform inner differentiation with respect
	 * to Xm.  Distinguishing these two cases is accomplished by transposing
	 * the Hessian addresses, as will be done in the final two calls to the
	 * dgemm() function below.
	 *
	 * Sum those terms that comprise d(Rj) times d(Dij).
	 */

	for (k = 0; k < 3; k++) {

#ifdef SCALAPACK

	  sg[j3 + k] += e_dij * dj[k];
	  sh[j3 + k] += n_dij * dj[k];

	  ptr = ptr2f(sumdeijdg, super_Nx3N, i, j3+k, &info);
	  if (ptr != NULL) *ptr += d_dij * dj[k];

	  ptr = ptr2f(sumdeijdh, super_Nx3N, i, j3+k, &info);
	  if (ptr != NULL) *ptr += j_dij * dj[k];
#else
	  sumdeijdg[jn3 + i3 + k] += e_dij * di[k];
	  sumdeijdg[jn3 + j3 + k] += e_dij * dj[k];
	  sumdeijdh[jn3 + i3 + k] += n_dij * di[k];
	  sumdeijdh[jn3 + j3 + k] += n_dij * dj[k];
#endif
	}
      }

      /*
       * For OpenMP but not for ScaLAPACK, sum the first derivative
       * accumulators into the gradient vector f.
       */

#ifdef OPENMP

      f[j3    ] -= dx;
      f[j3 + 1] -= dy;
      f[j3 + 2] -= dz;

#endif

    }

    if (gb2_debug) {
      printf("end loop 2\n");
      fflush(stdout);
    }

#endif /* if defined(OPENMP) || defined(SCALAPACK) */


    /*
     * There is no pragma directive associated with the above 'for' loop
     * so explicit thread synchronization is performed here.
     */

#pragma omp barrier

    /*
     * The SCALAPACK synchronization is accomplised via reduction
     * of the sumdeijda array.  Each element of this array should
     * be non-zero for one process only.
     *
     * No reduction is needed for MPI because no parallelization is performed.
     *
     * Actually, for both OpenMP and SCALAPACK it is possible to defer
     * synchronization until after the following (i,j) nest of loops
     * because the mapping of loop indices to threads or tasks is
     * identical to that of the two previous nests of loops, and because
     * only the i elements of the sumdeijda array are read in the following
     * nest of loops.  However, synchronization is possible at this point,
     * and may serve to better balance the workload in the next two nests
     * of loops.
     */

#ifdef SCALAPACK

    ierror = MPI_Allreduce(sumdeijda, reductarr, n,
			   MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (ierror != MPI_SUCCESS) {
      printf("Error in egb sumdeijda reduction, error = %d  mytaskid = %d\n",
	     ierror, mytaskid);
    }

    if (gb2_debug) {
      printf("MPI_Allreduce sumdeijda\n");
      fflush(stdout);
    }

    for (i = 0; i < n; i++) {
      sumdeijda[i] = reductarr[i];
    }

#endif

    /* Free the iexw array within this potentially parallel region of code. */

    free_ivector(iexw, -1, n);

    /*
     * Compute the first and second derivatives of the effective radius.
     * Update the gradient vector using the first derivatives.  Complete
     * equations 2, 3 and 4.  Begin equations 13, 14 and 17 which update
     * the Hessian matrix.
     *
     * For OpenMP this loop updates the i elements of the gradient
     * vector f as well as the (i,i) and (i,j) elements of the dreff and
     * Hessian matrices.
     *
     * For ScaLAPACK this loop updates the i and j elements of the gradient
     * vector grad as well as the (i,i) and (i,j) elements of the dreff and
     * Hessian matrices.
     *
     * For MPI and single-threaded execution this loop updates all elements
     * of all vectors and matrices, so the following (j,i) loop is not
     * necessary.
     *
     * Loop over all atoms i.
     *
     * Explicitly assign threads to loop indices for the following loop,
     * in a manner equivalent to (static, N) scheduling with OpenMP, and
     * identical to the manner in which threads are assigned in nblist.
     *
     * It is not necessary to synchronize OpenMP threads following
     * this loop because the next loop uses identical loop index to
     * thread mapping.  Hence there will be no race condition associated
     * with updates of the dreff, gradient and Hessian arrays.
     */

    for (i = 0; i < n; i++) {

#if defined(OPENMP) || defined(SCALAPACK)

      if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue;

#endif

      /*
       * Don't calculate derivatives of the effective radius of atom i
       * if atom i is frozen or if there are no pair atoms j associated
       * with atom i.
       */

      npairs = lpears[i] + upears[i];
      if ( frozen[i] || (npairs <= 0) ) continue;

      /* Initialize the first derivative accumulators. */

      dx = dy = dz = 0.0;

      i3 = 3 * i;
      in = n * i;
      in3 = 3 * in;

      /*
       * The following captures all terms that are to be multiplied by the
       * first derivative of the effective radius (equations 2, 3 and 4).
       */

      sumda = sumdeijda[i];

      if ( gb>1 ) {

	ri = rborn[i] - BOFFSET;
	thi = tanh( (gbalpha - gbbeta*psi[i] + gbgamma*psi[i]*psi[i])*psi[i] );
	sumda *= (gbalpha -2.0*gbbeta*psi[i] + 3.0*gbgamma*psi[i]*psi[i])
	  *(1.0 - thi*thi)*ri/rborn[i];
      }

      /*
       * The following captures all terms that are to be multiplied by the
       * second derivative of the effective radius (equations 13, 14 and 17).
       */

      sumda = sumdeijda[i];

      /* Prepare to compute the derivatives of the effective radius Ri. */

      xi = x[i3    ];
      yi = x[i3 + 1];
      zi = x[i3 + 2];
      ri = rborn[i] - BOFFSET;
      ri1i = 1. / ri;

      /* Select atom j from the pair list.  Non-graceful error handling. */

      for (m = 0; m < npairs; m++) {

	if (pearlist[i] == NULL) {
	  printf("NULL pair list entry in egb2 loop 5, taskid = %d\n", mytaskid);
	  fflush(stdout);
	}
	j = pearlist[i][m];

	j3 = 3 * j;
	jn3 = n * j3;

	xij = xi - x[j3    ];
	yij = yi - x[j3 + 1];
	zij = zi - x[j3 + 2];
	r2 = xij * xij + yij * yij + zij * zij;

	/* Ignore the ij atom pair if their separation exceeds the GB cutoff. */

	if ( r2 > rgbmaxpsmax2 ) continue;

	dij1i = 1.0 / sqrt(r2);
	dij2i = dij1i * dij1i;
	dij = r2 * dij1i;
	sj = fs[j] * (rborn[j] - BOFFSET);
	sj2 = sj * sj;

	/*
	 * The following are the numerator of the first derivatives of the
	 * equations from the Appendix of Schaefer and Froemmel and of the
	 * Taylor series expansion for d>>s by Andreas Svrcek-Seiler.  Smooth
	 * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev.  The
	 * derivatives are with respect to Dij.  The complete derivative is
	 * formed by multiplying the numerator by -Ri*Ri.  The factor of Ri*Ri
	 * has been moved to equations 5-18.  The negation is deferred until later.
	 * When the chain rule is used to form the first derivatives of the
	 * effective radius with respect to the cartesian coordinates, an
	 * additional factor of Dij appears in the denominator.  That factor
	 * is included in the following expressions.
	 */

	if ( dij > rgbmax + sj ) continue;

	if ( dij > rgbmax - sj ) {

	  temp1 = 1./(dij - sj);
	  dij3i = dij1i * dij2i;

	  datmp = 0.125 * dij3i * ((r2 + sj2) *
				   (temp1 * temp1 - rgbmax2i) -
				   2.0 * log(rgbmax * temp1));

	} else if ( dij > 4.0 * sj ) {

	  tmpsd = sj2 * dij2i;
	  dumbo = TE + tmpsd*(TF + tmpsd*(TG + tmpsd*(TH + tmpsd*THH)));
	  datmp = tmpsd * sj * dij2i * dij2i * dumbo;

	} else if ( dij > ri + sj ) {

	  temp1 = 1. / (r2 - sj2);
	  datmp = temp1 * sj * (-0.5 * dij2i + temp1)
	    + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj));

	} else if ( dij > fabs(ri - sj) ) {

	  temp1 = 1. / (dij + sj);
	  dij3i = dij2i * dij1i;
	  datmp =
	    -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i *
		     ri1i + dij1i * temp1 * (temp1 - dij1i)
		     - dij3i * log(ri * temp1));

	} else if ( ri < sj ) {

	  temp1 = 1. / (r2 - sj2);
	  datmp =
	    -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 -
		    0.5 * dij2i * dij1i * log((sj - dij) /
					      (sj + dij)));

	} else continue;

	/*
	 * Negate the first derivative of Ri with respect to the
	 * interatomic distance in order to account for the
	 * minus sign of -Ri*Ri.  The factor of Ri*Ri is omitted
	 * because it was included in equations 5-18.  These
	 * derivatives contain the appropriate factor of Dij
	 * in the denominator because it is supplied by datmp.
	 */

	df = -datmp;

	/*
	 * The following are the numerator of the second derivatives of the
	 * equations from the Appendix of Schaefer and Froemmel and of the
	 * Taylor series expansion for d>>s by Andreas Svrcek-Seiler.  Smooth
	 * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev.  The
	 * derivatives are with respect to Dij.
	 */

	if ( dij > rgbmax + sj ) continue;

	if ( dij > rgbmax - sj ) {

	  temp1 = 1./(dij - sj);
	  temp2 = temp1 * temp1;
	  dij3i = dij1i * dij2i;

	  da2tmp = -0.25 * (dij3i * (sj2 * rgbmax2i - 1.0) + dij1i * temp2 +
			    2.0 * (log(rgbmax * temp1) * dij3i +
				   temp1 * (dij2i - temp2)));

	} else if ( dij > 4.0 * sj ) {

	  tmpsd = sj2 * dij2i;
	  dumbo = TI + tmpsd*(TJ + tmpsd*(TK + tmpsd*(TL + tmpsd*TLL)));
	  da2tmp = tmpsd * sj * dij2i * dij2i * dumbo;


	} else if ( dij > ri + sj ) {

	  temp1 = 2. * dij * sj;
	  temp2 = (dij - sj) * (dij + sj);
	  dij3i = dij2i * dij1i;

	  da2tmp = (0.5 * log((dij - sj) / (dij + sj)) +
		    dij * sj * ((r2 + sj2) * (r2 - sj2) + temp1 * temp1) /
		    (temp2 * temp2 * temp2)) * dij3i;

	} else if ( dij > fabs(ri - sj) ) {

	  temp1 = dij + sj;
	  temp2 = dij * temp1;
	  ri2 = ri * ri;

	  da2tmp = ((2. * ri2 * log(ri / temp1) + sj2) * temp1 * temp1 * temp1
		    + ri2 * sj * (2. * dij * dij - dij * sj - sj2)) /
	    (4. * ri2 * temp2 * temp2 * temp2);

	} else if ( ri < sj ) {

	  temp1 = sj2 - r2;
	  temp2 = 2. * sj * dij;
	  dij3i = dij2i * dij1i;

	  da2tmp = (0.5 * log((sj - dij) / (sj + dij)) + sj * dij *
		    (temp1 * (sj2 + r2) - temp2 * temp2) /
		    (temp1 * temp1 * temp1)) * dij3i;

	} else continue;

	/* 
	 * Calculate the second derivative of the effective radius
	 * with respect to the interatomic distance.  A factor of
	 * the Ri*Ri has been omitted from the second derivative
	 * because it was included in equations 5-18.
	 *
	 * When the chain rule is used to form the second derivatives of the
	 * effective radius with respect to the cartesian coordinates, an
	 * additional factor of Dij*Dij appears in the denominator.  That
	 * factor is included in the following expression.
	 */

	dg = da2tmp * dij2i;

	/*
	 * Calculate the first and second derivatives of the interatomic
	 * distance Dij with respect to the cartesian coordinates of atoms
	 * i and j.  The results are placed into five arrays:
	 *
	 *   di[] for the first derivatives with respect to atom i
	 *   dj[] for the first derivatives with respect to atom j
	 *   d2ii[] for the second derivatives with respect to atom i
	 *   d2jj[] for the second derivatives with respect to atom j
	 *   d2ij[] for the second derivatives with respect to atoms i and j
	 *
	 * Some useful symmetry obtains.  The d2ii and d2ij arrays
	 * are symmetric in that their lower and upper triangles are
	 * the transposes of one another.  Also, d2jj equals d2ii, and
	 * d2ij is the negative of d2ii.
	 *
	 *
	 * Note: the additional factor of the interatomic distance Dij that
	 * ought to be included in the denominator of the following equations
	 * is included instead in the first and second derivatives of the
	 * effective radius with respect to Dij (df and dg) that were calculated
	 * above.
	 */
      
	di[0] = xij;
	di[1] = yij;
	di[2] = zij;

	dj[0] = -xij;
	dj[1] = -yij;
	dj[2] = -zij;

	/* Load the upper triangle of d2ii. */

	r2inv = 1.0 / r2;

	d2ii[0][0] = 1.0 - xij * xij * r2inv;
	d2ii[0][1] = -xij * yij * r2inv;
	d2ii[0][2] = -xij * zij * r2inv;

	d2ii[1][1] = 1.0 - yij * yij * r2inv;
	d2ii[1][2] = -yij * zij * r2inv;

	d2ii[2][2] = 1.0 - zij * zij * r2inv;

	/* Finish loading the rest of all of the matrices. */

	for (k = 0; k < 3; k++) {

	  /* Load the upper triangles of d2jj and d2ij. */

	  for (l = k; l < 3; l++) {

#if !defined(OPENMP) && !defined(SCALAPACK)

	    d2jj[k][l] = d2ii[k][l];
#endif
	    d2ij[k][l] = -d2ii[k][l];
	  }

	  /* Load the symmetric elements of d2ii, d2jj and d2ij. */

	  for (l = k+1; l < 3; l++) {
	    d2ii[l][k] = d2ii[k][l];

#if !defined(OPENMP) && !defined(SCALAPACK)

	    d2jj[l][k] = d2jj[k][l];
#endif
	    d2ij[l][k] = d2ij[k][l];
	  }
	}

	/*
	 * Store the 1st derivatives of Ri with respect to (xj,yj,zj)
	 * if SCALAPACK is not defined.  If SCALAPACK is defined, store
	 * these derivatives in the next nest of loops.
	 */

	dxj = df * dj[0];
	dyj = df * dj[1];
	dzj = df * dj[2];

#ifndef SCALAPACK

	dreff[in3 + j3    ] = dxj;
	dreff[in3 + j3 + 1] = dyj;
	dreff[in3 + j3 + 2] = dzj;
#endif

	/*
	 * If OPENMP is not defined, update the gradient vector f
	 * with d(Ri)/d(Xj).  If SCALAPACK is defined update the
	 * gradient vector grad instead of the gradient vector f.
	 */

#ifdef SCALAPACK

	grad[j3    ] += sumda * dxj;
	grad[j3 + 1] += sumda * dyj;
	grad[j3 + 2] += sumda * dzj;

#elif !defined(OPENMP)

	f[j3    ] += sumda * dxj;
	f[j3 + 1] += sumda * dyj;
	f[j3 + 2] += sumda * dzj;
#endif

	/*
	 * Sum the 1st derivatives of Ri with respect to (xi,yi,zi).
	 * Exploit the fact that d(Dij)/d(Xi) = -d(Dij)/d(Xj).
	 */

	dx -= dxj;
	dy -= dyj;
	dz -= dzj;

	/*
	 * Form the second derivatives of the effective radius with
	 * respect to the cartesian coordinates using the chain rule,
	 * multiply the result by sumda and sum to the Hessian matrix.
	 * This step implements equations 13, 14 and 17, except that
	 * terms such as d2(Ri)/d(Dij)d(Dik) are not handled here
	 * but instead are handled in the first call to dgemm() below.
	 *
	 * When in single-threaded mode update the Hessian using the ii, ij
	 * ji and jjterms.  For OpenMP update the Hessian using the ii and ij
	 * terms only.  For ScaLAPACK update the Hessian using the ji term
	 * only and update the i-element of the gg vector.
	 */

	for (k = 0; k < 3; k++) {
	  for (l = 0; l < 3; l++) {
	    ijterm = sumda*(df*d2ij[k][l] + dg*di[k]*dj[l]);

#ifdef SCALAPACK

	    gg[3*(i3 + k) + l] += sumda*(df*d2ii[k][l] + dg*di[k]*di[l]);

	    ptr = ptr2f(g, super_3Nx3N, j3+l, i3+k, &info);
	    if (ptr != NULL) *ptr += ijterm;
#else
	    g[n3*(i3 + k) + i3 + l] += sumda*(df*d2ii[k][l] + dg*di[k]*di[l]);
	    g[n3*(i3 + k) + j3 + l] += ijterm;
#endif

#if !defined(OPENMP) && !defined(SCALAPACK)

	    g[n3*(j3 + l) + i3 + k] += ijterm;
	    g[n3*(j3 + k) + j3 + l] += sumda*(df*d2jj[k][l] + dg*dj[k]*dj[l]);
#endif
	  }
	}
      }

      /*
       * Store the sum of first derivatives of Ri with respect to (xi,yi,zi)
       * in the diagonal elements of the dreff array.  These derivatives
       * include the appropriate factor of Dij but not a factor of Ri*Ri.
       * They include the minus sign of -Ri*Ri that was supplied in df.
       */

#ifdef SCALAPACK

      sr[i3    ] = dx;
      sr[i3 + 1] = dy;
      sr[i3 + 2] = dz;

#else

      dreff[in3 + i3    ] = dx;
      dreff[in3 + i3 + 1] = dy;
      dreff[in3 + i3 + 2] = dz;

#endif

      /* Update the gradient vector with d(Ri)/d(Xi). */

#ifdef SCALAPACK

      grad[i3    ] += sumda * dx;
      grad[i3 + 1] += sumda * dy;
      grad[i3 + 2] += sumda * dz;

#else

      f[i3    ] += sumda * dx;
      f[i3 + 1] += sumda * dy;
      f[i3 + 2] += sumda * dz;

#endif

    }

    if (gb2_debug) {
      printf("end loop 3\n");
      fflush(stdout);
    }

    /*
     * This nest of loops updates the (j,i) and (j,j) elements of the
     * Hessian matrix for ScaLAPACK or OpenMP.  For OpenMP update
     * the j elements of the gradient vector f.  For ScaLAPACK do
     * not update the gradient vector grad.  This nest of loop is
     * not necessary for single-threaded execution or for MPI.
     * 
     * The control flow in this nest of loops must be identical to 
     * that of the preceeding nest of loops.  Perform the minimum
     * calculation necessary to reconstitute the conditions of
     * the preceeding nest of loops.
     *
     * Because no computation follows the following 'for' loop within
     * this parallel region, the implied barrier at the end of the
     * parallel region enforces the necessary thread synchronization
     * for OpenMP threads.
     */

#if defined(OPENMP) || defined(SCALAPACK)

    /*
     * Loop over all atoms j.
     *
     * Explicitly assign threads to loop indices for the following loop,
     * in a manner equivalent to (static, N) scheduling with OpenMP, and
     * identical to the manner in which threads are assigned in nblist.
     */

    for (j = 0; j < n; j++) {

      if ( !myroc(j, BLOCKSIZE, numthreads, threadnum) ) continue;

      /*
       * Don't calculate derivatives with respect to atom j if there
       * are no atoms i associated with atom j.
       */

      npairs = lpears[j] + upears[j];
      if ( npairs <= 0 ) continue;

      j3 = 3 * j;
      jn3 = n * j3;

      xj = x[j3    ];
      yj = x[j3 + 1];
      zj = x[j3 + 2];

      sj = fs[j] * (rborn[j] - BOFFSET);
      sj2 = sj * sj;

      /* Initialize the first derivative accumulators. */

      dx = dy = dz = 0.0;

       /* Select atom i from the pair list.  Non-graceful error handling. */

      for (m = 0; m < npairs; m++) {

	if (pearlist[j] == NULL) {
	  printf("NULL pair list entry in egb2 loop 6, taskid = %d\n", mytaskid);
	  fflush(stdout);
	}
	i = pearlist[j][m];

	/*
	 * Don't calculate derivatives of the effective radius of atom i
	 * if atom i is frozen.
	 */

	if ( frozen[i] ) continue;

	i3 = 3 * i;
	in = (n * i);

	xij = x[i3    ] - xj;
	yij = x[i3 + 1] - yj;
	zij = x[i3 + 2] - zj;
	r2 = xij * xij + yij * yij + zij * zij;

	/* Ignore the ij atom pair if their separation exceeds the GB cutoff. */

	if ( r2 > rgbmaxpsmax2 ) continue;

	dij1i = 1.0 / sqrt(r2);
	dij2i = dij1i * dij1i;
	dij = r2 * dij1i;

	ri = rborn[i] - BOFFSET;
	ri1i = 1. / ri;

	/*
	 * The following are the numerator of the first derivatives of the
	 * equations from the Appendix of Schaefer and Froemmel and of the
	 * Taylor series expansion for d>>s by Andreas Svrcek-Seiler.  Smooth
	 * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev.  The
	 * derivatives are with respect to Dij.  The complete derivative is
	 * formed by multiplying the numerator by -Ri*Ri.  The factor of Ri*Ri
	 * has been moved to equations 5-18.  The negation is deferred until later.
	 * When the chain rule is used to form the first derivatives of the
	 * effective radius with respect to the cartesian coordinates, an
	 * additional factor of Dij appears in the denominator.  That factor
	 * is included in the following expressions.
	 */

	if ( dij > rgbmax + sj ) continue;

	if ( dij > rgbmax - sj ) {

	  temp1 = 1./(dij - sj);
	  dij3i = dij1i * dij2i;

	  datmp = 0.125 * dij3i * ((r2 + sj2) *
				   (temp1 * temp1 - rgbmax2i) -
				   2.0 * log(rgbmax * temp1));

	} else if ( dij > 4.0 * sj ) {

	  tmpsd = sj2 * dij2i;
	  dumbo = TE + tmpsd*(TF + tmpsd*(TG + tmpsd*(TH + tmpsd*THH)));
	  datmp = tmpsd * sj * dij2i * dij2i * dumbo;

	} else if ( dij > ri + sj ) {

	  temp1 = 1. / (r2 - sj2);
	  datmp = temp1 * sj * (-0.5 * dij2i + temp1)
	    + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj));

	} else if ( dij > fabs(ri - sj) ) {

	  temp1 = 1. / (dij + sj);
	  dij3i = dij2i * dij1i;
	  datmp =
	    -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i *
		     ri1i + dij1i * temp1 * (temp1 - dij1i)
		     - dij3i * log(ri * temp1));

	} else if ( ri < sj ) {

	  temp1 = 1. / (r2 - sj2);
	  datmp =
	    -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 -
		    0.5 * dij2i * dij1i * log((sj - dij) /
					      (sj + dij)));

	} else continue;

	/*
	 * Negate the first derivative of Ri with respect to the
	 * interatomic distance in order to account for the
	 * minus sign of -Ri*Ri.  The factor of Ri*Ri is omitted
	 * because it was included in equations 5-18.  These
	 * derivatives contain the appropriate factor of Dij
	 * in the denominator because it is supplied by datmp.
	 */

	df = -datmp;

	/*
	 * The following are the numerator of the second derivatives of the
	 * equations from the Appendix of Schaefer and Froemmel and of the
	 * Taylor series expansion for d>>s by Andreas Svrcek-Seiler.  Smooth
	 * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev.  The
	 * derivatives are with respect to Dij.
	 */

	if ( dij > rgbmax + sj ) continue;

	if ( dij > rgbmax - sj ) {

	  temp1 = 1./(dij - sj);
	  temp2 = temp1 * temp1;
	  dij3i = dij1i * dij2i;

	  da2tmp = -0.25 * (dij3i * (sj2 * rgbmax2i - 1.0) + dij1i * temp2 +
			    2.0 * (log(rgbmax * temp1) * dij3i +
				   temp1 * (dij2i - temp2)));

	} else if ( dij > 4.0 * sj ) {

	  tmpsd = sj2 * dij2i;
	  dumbo = TI + tmpsd*(TJ + tmpsd*(TK + tmpsd*(TL + tmpsd*TLL)));
	  da2tmp = tmpsd * sj * dij2i * dij2i * dumbo;


	} else if ( dij > ri + sj ) {

	  temp1 = 2. * dij * sj;
	  temp2 = (dij - sj) * (dij + sj);
	  dij3i = dij2i * dij1i;

	  da2tmp = (0.5 * log((dij - sj) / (dij + sj)) +
		    dij * sj * ((r2 + sj2) * (r2 - sj2) + temp1 * temp1) /
		    (temp2 * temp2 * temp2)) * dij3i;

	} else if ( dij > fabs(ri - sj) ) {

	  temp1 = dij + sj;
	  temp2 = dij * temp1;
	  ri2 = ri * ri;

	  da2tmp = ((2. * ri2 * log(ri / temp1) + sj2) * temp1 * temp1 * temp1
		    + ri2 * sj * (2. * dij * dij - dij * sj - sj2)) /
	    (4. * ri2 * temp2 * temp2 * temp2);

	} else if ( ri < sj ) {

	  temp1 = sj2 - r2;
	  temp2 = 2. * sj * dij;
	  dij3i = dij2i * dij1i;

	  da2tmp = (0.5 * log((sj - dij) / (sj + dij)) + sj * dij *
		    (temp1 * (sj2 + r2) - temp2 * temp2) /
		    (temp1 * temp1 * temp1)) * dij3i;

	} else continue;

	/* 
	 * Calculate the second derivative of the effective radius
	 * with respect to the interatomic distance.  A factor of
	 * the Ri*Ri has been omitted from the second derivative
	 * because it was included in equations 5-18.
	 *
	 * When the chain rule is used to form the second derivatives of the
	 * effective radius with respect to the cartesian coordinates, an
	 * additional factor of Dij*Dij appears in the denominator.  That
	 * factor is included in the following expression.
	 */

	dg = da2tmp * dij2i;

	/*
	 * Calculate the first and second derivatives of the interatomic
	 * distance Dij with respect to the cartesian coordinates of atoms
	 * i and j.  The results are placed into four arrays:
	 *
	 *   di[] for the first derivatives with respect to atom i
	 *   dj[] for the first derivatives with respect to atom j
	 *   d2jj[] for the second derivatives with respect to atom j
	 *   d2ij[] for the second derivatives with respect to atoms i and j
	 *
	 * Note: the additional factor of the interatomic distance Dij that
	 * ought to be included in the denominator of the following equations
	 * is provided by the absence of Dij in equations 15 and 16, as well as
	 * by one less power of Dij in the denominator of the equations for the
	 * derivatives of the Van der Waals and Coulombic energy terms.
	 */
      
	di[0] = xij;
	di[1] = yij;
	di[2] = zij;

	dj[0] = -xij;
	dj[1] = -yij;
	dj[2] = -zij;

	/* Load the upper triangle of d2jj. */

	r2inv = 1.0 / r2;

	d2jj[0][0] = 1.0 - xij * xij * r2inv;
	d2jj[0][1] = -xij * yij * r2inv;
	d2jj[0][2] = -xij * zij * r2inv;

	d2jj[1][1] = 1.0 - yij * yij * r2inv;
	d2jj[1][2] = -yij * zij * r2inv;

	d2jj[2][2] = 1.0 - zij * zij * r2inv;

	/* Finish loading the rest of all of the matrices. */

	for (k = 0; k < 3; k++) {

	  /* Load the upper triangle of d2ij. */

	  for (l = k; l < 3; l++) {
	    d2ij[k][l] = -d2jj[k][l];
	  }

	  /* Load the symmetric elements of d2ij and d2jj. */

	  for (l = k+1; l < 3; l++) {
	    d2ij[l][k] = d2ij[k][l];
	    d2jj[l][k] = d2jj[k][l];
	  }
	}

	/*
	 * The following captures all terms that are to be multiplied by the
	 * second derivative of the effective radius (equations 13, 14 and 17).
	 *
	 * Because index i is retrieved from the pairlist array it is
	 * not constrained to a particular range of values; therefore,
	 * the threads that have loaded the sumdeijda array have been
	 * synchronized above prior to the use of sumdeijda below.
	 */

	sumda = sumdeijda[i];

	/*
	 * Form the second derivatives of the effective radius with
	 * respect to the cartesian coordinates using the chain rule,
	 * multiply the result by sumda and sum to the Hessian matrix.
	 * This step implements equations 13, 14 and 17, except that
	 * terms such as d2(Ri)/d(Dij)d(Dik) are not handled here
	 * but instead are handled in the first call to dgemm() below.
	 *
	 * For OpenMP update the Hessian using the ji and jj terms only.
	 * For ScaLAPACK update the Hessian using the ij term only, and
	 * update the j element of the ff vector.
	 */

	for (k = 0; k < 3; k++) {
	  for (l = 0; l < 3; l++) {

#ifdef SCALAPACK

	    ptr = ptr2f(g, super_3Nx3N, i3+k, j3+l, &info);
	    if (ptr != NULL) *ptr += sumda*(df*d2ij[k][l] + dg*di[k]*dj[l]);

	    gg[3*(j3 + k) + l] += sumda*(df*d2jj[k][l] + dg*dj[k]*dj[l]);

#else

	    g[n3*(j3 + l) + i3 + k] += sumda*(df*d2ij[k][l] + dg*di[k]*dj[l]);
	    g[n3*(j3 + k) + j3 + l] += sumda*(df*d2jj[k][l] + dg*dj[k]*dj[l]);
#endif
	  }
	}

	/*
	 * The following captures all terms that are to be multiplied by the
	 * first derivative of the effective radius (equations 2, 3 and 4).
	 */

	if ( gb>1 ) {

	  ri = rborn[i] - BOFFSET;
	  thi = tanh( (gbalpha - gbbeta*psi[i] + gbgamma*psi[i]*psi[i])*psi[i] );
	  sumda *= (gbalpha -2.0*gbbeta*psi[i] + 3.0*gbgamma*psi[i]*psi[i])
	    *(1.0 - thi*thi)*ri/rborn[i];
	}

	/* Store the 1st derivatives of Ri with respect to (xj,yj,zj). */

	dxj = df * dj[0];
	dyj = df * dj[1];
	dzj = df * dj[2];

#ifdef SCALAPACK

	ptr = ptr2f(dreff, super_Nx3N, i, j3+0, &info);
	if (ptr != NULL) *ptr = dxj;

	ptr = ptr2f(dreff, super_Nx3N, i, j3+1, &info);
	if (ptr != NULL) *ptr = dyj;

	ptr = ptr2f(dreff, super_Nx3N, i, j3+2, &info);
	if (ptr != NULL) *ptr = dzj;
#endif

	/* Update the first derivative accumulators with d(Ri)/d(Xj). */

	dx += sumda * dxj;
	dy += sumda * dyj;
	dz += sumda * dzj;
      }

      /*
       * Update the gradient vector f from the first derivative accumulators,
       * but only for OpenMP, not for ScaLAPACK.
       */

#ifdef OPENMP

      f[j3    ] += dx;
      f[j3 + 1] += dy;
      f[j3 + 2] += dz;

#endif

    }

    if (gb2_debug) {
      printf("end loop 4\n");
      fflush(stdout);
    }

#endif /* if defined(OPENMP) || defined(SCALAPACK) */

  }

#ifdef SCALAPACK

  /*
   * Add the gradient vector grad to the gradient vector f.
   * This step is straightforward for a 1x1 process grid
   * but requires multiple steps for a larger process grid.
   */

  if (gridim == 1) {
    if (context_PxQ >= 0) {
      for (i = 0; i < m3; i++) {
	f[i] += grad[i];
      }

      if (gb2_debug) {
	printf("f += grad\n");
	fflush(stdout);
      }    
    }
  } else {

    /*
     * Perform a reduction over the gradient vector grad.
     * Leave the result in reductarr which exists in all processes.
     */

    ierror = MPI_Allreduce(grad, reductarr, m3,
			   MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (ierror != MPI_SUCCESS) {
      printf("Error in egb grad reduction, error = %d  mytaskid = %d\n",
	     ierror, mytaskid);
    }

    if (gb2_debug) {
      printf("MPI_Allreduce grad\n");
      fflush(stdout);
    }

    /* Gather f from context_PxQ into grad in context_1x1. */

    pdgemr2d_(&m3, &one,
	      f, &one, &one, desc_3Nx1,
	      grad, &one, &one, desc_1x1, &context_Nx1);

    if (gb2_debug) {
      printf("pgemr2d f\n");
      fflush(stdout);
    }

    /* Add reductarr to grad. */
    
    if (context_1x1 >= 0) {
      for (i = 0; i < m3; i++) {
	grad[i] += reductarr[i];
      }

      if (gb2_debug) {
	printf("f += reductarr\n");
	fflush(stdout);
      }
    }

    /* Distribute grad from context_1x1 into f in context_PxQ. */

    pdgemr2d_(&m3, &one,
	      grad, &one, &one, desc_1x1,
	      f, &one, &one, desc_3Nx1, &context_Nx1);

    if (gb2_debug) {
      printf("pgemr2d grad\n");
      fflush(stdout);
    }    
  }

  /*
   * The diagonal elements of the Hessian, sumdeijdg, sumdeijgh,
   * sumdeijdn and dreff matrices are stored in the gg, sg, sh,
   * sn and sr vectors.  Perform a reduction across each vector
   * and write the result into corresponding matrix.
   *
   * Here is the code for the Hessian.
   */

  ierror = MPI_Allreduce(gg, reductarr, 3*m3,
			 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  if (ierror != MPI_SUCCESS) {
    printf("Error in egb gg reduction, error = %d  mytaskid = %d\n",
	   ierror, mytaskid);
  }

  if (gb2_debug) {
    printf("MPI_Allreduce gg\n");
    fflush(stdout);
  }

  for (i = 0; i < n; i++) {
    i3 = 3*i;
    for (k = 0; k < 3; k++) {
      for (l = 0; l < 3; l++) {
	ptr = ptr2f(g, super_3Nx3N, i3+k, i3+l, &info);
	if (ptr != NULL) *ptr += reductarr[3*(i3 + k) + l];
      }
    }
  }

  /* Here is the code for sumdeijdg. */

  ierror = MPI_Allreduce(sg, reductarr, m3,
			 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  if (ierror != MPI_SUCCESS) {
    printf("Error in egb sg reduction, error = %d  mytaskid = %d\n",
	   ierror, mytaskid);
  }

  if (gb2_debug) {
    printf("MPI_Allreduce sg\n");
    fflush(stdout);
  }

  for (i = 0; i < n; i++) {
    i3 = 3*i;
    for (k = 0; k < 3; k++) {
      ptr = ptr2f(sumdeijdg, super_Nx3N, i, i3+k, &info);
      if (ptr != NULL) *ptr += reductarr[i3 + k];
    }
  }

  /* Here is the code for sumdeijdh. */

  ierror = MPI_Allreduce(sh, reductarr, m3,
			 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  if (ierror != MPI_SUCCESS) {
    printf("Error in egb sh reduction, error = %d  mytaskid = %d\n",
	   ierror, mytaskid);
  }

  if (gb2_debug) {
    printf("MPI_Allreduce sh\n");
    fflush(stdout);
  }

  for (i = 0; i < n; i++) {
    i3 = 3*i;
    for (k = 0; k < 3; k++) {
      ptr = ptr2f(sumdeijdh, super_Nx3N, i, i3+k, &info);
      if (ptr != NULL) *ptr += reductarr[i3 + k];
    }
  }

  /* Here is the code for dreff. */

  ierror = MPI_Allreduce(sr, reductarr, m3,
			 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  if (ierror != MPI_SUCCESS) {
    printf("Error in egb sr reduction, error = %d  mytaskid = %d\n",
	   ierror, mytaskid);
  }

  if (gb2_debug) {
    printf("MPI_Allreduce sr\n");
    fflush(stdout);
  }

  for (i = 0; i < n; i++) {
    i3 = 3*i;
    for (k = 0; k < 3; k++) {
      ptr = ptr2f(dreff, super_Nx3N, i, i3+k, &info);
      if (ptr != NULL) *ptr += reductarr[i3 + k];
    }
  }

  /* Here is the code for sumdeijdn. */

  ierror = MPI_Allreduce(sn, reductarr, n,
			 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  if (ierror != MPI_SUCCESS) {
    printf("Error in egb sn reduction, error = %d  mytaskid = %d\n",
	   ierror, mytaskid);
  }

  if (gb2_debug) {
    printf("MPI_Allreduce sn\n");
    fflush(stdout);
  }

  for (i = 0; i < n; i++) {
    ptr = ptr2f(sumdeijdn, super_NxN, i, i, &info);
    if (ptr != NULL) *ptr += reductarr[i];
  }

  /*
   * Add to the diagonal elements of sumdeijdn the d2(Ri)/d(Dij)d(Dik)
   * terms weighted by Ri and sumdeijda[i].
   */

  for (i = 0; i < n; i++) {
    ptr = ptr2f(sumdeijdn, super_NxN, i, i, &info);
    if (ptr != NULL) {
      *ptr += 2.0 * sumdeijda[i] * reff[i];
    }
  }

  t2 = seconds();
  if (gb2_debug) {
    printf("\nmatrix preparation time = %10.2f  n = %d  m3 = %d\n\n",
	   t2 - t1, n, m3);
    fflush(stdout);
  }
  tloop += t2 - t1;
  t1 = t2;

  /*
   * Perform the inner multiplications of equations 9, 10, 11, 12
   * and 18 in a manner analogous to the multiplications of
   * of equations 2, 3 and 4.  Also, multiply each element of each
   * row of the dreff array by the d2(Ri)/d(Dij)d(Dik) terms.
   *
   * Next perform the outer multiplication for equations 5, 6, 9, 10,
   * 11 and 12.
   *
   * Finally, perform the outer multiplication for equations 7 and 8.
   *
   * Some ScaLAPACK functions appear to quit unexpectedly
   * for large matrices on a 1x1 process grid, e.g. the pdgemm_
   * function in newton.c, so bypass the ScaLAPACK pdgemm_ and
   * use the LAPACK dgemm_ function instead.
   *
   * The correct test is (gridim == 1), not (nprow == 1 && npcol == 1)
   * since processes that aren't on the 1x1 grid have nprow == npcol == -1,
   * which would direct control to pdgemr2d_ (below) that would hang because
   * it would not be called from all processes, specifically not from
   * the process that is on the 1x1 grid and has nprow == npcol == 1.
   */

 if (gridim == 1) {
    if (context_PxQ >= 0) {

      transa = 'T';
      transb = 'N';
      dgemm_(&transa, &transb, &n, &m3, &n, &dblone,
	     sumdeijdn, &n, dreff, &n, &dblone, sumdeijdg, &n);

      t2 = seconds();
      if (gb2_debug) {
	printf("dgemm1 time = %10.2f\n\n", t2 - t1);
	fflush(stdout);
      }
      tb2g1 += t2 - t1;
      t1 = t2;

      transa = 'T';
      transb = 'N';
      dgemm_(&transa, &transb, &m3, &m3, &n, &dblone,
	     dreff, &n, sumdeijdg, &n, &dblone, g, &m3);

      t2 = seconds();
      if (gb2_debug) {
	printf("dgemm2 time = %10.2f\n\n", t2 - t1);
	fflush(stdout);
      }
      tb2g2 += t2 - t1;
      t1 = t2;

      dgemm_(&transa, &transb, &m3, &m3, &n, &dblone,
	     sumdeijdh, &n, dreff, &n, &dblone, g, &m3);

      t2 = seconds();
      if (gb2_debug) {
	printf("dgemm3 time = %10.2f\n\n", t2 -t1);
	fflush(stdout);
      }
      tb2g3 += t2 - t1;
    }
  } else {
    if (context_PxQ >= 0) {
      transa = 'N';
      transb = 'N';
      pdgemm_(&transa, &transb, &n, &m3, &n, &dblone,
	      sumdeijdn, &one, &one, desc_NxN,
	      dreff, &one, &one, desc_Nx3N, &dblone,
	      sumdeijdg, &one, &one, desc_Nx3N);

      t2 = seconds();
      if (gb2_debug) {
	printf("pdgemm1 time = %10.2f\n\n", t2 - t1);
	fflush(stdout);
      }
      tb2g1 += t2 - t1;
      t1 = t2;

      transa = 'T';
      transb = 'N';
      pdgemm_(&transa, &transb, &m3, &m3, &n, &dblone,
	      dreff, &one, &one, desc_Nx3N,
	      sumdeijdg, &one, &one, desc_Nx3N, &dblone,
	      g, &one, &one, desc_3Nx3N);

      t2 = seconds();
      if (gb2_debug) {
	printf("pdgemm2 time = %10.2f\n\n", t2 - t1);
	fflush(stdout);
      }
      tb2g2 += t2 - t1;
      t1 = t2;

      transa = 'T';
      transb = 'N';
      pdgemm_(&transa, &transb, &m3, &m3, &n, &dblone,
	      sumdeijdh, &one, &one, desc_Nx3N,
	      dreff, &one, &one, desc_Nx3N, &dblone,
	      g, &one, &one, desc_3Nx3N);

      t2 = seconds();
      if (gb2_debug) {
	printf("pdgemm3 time = %10.2f\n\n", t2 -t1);
	fflush(stdout);
      }
      tb2g3 += t2 - t1;
    }
  }

#else

  /*
   * Here is the non-ScaLAPACK section of the code.  Note that the
   * sumdeijdg, sumdeijdh and dreff matrices are built in a transposed
   * manner relative to the manner in which they are built for
   * ScaLAPACK execution because the ScaLAPACK submatrices are built
   * in column major order.
   *
   * Add to the diagonal elements of sumdeijdn the d2(Ri)/d(Dij)d(Dik)
   * terms weighted by Ri and sumdeijda[i].
   */

  for (i = 0; i < n; i++) {
    in = (n * i);
    sumdeijdn[in + i] += 2.0 * sumdeijda[i] * reff[i];
  }

  t2 = seconds();
  if (gb2_debug) {
    printf("\nmatrix preparation time = %10.2f  n = %d  m3 = %d\n\n",
	   t2 - t1, n, m3);
    fflush(stdout);
  }
  tloop += t2 - t1;
  t1 = t2;

  /*
   * Perform the inner multiplications of equations 9, 10, 11, 12
   * and 18 in a manner analogous to the multiplications of
   * of equations 2, 3 and 4.  Also, multiply each element of each
   * row of the dreff array by the d2(Ri)/d(Dij)d(Dik) terms.
   *
   * Next perform the outer multiplication for equations 5, 6, 9, 10,
   * 11 and 12.
   *
   * Finally, perform the outer multiplication for equations 7 and 8.
   */

  transa = 'N';
  transb = 'N';
  dgemm_(&transa, &transb, &m3, &n, &n, &dblone,
	 dreff, &m3, sumdeijdn, &n, &dblone, sumdeijdg, &m3);

  t2 = seconds();
  if (gb2_debug) {
    printf("dgemm1 time = %10.2f\n\n", t2 - t1);
    fflush(stdout);
  }
  tb2g1 += t2 - t1;
  t1 = t2;

  transa = 'N';
  transb = 'T';
  dgemm_(&transa, &transb, &m3, &m3, &n, &dblone,
	 sumdeijdg, &m3, dreff, &m3, &dblone, g, &m3);

  t2 = seconds();
  if (gb2_debug) {
    printf("dgemm2 time = %10.2f\n\n", t2 - t1);
    fflush(stdout);
  }
  tb2g2 += t2 - t1;
  t1 = t2;

  transa = 'N';
  transb = 'T';
  dgemm_(&transa, &transb, &m3, &m3, &n, &dblone,
	 dreff, &m3, sumdeijdh, &m3, &dblone, g, &m3);

  t2 = seconds();
  if (gb2_debug) {
    printf("dgemm3 time = %10.2f\n\n", t2 -t1);
    fflush(stdout);
  }
  tb2g3 += t2 - t1;

#endif

  /* Free the static arrays if STATIC_ARRAYS is undefined. */

#ifndef STATIC_ARRAYS

#ifndef SCALAPACK

  free_vector( sumdeijdn, 0, nn );
  sumdeijdn = NULL;

  free_vector( sumdeijdg, 0, nn3 );
  sumdeijdg = NULL;

  free_vector( sumdeijdh, 0, nn3 );
  sumdeijdh = NULL;

  free_vector( dreff, 0, nn3 );
  dreff = NULL;

#else

  if (context_PxQ >= 0) {

    free_superdesc( super_3Nx3N );
    super_3Nx3N = NULL;

    free_vector( sumdeijdn, 0, super_NxN->size );
    sumdeijdn = NULL;

    free_superdesc( super_NxN );
    super_NxN = NULL;

    free_vector( sumdeijdg, 0, super_Nx3N->size );
    sumdeijdg = NULL;

    free_vector( sumdeijdh, 0, super_Nx3N->size );
    sumdeijdh = NULL;

    free_vector( dreff, 0, super_Nx3N->size );
    dreff = NULL;

    free_superdesc( super_Nx3N );
    super_Nx3N = NULL;

    free_vector( reductarr, 0, 3*n3 );
    reductarr = NULL;
  }

  free_vector( gg, 0, 3*n3 );
  gg = NULL;

  free_vector( sg, 0, n3 );
  sg = NULL;

  free_vector( sh, 0, n3 );
  sh = NULL;

  free_vector( sr, 0, n3 );
  sr = NULL;

  free_vector( sn, 0, n );
  sn = NULL;

#endif

  free_vector( reff, 0, n );
  reff = NULL;

  free_vector( sumdeijda, 0, n );
  sumdeijda = NULL;

  free_vector( psi, 0, n );
  psi = NULL;

#endif

  if (gb2_debug) {
    printf("exit egb2\n");
    fflush(stdout);
  }

  /* Return elec and evdw through the reference parameters eelt and enb. */

  *eelt = elec;
  *enb = evdw;

  return (epol);
}

/***********************************************************************
                            MME2()
************************************************************************/

/*
 * Compute the energy and first and second derivatives.
 *
 * Calling parameters are as follows:
 *
 * x - input: the atomic (x,y,z) coordinates
 * f - updated: the gradient vector
 * g - updated: the Hessian matrix
 * m - returned: the atomic masses
 * grad - updated: the gradient vector in context_1x1
 * descF_PxQ - input: the ScaLAPACK descriptor for vector f in context_PxQ
 * descF_1x1 - input: the ScaLAPACK descriptor for vector grad in context_1x1
 * descG_PxQ - input: the ScaLAPACK descriptor for matrix g in context_PxQ
 * context_PxQ - input: the distributed vector and matrix context for ScaLAPACK
 * context_1x1 - input: the non-distributed vector context for ScaLAPACK
 * context_Nx1 - input: the general context for ScaLAPACK
 * gridim - input: the ScaLAPACK process grid dimension (=1 for single task)
 * natom - returned: the number of atoms
 * iter - input: the iteration counter, which if negative selects the following:
 *        -1 print some energy values
 *        -3 call egb to deallocate static arrays, then deallocate grad
 *        -(any other value) normal execution
 */

REAL_T mme2(REAL_T *x, REAL_T *f, REAL_T *g, REAL_T *m, REAL_T *grad,
	    INT_T *descF_PxQ, INT_T *descF_1x1, INT_T *descG_PxQ,
	    INT_T *context_PxQ, INT_T *context_1x1, INT_T *context_Nx1,
	    INT_T *gridim, INT_T *natom, INT_T *iter)
{
  REAL_T ebh, eba, eth, eta, eph, epa, enb, eel, enb14, eel14, ecn, frms;
  REAL_T ene[20], t1, t2;
  extern REAL_T tcholesky;
  int i, j, k, n, i3, threadnum, numthreads;
  size_t in9, ii, jj, kk;

#ifdef SCALAPACK
  int myrow, mycol, nprow, npcol, mb, nb;
  size_t locpF_PxQ, locqF_PxQ, locpG_PxQ, locqG_PxQ;
  int zero=0, one=1;
  REAL_T *ptr, reductarr[12];
  char ctrl;
#endif

  /* Enforce 3D (and disallow 4D) for second derivatives. */

  t1 = seconds();
  dim = 3;
  n = 3*prm->Natom;

  /*
   * If the iteration count equals -3, call egb2 to deallocate the
   * static arrays, then return; otherwise, simply return.
   */

  if (*iter == -3) {
    egb2(lpairs, upairs, pairlist, x, f, g, grad, prm->Fs,
	 prm->Rborn, prm->Charges, &kappa, &epsext, &enb, &eel,
	 *context_Nx1, *context_1x1, *context_PxQ,
	 descF_PxQ, descF_1x1, descG_PxQ,
	 *gridim, 1);
    return (0.0);
  }

  /*
   * If the iteration count is zero or one, print the energy header
   * for task 0 only.
   */

  if ( (*iter == 0 || *iter == 1) && mytaskid == 0 ) {
    printf
      ("    iter    Total    bad         vdW     elect.     cons.   genBorn      frms\n");
    fflush(stdout);
  }

  /*
   * If the iteration count is zero or one, initialize the hi-res
   * timer variables.
   */

  if (*iter == 0 || *iter == 1) {
    tnonb2 = tpair = tbond2 = tangl2 = tphi2 = tborn2 = tcons2 = 0.0;
    tmme2 = tb2g1 = tb2g2 = tb2g3 = tloop = 0.0;
    tlevel = tdiag = tmmetwo = tcholesky = tnewton = 0.0;
  }

  /*
   * Write the checkpoint file every nchk2 iteration if the chknm
   * variable is non-NULL, but for task 0 only.
   */

  if (chknm != NULL && (*iter > 0 && *iter % nchk2 == 0) && mytaskid == 0) {
    checkpoint(chknm, prm->Natom, x, *iter);
  }

  /*
   * Build the pair list if it hasn't already been built; rebuild
   * it every nsnb iteration.  This pair list will be used for
   * first derivative calculations (for line minimization within
   * Newton-Raphson) and for non-14 nonbonded interactions (i.e.,
   * when nbond2 is called instead of egb2).
   *
   * If SCALAPACK or MPI is defined, build another pair list that
   * is dedicated for use by egb2.  For ScaLAPACK this pair list
   * will conform to the block cyclic process grid.  For MPI this
   * pair list will be fully populated for each task because the
   * Born second derivatives are not parallelized under MPI.
  */

  if (nb_pairs < 0 || (*iter > 0 && *iter % nsnb == 0)) {
    t1 = seconds();
    nb_pairs = nblist(lpairs, upairs, pairlist, x, *context_PxQ, 1);
    t2 = seconds();
    tpair += t2 - t1;
    t1 = t2;
    /*		printf( "nb_pairs = %d\n", nb_pairs );          */
  }

#if defined(SCALAPACK) || defined(MPI)

  if (nb_pairs2 < 0 || (*iter > 0 && *iter % nsnb == 0)) {
    t1 = seconds();
    nb_pairs2 = nblist(lpairs2, upairs2, pairlist2, x, *context_PxQ, -1);
    t2 = seconds();
    tpair += t2 - t1;
    t1 = t2;
    /*		printf( "nb_pairs = %d\n", nb_pairs );          */
  }

#endif

  /* Initialize the gradient and Hessian arrays. */

#ifdef SCALAPACK

  /*
   * Get the number of rows and columns on the row cyclic (PxQ) process grid,
   * as well as this task's row and column on the grid.
   */

  blacs_gridinfo_(context_PxQ, &nprow, &npcol, &myrow, &mycol);

  /*
   * Only processes that are active on the PxQ grid initialize
   * the gradient vector and Hessian matrix.
   */

  if (*context_PxQ >= 0) {

    /*
     * The numroc_ function is used to calculate the number of vector or
     * matrix elements that are distributed across a PxQ processor grid
     * for the gradient vector or the Hessian matrix.  Only processes
     * in grid column zero initialize the gradient vector.
     */

    mb = descF_PxQ[MB_];
    nb = descF_PxQ[NB_];

    locpF_PxQ = numroc_(&n, &mb, &myrow, &zero, &nprow);
    locqF_PxQ = numroc_(&one, &nb, &mycol, &zero, &npcol);

    if (mycol == 0) {
      for (i = 0; i < locpF_PxQ*locqF_PxQ; i++) {
	f[i] = 0.0;
      }
    }

    /* Get the block sizes for the Hessian matrix. */

    mb = descG_PxQ[MB_];
    nb = descG_PxQ[NB_];

    locpG_PxQ = numroc_(&n, &mb, &myrow, &zero, &nprow);
    locqG_PxQ = numroc_(&n, &nb, &mycol, &zero, &npcol);

    for (ii = 0; ii < locpG_PxQ*locqG_PxQ; ii++) {
      g[ii] = 0.0;
    }
  }

#else

#pragma omp parallel private (i, j, i3, ii, jj, in9, threadnum, numthreads)
  {

    /*
     * Get the thread number and the number of threads for multi-threaded
     * execution.
     */

#ifdef OPENMP
    threadnum = omp_get_thread_num();
    numthreads = omp_get_num_threads();
#endif

    /*
     * Initialize the gradient and Hessian arrays inside of the
     * OpenMP parallel region.
     *
     * The "first touch" memory allocation strategy will locate
     * the elements of these arrays that are initialized by
     * a particular CPU in memory that is local to that CPU.
     *
     * Explicitly assign threads to loop indices for the following loop,
     * in a manner equivalent to (static, N) scheduling with OpenMP, and
     * identical to the manner in which threads are assigned in nblist.
     *
     * Because no computation follows the following 'for' loop within
     * this parallel region, the implied barrier at the end of the
     * parallel region enforces the necessary thread synchronization.
     */

#ifndef ROW_CYCLIC
#pragma omp for
#endif

    for (i = 0; i < prm->Natom; i++) {

#if defined(ROW_CYCLIC) && defined(OPENMP)
      if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue;
#endif

      i3 = 3 * i;
      ii = (size_t)i;
      in9 = (9 * prm->Natom * ii);

      f[i3    ] = f[i3 + 1] = f[i3 + 2] = 0.0;

      for (j = 0; j < 9 * prm->Natom; j++) {
	jj = (size_t)j;
	g[in9 + jj] = 0.0;
      }
    }
  }

#endif

  t2 = seconds();
  tmme2 += t2 - t1;
  t1 = t2;

  ebh = ebond2(prm->Nbonh, prm->BondHAt1, prm->BondHAt2,
	       prm->BondHNum, prm->Rk, prm->Req, x, f, g,
	       *context_PxQ, descF_PxQ, descG_PxQ);
  eba = ebond2(prm->Mbona, prm->BondAt1, prm->BondAt2,
	       prm->BondNum, prm->Rk, prm->Req, x, f, g,
	       *context_PxQ, descF_PxQ, descG_PxQ);
  ene[3] = ebh + eba;
  t2 = seconds();
  tbond2 += t2 - t1;
  t1 = t2;

  eth = eangl2(prm->Ntheth, prm->AngleHAt1, prm->AngleHAt2,
	       prm->AngleHAt3, prm->AngleHNum,
	       prm->Tk, prm->Teq, x, f, g,
	       *context_PxQ, descF_PxQ, descG_PxQ);
  eta = eangl2(prm->Ntheta, prm->AngleAt1, prm->AngleAt2,
	       prm->AngleAt3, prm->AngleNum,
	       prm->Tk, prm->Teq, x, f, g,
	       *context_PxQ, descF_PxQ, descG_PxQ);
  ene[4] = eth + eta;
  t2 = seconds();
  tangl2 += t2 - t1;
  t1 = t2;

  t1 = seconds();
  eph = ephi2(prm->Nphih, prm->DihHAt1, prm->DihHAt2,
	      prm->DihHAt3, prm->DihHAt4, prm->DihHNum,
	      prm->Pk, prm->Pn, prm->Phase, x, f, g,
	      *context_PxQ, descF_PxQ, descG_PxQ);
  epa = ephi2(prm->Mphia, prm->DihAt1, prm->DihAt2,
	      prm->DihAt3, prm->DihAt4, prm->DihNum,
	      prm->Pk, prm->Pn, prm->Phase, x, f, g,
	      *context_PxQ, descF_PxQ, descG_PxQ);
  ene[5] = eph + epa;
  t2 = seconds();
  tphi2 += t2 - t1;
  t1 = t2;

  ene[6] = 0.0;        /*  hbond term not in Amber-94 force field */

  nbond2(lpairs, prm->N14pairs, N14pearlist, 1, x, f, g, &enb14, &eel14,
	 scnb, scee, *context_PxQ, descF_PxQ, descG_PxQ);
  t2 = seconds();
  tnonb2 += t2 - t1;
  t1 = t2;

  if (e_debug) {
    EXPR("%9.3f", enb14);
    EXPR("%9.3f", eel14);
  }
  ene[7] = enb14;
  ene[8] = eel14;

  if (nconstrained) {
    ecn = econs2(x, f, g, *context_PxQ, descF_PxQ, descG_PxQ);
    t2 = seconds();
    tcons2 += t2 - t1;
    t1 = t2;
  } else
    ecn = 0.0;
  ene[9] = ecn;

  /* Calculate the generalized Born energy and derivatives. */

  if (gb) {

#if defined(SCALAPACK) || defined(MPI)
    ene[10] = egb2(lpairs2, upairs2, pairlist2, x, f, g, grad, prm->Fs,
		   prm->Rborn, prm->Charges, &kappa, &epsext, &enb, &eel,
		   *context_Nx1, *context_1x1, *context_PxQ,
		   descF_PxQ, descF_1x1, descG_PxQ,
		   *gridim, 0);
#else
    ene[10] = egb2(lpairs, upairs, pairlist, x, f, g, grad, prm->Fs,
		   prm->Rborn, prm->Charges, &kappa, &epsext, &enb, &eel,
		   *context_Nx1, *context_1x1, *context_PxQ,
		   descF_PxQ, descF_1x1, descG_PxQ,
		   *gridim, 0);
#endif

    t2 = seconds();
    tborn2 += t2 - t1;
    t1 = t2;
    ene[1] = enb;
    ene[2] = eel;
  } else {

#if defined(SCALAPACK) || defined(MPI)
    nbond2(lpairs2, upairs2, pairlist2, 0, x, f, g, &enb, &eel, 1.0, 1.0,
	   *context_PxQ, descF_PxQ, descG_PxQ);
#else
    nbond2(lpairs, upairs, pairlist, 0, x, f, g, &enb, &eel, 1.0, 1.0,
	   *context_PxQ, descF_PxQ, descG_PxQ);
#endif

    t2 = seconds();
    tnonb2 += t2 - t1;
    t1 = t2;
    ene[1] = enb;
    ene[2] = eel;
    ene[10] = 0.0;
  }

  /*
   * Zero out the frozen forces.  Use the local leading dimension of
   * the distributed Hessian matrix for the ScaLAPACK implementation.
   */

#pragma omp parallel private (k, j, kk, jj, threadnum, numthreads)
  {
    /*
     * Get the thread number and the number of threads for multi-threaded
     * execution.  Use default values of 0 and 1, respectively, for
     * single-threaded execution.
     */

#ifdef SCALAPACK
    threadnum = myrow;
    numthreads = nprow;
#elif defined(OPENMP)
    threadnum = omp_get_thread_num();
    numthreads = omp_get_num_threads();
#else
    threadnum = 0;
    numthreads = 1;
#endif

    /*
     * Use BLOCKSIZE to assign the loop index i to OpenMP threads
     * or ScaLAPACK process rows.  This approach is identical to
     * the approach used in the egb2 function.
     */

#pragma omp for

    for (k = 0; k < prm->Natom; k++) {

#if defined(OPENMP) || defined(SCALAPACK)
      if ( !myroc(k, BLOCKSIZE, numthreads, threadnum) ) continue;
#endif

      kk = (size_t)k;

      if (frozen[k]) {

#ifndef SCALAPACK
	f[3 * k    ] = 0.0;
	f[3 * k + 1] = 0.0;
	f[3 * k + 2] = 0.0;
#else
	/*
	 * Only processes in grid column 0 access the gradient vector.
	 * The correct grid row was selected by the above continue statement,
	 * which selects three rows at a time because the block size of
	 * the gradient vector is three times BLOCKSIZE.
	 */

	if (mycol == 0) {
	  ptr = ptr1d(f, descF_PxQ, 3*k+0);
	  if (ptr != NULL) *ptr = 0.0;
	  ptr = ptr1d(f, descF_PxQ, 3*k+1);
	  if (ptr != NULL) *ptr = 0.0;
	  ptr = ptr1d(f, descF_PxQ, 3*k+2);
	  if (ptr != NULL) *ptr = 0.0;
	}
#endif
      }
    }

    /* Zero the [k,j] elements of the Hessian matrix. */

    for (k = 0; k < prm->Natom; k++) {

      if ( !myroc(k, BLOCKSIZE, numthreads, threadnum) ) continue;
      kk = (size_t)k;

      if (frozen[k]) {

	for (j = 0; j < prm->Natom; j++) {
	  jj = (size_t)j;
	
#ifndef SCALAPACK
	  g[3*jj + 0 + n*(3*kk + 0)] = 0.0;
	  g[3*jj + 1 + n*(3*kk + 1)] = 0.0;
	  g[3*jj + 2 + n*(3*kk + 2)] = 0.0;
#else
	  /*
	   * The correct grid row was selected by the above continue statement.
	   * Determine the correct processor grid column for Hessian matrix
	   * access using the ptr2d function.
	   */

	    ptr = ptr2d(g, descG_PxQ, 3*kk+0, 3*jj+0);
	    if (ptr != NULL) *ptr = 0.0;
	    ptr = ptr2d(g, descG_PxQ, 3*kk+1, 3*jj+1);
	    if (ptr != NULL) *ptr = 0.0;
	    ptr = ptr2d(g, descG_PxQ, 3*kk+2, 3*jj+2);
	    if (ptr != NULL) *ptr = 0.0;
#endif
	}
      }
    }

#ifdef SCALAPACK

    /* This step is unnecessary for a square process grid. */

    threadnum = mycol;
    numthreads = npcol;

#endif

    /* Zero the [j,k] elements of the Hessian matrix. */

    for (k = 0; k < prm->Natom; k++) {

      if ( !myroc(k, BLOCKSIZE, numthreads, threadnum) ) continue;
      kk = (size_t)k;

      if (frozen[k]) {

	for (j = 0; j < prm->Natom; j++) {
	  jj = (size_t)j;
	
#ifndef SCALAPACK
	  g[3*kk + 0 + n*(3*jj + 0)] = 0.0;
	  g[3*kk + 1 + n*(3*jj + 1)] = 0.0;
	  g[3*kk + 2 + n*(3*jj + 2)] = 0.0;
#else
	  /*
	   * The correct grid column was selected by the above continue statement.
	   * Determine the correct processor grid row for Hessian matrix
	   * access using the ptr2d function.
	   */

	  ptr = ptr2d(g, descG_PxQ, 3*jj+0, 3*kk+0);
	  if (ptr != NULL) *ptr = 0.0;
	  ptr = ptr2d(g, descG_PxQ, 3*jj+1, 3*kk+1);
	  if (ptr != NULL) *ptr = 0.0;
	  ptr = ptr2d(g, descG_PxQ, 3*jj+2, 3*kk+2);
	  if (ptr != NULL) *ptr = 0.0;
#endif
	}
      }
    }
  }

  /*
   * Calculate the RMS error of the gradient vector.
   *
   * For non-ScaLAPACK execution, set some variables to values that
   * will select the proper sections of code below.
   */

#ifndef SCALAPACK

  *gridim = 1;
  *context_PxQ = 0;

#endif

  /*
   * Some ScaLAPACK functions appear to quit unexpectedly
   * for large matrices on a 1x1 process grid, e.g. the pdgemm_
   * function in newton.c, so bypass the ScaLAPACK pdgemr2d_
   * function and calculate the RMS error using f instead of grad.
   *
   * The correct test is (gridim == 1), not (nprow == 1 && npcol == 1)
   * since processes that aren't on the 1x1 grid have nprow == npcol == -1,
   * which would direct control to pdgemr2d_ (below) that would hang because
   * it would not be called from all processes, specifically not from
   * the process that is on the 1x1 grid and has nprow == npcol == 1.
   *
   * Non-ScaLAPACK execution will select this section of code because
   * gridim and context_PxQ were set to 1 and 0, respectively, above.
   */

  if (*gridim == 1) {
    if (*context_PxQ >= 0) {
      frms = 0.0;
      for (i = 0; i < n; i++)
	frms += f[i] * f[i];
      frms = sqrt(frms / ((REAL_T)n));
    }
  } else {

#ifdef SCALAPACK

    /*
     * For ScaLAPACK, begin by gathering the distributed gradient vector g
     * into the non-distributed gradient vector grad.  The function pdgemr2d_
     * will hang unless called from all processes.
     */

    pdgemr2d_(&n, &one,
	      f, &one, &one, descF_PxQ,
	      grad, &one, &one, descF_1x1, context_Nx1);

    /*
     * Calculate the RMS error for the gradient vector grad in context_1x1
     * where grad exists.
     */

    if (*context_1x1 >= 0) {
      frms = 0.0;
      for (i = 0; i < n; i++) {
	frms += grad[i] * grad[i];
      }
      frms = sqrt(frms / ((REAL_T)n));
    }

#endif

  }

  /*
   * If SCALAPACK is defined perform a reduction of the ene array,
   * but only the Van der Waals, Coulombic, and Born energies have
   * been computed in parallel by egb2, not nbond2.
   */

#ifdef SCALAPACK
    if (gb) {
      MPI_Allreduce(ene, reductarr, 12, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
      ene[1] = reductarr[1];
      ene[2] = reductarr[2];
      ene[10] = reductarr[10];
    }
#endif

    /* Calculate the total energy. */

    ene[0] = 0.0;
    for (k = 1; k <= 10; k++) {
      ene[0] += ene[k];
    }

  /*
   * Print the energies and rms gradient but only for positive values
   * of the iteration counter, and if SCALAPACK is defined, only for
   * context_1x1 where the gradient vector grad is valid.
   */

#ifdef SCALAPACK
  if (*context_1x1 >= 0)
#elif defined(MPI)
  if (mytaskid == 0)
#endif
    {
      if (*iter > -1 && (*iter == 1 || *iter % ntpr == 0)) {
	printf("ff:%4d %9.2f %9.2f %9.2f %9.2f %9.2f %9.2f %9.2e\n",
	       *iter, ene[0] + esurf, ene[3] + ene[4] + ene[5],
	       ene[1] + ene[7], ene[2] + ene[8], ene[9], ene[10], frms);
	fflush(stdout);
      }
    }

  /*
   * Load the atomic masses into the array m because the calling
   * function (newton) needs these masses to "level shift" to high
   * frequencies the eigenvalues associated with overall translation
   * and rotation of the molecule.
   *
   * Load the number of atoms as well.
   */

  for (i = 0; i < prm->Natom; i++) {
    m[i] = prm->Masses[i];
  }

  *natom = prm->Natom;

  /* A value of -1 for the iteration counter is reserved for printing. */

  if(*iter == -1){
    printf( "Inside mme2:\n" );
    printf( "     bond:  %15.9f\n", ene[3] );
    printf( "    angle:  %15.9f\n", ene[4] );
    printf( " dihedral:  %15.9f\n", ene[5] );
    printf( "    enb14:  %15.9f\n", ene[7] );
    printf( "    eel14:  %15.9f\n", ene[8] );
    printf( "      enb:  %15.9f\n", ene[1] );
    printf( "      eel:  %15.9f\n", ene[2] );
    printf( "      egb:  %15.9f\n", ene[10] );
    printf( "    econs:  %15.9f\n", ene[9] );
    printf( "    esurf:  %15.9f\n", ene[11] );
    printf( "    Total:  %15.9f\n", ene[0] );
  }

  t2 = seconds();
  tmme2 += t2 - t1;

  return (ene[0] + esurf);
}

/***********************************************************************
                            MME2_TIMER()
************************************************************************/

int mme2_timer( void )
{
  /* Use the maximum time from all ScaLAPACK processes. */

#ifdef SCALAPACK

  REAL_T timarr[17], reductarr[17];

  timarr[0] = tcons2;
  timarr[1] = tbond2;
  timarr[2] = tangl2;
  timarr[3] = tphi2;
  timarr[4] = tpair;
  timarr[5] = tnonb2;
  timarr[6] = tborn2;
  timarr[7] = tmme2;
  timarr[8] = tb2g1;
  timarr[9] = tb2g2;
  timarr[10] = tb2g3;
  timarr[11] = tmmetwo;
  timarr[12] = tlevel;
  timarr[13] = tdiag;
  timarr[14] = tcholesky;
  timarr[15] = tnewton;
  timarr[16] = tloop;

  MPI_Allreduce(timarr, reductarr, 17, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);

  tcons2 = reductarr[0];
  tbond2 = reductarr[1];
  tangl2 = reductarr[2];
  tphi2 = reductarr[3];
  tpair = reductarr[4];
  tnonb2 = reductarr[5];
  tborn2 = reductarr[6];
  tmme2 = reductarr[7];
  tb2g1 = reductarr[8];
  tb2g2 = reductarr[9];
  tb2g3 = reductarr[10];
  tmmetwo = reductarr[11];
  tlevel = reductarr[12];
  tdiag = reductarr[13];
  tcholesky = reductarr[14];
  tnewton = reductarr[15];
  tloop = reductarr[16];

#endif

  if (mytaskid == 0) {
    printf("\nSecond derivative timing summary:\n");
    printf("   constraints %10.2f\n", tcons2);
    printf("   bonds       %10.2f\n", tbond2);
    printf("   angles      %10.2f\n", tangl2);
    printf("   torsions    %10.2f\n", tphi2);
    printf("   pairlist    %10.2f\n", tpair);
    printf("   nonbonds    %10.2f\n", tnonb2);
    printf("   gen. Born   %10.2f\n", tborn2);
    printf("   mme2        %10.2f\n", tmme2);
    printf("   Total deriv %10.2f\n", tbond2 + tangl2 + tphi2 +
	   tpair + tnonb2 + tborn2 + tcons2 + tmme2);

    printf("\nBorn dgemm timing summary:\n");
    printf("   dgemm1      %10.2f\n", tb2g1);
    printf("   dgemm2      %10.2f\n", tb2g2);
    printf("   dgemm3      %10.2f\n", tb2g3);
    printf("   total dgemm %10.2f\n", tb2g1 + tb2g2 + tb2g3);
    printf("   Born-dgemm  %10.2f\n", tloop);

    printf("\nOverall timing summary:\n");
    printf("   derivatives %10.2f\n", tmmetwo);
    printf("   level       %10.2f\n", tlevel);
    printf("   dsyev       %10.2f\n", tdiag);
    printf("   dposv       %10.2f\n", tcholesky);
    printf("   newton      %10.2f\n", tnewton);
    printf("   Total       %10.2f\n\n",
	   tmmetwo + tlevel + tdiag + tcholesky + tnewton);
    fflush(stdout);
  }

  return (0);
}