/* * sff2.c: simple force field: implement routines to read a "prmtop" * file from AMBER, and calculate the energy. Implements only * some of AMBER's functionality: bonds, angles, dihedrals, and * nonbonded interactions with a distance-dependent dielectric. * Does not (yet) include support for period systems. * * Main interface is through routines "mme2" and "mme_init". * * Second derivatives, in particular the egb2 function, were * implemented by Russ Brown (russ.brown@sun.com). * * Changes to difang, eangl2 and ephi2 by David A. Case (case@scripps.edu). * * For the OpenMP implementation, only the egb2 function is parallelized * so the econs2, ebond2, eangl2, ephi2 and nbond2 functions execute in * single-threaded mode. * * For the ScaLAPACK implementation, only the egb2 function is parallelized * so the econs2, ebond2, eangl2, ephi2 and nbond2 functions all compute * each gradient vector and Hessian matrix element. However, because the * gradient and Hessian are distributed, the ptr1d, ptr1f, ptr2d and ptr2f * functions are used to write the subvector and submatrix elements by only * the process that owns the subvector or submatrix. The SUPERD_T type is * used to facilitate argument transmission to the ptr2f function. */ #ifdef SCALAPACK typedef struct superd { INT_T dtype, ctxt, m, n, mb, nb, rsrc, csrc, lld, row, col; size_t size; INT_T *divmb, *divnb, *modmb, *modnb, *divrw, *divcl, *modrw, *modcl; } SUPERD_T; #endif int numroc_(int *, int *, int *, int *, int *); void descinit_(int *, int *, int *, int *, int *, int *, int *, int *, int *, int *); void pdgemr2d_(INT_T *, INT_T *, REAL_T *, INT_T *, INT_T *, INT_T *, REAL_T *, INT_T *, INT_T *, INT_T *, INT_T *); void pdgemm_(char *, char *, INT_T *, INT_T *, INT_T *, REAL_T *, REAL_T *, INT_T *, INT_T *, INT_T *, REAL_T *, INT_T *, INT_T *, INT_T *, REAL_T *, REAL_T *, INT_T *, INT_T *, INT_T *); void dgemm_( char *, char *, INT_T *, INT_T*, INT_T*, REAL_T *, REAL_T *, INT_T *, REAL_T *, INT_T *, REAL_T *, REAL_T*, INT_T * ); /*********************************************************************** SUPERDESC() ************************************************************************/ /* Create and initialize a superdescriptor. */ #ifdef SCALAPACK SUPERD_T *superdesc( INT_T *desc ) { SUPERD_T *sd; int nprow, npcol, myrow, mycol, i, zero=0; size_t locp, locq; /* Malloc a superdescriptor and copy values from the descriptor. */ if ( ( sd = (SUPERD_T*)malloc( sizeof(SUPERD_T) ) ) == NULL ) { printf("Error allocating superdescriptor in superdesc!\n"); fflush(stdout); mpierror(-1); } sd->dtype = desc[DTYPE_]; sd->ctxt = desc[CTXT_]; sd->m = desc[M_]; sd->n = desc[N_]; sd->mb = desc[MB_]; sd->nb = desc[NB_]; sd->rsrc = desc[RSRC_]; sd->csrc = desc[CSRC_]; sd->lld = desc[LLD_]; if (gb2_debug) { printf("superdesc: context = %d\n", sd->ctxt); fflush(stdout); } /* Initialize the division and modulus array pointers to NULL. */ sd->divmb = sd->modmb = sd->divrw = sd->modrw = NULL; sd->divnb = sd->modnb = sd->divcl = sd->modcl = NULL; /* Return NULL if the context is invalid. */ if (sd->lld < 0) return (NULL); /* * Calculate the maximum size of the submatrix array. * Store the row and column for this process. */ blacs_gridinfo_( &(sd->ctxt), &nprow, &npcol, &myrow, &mycol); locp = numroc_( &(sd->m), &(sd->mb), &myrow, &zero, &nprow ); locq = numroc_( &(sd->n), &(sd->nb), &mycol, &zero, &npcol ); sd->size = locp * locq; sd->row = myrow; sd->col = mycol; /* Return NULL if the size or the matrix dimensions are invalid. */ if (sd->size < 0 || sd->m < 0 || sd->n < 0) return (NULL); /* Create and fill the division and modulus arrays. */ sd->divmb = ivector( 0, sd->m ); sd->modmb = ivector( 0, sd->m ); sd->divrw = ivector( 0, sd->m ); sd->modrw = ivector( 0, sd->m ); sd->divnb = ivector( 0, sd->n ); sd->modnb = ivector( 0, sd->n ); sd->divcl = ivector( 0, sd->n ); sd->modcl = ivector( 0, sd->n ); for (i = 0; i < sd->m; i++) { (sd->divmb)[i] = i/(sd->mb); (sd->modmb)[i] = i%(sd->mb); (sd->divrw)[i] = i/nprow; (sd->modrw)[i] = i%nprow; } for (i = 0; i < sd->n; i++) { (sd->divnb)[i] = i/(sd->nb); (sd->modnb)[i] = i%(sd->nb); (sd->divcl)[i] = i/npcol; (sd->modcl)[i] = i%npcol; } /* Return the superdescriptor. */ return (sd); } #endif /*********************************************************************** FREE_SUPERDESC() ************************************************************************/ #ifdef SCALAPACK /* Free a superdescriptor including all of its dynamic arrays. */ void free_superdesc( SUPERD_T *sd ) { if (sd == NULL) return; if (sd->divmb != NULL) free_ivector( sd->divmb, 0, sd->m ); if (sd->modmb != NULL) free_ivector( sd->modmb, 0, sd->m ); if (sd->divrw != NULL) free_ivector( sd->divrw, 0, sd->m ); if (sd->modrw != NULL) free_ivector( sd->modrw, 0, sd->m ); if (sd->divnb != NULL) free_ivector( sd->divnb, 0, sd->n ); if (sd->modnb != NULL) free_ivector( sd->modnb, 0, sd->n ); if (sd->divcl != NULL) free_ivector( sd->divcl, 0, sd->n ); if (sd->modcl != NULL) free_ivector( sd->modcl, 0, sd->n ); free(sd); } #endif /*********************************************************************** ADR1D() ************************************************************************/ /* Perform the 1D address mapping from the global to the local address. */ #ifdef SCALAPACK static size_t adr1d(int i, int mb, int nprow) { return ( (i/(nprow*mb))*mb + i%mb ); } #endif /*********************************************************************** ADR2D() ************************************************************************/ #ifdef SCALAPACK /* Perform the 2D address mapping from the global to the local address. */ static size_t adr2d(int i, int j, int mb, int nb, int nprow, int npcol, int lld) { size_t lbi, lbj, bai, baj; /* Calculate the local block address (lbi, lbj) */ lbi = i/(nprow*mb); lbj = j/(npcol*nb); /* Calculate the offset (bai, baj) within the local block. */ bai = i%mb; baj = j%nb; /* Calculate the local array address for column-major addressing. */ return ( lld*(nb*lbj + baj) + mb*lbi + bai ); } #endif /*********************************************************************** PTR1D() ************************************************************************/ /* Return a pointer to a submatrix element or NULL if undefined. */ #ifdef SCALAPACK REAL_T *ptr1d( REAL_T *a, int *desc, int i ) { int nprow, npcol, myrow, mycol, locp, locq, zero=0; size_t adr; /* Return NULL if the matrix isn't active in the context. */ if (desc[CTXT_] < 0) return (NULL); /* Return NULL if the global index is out of range. */ if (i > desc[M_]) { printf("Global vector address overflow, i=%d imax=%d context=%d\n", i, desc[M_], desc[CTXT_]); fflush(stdout); return (NULL); } /* Get the process grid information. */ blacs_gridinfo_(&desc[CTXT_], &nprow, &npcol, &myrow, &mycol); /* * Return NULL if the submatrix doesn't exist in this process. * Perform the column test first because the row test is likely * to be true for egb2 because a row test is performed at the * level of the loop index. */ if ( mycol != 0 || !myroc(i, desc[MB_], nprow, myrow) ) return (NULL); /* Calculate the submatrix address and return NULL if out of range. */ locp = numroc_(&desc[M_], &desc[MB_], &myrow, &zero, &nprow); locq = numroc_(&desc[N_], &desc[NB_], &mycol, &zero, &npcol); adr = adr1d(i, desc[MB_], nprow); if ( adr > (size_t)(locp*locq) ) { printf("Local subvector address overflow, adr=%lld adrmax=%lld context=%d\n", adr, (size_t)(locp*locq), desc[CTXT_]); fflush(stdout); return (NULL); } /* If the vector array isn't allocated return NULL. */ if (a == NULL) { printf("Local subvector is not allocated, context=%d\n", desc[CTXT_]); fflush(stdout); return (NULL); } /* Otherwise, return the address of the matrix element. */ return (&a[adr]); } #endif /*********************************************************************** PTR1F() ************************************************************************/ /* Return a pointer to a submatrix element or NULL if undefined. */ #ifdef SCALAPACK REAL_T *ptr1f( REAL_T *a, SUPERD_T *sd, int i ) { int lbi, bai, adr; /* Return NULL if the superdescriptor is NULL. */ if (sd == NULL) return (NULL); /* Return NULL if the matrix isn't active in the context. */ if (sd->ctxt < 0) return (NULL); /* Return NULL if the global index is out of range. */ if (i < 0 || i > sd->m) { printf("Global vector address overflow, i=%d imax=%lld context=%d\n", i, sd->m, sd->ctxt); fflush(stdout); return (NULL); } /* * Return NULL if the submatrix doesn't exist in this process. * Perform the column test first because the row test is likely * to be true for egb2 because a row test is performed at the * level of the loop index. */ lbi = (sd->divmb)[i]; if ( sd->col != 0 || (sd->modrw)[lbi] != sd->row ) return (NULL); /* Calculate the submatrix address and return NULL if out of range. */ lbi = (sd->divrw)[lbi]; bai = (sd->modmb)[i]; adr = (sd->mb) * lbi + bai; if ( adr < 0 || adr > sd->size ) { printf("Local subvector address overflow, adr=%lld adrmax=%lld context=%d\n", adr, sd->size, sd->ctxt); fflush(stdout); return (NULL); } /* If the vector array isn't allocated return NULL. */ if (a == NULL) { printf("Local subvector is not allocated, context=%d\n", sd->ctxt); fflush(stdout); return (NULL); } /* Otherwise, return the address of the matrix element. */ return (&a[adr]); } #endif /*********************************************************************** PTR2D() ************************************************************************/ /* Return a pointer to a submatrix element or NULL if undefined. */ #ifdef SCALAPACK REAL_T *ptr2d( REAL_T *a, int *desc, int i, int j ) { int nprow, npcol, myrow, mycol, locp, locq, zero=0; size_t adr; /* Return NULL if the matrix isn't active in the context. */ if (desc[CTXT_] < 0) return (NULL); /* Return NULL if the global indices are out of range. */ if (i > desc[M_]) { printf("Global matrix address overflow, i=%d imax=%lld context=%d\n", i, desc[M_], desc[CTXT_]); fflush(stdout); return (NULL); } if (j > desc[N_]) { printf("Global matrix address overflow, j=%d jmax=%lld context=%d\n", j, desc[N_], desc[CTXT_]); fflush(stdout); return (NULL); } /* Get the process grid information. */ blacs_gridinfo_(&desc[CTXT_], &nprow, &npcol, &myrow, &mycol); /* Return NULL if the submatrix doesn't exist in this process. */ if ( !myroc(i + desc[RSRC_], desc[MB_], nprow, myrow) || !myroc(j + desc[CSRC_], desc[NB_], npcol, mycol) ) return (NULL); /* Calculate the submatrix address and return NULL if out of range. */ locp = numroc_(&desc[M_], &desc[MB_], &myrow, &zero, &nprow); locq = numroc_(&desc[N_], &desc[NB_], &mycol, &zero, &npcol); adr = adr2d(i, j, desc[MB_], desc[NB_], nprow, npcol, locp); if ( adr > (size_t)(locp*locq) ) { printf("Local submatrix address overflow, adr=%lld adrmax=%lld context=%d\n", adr, (size_t)(locp*locq), desc[CTXT_]); fflush(stdout); return (NULL); } /* If the matrix array isn't allocated return NULL. */ if (a == NULL) { printf("Local submatrix is not allocated, context=%d\n", desc[CTXT_]); fflush(stdout); return (NULL); } /* Otherwise, return the address of the matrix element. */ return (&a[adr]); } #endif /*********************************************************************** PTR2F() ************************************************************************/ /* * Return a pointer to a submatrix element or NULL if undefined. * Always print an error message for 'hard' errors, such as address * overflows, but only print an error message for normal NULL, * such as 'matrix isn't active in context' if gb2_debug != 0. */ #ifdef SCALAPACK REAL_T *ptr2f( REAL_T *a, SUPERD_T *sd, int i, int j, int *info ) { size_t lbi, lbj, bai, baj, adr; /* Return NULL if the superdescriptor is NULL. */ if (sd == NULL) { if (gb2_debug) { printf("ptr2f: superdescriptor is NULL\n"); fflush(stdout); } *info = -1; return (NULL); } /* Return NULL if the matrix isn't active in the context. */ if (sd->ctxt < 0) { if (gb2_debug) { printf("ptr2f: matrix isn't active in context %d\n", sd->ctxt); fflush(stdout); } *info = -2; return (NULL); } /* Return NULL if the global indices are out of range. */ if (i < 0) { printf("ptr2f: matrix row address underflow, i=%d imax=%d context=%d\n", i, sd->m, sd->ctxt); fflush(stdout); *info = -3; return (NULL); } if (i > sd->m) { printf("ptr2f: matrix row address overflow, i=%d imax=%d context=%d\n", i, sd->m, sd->ctxt); fflush(stdout); *info = -4; return (NULL); } if (j < 0) { printf("ptr2f: matrix column address underflow, j=%d jmax=%d context=%d\n", j, sd->n, sd->ctxt); fflush(stdout); *info = -5; return (NULL); } if (j > sd->n) { printf("ptr2f: matrix column address overflow, j=%d jmax=%d context=%d\n", j, sd->n, sd->ctxt); fflush(stdout); *info = -6; return (NULL); } /* * Return NULL if the submatrix doesn't exist in this process. * Perform the column test first because the row test is likely * to be true for egb2 because a row test is performed at the * level of the loop index. * * The test is (csrc+j/nb)%npcol != mycol || (rsrc+i/mb)%nprow != myrow */ lbi = (sd->divmb)[i]; lbj = (sd->divnb)[j]; if ( (sd->modcl)[sd->csrc + lbj] != sd->col || (sd->modrw)[sd->rsrc + lbi] != sd->row ) { if (gb2_debug) { printf("ptr2f: submatrix doesn't exist in process, row=%d col=%d\n", sd->row, sd->col); fflush(stdout); } *info = -7; return (NULL); } /* Calculate the local block address (lbi=i/mb/nprow, lbj=j/nb/npcol) */ lbi = (sd->divrw)[lbi]; lbj = (sd->divcl)[lbj]; /* Calculate the offset (bai=i%mb, baj=j%nb) within the local block. */ bai = (sd->modmb)[i]; baj = (sd->modnb)[j]; /* Calculate the submatrix address and return NULL if out of range. */ adr = (sd->lld) * ( (sd->nb) * lbj + baj ) + (sd->mb) * lbi + bai; if ( adr < 0 || adr > sd->size ) { printf("ptr2f: submatrix address overflow, adr=%lld adrmax=%lld context=%d\n", adr, sd->size, sd->ctxt); fflush(stdout); *info = -8; return (NULL); } /* If the matrix array isn't allocated return NULL. */ if (a == NULL) { printf("ptr2f: submatrix is not allocated, context=%d\n", sd->ctxt); fflush(stdout); *info = -9; return (NULL); } /* Otherwise, return the address of the matrix element. */ *info = 0; return (&a[adr]); } #endif /*********************************************************************** ECONS2() ************************************************************************/ /* Calculate the constrained energy and derivatives. */ static REAL_T econs2( REAL_T *x, REAL_T *f, REAL_T *g, int context_PxQ, int *descF_PxQ, int *descG_PxQ ) { int i; size_t n; REAL_T e_cons, rx, ry, rz; #ifdef SCALAPACK REAL_T *ptr; #endif n = 3*prm->Natom; e_cons = 0.0; for (i = 0; i < prm->Natom; i++) { if (constrained[i]) { rx = x[3 * i ] - x0[3 * i ]; ry = x[3 * i + 1] - x0[3 * i + 1]; rz = x[3 * i + 2] - x0[3 * i + 2]; e_cons += wcons * (rx * rx + ry * ry + rz * rz); #ifdef SCALAPACK ptr = ptr1d(f, descF_PxQ, 3*i+0); if (ptr != NULL) *ptr += 2. * wcons * rx; ptr = ptr1d(f, descF_PxQ, 3*i+1); if (ptr != NULL) *ptr += 2. * wcons * ry; ptr = ptr1d(f, descF_PxQ, 3*i+2); if (ptr != NULL) *ptr += 2. * wcons * rz; ptr = ptr2d(g, descG_PxQ, 3*i+0, 3*i+0); if (ptr != NULL) *ptr += 2. * wcons; ptr = ptr2d(g, descG_PxQ, 3*i+1, 3*i+1); if (ptr != NULL) *ptr += 2. * wcons; ptr = ptr2d(g, descG_PxQ, 3*i+1, 3*i+1); if (ptr != NULL) *ptr += 2. * wcons; #else f[3 * i ] += 2. * wcons * rx; f[3 * i + 1] += 2. * wcons * ry; f[3 * i + 2] += 2. * wcons * rz; g[3*i + 0 + n*(3*i + 0)] += 2. * wcons; g[3*i + 1 + n*(3*i + 1)] += 2. * wcons; g[3*i + 2 + n*(3*i + 2)] += 2. * wcons; #endif } } return (e_cons); } /*********************************************************************** EBOND2() ************************************************************************/ /* Calculate bond stretching energy and derivatives. */ static REAL_T ebond2( int nbond, int *a1, int *a2, int *atype, REAL_T *Rk, REAL_T *Req, REAL_T *x, REAL_T *f, REAL_T *g, int context_PxQ, int *descF_PxQ, int *descG_PxQ ) { int i, k, l, at1, at2, atyp; size_t n; REAL_T e_bond, r, r2, rinv, r2inv, rx, ry, rz, dfx, dfy, dfz, db, df, dg, e; REAL_T di[3], dj[3], d2ii[3][3], d2jj[3][3], d2ij[3][3], ijterm; #ifdef SCALAPACK REAL_T *ptr; #endif n = 3*prm->Natom; e_bond = 0.0; for (i = 0; i < nbond; i++) { at1 = a1[i]; at2 = a2[i]; atyp = atype[i] - 1; /* Calculate the interatomic distance and its (x,y,z) components. */ rx = x[at1 ] - x[at2 ]; ry = x[at1 + 1] - x[at2 + 1]; rz = x[at1 + 2] - x[at2 + 2]; r2 = rx * rx + ry * ry + rz * rz; r2inv = 1.0 / r2; r = sqrt(r2); rinv = r * r2inv; /* * Calculate the first and second derivatives of the interatomic * distance Dij with respect to the cartesian coordinates of atoms * i and j. The results are placed into five arrays: * * di[] for the first derivatives with respect to atom i * dj[] for the first derivatives with respect to atom j * d2ii[] for the second derivatives with respect to atom i * d2jj[] for the second derivatives with respect to atom j * d2ij[] for the second derivatives with respect to atoms i and j * * Some useful symmetry obtains. The d2ii, d2jj and d2ij arrays * are symmetric in that their lower and upper triangles are * the transposes of one another. Also, d2jj equals d2ii, and * d2ij is the negative of d2ii. * * As was exploited by David Case and his colleagues, the vector dj * is the negative of the vector di. */ di[0] = rx; di[1] = ry; di[2] = rz; dj[0] = -rx; dj[1] = -ry; dj[2] = -rz; /* Load the upper triangle of d2ii. */ d2ii[0][0] = 1.0 - rx * rx * r2inv; d2ii[0][1] = -rx * ry * r2inv; d2ii[0][2] = -rx * rz * r2inv; d2ii[1][1] = 1.0 - ry * ry * r2inv; d2ii[1][2] = -ry * rz * r2inv; d2ii[2][2] = 1.0 - rz * rz * r2inv; /* Finish loading the rest of all of the matrices. */ for (k = 0; k < 3; k++) { /* Load the upper triangles of d2jj and d2ij. */ for (l = k; l < 3; l++) { d2jj[k][l] = d2ii[k][l]; d2ij[k][l] = -d2ii[k][l]; } /* Load the symmetric elements of d2ii, d2jj and d2ij. */ for (l = k+1; l < 3; l++) { d2ii[l][k] = d2ii[k][l]; d2jj[l][k] = d2jj[k][l]; d2ij[l][k] = d2ij[k][l]; } } /* * Calculate the bond stretching energy as: * * e = Rk*(r - Req)^2 */ db = r - Req[atyp]; df = Rk[atyp] * db; e = df * db; e_bond += e; /* * For the calculation of the 1st derivatives of the energy, the * 1st derivatives of the bond distance are multiplied by df: * * de/dx = df*(dr/dx) * * where ds/dx represents the 1st derivative of the bond distance, and: * * df = de/dr = 2*Rk*(r - Req) * * because the chain rule gives: * * de/dx = (de/dr)*(dr/dx) = df*(dr/dx) * * Divide df by r because this factor is not present in dr/dx. */ df *= 2.0 * rinv; dfx = df * rx; dfy = df * ry; dfz = df * rz; #ifdef SCALAPACK ptr = ptr1d(f, descF_PxQ, at1+0); if (ptr != NULL) *ptr += dfx; ptr = ptr1d(f, descF_PxQ, at1+1); if (ptr != NULL) *ptr += dfy; ptr = ptr1d(f, descF_PxQ, at1+2); if (ptr != NULL) *ptr += dfz; ptr = ptr1d(f, descF_PxQ, at2+0); if (ptr != NULL) *ptr -= dfx; ptr = ptr1d(f, descF_PxQ, at2+1); if (ptr != NULL) *ptr -= dfy; ptr = ptr1d(f, descF_PxQ, at2+2); if (ptr != NULL) *ptr -= dfz; #else f[at1 ] += dfx; f[at1 + 1] += dfy; f[at1 + 2] += dfz; f[at2 ] -= dfx; f[at2 + 1] -= dfy; f[at2 + 2] -= dfz; #endif /* * For the calculation of the 2nd derivatives of the energy, the 1st and 2nd * derivatives of the bond distance are multiplied by dg and df as follows: * * d2e/dx2dx1 = df*(d2r/dx2dx1) + dg*(dr/dx2)*(dr/dx1) * * where d2s/dx2dx1 represents a 2nd derivative of the bond distance, * dr/dx1 and dr/dx2 represent 1st derivatives of the bond distance, and: * * dg = (d/dr)df = (d/dr)(de/dr) = 2*Rk * * because the chain rule gives: * * d2e/dx2dx1 = (d/dx2)(de/dx1) * * = (d/dx2)[(de/dr)*(dr/dx1)] * * = (d/dx2)[df*(dr/dx1)] * * = [(d/dx2)df]*(dr/dx1) + df*[(d/dx2)(dr/dx1)] * * = [(d/dx2)(de/dr)]*(dr/dx1) + df*(d2r/dx2dx1) * * = {[(d/dr)(de/dr)]*(dr/dx2)}*(dr/dx1) + df*(d2r/dx2dx1) * * = dg*(dr/dx2)*(dr/dx1) + df*(d2r/dx2dx1) * * Divide dg by r^2 because this factor is not present in (dr/dx1)*(dr/dx2). */ dg = 2.0 * Rk[atyp] * r2inv; for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { ijterm = df * d2ij[k][l] + dg * di[k] * dj[l]; #ifdef SCALAPACK ptr = ptr2d(g, descG_PxQ, at1+l, at1+k); if (ptr != NULL) *ptr += df * d2ii[k][l] + dg * di[k] * di[l]; ptr = ptr2d(g, descG_PxQ, at2+l, at2+k); if (ptr != NULL) *ptr += df * d2jj[k][l] + dg * dj[k] * dj[l]; ptr = ptr2d(g, descG_PxQ, at2+l, at1+k); if (ptr != NULL) *ptr += ijterm; ptr = ptr2d(g, descG_PxQ, at1+k, at2+l); if (ptr != NULL) *ptr += ijterm; #else g[at1 + k + n*(at1 + l)] += df * d2ii[k][l] + dg * di[k] * di[l]; g[at2 + k + n*(at2 + l)] += df * d2jj[k][l] + dg * dj[k] * dj[l]; g[at1 + k + n*(at2 + l)] += ijterm; g[at2 + l + n*(at1 + k)] += ijterm; #endif } } } if (e_debug) EXPR("%9.3f", e_bond); return (e_bond); } /*********************************************************************** DIFANG() ************************************************************************/ /* * This function sets up second derivatives for angle-like potentials, * and is used by the eangl2 and ephi2 functions. * * Calling parameters are as follows: * * cst - cosine of theta * s - bond vectors ij and kj * dc - first der. of costheta w/respect to the cartesian differences in s * ddc - output of second derivative of costheta w/respect to the cartesian * differences */ static void difang(REAL_T cst, REAL_T *s, REAL_T *dc, REAL_T **ddc) { REAL_T bij, boi2, boij3, boji3, bkj, boj2, br, bt; int i, i1, j; /* ----- first set up distances needed ----- */ bij = s[1]*s[1] + s[2]*s[2] + s[3]*s[3]; boi2 = 1.0 / bij; bij = sqrt(bij); bkj = s[4]*s[4] + s[5]*s[5] + s[6]*s[6]; boj2 = 1.0 / bkj; bkj = sqrt(bkj); br = boi2; /* * ----- set up ddc first as second derivative of costheta w/respect * to cartesian differences ----- */ for (i = 1; i <= 6; i++) { if (i == 4) br = boj2; ddc[i][i] = -br*(2.0*s[i]*dc[i] + cst*(1.0 - s[i]*s[i]*br)); } for (i = 1; i <= 3; i++) { ddc[i][i+3] = -boi2*s[i]*dc[i+3] - boj2*s[i+3]*dc[i] - boi2*boj2*(s[i]*s[i+3]*cst - bij*bkj); } for (i = 1; i <= 2; i++) { i1 = i + 1; for (j = i1; j <= 3; j++) { ddc[i][j] = -boi2*(s[i]*dc[j] + s[j]*dc[i] - boi2*cst*s[i]*s[j]); } } for (i = 4; i <= 5; i++) { i1 = i + 1; for (j = i1; j <= 6; j++) { ddc[i][j] = -boj2*(s[i]*dc[j] + s[j]*dc[i] - boj2*cst*s[i]*s[j]); } } boij3 = boj2/(bij*bkj); boji3 = boi2/(bij*bkj); bt = boi2*boj2*cst; for (i = 1; i <= 3; i++) { for (j = 4; j <= 6; j++) { if (j != i + 3) ddc[i][j] = -boij3*s[i+3]*s[j] - boji3*s[i]*s[j-3] + bt*s[i]*s[j]; } } /* ----- symmetrize the ddc array ----- */ for (i = 1; i <= 6; i++) { for (j = i; j <= 6; j++) { ddc[j][i] = ddc[i][j]; } } return; } /*********************************************************************** EANGL2() ************************************************************************/ /* Calculate the bond bending energy and derivatives.*/ static REAL_T eangl2( int nang, int *a1, int *a2, int *a3, int *atype, REAL_T *Tk, REAL_T *Teq, REAL_T *x, REAL_T *f, REAL_T *g, int context_PxQ, int *descF_PxQ, int *descG_PxQ ) { int i, j, ia, atyp, at1, at2, at3, ist, iof, inew, jst, jof, jnew; size_t n; REAL_T x1, y1, z1, x2, y2, z2, x3, y3, z3; REAL_T c, st, st2c, dtheta, theta, df, ddf, e, e_theta; REAL_T rij, rkj, rij2, rkj2, rrik; REAL_T s[7], dc[7], dr[10]; REAL_T **ddc, **ddr; #ifdef SCALAPACK REAL_T *ptr; #endif ddc = matrix(1,6,1,6); ddr = matrix(1,9,1,9); n = 3*prm->Natom; e_theta = 0.0; for (ia = 0; ia < nang; ia++) { at1 = a1[ia]; at2 = a2[ia]; at3 = a3[ia]; atyp = atype[ia] - 1; /* Set up the variables x1..x3, y1..y3 and z1..z3 for use later. */ x1 = x[at1 ]; y1 = x[at1 + 1]; z1 = x[at1 + 2]; x2 = x[at2 ]; y2 = x[at2 + 1]; z2 = x[at2 + 2]; x3 = x[at3 ]; y3 = x[at3 + 1]; z3 = x[at3 + 2]; s[1] = x1 - x2; s[2] = y1 - y2; s[3] = z1 - z2; s[4] = x3 - x2; s[5] = y3 - y2; s[6] = z3 - z2; rij2 = s[1]*s[1] + s[2]*s[2] + s[3]*s[3]; rkj2 = s[4]*s[4] + s[5]*s[5] + s[6]*s[6]; rij = sqrt(rij2); rkj = sqrt(rkj2); rrik = rij * rkj; c = (s[1]*s[4] + s[2]*s[5] + s[3]*s[6]) / rrik; c = c > 1.0 ? 1.0 : c; c = c < -1.0 ? -1.0 : c; theta = acos(c); dtheta = theta - Teq[atyp]; /* df and ddf are derivatives of E with respect to c == costheta: */ df = dtheta * Tk[atyp]; ddf = 2. * Tk[atyp]; e = df * dtheta; e_theta += e; /* * Calculate the sine then limit small values of the sine to the range * -10**-3..+10**-3 so as to avoid division by zero. */ st = sin(theta); if (st > 0 && st < 1.e-3) st = 1.e-3; else if (st < 0 && st > -1.e-3) st = -1.e-3; df *= 2.0; /* check this!! */ /* dc = derivative of c with respect to cartesian differences: */ for( i=1; i<=3; i++ ){ dc[i] = (s[i+3]/rkj - c*s[i]/rij)/rij; dc[i+3] = (s[i]/rij - c*s[i+3]/rkj)/rkj; } /* get ddc = second derivative of c with respect to cartesian differences: */ difang( c, s, dc, ddc ); /* change ddc to second. der. of theta with respect to cartesian differences: */ st2c = c/(st*st); for( i=1; i<=6; i++ ){ for( j=i; j<=6; j++ ){ ddc[i][j] = -(ddc[i][j] + dc[i]*dc[j]*st2c)/st; ddc[j][i] = ddc[i][j]; } } /* change dc to derivative of theta w/ respect to cartesian differences: */ for( i=1; i<=6; i++ ){ dc[i] = -dc[i]/st; } /* dr will hold -derivates of theta w/ respect to cartesians: */ for( i=1; i<=3; i++ ){ dr[i] = -dc[i]; dr[i+6] = -dc[i+3]; dr[i+3] = dc[i] + dc[i+3]; } /* update the forces: */ #ifdef SCALAPACK ptr = ptr1d(f, descF_PxQ, at1+0); if (ptr != NULL) *ptr -= df*dr[1]; ptr = ptr1d(f, descF_PxQ, at1+1); if (ptr != NULL) *ptr -= df*dr[2]; ptr = ptr1d(f, descF_PxQ, at1+2); if (ptr != NULL) *ptr -= df*dr[3]; ptr = ptr1d(f, descF_PxQ, at2+0); if (ptr != NULL) *ptr -= df*dr[4]; ptr = ptr1d(f, descF_PxQ, at2+1); if (ptr != NULL) *ptr -= df*dr[5]; ptr = ptr1d(f, descF_PxQ, at2+2); if (ptr != NULL) *ptr -= df*dr[6]; ptr = ptr1d(f, descF_PxQ, at3+0); if (ptr != NULL) *ptr -= df*dr[7]; ptr = ptr1d(f, descF_PxQ, at3+1); if (ptr != NULL) *ptr -= df*dr[8]; ptr = ptr1d(f, descF_PxQ, at3+2); if (ptr != NULL) *ptr -= df*dr[9]; #else f[at1 ] -= df*dr[1]; f[at1 + 1] -= df*dr[2]; f[at1 + 2] -= df*dr[3]; f[at2 ] -= df*dr[4]; f[at2 + 1] -= df*dr[5]; f[at2 + 2] -= df*dr[6]; f[at3 ] -= df*dr[7]; f[at3 + 1] -= df*dr[8]; f[at3 + 2] -= df*dr[9]; #endif /* ddr will hold second derivatives of theta with respect to carts: */ for( i=1; i<=3; i++ ){ for( j=i; j<=3; j++ ){ ddr[i][j] = ddc[i][j]; ddr[i+6][j+6] = ddc[i+3][j+3]; } for( j=1; j<=3; j++ ){ ddr[i][j+6] = ddc[i][j+3]; ddr[i][j+3] = -ddc[i][j] - ddc[i][j+3]; ddr[i+3][j+6] = -ddc[i+3][j] - ddc[i+3][j+3]; } } for( i=1; i<=3; i++ ){ for( j=i; j<=3; j++ ){ ddr[i+3][j+3] = -ddr[i][j+3] - ddr[i+3][j+6]; } } /* for( i=1; i<=9; i++ ){ for( j=i; j<=9; j++ ){ ddr[j][i] = ddr[i][j]; } } */ for( i=1; i<=9; i++ ){ ist = at3; if( i<=6 ) ist = at2; if( i<=3 ) ist = at1; iof = i%3 ; if( iof == 0 ) iof =3; inew = ist + iof - 1; for( j=i; j<=9; j++ ){ jst = at3; if( j<=6 ) jst = at2; if( j<=3 ) jst = at1; jof = j%3; if( jof == 0 ) jof =3; jnew = jst + jof - 1; #ifdef SCALAPACK ptr = ptr2d(g, descG_PxQ, jnew, inew); if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j]; if ( inew != jnew ) { ptr = ptr2d(g, descG_PxQ, inew, jnew); if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j]; } #else g[inew + n*jnew] += ddf*dr[i]*dr[j] + df*ddr[i][j]; if( inew != jnew ) g[jnew + n*inew] += ddf*dr[i]*dr[j] + df*ddr[i][j]; #endif } } } free_matrix( ddc, 1,6,1,6 ); free_matrix( ddr, 1,6,1,6 ); if (e_debug) EXPR("%9.3f", e_theta); return (e_theta); } /*********************************************************************** EPHI2() ************************************************************************/ /* Calculate the dihedral torsion energy and derivatives. */ static REAL_T ephi2( int nphi, int *a1, int *a2, int *a3, int *a4, int *atype, REAL_T *Pk, REAL_T *Pn, REAL_T *Phase, REAL_T *x, REAL_T *f, REAL_T *g, int context_PxQ, int *descF_PxQ, int *descG_PxQ ) { int i, j, k, l, at1, at2, at3, at4, atyp, iper,iphi; size_t n; int inew, ist, iof, jnew, jst, jof; REAL_T e, df, ddf, e_tors; REAL_T xij, yij, zij, xkj, ykj, zkj, xkl, ykl, zkl; REAL_T dx, dy, dz, gx, gy, gz, bi, bk, ct, boi2, boj2, z1, z2, bioj, bjoi; REAL_T ct2, arg, z11, z12, z22; REAL_T t[7], dc[7], dr[13]; REAL_T dum,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12; REAL_T **dtx, **ddr, **ddc; #ifdef SCALAPACK REAL_T *ptr; #endif ddc = matrix( 1,6, 1,6 ); dtx = matrix( 1,6, 1,12 ); ddr = matrix( 1,12, 1,12 ); n = 3*prm->Natom; e_tors = 0.0; for (iphi = 0; iphi < nphi; iphi++) { /* * Get the atom numbers. Atom 3 may be negative to specify that * the end group interactions are to be ignored. Atom 4 may be * negative to specify that the dihedral angle is an "improper * torsion". However, neither of these conditions is actually * treated in this function, so just ignore negative values. */ at1 = a1[iphi]; at2 = a2[iphi]; at3 = abs(a3[iphi]); at4 = abs(a4[iphi]); atyp = atype[iphi] - 1; /* ----- calculation of the dihedral ----- */ xij = x[at1 ] - x[at2 ]; yij = x[at1 + 1] - x[at2 + 1]; zij = x[at1 + 2] - x[at2 + 2]; xkj = x[at3 ] - x[at2 ]; ykj = x[at3 + 1] - x[at2 + 1]; zkj = x[at3 + 2] - x[at2 + 2]; xkl = x[at3 ] - x[at4 ]; ykl = x[at3 + 1] - x[at4 + 1]; zkl = x[at3 + 2] - x[at4 + 2]; dx = yij * zkj - zij * ykj; dy = zij * xkj - xij * zkj; dz = xij * ykj - yij * xkj; gx = zkj * ykl - ykj * zkl; gy = xkj * zkl - zkj * xkl; gz = ykj * xkl - xkj * ykl; bi = dx * dx + dy * dy + dz * dz; bk = gx * gx + gy * gy + gz * gz; ct = dx * gx + dy * gy + dz * gz; /* ----- branch if linear dihedral ----- */ assert(bi > 0.01); assert(bk > 0.01); boi2 = 1. / bi; boj2 = 1. / bk; bi = sqrt(bi); bk = sqrt(bk); z1 = 1. / bi; z2 = 1. / bk; bioj = bi * z2; bjoi = bk * z1; ct = ct * z1 * z2; if (ct > 1.) ct = 1.; if (ct < (-1.)) ct = -1.; /* ----- ct value above is actually -cosphi; here we change * its sign ----- */ ct = -ct; /* ----- calculate the energy and derivatives ----- * ----- get df = first der. of potential w/respect to cosphi; and * ddf = second der. of potntial w/respect to cosphi ----- * the torsional potential is assumed to have the form: * e = pk(ic) * (1.0+phase*cos(pn(ic)*phi) * where phase = 1.0 or -1.0, and pn = 1,2,3,4, or 6 * ----- energy terms for dihedrals are expressed in terms of * cosphi in order to eliminate problems for planar angles ---- */ multi_term: iper = (int) fabs(Pn[atyp]) + 0.0001; assert(iper != (fabs(Pn[atyp]) - 0.0001)); assert(iper >= 1 && iper <= 6); switch (iper) { case 1: e = ct; df = 1.0; ddf = 0.0; break; case 2: e = 2.0 * ct * ct - 1.0; df = 4.0 * ct; ddf = 4.0; break; case 3: ct2 = ct * ct; e = ct * (4.0 * ct2 - 3.0); df = 12.0 * ct2 - 3.0; ddf = 24.0 * ct; break; case 4: ct2 = ct * ct; e = 1.0 + ct2 * 8.0 * (ct2 - 1.0); df = 16.0 * ct * (ct2 + ct2 - 1.0); ddf = 16.0 * (6.0 * ct2 - 1.0); break; case 6: ct2 = ct * ct; e = ct2 * (ct2 * (ct2 * 32.0 - 48.0) + 18.0) - 1.0; df = ct * (ct2 * (ct2 * 192.0 - 192.0) + 36.0); ddf = ct2 * (ct2 * 960.0 - 576.0) + 36.0; default: fprintf(stderr, "bad periodicity: %d\n", iper); exit(1); } if (Phase[atyp] == 0.0) { arg = Pk[atyp]; } else { arg = -Pk[atyp]; assert(fabs(3.14159 - Phase[atyp]) < 0.01); } e = Pk[atyp] + arg * e; df = df * arg; ddf = ddf * arg; e_tors += e; t[1] = dx; t[2] = dy; t[3] = dz; t[4] = -gx; t[5] = -gy; t[6] = -gz; /* * ----- now, set up array dc = first der. of cosphi w/respect * to the cartesian differences t ----- */ z11 = z1 * z1; z12 = z1 * z2; z22 = z2 * z2; for (i = 1; i <= 3; i++) { dc[i] = t[i+3] * z12 - ct * t[i] * z11; dc[i+3] = t[i] * z12 - ct * t[i+3] * z22; } /* * ----- subroutine difang will now create array ddc which is second * derivative of cosphi with respect to the t s ----- */ difang(ct, t, dc, ddc); /* ---now set up array s, given on page 118 of cff book-- */ s1 = xij; s2 = yij; s3 = zij; s4 = xkj; s5 = ykj; s6 = zkj; s7 = -xkj; s8 = -ykj; s9 = -zkj; s10 = -xkl; s11 = -ykl; s12 = -zkl; /* * ----- set up dtx[i][j] = derivative of t[i] w/respect to x[j] * see p. 120 of cff book ----- */ for (i = 1; i <= 6; i++) { for (j = 1; j <= 12; j++) { dtx[i][j] = 0.0; } } dtx[1][2] = s6; dtx[1][3] = -s5; dtx[1][5] = s3 - s6; dtx[1][6] = s5 - s2; dtx[1][8] = -s3; dtx[1][9] = s2; dtx[2][1] = -s6; dtx[2][3] = s4; dtx[2][4] = s6 - s3; dtx[2][6] = s1 - s4; dtx[2][7] = s3; dtx[2][9] = -s1; dtx[3][1] = s5; dtx[3][2] = -s4; dtx[3][4] = s2 - s5; dtx[3][5] = s4 - s1; dtx[3][7] = -s2; dtx[3][8] = s1; dtx[4][5] = s12; dtx[4][6] = -s11; dtx[4][8] = s9 - s12; dtx[4][9] = s11 - s8; dtx[4][11] = -s9; dtx[4][12] = s8; dtx[5][4] = -s12; dtx[5][6] = s10; dtx[5][7] = s12 - s9; dtx[5][9] = s7 - s10; dtx[5][10] = s9; dtx[5][12] = -s7; dtx[6][4] = s11; dtx[6][5] = -s10; dtx[6][7] = s8 - s11; dtx[6][8] = s10 - s7; dtx[6][10] = -s8; dtx[6][11] = s7; /* * ----- set up dr array, containing -first derivative of cosphi with * respect to cartesians ----- */ for (i = 1; i <= 12; i++) { dum = 0.0; for (j = 1; j <= 6; j++) { dum += dc[j]*dtx[j][i]; } dr[i] = -dum; } /* ----- update the force array ----- */ #ifdef SCALAPACK ptr = ptr1d(f, descF_PxQ, at1+0); if (ptr != NULL) *ptr -= df * dr[1]; ptr = ptr1d(f, descF_PxQ, at1+1); if (ptr != NULL) *ptr -= df * dr[2]; ptr = ptr1d(f, descF_PxQ, at1+2); if (ptr != NULL) *ptr -= df * dr[3]; ptr = ptr1d(f, descF_PxQ, at2+0); if (ptr != NULL) *ptr -= df * dr[4]; ptr = ptr1d(f, descF_PxQ, at2+1); if (ptr != NULL) *ptr -= df * dr[5]; ptr = ptr1d(f, descF_PxQ, at2+2); if (ptr != NULL) *ptr -= df * dr[6]; ptr = ptr1d(f, descF_PxQ, at3+0); if (ptr != NULL) *ptr -= df * dr[7]; ptr = ptr1d(f, descF_PxQ, at3+1); if (ptr != NULL) *ptr -= df * dr[8]; ptr = ptr1d(f, descF_PxQ, at3+2); if (ptr != NULL) *ptr -= df * dr[9]; ptr = ptr1d(f, descF_PxQ, at4+0); if (ptr != NULL) *ptr -= df * dr[10]; ptr = ptr1d(f, descF_PxQ, at4+1); if (ptr != NULL) *ptr -= df * dr[11]; ptr = ptr1d(f, descF_PxQ, at4+2); if (ptr != NULL) *ptr -= df * dr[12]; #else f[at1 ] -= df * dr[1]; f[at1 + 1] -= df * dr[2]; f[at1 + 2] -= df * dr[3]; f[at2 ] -= df * dr[4]; f[at2 + 1] -= df * dr[5]; f[at2 + 2] -= df * dr[6]; f[at3 ] -= df * dr[7]; f[at3 + 1] -= df * dr[8]; f[at3 + 2] -= df * dr[9]; f[at4 ] -= df * dr[10]; f[at4 + 1] -= df * dr[11]; f[at4 + 2] -= df * dr[12]; #endif /* * ----- now set up the ddr array = second der. of cosphi w/respect * to cartesians; first we take the first term of last formula * on p. 113 of cff book ----- */ for (i = 1; i <= 12; i++) { for (j = i; j <= 12; j++) { ddr[i][j] = 0.0; for (k = 1; k <= 6; k++) { for (l = 1; l <= 6; l++) { ddr[i][j] = ddr[i][j] + ddc[k][l]*dtx[k][i]*dtx[l][j]; } } } } /* ----- now do the second term of this equation ----- */ ddr[2][9] += dc[1]; ddr[3][5] += dc[1]; ddr[6][8] += dc[1]; ddr[2][6] -= dc[1]; ddr[3][8] -= dc[1]; ddr[5][9] -= dc[1]; ddr[1][6] += dc[2]; ddr[3][7] += dc[2]; ddr[4][9] += dc[2]; ddr[1][9] -= dc[2]; ddr[3][4] -= dc[2]; ddr[6][7] -= dc[2]; ddr[1][8] += dc[3]; ddr[2][4] += dc[3]; ddr[5][7] += dc[3]; ddr[1][5] -= dc[3]; ddr[2][7] -= dc[3]; ddr[4][8] -= dc[3]; ddr[5][12] += dc[4]; ddr[6][8] += dc[4]; ddr[9][11] += dc[4]; ddr[5][9] -= dc[4]; ddr[6][11] -= dc[4]; ddr[8][12] -= dc[4]; ddr[4][9] += dc[5]; ddr[6][10] += dc[5]; ddr[7][12] += dc[5]; ddr[4][12] -= dc[5]; ddr[6][7] -= dc[5]; ddr[9][10] -= dc[5]; ddr[4][11] += dc[6]; ddr[5][7] += dc[6]; ddr[8][10] += dc[6]; ddr[4][8] -= dc[6]; ddr[5][10] -= dc[6]; ddr[7][11] -= dc[6]; /* ----- Now form the second derivative matrix ----- */ for (i = 1; i <= 12; i++) { ist = at4; if (i <= 9) ist = at3; if (i <= 6) ist = at2; if (i <= 3) ist = at1; iof = i % 3; if (iof == 0) iof = 3; inew = ist + iof - 1; for (j = i; j <= 12; j++) { jst = at4; if (j <= 9) jst = at3; if (j <= 6) jst = at2; if (j <= 3) jst = at1; jof = j % 3; if (jof == 0) jof = 3; jnew = jst + jof - 1; #ifdef SCALAPACK ptr = ptr2d(g, descG_PxQ, jnew, inew); if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j]; if (inew != jnew) { ptr = ptr2d(g, descG_PxQ, inew, jnew); if (ptr != NULL) *ptr += ddf*dr[i]*dr[j] + df*ddr[i][j]; } #else g[inew + n*jnew] += ddf*dr[i]*dr[j] + df*ddr[i][j]; if (inew != jnew) g[jnew + n*inew] += ddf*dr[i]*dr[j] + df*ddr[i][j]; #endif } } #ifdef PRINT_EPHI printf("%4d %4d %4d %4d %4d %9.4f\n", i + 1, at1 / 3, at2 / 3, at3 / 3, at4 / 3, e); #endif /* A negative value of Pn means that more terms are needed in the expansion. */ if (Pn[atyp] < 0) { atyp++; goto multi_term; } } if (e_debug) EXPR("%9.3f", e_tors); free_matrix( dtx, 1,12, 1,12 ); free_matrix( ddr, 1,12, 1,12 ); return (e_tors); } /*********************************************************************** NBOND2() ************************************************************************/ /* * Calculate the non-bonded energy, 1st and 2nd derivatives. * This function is complicated by the fact that it must * process two forms of pair lists: the 1-4 pair list and * the non-bonded pair list. The non-bonded pair list * must be modified by the excluded atom list whereas the * 1-4 pair list is used unmodified. Also, the non-bonded * pair list comprises lower and upper triangles whereas * the 1-4 pair list comprises an upper triangle only. * * Calling parameters are as follows: * * lpears - the number of pairs on the lower triangle pair list * upears - the number of pairs on the upper trianble pair list * pearlist - either the 1-4 pair list or the non-bonded pair list * N14 - set to 0 for the non-bonded pair list, 1 for the 1-4 pair list * x - the atomic coordinate array * f - the gradient vector * g - the Hessian matrix mapped onto a linear vector * enb - Van der Waals energy return value, passed by reference * eel - Coulombic energy return value, passed by reference * enbfac - scale factor for Van der Waals energy * eelfac - scale factor for Coulombic energy */ static int nbond2( int *lpears, int *upears, int **pearlist, int N14, REAL_T *x, REAL_T *f, REAL_T *g, REAL_T *enb, REAL_T *eel, REAL_T enbfac, REAL_T eelfac, int context_PxQ, int *descF_PxQ, int *descG_PxQ ) { int i, j, k, l, jn, ic, npr, lpair, iaci; size_t n; int *iexw; REAL_T r, r2, r2inv, df2, dg2, r6, r10, f1, f2, df, dg; REAL_T dis, kij, d0, diff, rinv, rs, rssq, eps1, epsi, cgijr, pow, cgi; REAL_T dumx, dumy, dumz, enbfaci, eelfaci; int nhbpair, ibig, isml; REAL_T xi, yi, zi, xij, yij, zij; REAL_T di[3], dj[3], d2ii[3][3], d2jj[3][3], d2ij[3][3], ijterm; #ifdef SCALAPACK REAL_T *ptr; #endif #if 0 REAL_T hbener, nb14; nhbpair = 0; hbener = 0.0; nb14 = 0.0; #endif #define SIG 0.3 #define DIW 78.0 #define C1 38.5 /* * For the various methods of calculating the nonbonded energy * both the energy and its 1st and 2nd derivatives are calculated. * * For the calculation of the 1st derivatives of the energy, the * 1st derivatives the nonbonded distance r are multiplied by df: * * de/dx = df*(dr/dx) * * where dr/dx represents the 1st derivative of the distance, and * * df = de/dr * * is required by the chain rule: * * de/dx = (de/dr)*(dr/dx) = df*(dr/dx) * * For the calculation of the 2nd derivatives of the energy, the 1st and * 2nd derivatives of the distance are multiplied by dg and df as follows: * * d2e/dxjdxi = df*(d2r/dxjdxi) + dg*(dr/dxj)*(dr/dxi) * * where d2r/dxjdxi represents a 2nd derivative of the distance, * dr/dxi and dr/dxj represent 1st derivatives of the distance, and: * * dg = (d/dr)df * * is required by the chain rule: * * d2e/dxjdxi = (d/dxj)(de/dxi) * * = (d/dxj)[(de/dr)*(dr/dxi)] * * = (d/dxj)[df*(dr/dxi)] * * = [(d/dxj)df]*(dr/dxi) + df*[(d/dxj)(dr/dxi)] * * = [(d/dxj)(de/dr)]*(dr/dxi) + df*(d2r/dxjdxi) * * = {[(d/dr)(de/dr)]*(dr/dxj)}*(dr/dxi) + df*(d2r/dxjdxi) * * = dg*(dr/dxj)*(dr/dxi) + df*(d2r/dxjdxi) * */ n = 3*prm->Natom; *enb = 0.; *eel = 0.; enbfaci = 1. / enbfac; eelfaci = 1. / eelfac; /* * Allocate and initialize the iexw array used for skipping excluded * atoms. Note that because of the manner in which iexw is used, it * is necessary to initialize it before only the first iteration of * the following loop. */ iexw = ivector(-1, prm->Natom); for (i = -1; i < prm->Natom; i++) { iexw[i] = -1; } /* Loop over all atoms i except for the final atom i. */ for (i = 0; i < prm->Natom - 1; i++) { /* Check whether there are any atoms j on the pair list of atom i. */ npr = upears[i]; if (npr <= 0) continue; iaci = prm->Ntypes * (prm->Iac[i] - 1); dumx = 0.0; dumy = 0.0; dumz = 0.0; xi = x[3 * i ]; yi = x[3 * i + 1]; zi = x[3 * i + 2]; cgi = eelfaci * prm->Charges[i]; /* * Expand the excluded list into the iexw array by storing i * at array address j. */ for (j = 0; j < prm->Iblo[i]; j++) { iexw[IexclAt[i][j] - 1] = i; } /* * If the 'N14' calling parameter is clear, use the beginning * address of the upper triangle pair list, which happens * to be the number of atoms on the lower triangle pair list. * If the 'N14' calling parameter is set, the beginning * address is zero because no lower triangle pair list is * used for the N14 interactions. */ if (N14 == 0) { lpair = lpears[i]; } else { lpair = 0; } /* Select atom j from the pair list. Non-graceful error handling. */ for (jn = 0; jn < npr; jn++) { if (pearlist[i] == NULL) { printf("NULL pair list entry in nbond2, taskid = %d\n", mytaskid); fflush(stdout); } j = pearlist[i][lpair + jn]; /* * If the 'N14' calling parameter is clear, check whether * this i,j pair is exempted by the excluded atom list. */ if (N14 != 0 || iexw[j] != i) { xij = xi - x[3 * j ]; yij = yi - x[3 * j + 1]; zij = zi - x[3 * j + 2]; r2 = xij * xij + yij * yij + zij * zij; r2inv = 1.0 / r2; r = sqrt(r2); rinv = r * r2inv; /* * Calculate the first and second derivatives of the interatomic * distance Dij with respect to the cartesian coordinates of atoms * i and j. The results are placed into five arrays: * * di[] for the first derivatives with respect to atom i * dj[] for the first derivatives with respect to atom j * d2ii[] for the second derivatives with respect to atom i * d2jj[] for the second derivatives with respect to atom j * d2ij[] for the second derivatives with respect to atoms i and j * * Some useful symmetry obtains. The d2ii, d2jj and d2ij arrays * are symmetric in that their lower and upper triangles are * the transposes of one another. Also, d2jj equals d2ii, and * d2ij is the negative of d2ii. * * As was exploited by David Case and his colleagues, the vector dj * is the negative of the vector di. */ di[0] = xij; di[1] = yij; di[2] = zij; dj[0] = -xij; dj[1] = -yij; dj[2] = -zij; /* Load the upper triangle of d2ii. */ d2ii[0][0] = 1.0 - xij * xij * r2inv; d2ii[0][1] = -xij * yij * r2inv; d2ii[0][2] = -xij * zij * r2inv; d2ii[1][1] = 1.0 - yij * yij * r2inv; d2ii[1][2] = -yij * zij * r2inv; d2ii[2][2] = 1.0 - zij * zij * r2inv; /* Finish loading the rest of all of the matrices. */ for (k = 0; k < 3; k++) { /* Load the upper triangles of d2jj and d2ij. */ for (l = k; l < 3; l++) { d2jj[k][l] = d2ii[k][l]; d2ij[k][l] = -d2ii[k][l]; } /* Load the symmetric elements of d2ii, d2jj and d2ij. */ for (l = k+1; l < 3; l++) { d2ii[l][k] = d2ii[k][l]; d2jj[l][k] = d2jj[k][l]; d2ij[l][k] = d2ij[k][l]; } } /* Calculate the energy and derivatives according to dield. */ if (dield == -3) { /* special code Ramstein & Lavery dielectric, 94 force field */ rs = SIG * r; rssq = rs * rs; pow = exp(-rs); eps1 = rssq + rs + rs + 2.0; epsi = 1.0 / (DIW - C1 * pow * eps1); cgijr = cgi * prm->Charges[j] * rinv * epsi; *eel += cgijr; df2 = -cgijr * (1.0 + C1 * pow * rs * rssq * epsi); dg2 = cgijr*(epsi*(C1*pow*SIG*(rssq*(2.0*(epsi*C1*pow*SIG*rssq + rinv) + SIG) - 2.0*SIG*rs)) + 2.0*r2inv); ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1; if( ic >= 0 ){ r6 = r2inv * r2inv * r2inv; f2 = prm->Cn2[ic] * r6; f1 = prm->Cn1[ic] * r6 * r6; *enb += (f1 - f2) * enbfaci; df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv; dg = dg2 + (156.0 * f1 - 42.0 * f2 * enbfaci) * r2inv; } else { df = df2 * rinv; dg = dg2; } } else if (dield == -4) { /* distance-dependent dielectric code, 94 ff */ /* epsilon = r */ rs = cgi * prm->Charges[j] * r2inv; df2 = -2.0 * rs; dg2 = 6.0 * rs * r2inv; *eel += rs; ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1; if( ic >= 0 ){ r6 = r2inv * r2inv * r2inv; f2 = prm->Cn2[ic] * r6; f1 = prm->Cn1[ic] * r6 * r6; *enb += (f1 - f2) * enbfaci; df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv; dg = dg2 + (156.0 * f1 - 42.0 * f2 * enbfaci) * r2inv; } else { df = df2 * rinv; dg = dg2; } } else if (dield == -5) { /* non-bonded term from yammp */ dis = r; ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1; d0 = prm->Cn2[ic]; if (dis < d0) { kij = prm->Cn1[ic]; diff = dis - d0; *enb += kij * diff * diff; df = 2.0 * kij * diff; dg = 2.0 * kij; } else { df = 0.0; dg = 0.0; } } else { /* * Code for various dielectric models. * The df2 variable should hold r(dV/dr). */ if (dield == 0) { /* epsilon = r */ rs = cgi * prm->Charges[j] * r2inv; df2 = -2.0 * rs; dg2 = 6.0 * rs * r2inv; *eel += rs; } else if (dield == 1) { /* epsilon = 1 */ rs = cgi * prm->Charges[j] * rinv; df2 = -rs; dg2 = 2.0 * rs * r2inv; *eel += rs; } else if (dield == -2) { /* Ramstein & Lavery dielectric, PNAS 85, 7231 (1988). */ rs = SIG * r; rssq = rs * rs; pow = exp(-rs); eps1 = rssq + rs + rs + 2.0; epsi = 1.0 / (DIW - C1 * pow * eps1); cgijr = cgi * prm->Charges[j] * rinv * epsi; *eel += cgijr; df2 = -cgijr * (1.0 + C1 * pow * rs * rssq * epsi); dg2 = cgijr*(epsi*(C1*pow*SIG*(rssq*(2.0*(epsi*C1*pow*SIG*rssq + rinv) + SIG) - 2.0*SIG*rs)) + 2.0*r2inv); } /* Calculate either Van der Waals or hydrogen bonded term. */ ic = prm->Cno[iaci + prm->Iac[j] - 1]; if (ic > 0 || enbfac != 1.0) { if (ic > 0) { ic--; } else { ibig = prm->Iac[i] > prm->Iac[j] ? prm->Iac[i] : prm->Iac[j]; isml = prm->Iac[i] > prm->Iac[j] ? prm->Iac[j] : prm->Iac[i]; ic = ibig * (ibig - 1) / 2 + isml - 1; } r6 = r2inv * r2inv * r2inv; f2 = prm->Cn2[ic] * r6; f1 = prm->Cn1[ic] * r6 * r6; *enb += (f1 - f2) * enbfaci; df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv; dg = dg2 + (156.0 * f1 - 42.0 * f2) * enbfaci * r2inv; #if 0 if( enbfac != 1.0 ) nb14 += (f1-f2)*enbfaci; #endif } else { ic = -ic - 1; r10 = r2inv * r2inv * r2inv * r2inv * r2inv; f2 = prm->HB10[ic] * r10; f1 = prm->HB12[ic] * r10 * r2inv; *enb += (f1 - f2) * enbfaci; df = (df2 + (10.0 * f2 - 12.0 * f1) * enbfaci) * rinv; dg = dg2 + (156.0 * f1 - 110.0 * f2 * enbfaci) * r2inv; nhbpair++; #if 0 hbener += (f1-f2)*enbfaci; #endif } } /* Divide by r here instead dividing the derivatives by r. */ df *= rinv; dg *= r2inv; /* Update the gradient and the Jacobian. */ dumx += df * di[0]; dumy += df * di[1]; dumz += df * di[2]; #ifdef SCALAPACK ptr = ptr1d(f, descF_PxQ, 3*j+0); if (ptr != NULL) *ptr += df * dj[0]; ptr = ptr1d(f, descF_PxQ, 3*j+1); if (ptr != NULL) *ptr += df * dj[1]; ptr = ptr1d(f, descF_PxQ, 3*j+2); if (ptr != NULL) *ptr += df * dj[2]; #else f[3 * j ] += df * dj[0]; f[3 * j + 1] += df * dj[1]; f[3 * j + 2] += df * dj[2]; #endif /* * Calculate the second derivatives of Dij for the Van der Waals * and Coulombic interactions. */ for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { ijterm = df * d2ij[k][l] + dg * di[k] * dj[l]; #ifdef SCALAPACK ptr = ptr2d(g, descG_PxQ, 3*i+l, 3*i+k); if (ptr != NULL) *ptr += df * d2ii[k][l] + dg * di[k] * di[l]; ptr = ptr2d(g, descG_PxQ, 3*j+l, 3*j+k); if (ptr != NULL) *ptr += df * d2jj[k][l] + dg * dj[k] * dj[l]; ptr = ptr2d(g, descG_PxQ, 3*j+l, 3*i+k); if (ptr != NULL) *ptr += ijterm; ptr = ptr2d(g, descG_PxQ, 3*i+k, 3*j+l); if (ptr != NULL) *ptr += ijterm; #else g[3*i + k + n*(3*i + l)] += df * d2ii[k][l] + dg * di[k] * di[l]; g[3*j + k + n*(3*j + l)] += df * d2jj[k][l] + dg * dj[k] * dj[l]; g[3*i + k + n*(3*j + l)] += ijterm; g[3*j + l + n*(3*i + k)] += ijterm; #endif } } } } /* For atom i, the gradient is updated in the i-loop only. */ #ifdef SCALAPACK ptr = ptr1d(f, descF_PxQ, 3*i+0); if (ptr != NULL) *ptr += dumx; ptr = ptr1d(f, descF_PxQ, 3*i+1); if (ptr != NULL) *ptr += dumy; ptr = ptr1d(f, descF_PxQ, 3*i+2); if (ptr != NULL) *ptr += dumz; #else f[3 * i ] += dumx; f[3 * i + 1] += dumy; f[3 * i + 2] += dumz; #endif } /* Deallocate the iexw array. */ free_ivector(iexw, -1, prm->Natom); return (0); } /*********************************************************************** EGB2() ************************************************************************/ /* * Calculate the generalized Born energy, 1st and 2nd derivatives. * * Calling parameters are as follows: * * lpears - the number of pairs on the lower triangle pair list * upears - the number of pairs on the upper trianble pair list * pearlist - the pair list, contiguous for the upper and lower triangles * x - input: the atomic (x,y,z) coordinates * f - updated: the gradient vector * g - updated: the Hessian matrix * grad - updated: the gradient vector in context_1x1 * fs - input: overlap parameters * rborn - input: atomic radii * q - input: atomic charges * kappa - input: inverse of the Debye-Huckel length * diel_ext - input: solvent dielectric constant * enb - updated: Lennard-Jones energy * eelt - updated: gas-phase electrostatic energy * context_Nx1 - input: the general context for ScaLAPACK * context_1x1 - input: the non-distributed vector context for ScaLAPACK * context_PxQ - input: the distributed matrix context for ScaLAPACK * desc_3Nx1 - input: ScaLAPACK descriptor for the gradient vector in context_PxQ * desc_1x1 - input: ScaLAPACK descriptor for the gradient vector in context_1x1 * desc_3Nx3N - input: ScaLAPACK descriptor for the Hessian matrix in context_PxQ * gridim - input: the ScaLAPACK process grid dimension (=1 for single task) * freevectors - input: if !=0 free the static vectors and return */ static REAL_T egb2( int *lpears, int *upears, int **pearlist, REAL_T *x, REAL_T *f, REAL_T *g, REAL_T *grad, REAL_T *fs, REAL_T *rborn, REAL_T *q, REAL_T *kappa, REAL_T *diel_ext, REAL_T *enb, REAL_T *eelt, int context_Nx1, int context_1x1, int context_PxQ, int *desc_3Nx1, int *desc_1x1, int *desc_3Nx3N, int gridim, int freevectors ) #define BOFFSET 0.09 #define KSCALE 0.73 #define PIx4 12.5663706143591724639918 #define PIx2 6.2831853071795862319959 #define PIx1 3.1415926535897931159979 /*FGB taylor coefficients follow */ /* from A to L : */ /* 1/3 , 2/5 , 3/7 , 4/9 , 5/11 */ /* 4/3 , 12/5 , 24/7 , 40/9 , 60/11 */ /* 20/3, 84/5, 216/7, 440/9, 780/11 */ #define TA 0.33333333333333333333 #define TB 0.4 #define TC 0.42857142857142857143 #define TD 0.44444444444444444444 #define TDD 0.45454545454545454545 #define TE 1.33333333333333333333 #define TF 2.4 #define TG 3.42857142857142857143 #define TH 4.44444444444444444444 #define THH 5.45454545454545454545 #define TI 6.66666666666666666667 #define TJ 16.8 #define TK 30.8571428571428571429 #define TL 48.8888888888888888889 #define TLL 70.9090909090909090909 { #ifdef SCALAPACK int zero=0, one=1, info=0; int myrow, mycol, nprow, npcol; int bs, bs3, ierror, lld; static int desc_NxN[DLEN_], desc_Nx3N[DLEN_]; static SUPERD_T *super_3Nx3N=NULL, *super_NxN=NULL, *super_Nx3N=NULL; static REAL_T *reductarr=NULL, *gg=NULL; static REAL_T *sg=NULL, *sh=NULL, *sr=NULL, *sn=NULL; REAL_T *ptr; size_t ii, locp; #endif #ifdef OPENMP int iexcl_last; #endif int i, j, k, l, m, n, n2, m3; int npairs, ic, iaci, threadnum, numthreads; int *iexw; size_t in, i3, in3, jn, j3, jn3, nn, n3, nn3; REAL_T elec, evdw, epol, dielfac, qi, qj, qiqj, fgbi, fgbk, rb2, expmkf; REAL_T xi, yi, zi, xj, yj, zj, xij, yij, zij; REAL_T dumbo,tmpsd; REAL_T df, dg, dx, dy, dz; REAL_T dedx, dedy, dedz; REAL_T dij1i, dij2i, dij3i, temp1, temp2, t1, t2; REAL_T qi2h, qid2h, datmp, da2tmp; REAL_T theta, ri1i; REAL_T dij, sumi, ijterm; REAL_T eel, f6, f12, rinv, r2inv, r4inv, r6inv; REAL_T r2, ri, ri2, riinv, rj, rjinv, rirjinv, rj2, sj, sj2, thi; REAL_T uij, efac, temp4, temp5, temp6, temp7, sumda; REAL_T d_dij, e_dij, f_dij, g_dij, h_dij, i_dij; REAL_T j_dij, l_dij, m_dij, n_dij; REAL_T a_dij, b_dij, c_dij, p_dij, r_dij; REAL_T temp8, temp9, temp10, temp11, temp12, temp13; REAL_T temp14, temp15, temp16, temp17, temp18; REAL_T temp20, temp21, temp22, temp23; int count, count2, icount; REAL_T si, sumAij, sumAjk, sumAijAjk, sumdAijddijdxi; REAL_T sumdAijddijdyi, sumdAijddijdzi, sumdAijddijdxiAjk; REAL_T sumdAijddijdyiAjk, sumdAijddijdziAjk, rij, tmpaij, Aij, dAijddij; REAL_T dAijddijdxj, dAijddijdyj, dAijddijdzj; REAL_T sumdAjkddjkdxj, sumdAjkddjkdyj, sumdAjkddjkdzj, p3p4Aij; REAL_T xk, yk, zk, rjk2, djk1i, rjk, vdw2dif, tmpajk, Ajk, sumAjk2, dAjkddjk; REAL_T dAjkddjkdxj, dAjkddjkdyj, dAjkddjkdzj, lastxj, lastyj, lastzj; REAL_T dAidxj, dAidyj, dAidzj, Ai, dAidxi, dAidyi, dAidzi; REAL_T totsasa; REAL_T rgbmax1i, rgbmax2i, rgbmaxpsmax2; REAL_T dxj, dyj, dzj; REAL_T di[3], dj[3], d2ii[3][3], d2jj[3][3], d2ij[3][3]; char transa, transb; REAL_T dblone=1.0; static REAL_T *reff=NULL, *dreff=NULL, *psi=NULL; static REAL_T *sumdeijda=NULL, *sumdeijdn=NULL; static REAL_T *sumdeijdg=NULL, *sumdeijdh=NULL; /* If PRINT_EGB2_TIMES is defined print some calculation times. */ #undef PRINT_EGB2_TIMES /* * If ROW_CYCLIC is defined allocate the Hessian and some temporary * matrices in a row cyclic manner via the "first touch" method. */ #define ROW_CYCLIC if (gb2_debug) { printf("enter egb2, n = %d\n", prm->Natom); #ifdef SCALAPACK printf("egb2: context_Nx1=%d context_1x1=%d context_PxQ=%d\n", context_Nx1, context_1x1, context_PxQ); printf("egb2: descF_PxQ[CTXT_]=%d descF_1x1[CTXT_]=%d descG_PxQ[CTXT_]=%d\n", desc_3Nx1[CTXT_], desc_1x1[CTXT_], desc_3Nx3N[CTXT_]); #endif fflush(stdout); } t1 = seconds(); count = 0; totsasa = 0.0; /* Compute these because they are used a fair amount. */ n = prm->Natom; nn = n * n; n2 = 2 * n; m3 = 3 * n; n3 = m3; nn3 = n * n3; /* * If freevectors != 0, deallocate the static arrays that have been * previously allocated and return. Note that when ScaLAPACK is * defined, the size of the arrays is determined by the 'size' field * of the associated superdescriptor, and so in principle the array * should be free before its associated superdescriptor. In practice, * however, the free_vector function doesn't use the size argument, * so any dummy value could be supplied. */ if (freevectors != 0) { #ifndef SCALAPACK free_vector( sumdeijdn, 0, nn ); sumdeijdn = NULL; free_vector( sumdeijdg, 0, nn3 ); sumdeijdg = NULL; free_vector( sumdeijdh, 0, nn3 ); sumdeijdh = NULL; free_vector( dreff, 0, nn3 ); dreff = NULL; #else if (context_PxQ >= 0) { free_superdesc( super_3Nx3N ); super_3Nx3N = NULL; free_vector( sumdeijdn, 0, super_NxN->size ); sumdeijdn = NULL; free_superdesc( super_NxN ); super_NxN = NULL; free_vector( sumdeijdg, 0, super_Nx3N->size ); sumdeijdg = NULL; free_vector( sumdeijdh, 0, super_Nx3N->size ); sumdeijdh = NULL; free_vector( dreff, 0, super_Nx3N->size ); dreff = NULL; free_superdesc( super_Nx3N ); super_Nx3N = NULL; free_vector( reductarr, 0, 3*n3 ); reductarr = NULL; } free_vector( gg, 0, 3*n3 ); gg = NULL; free_vector( sg, 0, n3 ); sg = NULL; free_vector( sh, 0, n3 ); sh = NULL; free_vector( sr, 0, n3 ); sr = NULL; free_vector( sn, 0, n ); sn = NULL; #endif free_vector( reff, 0, n ); reff = NULL; free_vector( sumdeijda, 0, n ); sumdeijda = NULL; free_vector( psi, 0, n ); psi = NULL; if (gb2_debug) { printf("freevectors\n"); fflush(stdout); } return (0.0); } /* Allocate some static arrays if they have not been allocated already. */ if (reff == NULL) reff = vector(0, n); if (sumdeijda == NULL) sumdeijda = vector(0, n); if ( (psi == NULL) && (gb==2 || gb==5) ) psi = vector(0, n); #ifndef SCALAPACK /* * If SCALAPACK is not defined, allocate sumdeijdn, sumdeijdg, sumdeijdh * and dreff as NAB vectors. */ if (sumdeijdn == NULL) sumdeijdn = vector(0, nn); if (sumdeijdg == NULL) sumdeijdg = vector(0, nn3); if (sumdeijdh == NULL) sumdeijdh = vector(0, nn3); if (dreff == NULL) dreff = vector(0, nn3); #else /* * If SCALAPACK is defined, initialize matrix descriptors for the * sumdeindn, sumdeijdg, sumdeijdh and dreff matrices, then create * superdescriptors that contain the matrix size (for each process), * as well as precomputed division and modulus arrays to accelerate * the mapping from a global matrix (row, column) address to a local * submatrix array address. Once the superdescriptors are created, * allocate the submatrix arrays. * * Get the block size for the square block and multiply by three. */ bs = BLOCKSIZE; bs3 = 3*bs; /* * Get the number of rows and columns on the block cyclic (PxQ) process * grid, as well as this task's row and column on the grid. */ blacs_gridinfo_(&context_PxQ, &nprow, &npcol, &myrow, &mycol); /* * Allocate the reduction array in all processes because MPI_Allreduce * will use MPI_COMM_WORLD as its communicator. It is unclear that * it is possible to use a ScaLAPACK context as an MPI communicator, * so the default MPI_COMM_WORLD is used instead. */ if (reductarr == NULL) reductarr = vector(0, 3*n3); /* * If this task is on the process grid, set up the array descriptors. * If this task isn't on the process grid, set desc_NxN[CTXT_] and * desc_Nx3N[CTXT_] to -1 in case any of the associated matrices are * called by the pdgemr2d_ function, and set the superdescriptors * to NULL. */ /* * Distribute the sumdeijdg, sumdeijdh and dreff matrices on the * block cyclic process grid. Because the number of columns is * three times the number of rows for these matrices, use a row * block size of BLOCKSIZE and a column block size of 3*BLOCKSIZE * so that a test of the row index i and the column index j (not 3*j) * may be used to identify the grid process to which a global matrix * address maps. * * Distribute the sumdeijdn matrix on the block cyclic process * grid. Use row and column block sizes of BLOCKSIZE because * the number of rows equals the number of columns for this matrix. * * The numroc_ function is used to calculate the number of matrix * elements that are distributed across a PxQ processor grid. * * Once the descriptor has been initialized by the descinit function, * create a superdescriptor from the descriptor. * * Allocate the gg, sg, sh, sr and sn vectors for the diagonal * elements of the g, sumdeijdg, sumdeijdh, sumdeijdr and * sumdeijdn matrices. */ if (context_PxQ >= 0) { if (super_3Nx3N == NULL) { if ( ( super_3Nx3N = superdesc(desc_3Nx3N) ) == NULL ) { printf("egb2: superdescriptor 3Nx3N == NULL, mytaskid = %d\n", mytaskid); fflush(stdout); } } if (super_NxN == NULL) { locp = numroc_(&n, &bs, &myrow, &zero, &nprow); lld = locp; descinit_(desc_NxN, &n, &n, &bs, &bs, &zero, &zero, &context_PxQ, &lld, &info); if (info) { printf("egb2: error in desc_NxN initialization, mytaskid = %d\n", mytaskid); fflush(stdout); } if ( ( super_NxN = superdesc(desc_NxN) ) == NULL) { printf("egb2: superdescriptor NxN == NULL, mytaskid = %d\n", mytaskid); fflush(stdout); } } if (sumdeijdn == NULL && super_NxN->size > 0) { sumdeijdn = vector( 0, super_NxN->size ); } if (super_Nx3N == NULL) { locp = numroc_(&n, &bs, &myrow, &zero, &nprow); lld = locp; descinit_(desc_Nx3N, &n, &m3, &bs, &bs3, &zero, &zero, &context_PxQ, &lld, &info); if (info) { printf("egb2: error in desc_Nx3N initialization, mytaskid = %d\n", mytaskid); fflush(stdout); } if ( ( super_Nx3N = superdesc(desc_Nx3N) ) == NULL) { printf("egb2: superdescriptor Nx3N == NULL, mytaskid = %d\n", mytaskid); fflush(stdout); } } if (sumdeijdg == NULL && super_Nx3N->size > 0) { sumdeijdg = vector( 0, super_Nx3N->size ); } if (sumdeijdh == NULL && super_Nx3N->size > 0) { sumdeijdh = vector( 0, super_Nx3N->size ); } if (dreff == NULL && super_Nx3N->size > 0) { dreff = vector( 0, super_Nx3N->size ); } } else { desc_NxN[CTXT_] = desc_Nx3N[CTXT_] = -1; super_3Nx3N = super_NxN = super_Nx3N = NULL; } gg = vector( 0, 3*n3 ); sr = vector( 0, n3 ); sg = vector( 0, n3 ); sh = vector( 0, n3 ); sn = vector( 0, n ); if (gb2_debug) { printf("vector allocation, mytaskid = %d\n", mytaskid); fflush(stdout); } #endif /* * Smooth "cut-off" in calculating GB effective radii. * Implementd by Andreas Svrcek-Seiler and Alexey Onufriev. * The integration over solute is performed up to rgbmax and includes * parts of spheres; that is an atom is not just "in" or "out", as * with standard non-bonded cut. As a result, calclated effective * radii are less than rgbmax. This saves time, and there is no * discontinuity in dReff/drij. * * Only the case rgbmax > 5*max(sij) = 5*fsmax ~ 9A is handled; this is * enforced in mdread(). Smaller values would not make much physical * sense anyway. * * Note: rgbmax must be less than or equal to cut so that the pairlist * generated from cut may be applied to calculation of the effective * radius and its derivatives. */ if (rgbmax > cut) { printf("Error in egb2: rgbmax = %f is greater than cutoff = %f\n", rgbmax, cut); exit(1); } rgbmax1i = 1.0/rgbmax; rgbmax2i = rgbmax1i*rgbmax1i; rgbmaxpsmax2 = (rgbmax+prm->Fsmax)*(rgbmax+prm->Fsmax); #if 0 if (gb2_debug) { printf("Effective Born radii:\n"); } #endif /* * Get the "effective" Born radii via the approximate pairwise method * Use Eqs 9-11 of Hawkins, Cramer, Truhlar, J. Phys. Chem. 100:19824 * (1996). * * This computation is not executed in parallel for the case of gbsa==1 * due to the loop-carried dependence of the 'count' variable. However, * an apparent bug in the Sun implementation of OpenMP may cause this * computation to execute more slowly than in serial mode in the case * where an 'if (gbsa != 1)' clause is added to the following pragma. * To avoid this problem, the code is replicated verbatim and selected * with an 'if (gbsa != 1)' statement. When gbsa != 1 the pragma version * is selected; otherwise, the non-pragma version is selected. */ if (gbsa != 1) { /* * For SCALAPACK or MPI, initialize all elements of the reff array * to 0 because although each task will calculate only a subset of the * elements, a reduction is used to combine the results from all tasks. * Initialize this array even for tasks that are not on the process grid * because MPI_Allreduce will use the default MPI_COMM_WORLD. * * For OpenMP, the combination of all threads loads all elements * of the reff array so there is no need to initialize it to 0. * * For single-threaded execution all elements of the reff array are * loaded so there is no need to initialize it to 0. */ #if defined(SCALAPACK) || defined(MPI) for (i = 0; i < n; i++) { reff[i] = 0.0; } if (gb2_debug) { printf("reff initialized, mytaskid = %d\n", mytaskid); fflush(stdout); } #endif #pragma omp parallel \ private (i, xi, yi, zi, ri, ri1i, sumi, j, k, xij, yij, zij, \ r2, dij1i, dij, sj, sj2, uij, dij2i, tmpsd, dumbo, theta, \ threadnum, numthreads ) { /* * Get the thread number and the number of threads for multi-threaded * execution under OpenMP. For ScaLAPACK, use the process column and * the number of process columns, respectively. For single-threaded * execution and MPI there is no need to set these variables. */ #ifdef OPENMP threadnum = omp_get_thread_num(); numthreads = omp_get_num_threads(); #elif defined(SCALAPACK) threadnum = mycol; numthreads = npcol; #endif /* * Loop over all atoms i. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. * * OpenMP utilizes all threads. ScaLAPACK utilizes all processes * on the process grid. MPI is not parallelized. * * The reff array is written in the following loops. It is necessary * to synchronize the OpenMP threads that execute these loops after * loop execution so that a race condition does not exist for reading * the reff array before it is written. Even if all subsequent loops * use loop index to thread mapping that is identical to that of the * following loop, elements of the reff array are indexed by other * than such a loop indices, so synchronization is necessary. * * OpenMP synchronization is accomplished by the implied barrier * at the end of this parallel region. ScaLAPACK synchronization is * accomplished by MPI_Allreduce. */ for (i = 0; i < n; i++) { #if defined(OPENMP) || defined(SCALAPACK) if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue; #endif xi = x[3 * i ]; yi = x[3 * i + 1]; zi = x[3 * i + 2]; ri = rborn[i] - BOFFSET; ri1i = 1. / ri; sumi = 0.0; /* Select atom j from the pair list. Non-graceful error handling. */ for (k = 0; k < lpears[i] + upears[i]; k++) { if (pearlist[i] == NULL) { printf("NULL pair list entry in egb2 loop 1, mytaskid = %d\n", mytaskid); fflush(stdout); } j = pearlist[i][k]; #ifdef SCALAPACK if (gb2_debug) { if ( !myroc(j, BLOCKSIZE, nprow, myrow) ) { printf("pair list error!\n"); fflush(stdout); } } #endif xij = xi - x[3 * j ]; yij = yi - x[3 * j + 1]; zij = zi - x[3 * j + 2]; r2 = xij * xij + yij * yij + zij * zij; if ( r2 > rgbmaxpsmax2 ) continue; dij1i = 1.0 / sqrt(r2); dij = r2 * dij1i; sj = fs[j] * (rborn[j] - BOFFSET); sj2 = sj * sj; /* * The following equations are from the Appendix of Schaefer and Froemmel, * JMB 216:1045-1066, 1990. Taylor series expansion for d>>s is by Andreas * Svrcek-Seiler. Smooth rgbmax idea is from Andreas Svrcek-Seiler and * Alexey Onufriev. */ if ( dij > rgbmax + sj ) continue; if ( dij > rgbmax - sj ) { uij = 1./(dij - sj); sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij + rgbmax2i * (r2 - 4.0 * rgbmax * dij - sj2) - 2.0 * log(rgbmax * uij)); } else if ( dij > 4.0 * sj ) { dij2i = dij1i * dij1i; tmpsd = sj2 * dij2i; dumbo = TA + tmpsd*(TB + tmpsd*(TC + tmpsd*(TD + tmpsd*TDD))); sumi -= sj2 * sj * dij2i * dij2i * dumbo; } else if ( dij > ri + sj ) { sumi -= 0.5 * (sj / (r2 - sj2) + 0.5 * dij1i * log((dij - sj) / (dij + sj))); } else if ( dij > fabs(ri - sj) ) { theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2); uij = 1. / (dij + sj); sumi -= 0.25 * (ri1i * (2. - theta) - uij + dij1i * log(ri * uij)); } else if ( ri < sj ) { sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i + 0.5 * dij1i * log((sj - dij) / (sj + dij))); } if (gbsa == 1) { if ((P0[i] + P0[j]) > dij) { if (P0[i] > 2.5 && P0[j] > 2.5) { ineighbor[count] = j + 1; /* this +1 is VERY important */ count += 1; } } } } #define GBALPHA 0.8 #define GBBETA 0.0 #define GBGAMMA 2.909125 /* * For ScaLAPACK store sums instead of reciprocal sums * because more terms will be contributed to each sum * from other process columns via the subsequent reduction. */ #ifdef SCALAPACK if ( gb==1 ) { /* "standard" (HCT) effective radii: */ reff[i] = sumi; } else { /* "gbao" formulas: */ psi[i] = -ri*sumi; reff[i] = -tanh((GBALPHA + GBGAMMA*psi[i]*psi[i])*psi[i])/rborn[i]; } #else if ( gb==1 ) { /* "standard" (HCT) effective radii: */ reff[i] = 1.0/(ri1i + sumi); } else { /* "gbao" formulas: */ psi[i] = -ri*sumi; reff[i] = ri1i - tanh( (GBALPHA + GBGAMMA*psi[i]*psi[i]) *psi[i])/rborn[i]; } #endif } } if (gb2_debug) { printf("reff calculated, mytaskid = %d\n", mytaskid); fflush(stdout); } /* * The ScaLAPACK synchronization is accomplished via reduction * of the reff array. * * No reduction is accomplished for MPI because egb2 is not * parallelized for MPI. */ #ifdef SCALAPACK ierror = MPI_Allreduce(reff, reductarr, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb reff reduction, error = %d myrow = %d mycol = %d\n", ierror, myrow, mycol); } if (gb2_debug) { printf("MPI_Allreduce reff, mytaskid = %d\n", mytaskid); fflush(stdout); } for (i = 0; i < n; i++) { reff[i] = reductarr[i]; if (reff[i] == 0.0) { printf("reff[%d]=0!, myrow = %d mycol = %d\n", i, myrow, mycol); fflush(stdout); } } /* For ScaLAPACK reciprocate the reduced sums in reff. */ for (i = 0; i < n; i++) { ri = rborn[i] - BOFFSET; ri1i = 1. / ri; reff[i] = 1.0 / (ri1i + reff[i]); } #endif } else { /* * Here is the second replication of the code, always executed serially. * Loop over all atoms i. */ for (i = 0; i < n; i++) { xi = x[3 * i ]; yi = x[3 * i + 1]; zi = x[3 * i + 2]; ri = rborn[i] - BOFFSET; ri1i = 1. / ri; sumi = 0.0; /* Select atom j from the pair list. Non-graceful error handling. */ for (k = 0; k < lpears[i] + upears[i]; k++) { if (pearlist[i] == NULL) { printf("NULL pair list entry in egb2 loop 2, taskid %d\n", mytaskid); fflush(stdout); } j = pearlist[i][k]; xij = xi - x[3 * j ]; yij = yi - x[3 * j + 1]; zij = zi - x[3 * j + 2]; r2 = xij * xij + yij * yij + zij * zij; if ( r2 > rgbmaxpsmax2 ) continue; dij1i = 1.0 / sqrt(r2); dij = r2 * dij1i; sj = fs[j] * (rborn[j] - BOFFSET); sj2 = sj * sj; /* * The following equations are from the Appendix of Schaefer and Froemmel, * JMB 216:1045-1066, 1990. Taylor series expansion for d>>s is by Andreas * Svrcek-Seiler. Smooth rgbmax idea is from Andreas Svrcek-Seiler and * Alexey Onufriev. */ if ( dij > rgbmax + sj ) continue; if ( dij > rgbmax - sj ) { uij = 1./(dij - sj); sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij * uij + rgbmax2i * (r2 - 4.0 * rgbmax * dij - sj2) - 2.0 * log(rgbmax * uij)); } else if ( dij > 4.0 * sj ) { dij2i = dij1i * dij1i; tmpsd = sj2 * dij2i; dumbo = TA + tmpsd*(TB + tmpsd*(TC + tmpsd*(TD + tmpsd*TDD))); sumi -= sj2 * sj * dij2i * dij2i * dumbo; } else if ( dij > ri + sj ) { sumi -= 0.5 * (sj / (r2 - sj2) + 0.5 * dij1i * log((dij - sj) / (dij + sj))); } else if ( dij > fabs(ri - sj) ) { theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2); uij = 1. / (dij + sj); sumi -= 0.25 * (ri1i * (2. - theta) - uij + dij1i * log(ri * uij)); } else if ( ri < sj ) { sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i + 0.5 * dij1i * log((sj - dij) / (sj + dij))); } if (gbsa == 1) { if ((P0[i] + P0[j]) > dij) { if (P0[i] > 2.5 && P0[j] > 2.5) { ineighbor[count] = j + 1; /* this +1 is VERY important */ count += 1; } } } } if ( gb==1 ){ /* "standard" (HCT) effective radii: */ reff[i] = 1.0/(ri1i + sumi); } else { /* "gbao" formulas: */ #define GBALPHA 0.8 #define GBBETA 0.0 #define GBGAMMA 2.909125 psi[i] = -ri*sumi; reff[i] = 1.0 / (ri1i - tanh( (GBALPHA + GBGAMMA*psi[i]*psi[i]) *psi[i])/rborn[i] ); } /* gbsa == 1 */ ineighbor[count] = 0; count = count + 1; #if 0 reff[i] = rborn[i] > reff[i] ? rborn[i] : reff[i]; if (gb2_debug) { printf("%d\t%15.7f\t%15.7f\n", i + 1, rborn[i], reff[i]); } #endif } if (gb2_debug) { printf("reff calculated\n"); fflush(stdout); } } /* * Main LCPO stuff follows. No second derivative support for this. * Also no parallelization for OpenMP or ScaLAPACK due to the loop-carried * dependence of the 'count' variable. */ if (gbsa == 1) { totsasa = 0.0; count = 0; for (i = 0; i < n; i++) { if (ineighbor[count] == 0) { count = count + 1; } else { si = PIx4 * P0[i] * P0[i]; sumAij = 0.0; sumAjk = 0.0; sumAjk2 = 0.0; sumAijAjk = 0.0; sumdAijddijdxi = 0.0; sumdAijddijdyi = 0.0; sumdAijddijdzi = 0.0; sumdAijddijdxiAjk = 0.0; sumdAijddijdyiAjk = 0.0; sumdAijddijdziAjk = 0.0; icount = count; L70:j = ineighbor[count] - 1; /* mind the -1 , fortran again */ xi = x[3 * i ]; yi = x[3 * i + 1]; zi = x[3 * i + 2]; xj = x[3 * j ]; yj = x[3 * j + 1]; zj = x[3 * j + 2]; r2 = (xi-xj) * (xi-xj) + (yi-yj) * (yi-yj) + (zi-zj) * (zi-zj); dij1i = 1. / sqrt(r2); rij = r2 * dij1i; tmpaij = P0[i] - rij * 0.5 - (P0[i] * P0[i] - P0[j] * P0[j]) * 0.5 * dij1i; Aij = PIx2 * P0[i] * tmpaij; dAijddij = PIx1 * P0[i] * (dij1i * dij1i * (P0[i] * P0[i] - P0[j] * P0[j]) - 1.0); dAijddijdxj = dAijddij * (xj - xi) * dij1i; dAijddijdyj = dAijddij * (yj - yi) * dij1i; dAijddijdzj = dAijddij * (zj - zi) * dij1i; sumAij = sumAij + Aij; count2 = icount; sumAjk2 = 0.0; sumdAjkddjkdxj = 0.0; sumdAjkddjkdyj = 0.0; sumdAjkddjkdzj = 0.0; p3p4Aij = -surften * (P3[i] + P4[i] * Aij); L80:k = ineighbor[count2] - 1; /*same as above, -1 ! */ if (j == k) goto L85; xk = x[3 * k]; yk = x[3 * k + 1]; zk = x[3 * k + 2]; rjk2 = (xj - xk) * (xj - xk) + (yj - yk) * (yj - yk) + (zj - zk) * (zj - zk); djk1i = 1.0 / sqrt(rjk2); rjk = rjk2 * djk1i; if (P0[j] + P0[k] > rjk) { vdw2dif = P0[j] * P0[j] - P0[k] * P0[k]; tmpajk = 2.0 * P0[j] - rjk - vdw2dif * djk1i; Ajk = PIx1 * P0[j] * tmpajk; sumAjk = sumAjk + Ajk; sumAjk2 = sumAjk2 + Ajk; dAjkddjk = PIx1 * P0[j] * djk1i * (djk1i * djk1i * vdw2dif - 1.0); dAjkddjkdxj = dAjkddjk * (xj - xk); dAjkddjkdyj = dAjkddjk * (yj - yk); dAjkddjkdzj = dAjkddjk * (zj - zk); f[3 * k] = f[3 * k] + dAjkddjkdxj * p3p4Aij; f[3 * k + 1] = f[3 * k + 1] + dAjkddjkdyj * p3p4Aij; f[3 * k + 2] = f[3 * k + 2] + dAjkddjkdzj * p3p4Aij; sumdAjkddjkdxj = sumdAjkddjkdxj + dAjkddjkdxj; sumdAjkddjkdyj = sumdAjkddjkdyj + dAjkddjkdyj; sumdAjkddjkdzj = sumdAjkddjkdzj + dAjkddjkdzj; } L85:count2 = count2 + 1; if (ineighbor[count2] != 0) { goto L80; } else { count2 = icount; } sumAijAjk = sumAijAjk + Aij * sumAjk2; sumdAijddijdxi = sumdAijddijdxi - dAijddijdxj; sumdAijddijdyi = sumdAijddijdyi - dAijddijdyj; sumdAijddijdzi = sumdAijddijdzi - dAijddijdzj; sumdAijddijdxiAjk = sumdAijddijdxiAjk - dAijddijdxj * sumAjk2; sumdAijddijdyiAjk = sumdAijddijdyiAjk - dAijddijdyj * sumAjk2; sumdAijddijdziAjk = sumdAijddijdziAjk - dAijddijdzj * sumAjk2; lastxj = dAijddijdxj * sumAjk2 + Aij * sumdAjkddjkdxj; lastyj = dAijddijdyj * sumAjk2 + Aij * sumdAjkddjkdyj; lastzj = dAijddijdzj * sumAjk2 + Aij * sumdAjkddjkdzj; dAidxj = surften * (P2[i] * dAijddijdxj + P3[i] * sumdAjkddjkdxj + P4[i] * lastxj); dAidyj = surften * (P2[i] * dAijddijdyj + P3[i] * sumdAjkddjkdyj + P4[i] * lastyj); dAidzj = surften * (P2[i] * dAijddijdzj + P3[i] * sumdAjkddjkdzj + P4[i] * lastzj); f[3 * j] = f[3 * j] + dAidxj; f[3 * j + 1] = f[3 * j + 1] + dAidyj; f[3 * j + 2] = f[3 * j + 2] + dAidzj; count = count + 1; if (ineighbor[count] != 0) { goto L70; } else { count = count + 1; } Ai = P1[i] * si + P2[i] * sumAij + P3[i] * sumAjk + P4[i] * sumAijAjk; dAidxi = surften * (P2[i] * sumdAijddijdxi + P4[i] * sumdAijddijdxiAjk); dAidyi = surften * (P2[i] * sumdAijddijdyi + P4[i] * sumdAijddijdyiAjk); dAidzi = surften * (P2[i] * sumdAijddijdzi + P4[i] * sumdAijddijdziAjk); f[3 * i] = f[3 * i] + dAidxi; f[3 * i + 1] = f[3 * i + 1] + dAidyi; f[3 * i + 2] = f[3 * i + 2] + dAidzi; totsasa = totsasa + Ai; /* printf("totsasa up to %d %f\n",i,totsasa); */ } } /* printf("SASA %f , ESURF %f \n",totsasa,totsasa*surften); */ esurf = totsasa * surften; } /* * Compute the generalized Born energy for all pair interactions, as * well as the Van der Waals and Coulombic energy for all pairs that * are not on the excluded list. Compute the 1st and 2nd derivatives * of the of the Van der Waals and Coulombic energy for all non-excluded * pairs. * * In addition, compute the 1st derivative of the Born energy for all * pairs of atoms (selected by the indices i and j). David Case and * his colleagues have shown that the first derivative for an atom pair * may be calculated by first building the following equations in * terms of Ri (the effective radius of atom i), Rj (the effective * radius of atom j) and Dij (the distance between atoms i and j): * * rb2 = Ri * Rj * * efac = exp(Dij^2 / (4 * rb2)) * * fgbi = 1 / sqrt(Dij^2 + rb2 * efac) * * fgbk = -kappa * KSCALE / fgbi * * expmkf = exp(fgbk) / DIELECTRIC_CONSTANT * * dielfac = 1 - expmkf * * temp4 = fgbi * fgbi * fgbi * * temp6 = Qi * Qj * temp4 * (dielfac + fgbk * expmkf) * * temp5 = efac * temp6 * (rb2 + 0.25 * r2) / 2 * * From the above equations it is possible to express the first * derivative of the Born energy with respect to any cartesian coordinate * (designated by Xl) by summing contributions from the three following * equations: * * 1. d(Dij)/d(Xl) * A(Dij) where * * A(Dij) = (1 - efac / 4) * temp6 * * 2. d(Ri)/d(Xl) * B(Dij) where * * B(Dij) = Ri * temp5 * * 3. d(Rj)/d(Xl) * C(Dij) where * * C(Dij) = Rj * temp5 * * Equation 1 is the simplest of the three to evaluate. One calculates * all of the derivatives of Dij with respect to cartesian coordinates, * multiplies these derivatives by A(Dij), and sums the resulting products * to the gradient vector. There are six derivatives involving atoms * i and j: * * d(Dij)/d(Xi), d(Dij)/d(Yi), d(Dij)/d(Zi) * * d(Dij)/d(Xj), d(Dij)/d(Yj), d(Dij)/d(Zj) * * The derivatives of Dij with respect to the cartesian coordinates * contain a factor of Dij in the denominator. This factor is absorbed * by equation 1 (and all of the other equations that incorporate the * derivatives) rather than carried with the individual derivatives. * * Equations 2 and 3 are more complicated because the derivatives of the * effective radii Ri and Rj with respect to the cartesian coordinates are * sometimes series, but sometimes single terms. For example: * * d(Ri)/d(Xi) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xi) * * + d[f(Dik)]/d(Dik) * d(Dik)/d(Xi) + ... * * + d[f(Din)]/d(Din) * d(Din)/d(Xi) * * d(Ri)/d(Xj) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xj) * * d(Rj)/d(Xj) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xj) * * + d[f(Djk)]/d(Djk) * d(Djk)/d(Xj) + ... * * + d[f(Djn)]/d(Djn) * d(Djn)/d(Xj) * * d(Rj)/d(Xi) = d[f(Dij)]/d(Dij) * d(Dij)/d(Xi) * * where n is the number of atoms and f() designates a function of the * interatomic distance. * * Moreover, when all of the atom pairs are considered, one can refactor * the computation to obtain terms such as: * * d(Ri)/d(Xi) * [B(Dij) + B(Dik) + ... + B(Din)] * * d(Rj)/d(Xj) * [C(Dij) + C(Djk) + ... + C(Djn)] * * The square brackets [] that B(Dij) + B(Dik) + ... + B(Din) are summed * prior to multiplication by the derivatives of the effective radius. * * David Case and his colleagues have discovered an algorithm that permits * evaluation of the first derivatives while requiring that the first * derivatives of the effective radii be evaluated once only. They * use an array called "sumdeijda" into which the above series are summed. * For example, the series B(Dij) + B(Dik) + ... + B(Din) is summed into * sumdeijda[i], and the series C(Dij) + C(Djk) + ... + C(Din) is summed * into sumdeijda[j]. One can think of the contents of sumdeijda[i] * as the sum of everything that gets multiplied by d(Ri)/d(Xl), where * d(Ri)/d(Xl) represents all of the derivatives of Ri with respect to the * cartesian coordinates. Similarly, one can think of the contents of * sumdeijda[j] as the sum of everything that gets multiplied by d(Rj)/d(Xl), * where d(Rj)/d(Xl) represents all of the derivatives of Rj with respect * to the cartesian coordinates. * * Once the sumdeijda array is loaded, the d(Ri)/d(Xl) and d(Rj)/d(Xl) are * calculated and multiplied by the respective elements of sumdeijda. For * example, for d(Ri)/d(Xl) the individual terms of the d(Ri)/d(Xi) series * are calculated and stored into another array called da. Each term has * an x-, y- and z-component. The components of the jth term are: * * d[f(Dij)]/d(Dij) * d(Dij)/dXi * * d[f(Dij)]/d(Dij) * d(Dij)/dYi * * d[f(Dij)]/d(Dij) * d(Dij)/dZi * * d[f(Dij)]/d(Dij) * d(Dij)/dXj * * d[f(Dij)]/d(Dij) * d(Dij)/dYj * * d[f(Dij)]/d(Dij) * d(Dij)/dZj * * Note that the jth term contains derivatives with respect to the cartesian * coordinates of atoms i and j. The derivatives with respect to the cartesian * coordinates of atom j are stored in da[j]. The derivatives with respect to * the cartesian coordinates of atom i are summed into da[i] with the result * that da[i] contains derivatives from f(Dij), f(Dik), ..., f(Din). * * Each element of the da array is then multiplied by sumdeijda[i] to form: * * [ d(Rij)/d(Xl) + d(Rik)/d(Xl) + ... ] * [f(Dij) + f(Dik) + ... ] * where the series d(Rij)/d(Xl) + d(Rik)/d(Xl) + ... contains n terms each * of which has an (x,y,z) coordinate. Therefore this result is a vector of * length 3*n where n is the number of atoms. This vector represents * the derivatives of Ri with respect to all of the cartesian coordinates. * It is summed into the gradient vector. * * The process is then repeated for Rj, Rk, ..., Rn. * * There is one further refinement that relates to the first derivatives * of the "diagonal" terms, i.e., those energy terms that are functions * of the effective radius Ri but not of the interatomic distance Dij. * One begins by building up the following equations: * * expmkf = exp(-kappa * KSCALE * Ri) / DIELECTRIC_CONSTANT * * dielfac = 1 - expmkf * * qi2h = Qi^2 / 2 * * qid2h = di2h * dielfac * * Then the first derivative with respect to Ri is the equation: * * 4. d(Ri)/d(Xl) * R(Dij) where * * R(Dij) = qid2h - kappa * KSCALE * qi2h * expmkf * Ri * * * Equation 4, like equation 2, is multiplied by d(Ri)/d(Xl) and * therefore one can add these two equations and store the result * in the ith element of the sumdeijda array. * * Note that equations 2, 3 and 4 include a factor of either -Ri*Ri * or -Rj*Rj which arises from the derivative of Ri or Rj, respectively. * The negation is introduced when sumda is multiplied by an element * from the dreff array. * * The above algorithm may be extended to the calculation of the second * derivatives. One proceeds by first building the following equations * in terms of Ri, Rj and Dij: * * RiInv = 1 / Ri * * RjInv = 1 / Rj * * RiRjInv = RiInv * RjInv * * temp4 = fgbi * fgbi * fgbi * * temp5 = Qi * Qj * temp4 * (dielfac + fgbk * expmkf) / 2 * * temp6 = fgbi * fgbi * temp5 * * temp7 = Qi * Qj * kappa^2 * KSCALE^2 * temp4 * expmkf * * temp8 = 1 - efac / 4 * * temp9 = temp7 / 4 - 3 * temp6 / 2 * * temp10 = Dij^2 / 4 * * temp11 = Ri * Rj + temp10 * * temp12 = Ri^2 * Rj^2 * * temp13 = Ri * temp11 * * temp14 = Rj * temp11 * * temp15 = temp10 * RjInv * * temp16 = temp10 * RiInv * * temp17 = temp9 * temp13 * efac * * temp18 = temp9 * temp14 * efac * * temp20 = temp8 * temp9 * * temp21 = temp20 * temp13 * * temp22 = temp20 * temp14 * * temp23 = 2 * efac; * * * From the above equations it is possible to express the second * derivative of the Born energy with respect to two cartesian coordinates * (designated by Xl and Xm) by summing contributions from the following * equations: * * 5. d(Ri)/d(Xm) * [ d(Dij)/d(Xl) * D(Dij) ] where * * D(Dij) = temp23 * (temp21 - temp15 * temp5 / 4) * * * 6. d(Rj)/d(Xm) * [ d(Dij)/d(Xl) * E(Dij) ] where * * E(Dij) = temp23 * (temp22 - temp16 * temp5 / 4) * * * 7. d(Ri)/d(Xl) * [ d(Dij)/d(Xm) * J(Dij) ] where * * J(Dij) = temp23 * ((Ri - RjInv * temp11) * temp5 / 4 + temp21) * * * 8. d(Rj)/d(Xl) * [ d(Dij)/d(Xm) * N(Dij) ] where * * N(Dij) = temp23 * ((Rj - RiInv * temp11) * temp5 / 4 + temp22) * * * 9. d(Ri)/d(Xm) * [ d(Ri)/d(Xl) * H(Dij) ] where * * H(Dij) = efac * (temp10 * (temp13 * RjInv - Ri^2) * temp5 + temp13 * temp17) * * * 10. d(Rj)/d(Xm) * [ d(Rj)/d(Xl) * M(Dij) ] where * * M(Dij) = efac * (temp10 * (temp14 * RiInv - Rj^2) * temp5 + temp14 * temp18) * * * 11. d(Rj)/d(Xm) * [ d(Ri)/d(Xl) * I(Dij) ] where * * I(Dij) = efac * ((temp12 + temp16 * temp13) * temp5 + temp13 * temp18) * * * 12. d(Ri)/d(Xm) * [ d(Rj)/d(Xl) * L(Dij) ] where * * L(Dij) = efac * ((temp12 + temp15 * temp14) * temp5 + temp14 * temp17) * * * 15. d2(Dij)/d(Xm)d(Xl) * F(Dij) where * * F(Dij) = 2 * temp8 * temp5 * * * 16. d(Dij)/d(Xm) * [ d(Dij)/d(Xl) * G(Dij) ] where * * G(Dij) = 4 * temp8 * temp20 * * + (2 * temp8 + efac * temp10 * RiRjInv) * temp5 / Dij^2 * * * Using the above equations 5-16 it is possible to calculate the * second derivatives of the Born energy in an optimized manner. * Note, however, that computing the second derivatives of the effective * radius with respect to the cartesian coordinates involves some * complexity. This fact can be appreciated by beginning with the * following expression for the effective radius Ri of atom i: * * * Ri = 1 / [1/Rborn(i) + f(Dij) + f(Dik) + ... + f(Din)] * * * where Rborn(i) represents the Born radius of atom i and f() respesents * a function of the interatomic distance. Differentiating with respect to * a cartesian coordinate Xl gives: * * * d(Ri)/d(Xl) = -Ri^2 * { d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) + * * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... + * * d[f(Din)]/d(Din) * d(Din)/d(Xl) } * * * The above expression is the general case. Many of the terms are * zero, for example, d(Dij)/d(Xl) is zero when l equals neither i nor j. * * Differentiating again with respect to a cartesian coordinate Xm gives: * * * d2(Ri)/d(Xm)d(Xl) = * * - Ri^2 * { d[f(Dij)]/d(Dij) * d2(Dij)/d(Xm)d(Xl) + * * d[f(Dik)]/d(Dik) * d2(Dik)/d(Xm)d(Xl) + ... + * * d[f(Din)]/d(Din) * d2(Din)/d(Xm)d(Xl) } * * * - Ri^2 * { d2[f(Dij)]/d(Dij)2 * d(Dij)/d(Xm) * d(Dij)/d(Xl) + * * d2[f(Dik)]/d(Dik)2 * d(Dik)/d(Xm) * d(Dik)/d(Xl) + ... + * * d2[f(Din)]/d(Din)2 * d(Din)/d(Xm) * d(Din)/d(Xl) } * * * + 2Ri^3 * { d[f(Dij)]/d(Dij) * d(Dij)/d(Xm) * d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) + * * d[f(Dij)]/d(Dij) * d(Dij)/d(Xm) * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... + * * d[f(Dij)]/d(Dij) * d(Dij)/d(Xm) * d[f(Din)]/d(Din) * d(Din)/d(Xl) + * * * d[f(Dik)]/d(Dik) * d(Dik)/d(Xm) * d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) + * * d[f(Dik)]/d(Dik) * d(Dik)/d(Xm) * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... + * * d[f(Dik)]/d(Dik) * d(Dik)/d(Xm) * d[f(Din)]/d(Din) * d(Din)/d(Xl) + ... + * * * d[f(Din)]/d(Din) * d(Din)/d(Xm) * d[f(Dij)]/d(Dij) * d(Dij)/d(Xl) + * * d[f(Din)]/d(Din) * d(Din)/d(Xm) * d[f(Dik)]/d(Dik) * d(Dik)/d(Xl) + ... + * * d[f(Din)]/d(Din) * d(Din)/d(Xm) * d[f(Din)]/d(Din) * d(Din)/d(Xl) } * * * Many of the terms in the above expression are zero, or example, when neither * l nor m equals i or j. However, many non-zero terms remain, and in particular * there exists a large number of the terms that are multiplied by 2Ri^3. But * fortunately these particular terms may be computed by simply multiplying * the derivative of the Ri with respect to all cartesian coordinates by itself. * * As can be appreciated from the above analysis, differentiation gives rise to * factors of -Ri^2, Ri^3, -Rj^2 and Rj^3. A factor of Ri^2 or Rj^2 is included * equations 5-16, as well as equations 17-18 (below). * * Square brackets [ ] have been added to many of equations 5-18 * to indicate the order of evaluation. The derivative inside of the * brackets will be called the inner derivative. The derivative outside * of the brackets will be called the outer derivative. The inner * derivative will be evaluated prior to the outer derivative. The * inner derivative that is multiplied by the accompanying functions * of Dij, e.g., H(Dij) of equation 7, produces a gradient vector. * * Equations 15 and 16 are analogous to equation 1 and are simple to * evaluate because they do not involve derivatives of the effective * radii Ri and Rj and so do not comprise series. Instead, they comprise * the derivatives of Dij with respect to the cartesian coordinates * of atoms i and j only. * * Equations 13 and 14 are analogous to equations 2 and 3 except that * they involve second derivatives of the effective radii Ri and Rj instead * of first derivatives. However, the second derivatives of the effective * radius Ri include terms such as d2(Ri)/d(Xl)d(Xm) which affect many Hessian * elements when neither l nor m equals i. * * Equations 9, 10, 11 and 12 imply two applications of the approach used * to evaluate equations 2 and 3. However, the result of the multiplication * by the inner derivative (which is enclosed in [ ]) is not a single term * but rather is a gradient vector. Therefore, multiplication by the outer * derivative involves multiplying each derivative of Ri or Rj by all of the * terms in the gradient vector. Moreover, this process requires a * two-dimensional array, i.e., an array of gradient vectors which is named * either "sumdeijdg" or "sumdeijdh", and is analogous to the sumdeijda array * that is used to evaluate equations 2, 3 and 4. The order of evaluation of * the derivatives appears to be unimportant for equations 9 and 10, i.e., the * inner derivatives may be swappped with the outer derivatives without * affecting the results. * * However, there is a subtlety involving equations 11 and 12 which requires * that if the inner derivatives are swapped with the outer derivatives in * equation 11, they must be similarly swapped in equation 12. This subtlety * arises because the the sumdeijdg array is used to collect all inner * derivatives with respect to Xl that will subsequently be multiplied by * outer derivatives with respect to Xm An analogous sumdeijdh array is used * to collect all inner derivatives with respect to Xm that will subsequently * be multiplied by outer derivatives with respect to Xl. The inner derivatives * of equations 11 and 12 are stored in the sumdeijdg array, which implies * that for these equations the inner derivatives are taken with respect to Xl. * Swapping the inner and outer derivatives for both of these equations preserves * the symmetry of the Hessian matrix, but swapping the inner and outer derivatives * for only one of these equations breaks the symmetry and leads to erroneous * results. For example, if one were to swap the inner and outer derivatives * in equation 11, but not in equation 12, the following pair of equations would * obtain: * * 11'. d(Ri)/d(Xm) * [ d(Rj)/d(Xl) * I(Dij) ] * * 12. d(Ri)/d(Xm) * [ d(Rj)/d(Xl) * L(Dij) ] * * This pair of equations is not symmetric and is therefore incorrect. * * Equations 5, 6, 7 and 8 are evaluated by first multiplying by * an inner derivative of Dij and then by multiplying by an outer * derivative of Ri or Rj. The inner derivatives of Dij are * collected into either the sumdeijdg array or sumdeijdh array. * The derivatives of a particular Dij produce a gradient vector * that is stored in one row of either sumdeijdg or sumdeijdh. * Subsequent multiplication by the outer derivative of Ri or Rj * involves multiplying each derivative of Ri or Rj by all of the * terms in the gradient vector. This process involves * a subtlety. In equations 5 and 6 the inner derivative is with * respect to Xl whereas in equations 7 and 8 the inner derivative * is with respect to Xm. For this reason, the inner derivatives * of equations 5 and 6 are collected into the sumdeijdg array, * whereas the inner derivatives of equations 7 and 8 are collected * into the sumdeijdh array, following the convention that was * discussed above for equations 11 and 12. The outer derivatives * taken with respect to Xm are computed using the sumdeijdg array, * whereas the outer derivatives taken with respect to Xl are computed * using the sumdeijdh array. These two sets of derivatives are * written into transpose-related elements of the Hessian matrix in * order to preserve the symmetry of the Hessian. * * On another topic, it is possible to generate equations for the second * derivatives of the non-pairwise ("diagonal") terms. One begins by * building up the following equations: * * expmkf = exp(-kappa * KSCALE * Ri) / DIELECTRIC_CONSTANT * * dielfac = 1 - expmkf * * temp18 = Ri * (kappa * KSCALE * expmkf * Ri * (2 + kappa * KSCALE * Ri) * * - 2 * dielfac) * * * Using the above equations it is possible to calculate the second * derivatives of the non-pairwise terms via the two following equations: * * 17. d2(Ri)/d(Xm)d(Xl) * R(Dij) where R(Dij) is defined in equation 4. * * * 18. d(Ri)/d(Xm) * [ d(Ri)/d(Xl) * P(Dij) ] where * * P(Dij) = Qi^2 * temp18 / 2 * * Equations 17 and 18 are handled in a manner analogous to equation 4. * * * Calculate the Born, van der Waals and Coulombic energies. Complete * equations 15 and 16, and set up the remaining equations. For OpenMP * in this collection of loops sum data relative only to atom i or to * pairs ii and ij so that OpenMP can be used to parallelize the outer loop. * For ScaLAPACK sum data relative to atoms i and j, as well as to pairs * ii and jj. When in single-threaded mode sum data relative to atoms * i and j, as well as to pairs ii, ij, ji and jj. * * Initialize the energy by all OpenMP threads or ScaLAPACK processes. */ epol = elec = evdw = 0.0; #pragma omp parallel reduction (+: epol, elec, evdw) \ private (i, i3, in, in3, xi, yi, zi, qi, qj, ri, ri2, riinv, expmkf, dielfac, \ qi2h, qid2h, temp17, temp18, p_dij, r_dij, iaci, j, j3, jn, jn3, \ xij, yij, zij, r2, r2inv, qiqj, rj, rj2, rjinv, rirjinv, rb2, efac, \ fgbi, fgbk, temp4, temp6, a_dij, temp5, b_dij, c_dij, di, dj, d2ii, d2ij, \ k, l, df, dg, rinv, r4inv, eel, ic, r6inv, f6, f12, dedx, dedy, dedz, \ temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15, \ temp16, temp20, temp21, temp22, temp23, d_dij, e_dij, f_dij, \ g_dij, h_dij, i_dij, j_dij, l_dij, m_dij, n_dij, \ npairs, xj, yj, zj, thi, threadnum, numthreads, \ d2jj, ri1i, temp1, dij3i, dx, dy, dz, m, iexw, \ datmp, tmpsd, dumbo, temp2, da2tmp, sj, sj2, dij1i, dij2i, dij, \ dxj, dyj, dzj, sumda, ijterm) { /* * Get the thread number and the number of threads for multi-threaded * execution under OpenMP. For ScaLAPACK, use the process column and * the number of process columns, respectively. These variables are * not used for MPI or single-threaded execution. */ #ifdef OPENMP threadnum = omp_get_thread_num(); numthreads = omp_get_num_threads(); #elif defined(SCALAPACK) threadnum = mycol; numthreads = npcol; #endif /* * For OpenMP, initialize the sumdeijda, sumdeijdg, sumdeijdh, * sumdeijdn and dreff arrays inside of the parallel region. * * The "first touch" memory allocation strategy will locate * the elements of these arrays that are initialized by * a particular CPU in memory that is local to that CPU. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. * * It is not necessary to synchronize OpenMP threads following * this loop because the next loops use identical loop index to * thread mapping. Hence the sumdeijda array is guaranteed to * be initialized above before it is updated below. * * For ScaLAPACK, each process maintains a full copy of the sumdeijda * and grad arrays, so initialize the entirety of these arrays for each * task. Note that because each process maintains a full copy of these * arrays they may be updated in i,j loops only and not in j,i loops. * * For ScaLAPACK initialize as well the gg, sg, sh, sr and sn vectors. * * Because at present MPI_Allreduce uses the default MPI_COMM_WORLD, * initialization must be performed for all tasks, not just the tasks * on the process grid. * * Each process on the process grid maintains a submatrix of the * distributed sumdeijdg, sumdeijdh, sumdeijdn and dreff matrices, * so initialize the submatrix that is maintained by each process. */ #ifdef SCALAPACK for (i = 0; i < n; i++) { sumdeijda[i] = sn[i] = 0.0; } for (i = 0; i < n3; i++) { grad[i] = sg[i] = sh[i] = sr[i] = 0.0; } for (i = 0; i < 3*n3; i++) { gg[i] = 0.0; } if (context_PxQ >= 0) { for (i = 0; i < super_NxN->size; i++) { sumdeijdn[i] = 0.0; } for (i = 0; i < super_Nx3N->size; i++) { sumdeijdg[i] = 0.0; } for (i = 0; i < super_Nx3N->size; i++) { sumdeijdh[i] = 0.0; } for (i = 0; i < super_Nx3N->size; i++) { dreff[i] = 0.0; } } if (gb2_debug) { printf("arrays initialized\n"); fflush(stdout); } #else #ifndef ROW_CYCLIC #pragma omp for #endif for (i = 0; i < n; i++) { #if defined(ROW_CYCLIC) && defined(OPENMP) if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue; #endif in = n * i; in3 = 3 * in; sumdeijda[i] = 0.0; for (j = 0; j < n; j++) { sumdeijdn[in + j] = 0.0; } for (j = 0; j < n3; j++) { sumdeijdg[in3 + j] = sumdeijdh[in3 + j] = dreff[in3 + j] = 0.0; } } #endif /* * Allocate and initialize the iexw array used for skipping excluded * atoms. Note that because of the manner in which iexw is used, it * is necessary to initialize it before only the first iteration of * the following loop. */ iexw = ivector(-1, n); for (i = -1; i < n; i++) { iexw[i] = -1; } /* * Loop over all atoms i. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. * * For OpenMP the threads are arranged in a row cyclic manner so each * iteration of the following loops is assigned to a different thread. * * For ScaLAPACK the processes are arranged in a block cyclic manner * so each iteration of the following loops is assigned to all process * columns of a given process row. The mapping of iterations onto * process rows is accomplished such that the submatrices maintained * by the processes of the row are indexed by the global matrix row * index that is specified by the loop index. Before a matrix element * is updated, a check is performed to select the correct process * column as well. * * The Hessian matrix g has dimensionality 3Nx3N. The gradient * vector f has dimensionality 3Nx1. The sumdeijdn array has * dimensionality NxN. The sumdeijdg, sumdeijdh and dreff arrays * have dimensionality Nx3N. To facilitate selection of the correct * process row and column for all of these matrices, the block * dimensions have been chosen to match the dimensionality, that is, * a dimensionality of 3N requires a block dimension of 3*BLOCKSIZE, * whereas a dimensionality of N requires a block dimension of * BLOCKSIZE. This approach permits the use of the bs variable * instead of the bs3 variable in the calls to the myroc function * below. * * It is not necessary to synchronize OpenMP threads following * this loop because the next loop uses identical loop index to * thread mapping. Hence there will be no race condition associated * with updates of the sumdeijda, sumdeijdn, sumdeijdg, sumdeijdh, * gradient and Hessian arrays. In particlar, this (i,j) loop * updates i elements of the sumdeijda and f gradient vectors, and * (i,i) and (i,j) elements of the sumdeijdg, sumdeijdh and Hessian * matrices, whereas the next (j,i) loop updates j elements of the * sumdeijda and f gradient vectors,and (j,j) and (j,i) elements * of the sumdeijdg, sumdeijdh and Hessian matrices, so each loop * updates the same array elements. * * For ScaLAPACK this loop updates i and j elements of the sumdeijda * and (grad) gradient vectors, so there is no need to update these * vectors in the next (j,i) loop. For ScaLAPACK this loop updates * matrices in the same manner as OpenMP, so the following (j,i) loop * is necessary. * * For MPI and single-threaded execution this loop updates all elements * of all vectors and matrices, so the following (j,i) loop is not * necessary. */ for (i = 0; i < n; i++) { #if defined(OPENMP) || defined(SCALAPACK) if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue; #endif in = n * i; qi = q[i]; ri = reff[i]; /* * If atom i is not frozen, compute the "diagonal" energy that * is a function of only the effective radius Ri but not of the * interatomic distance Dij. Compute also the contribution of * the diagonal energy term to the sum by which the derivatives * of Ri will be multiplied. */ if (!frozen[i]) { expmkf = exp(-KSCALE * (*kappa) * ri) / (*diel_ext); dielfac = 1.0 - expmkf; qi2h = 0.5 * qi * qi; qid2h = qi2h * dielfac; /* * For ScaLAPACK, update scalars such as epol by only one * process row (such as row 0) of a given process column * in this i-loop, but update scalars by all process rows * in the i,j and j,i nested loops. */ #ifdef SCALAPACK if (myrow == 0) { epol += -qid2h / ri; } #else epol += -qid2h / ri; #endif temp18 = ( (*kappa) * KSCALE * expmkf * ri * (2.0 + (*kappa) * KSCALE * ri) - 2.0 * dielfac ) * ri; p_dij = qi2h * temp18; r_dij = qid2h - KSCALE * (*kappa) * qi2h * expmkf * ri; /* * Each of the ScaLAPACK process rows performs the following * update to sumdeijda[i] because it occurs in the i-loop not * the j-loop. Therefore, disallow all but one update. */ #ifdef SCALAPACK if (myrow == 0) { sumdeijda[i] += r_dij; } #else sumdeijda[i] += r_dij; #endif /* * For ScaLAPACK, the diagonal elements of matrices * such as sumdeijdn are not updated; rather, an * associated vector such as sn is updated instead. * This approach is necessary because the diagonal * elements (i,i) and (j,j) receive contributions * from all (i,j) atom pairs; however, because the * pair list is distributed across process columns, * a process that calculates the contribution from * atom pair (i,j) will be able to access element * (i,j) of the matrix but that process may not be * able to access element (i,i) or (j,j). Hence * a vector such as sn exists in each process for * the purpose of updating the diagonal elements of * these matrices. * * Also, as discussed above for sumdeijda[i], disallow * all but one update of sn[i] in the i-loop. */ #ifdef SCALAPACK if (myrow == 0) { sn[i] += p_dij; } #else sumdeijdn[in + i] += p_dij; #endif } /* * Skip the pair calculations if there are no atoms j on the * pair list of atom i. */ npairs = upears[i]; if ( npairs <= 0 ) continue; i3 = 3 * i; in3 = 3 * in; xi = x[i3 ]; yi = x[i3 + 1]; zi = x[i3 + 2]; iaci = prm->Ntypes * (prm->Iac[i] - 1); ri2 = ri * ri; riinv = 1.0 / ri; /* Initialize the first derivative accumulators. */ dx = dy = dz = 0.0; /* * Expand the excluded atom list into the iexw array by storing i * at array address j. */ for (j = 0; j < prm->Iblo[i]; j++) { iexw[IexclAt[i][j] - 1] = i; } /* Select atom j from the pair list. Non-graceful error handling. */ for (m = lpears[i]; m < lpears[i] + npairs; m++) { if (pearlist[i] == NULL) { printf("NULL pair list entry in egb2 loop 3, taskid = %d\n", mytaskid); fflush(stdout); } j = pearlist[i][m]; j3 = 3 * j; jn = n * j; jn3 = 3 * jn; /* Continue computing the non-diagonal energy term. */ xij = xi - x[j3 ]; yij = yi - x[j3 + 1]; zij = zi - x[j3 + 2]; r2 = xij * xij + yij * yij + zij * zij; /* * Because index j is retrieved from the pairlist array it is * not constrained to a particular range of values; therefore, * the threads that have loaded the reff array must be * synchronized prior to the use of reff below. */ r2inv = 1.0 / r2; qiqj = qi * q[j]; rj = reff[j]; rj2 = rj * rj; rjinv = 1.0 / rj; rirjinv = riinv * rjinv; rb2 = ri * rj; efac = exp(-r2 / (4.0 * rb2)); fgbi = 1.0 / sqrt(r2 + rb2 * efac); fgbk = -(*kappa) * KSCALE / fgbi; /* * Calculate the "non-diagonal" energy term, i.e., the term that is a * function of the effective radius Ri and the interatomic distance Dij. */ expmkf = exp(fgbk) / (*diel_ext); dielfac = 1.0 - expmkf; epol += -qiqj * dielfac * fgbi; temp4 = fgbi * fgbi * fgbi; temp6 = qiqj * temp4 * (dielfac + fgbk * expmkf); /* The following term is required by equation 1. */ a_dij = temp6 * (1.0 - 0.25 * efac); /* * Prepare to compute the 1st derivatives of the effective radii. * The following code performs the sums indicated by equations 2 and 3. * Compute the contribution of the non-diagonal energy term to the sum * by which the derivatives of Ri and Rj will be multiplied. For OpenMP * sum the contribution for the derivative of Ri only; otherwise, sum * the contribution for the derivatives of Ri and Rj. */ temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2); b_dij = ri * temp5; c_dij = rj * temp5; sumdeijda[i] += b_dij; #if !defined(OPENMP) && !defined(SCALAPACK) sumdeijda[j] += c_dij; #endif /* * Calculate the first and second derivatives of the interatomic * distance Dij with respect to the cartesian coordinates of atoms * i and j. The results are placed into five arrays: * * di[] for the first derivatives with respect to atom i * dj[] for the first derivatives with respect to atom j * d2ii[] for the second derivatives with respect to atom i * d2jj[] for the second derivatives with respect to atom j * d2ij[] for the second derivatives with respect to atoms i and j * * Some useful symmetry obtains. The d2ii and d2ij arrays * are symmetric in that their lower and upper triangles are * the transposes of one another. Also, d2jj equals d2ii, and * d2ij is the negative of d2ii. * * Note: the additional factor of the interatomic distance Dij that * ought to be included in the denominator of the following equations * is provided by the absence of Dij in equations 15 and 16, as well as * by one less power of Dij in the denominator of the equations for the * derivatives of the Van der Waals and Coulombic energy terms. */ di[0] = xij; di[1] = yij; di[2] = zij; dj[0] = -xij; dj[1] = -yij; dj[2] = -zij; /* Load the upper triangle of d2ii. */ d2ii[0][0] = 1.0 - xij * xij * r2inv; d2ii[0][1] = -xij * yij * r2inv; d2ii[0][2] = -xij * zij * r2inv; d2ii[1][1] = 1.0 - yij * yij * r2inv; d2ii[1][2] = -yij * zij * r2inv; d2ii[2][2] = 1.0 - zij * zij * r2inv; /* Finish loading the rest of all of the matrices. */ for (k = 0; k < 3; k++) { /* Load the upper triangles of d2jj and d2ij. */ for (l = k; l < 3; l++) { #if !defined(OPENMP) && !defined(SCALAPACK) d2jj[k][l] = d2ii[k][l]; #endif d2ij[k][l] = -d2ii[k][l]; } /* Load the symmetric elements of d2ii, d2jj and d2ij. */ for (l = k+1; l < 3; l++) { d2ii[l][k] = d2ii[k][l]; #if !defined(OPENMP) && !defined(SCALAPACK) d2jj[l][k] = d2jj[k][l]; #endif d2ij[l][k] = d2ij[k][l]; } } /* * Compute the Van der Waals and Coulomb energy here because * if egb2 is called nbond2 is not called. Remember that the * derivatives of Dij with respect to x, y and z supply one more * power of 1/Dij but that power of 1/Dij is included in the * equations below, instead of with the derivatives of Dij. * * Compute the Van der Waals and Coulombic energies for only * those pairs which are not on the excluded atom list. Any * pair on the excluded atom list will have atom i stored at * address j of the iexw array. It is not necessary to reset * the elements of the iexw array to -1 between successive * iterations in i because an i,j pair is uniquely identified * by atom i stored at array address j. Thus for example, the * i+1,j pair would be stored at the same address as the i,j * pair but after the i,j pair were used. * * Initialize df here instead of within the following if statement * so that df is initialized unconditionally. */ df = 0.0; if (iexw[j] != i) { /* Initialize dg. */ dg = 0.0; rinv = 1. / sqrt(r2); r4inv = r2inv * r2inv; /* gas-phase Coulomb energy: */ eel = qiqj * rinv; elec += eel; df -= eel; dg += 2. * eel; /* Lennard-Jones energy: */ ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1; if( ic >= 0 ){ r6inv = r2inv * r2inv * r2inv; f6 = prm->Cn2[ic] * r6inv; f12 = prm->Cn1[ic] * r6inv * r6inv; evdw += f12 - f6; df += 6. * f6 - 12. * f12; dg += 156. * f12 - 42. * f6; } df *= r2inv; dg *= r4inv; /* * Calculate the second derivatives of Dij for the Van der Waals * and Coulombic interactions. This calculation uses df and dg * for these interactions only. * * When in single-threaded mode sum to the Hessian the ii, ij, ji * and jj terms. For OpenMP sum to the ii and ij terms only. * For ScaLAPACK sum to the ji term only and sum to the gg vector * instead of to the ii term. */ for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { ijterm = df*d2ij[k][l] + dg*di[k]*dj[l]; #ifdef SCALAPACK gg[3*(i3 + k) + l] += df*d2ii[k][l] + dg*di[k]*di[l]; ptr = ptr2f(g, super_3Nx3N, j3+l, i3+k, &info); if (ptr != NULL) *ptr += ijterm; #else g[n3*(i3 + k) + i3 + l] += df*d2ii[k][l] + dg*di[k]*di[l]; g[n3*(i3 + k) + j3 + l] += ijterm; #endif #if !defined(OPENMP) && !defined(SCALAPACK) g[n3*(j3 + l) + i3 + k] += ijterm; g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l]; #endif } } } /* * Add to df the contribution to the gradient vector from * equation 1 so that it will be added to the gradient vector * along with the contributions from the Van der Waals and * Coulombic terms. */ df += a_dij; /* * Complete the first derivatives of Dij for the Van der Waals * and Coulombic interactions, as well as for the component of * the first derivatives that is contributed by equation 1. * * Sum into the first derivative accumulators (dx, dy, dz) * the derivatives that are computed relative to the Cartesian * coordiantes of atom i. * * For ScaLAPACK sum into the gradient vector grad the derivative * that is computed relative to the Cartesian coordinates of atom * j. For all other cases except OpenMP sum this derivative into * the gradient vector f. */ dedx = df * xij; dedy = df * yij; dedz = df * zij; dx += dedx; dy += dedy; dz += dedz; #ifdef SCALAPACK grad[j3 ] -= dedx; grad[j3 + 1] -= dedy; grad[j3 + 2] -= dedz; #elif !defined(OPENMP) f[j3 ] -= dedx; f[j3 + 1] -= dedy; f[j3 + 2] -= dedz; #endif /* The following code sets up the sums for equations 5-16. */ temp5 = 0.5 * temp6; temp6 = fgbi * fgbi * temp5; temp7 = qiqj * (*kappa) * (*kappa) * KSCALE * KSCALE * temp4 * expmkf; temp8 = 1 - 0.25 * efac; temp9 = 0.25 * temp7 - 1.5 * temp6; temp10 = 0.25 * r2; temp11 = rb2 + temp10; temp12 = ri2 * rj2; temp13 = ri * temp11; temp14 = rj * temp11; temp15 = temp10 * rjinv; temp16 = temp10 * riinv; temp17 = temp9 * temp13 * efac; temp18 = temp9 * temp14 * efac; temp20 = temp8 * temp9; temp21 = temp20 * temp13; temp22 = temp20 * temp14; temp23 = 2.0 * efac; d_dij = temp23 * (temp21 - 0.25 * temp15 * temp5); e_dij = temp23 * (temp22 - 0.25 * temp16 * temp5); f_dij = 2.0 * temp8 * temp5; g_dij = 4.0 * temp8 * temp8 * temp9 + (2.0 * temp8 + efac * temp10 * rirjinv ) * r2inv * temp5; h_dij = efac * (temp10 * (temp13 * rjinv - ri2) * temp5 + temp13 * temp17); i_dij = efac * ((temp12 + temp16 * temp13) * temp5 + temp13 * temp18); j_dij = temp23 * ((ri - rjinv * temp11) * 0.25 * temp5 + temp21); l_dij = efac * ((temp12 + temp15 * temp14) * temp5 + temp14 * temp17); m_dij = efac * (temp10 * (temp14 * riinv - rj2) * temp5 + temp14 * temp18); n_dij = temp23 * ((rj - riinv * temp11) * 0.25 * temp5 + temp22); /* * Evaluate equations 15 and 16 and sum the results to the Hessian * matrix. These equations involve the derivatives d(Dij)/d(Xl), * d(Dij)/d(Xm) and d2(Dij)/d(Xl)d(Xm), which derivatives exist * only for values of k==i, k==j, l==i and l==j. * * When in single-threaded mode sum to the Hessian the ii, ij, ji * and jj terms. For OpenMP sum to the ii and ij terms only. * For ScaLAPACK sum to the ji term only and sum to the gg vector * instead of to the ii term. */ df = f_dij; dg = g_dij; for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { ijterm = df*d2ij[k][l] + dg*di[k]*dj[l]; #ifdef SCALAPACK gg[3*(i3 + k) + l] += df*d2ii[k][l] + dg*di[k]*di[l]; ptr = ptr2f(g, super_3Nx3N, j3+l, i3+k, &info); if (ptr != NULL) *ptr += ijterm; #else g[n3*(i3 + k) + i3 + l] += df*d2ii[k][l] + dg*di[k]*di[l]; g[n3*(i3 + k) + j3 + l] += ijterm; #endif #if !defined(OPENMP) && !defined(SCALAPACK) g[n3*(j3 + l) + i3 + k] += ijterm; g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l]; #endif } } /* * Setup the inner differentiations indicated by equations 9 and 12. * The column index selects the first derivative to be calculated, * i.e., d(Ri) or d(Rj). The row index is i because we are * setting up the outer multiplication by d(Ri). */ #ifdef SCALAPACK sn[i] += h_dij; ptr = ptr2f(sumdeijdn, super_NxN, j, i, &info); if (ptr != NULL) *ptr += i_dij; #else sumdeijdn[in + i] += h_dij; sumdeijdn[in + j] += l_dij; #endif /* * Setup the inner differentiations indicated by equations 10 and 11. * The column index selects the first derivative to be calculated, * i.e., d(Ri) or d(Rj). The row index is j because we are * setting up the outer multiplication by d(Rj). When in single- * threaded mode sum both terms. When in multi-threaded mode sum * neither term. */ #if !defined(OPENMP) && !defined(SCALAPACK) sumdeijdn[jn + i] += i_dij; sumdeijdn[jn + j] += m_dij; #endif /* * Perform the inner differentiation indicated by equations 5, 6, 7 and 8. * The row or first index is selected by i because we are setting up the * outer differentiation by the derivatives of Ri. The column or second * index is selected by i or j because the Dij term affects the derivatives * of atoms i and j. Both the sumdeijdg and sumdeijdh arrays are used * because equations 5 and 6 perform inner differentiation with respect to * Xl, whereas equations 7 and 8 perform inner differentiation with respect * to Xm. Distinguishing these two cases is accomplished by transposing * the Hessian addresses, as will be done in the final two calls to the * dgemm() function below. * * When in single-threaded mode sum into the sumdeigdg and sumdeijdh arrays * all terms. When in multi-threaded mode sum only those terms that comprise * d(Ri) times d(Dij). * * For ScaLAPACK, update matrices such as sumdeijdg * by the correct process column. The correct process * row has already been selected by the loop index i. * Note that because sumdeijdg has dimensionality Nx3N * both blocksize (bs) and 3*blocksize (bs3) are used in * calculation of the local submatrix address. */ for (k = 0; k < 3; k++) { #ifdef SCALAPACK sg[i3 + k] += d_dij * di[k]; sh[i3 + k] += j_dij * di[k]; ptr = ptr2f(sumdeijdg, super_Nx3N, j, i3+k, &info); if (ptr != NULL) *ptr += e_dij * di[k]; ptr = ptr2f(sumdeijdh, super_Nx3N, j, i3+k, &info); if (ptr != NULL) *ptr += n_dij * di[k]; #else sumdeijdg[in3 + i3 + k] += d_dij * di[k]; sumdeijdg[in3 + j3 + k] += d_dij * dj[k]; sumdeijdh[in3 + i3 + k] += j_dij * di[k]; sumdeijdh[in3 + j3 + k] += j_dij * dj[k]; #endif #if !defined(OPENMP) && !defined(SCALAPACK) sumdeijdg[jn3 + i3 + k] += e_dij * di[k]; sumdeijdg[jn3 + j3 + k] += e_dij * dj[k]; sumdeijdh[jn3 + i3 + k] += n_dij * di[k]; sumdeijdh[jn3 + j3 + k] += n_dij * dj[k]; #endif } } /* Sum the first derivative accumulators into the gradient vector. */ #ifdef SCALAPACK grad[i3 ] += dx; grad[i3 + 1] += dy; grad[i3 + 2] += dz; #else f[i3 ] += dx; f[i3 + 1] += dy; f[i3 + 2] += dz; #endif } if (gb2_debug) { printf("end loop 1\n"); fflush(stdout); } /* * Do not execute the following nest of loops for single-threads or MPI. * For OpenMP update the j elements of the gradient vector f and of the * sumdeijda array. For OpenMP and ScaLAPACK update the j,i and j,j * elements of the sumdeijdg, sumdeijdh and Hessian matrices. */ #if defined(OPENMP) || defined(SCALAPACK) /* * Initialize the iexw array used for skipping excluded atoms. * Note that because of the manner in which iexw is used, it * is necessary to initialize it before only the first iteration * of the following loop. */ for (i = -1; i < n; i++) { iexw[i] = -1; } /* * Loop over all atoms j. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. */ for (j = 0; j < n; j++) { if ( !myroc(j, BLOCKSIZE, numthreads, threadnum) ) continue; /* * Skip the pair calculations if there are no atoms i on the * pair list of atom j. */ npairs = lpears[j]; if ( npairs <= 0 ) continue; j3 = 3 * j; jn = n * j; jn3 = n * j3; xj = x[j3 ]; yj = x[j3 + 1]; zj = x[j3 + 2]; qj = q[j]; rj = reff[j]; rj2 = rj * rj; rjinv = 1.0 / rj; /* Initialize the first derivative accumulators. */ dx = dy = dz = 0.0; /* * Expand the excluded list into the iexw array by storing j * at array address i. */ for (i = 0; i < Jblo[j]; i++) { iexw[JexclAt[j][i] - 1] = j; } /* Select atom i from the pair list. Non-graceful error handling. */ for (m = 0; m < npairs; m++) { if (pearlist[j] == NULL) { printf("NULL pair list entry in egb2 loop 4, taskid = %d\n", mytaskid); fflush(stdout); } i = pearlist[j][m]; in = n * i; i3 = 3 * i; in3 = n * i3; xij = x[i3 ] - xj; yij = x[i3 + 1] - yj; zij = x[i3 + 2] - zj; r2 = xij * xij + yij * yij + zij * zij; r2inv = 1.0 / r2; iaci = prm->Ntypes * (prm->Iac[i] - 1); /* * Because index i is retrieved from the pairlist array it is * not constrained to a particular range of values; therefore, * the threads that have loaded the reff array must be * synchronized prior to the use of reff below. */ qiqj = q[i] * qj; ri = reff[i]; ri2 = ri * ri; riinv = 1.0 / ri; rirjinv = riinv * rjinv; rb2 = ri * rj; efac = exp(-r2 / (4.0 * rb2)); fgbi = 1.0 / sqrt(r2 + rb2 * efac); fgbk = -(*kappa) * KSCALE / fgbi; /* * Calculate the "non-diagonal" energy term, i.e., the term that is a * function of the effective radius Ri and the interatomic distance Dij. */ expmkf = exp(fgbk) / (*diel_ext); dielfac = 1.0 - expmkf; temp4 = fgbi * fgbi * fgbi; temp6 = qiqj * temp4 * (dielfac + fgbk * expmkf); /* The following term is required by equation 1. */ a_dij = temp6 * (1.0 - 0.25 * efac); /* * Prepare to compute the 1st derivatives of the effective radii. * The following code performs the sums indicated by equations 2 and 3. * Compute the contribution of the non-diagonal energy term to the sum * by which the derivative of Rj will be multiplied. */ temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2); c_dij = rj * temp5; /* Update the j element of the sumdeijda array. */ sumdeijda[j] += c_dij; /* * Calculate the first and second derivatives of the interatomic * distance Dij with respect to the cartesian coordinates of atoms * i and j. The results are placed into two arrays: * * di[] for the first derivatives with respect to atom i * dj[] for the first derivatives with respect to atom j * d2jj[] for the second derivatives with respect to atom j * d2ij[] for the second derivatives with respect to atoms i and j * * Some useful symmetry obtains. The d2ii and d2ij arrays * are symmetric in that their lower and upper triangles are * the transposes of one another. Also, d2jj equals d2ii, and * d2ij is the negative of d2ii. * * Note: the additional factor of the interatomic distance Dij that * ought to be included in the denominator of the following equations * is provided by the absence of Dij in equations 15 and 16, as well as * by one less power of Dij in the denominator of the equations for the * derivatives of the Van der Waals and Coulombic energy terms. */ di[0] = xij; di[1] = yij; di[2] = zij; dj[0] = -xij; dj[1] = -yij; dj[2] = -zij; /* Load the upper triangle of d2jj. */ d2jj[0][0] = 1.0 - xij * xij * r2inv; d2jj[0][1] = -xij * yij * r2inv; d2jj[0][2] = -xij * zij * r2inv; d2jj[1][1] = 1.0 - yij * yij * r2inv; d2jj[1][2] = -yij * zij * r2inv; d2jj[2][2] = 1.0 - zij * zij * r2inv; /* Finish loading the rest of all of the matrices. */ for (k = 0; k < 3; k++) { /* Load the upper triangle of d2ij. */ for (l = k; l < 3; l++) { d2ij[k][l] = -d2jj[k][l]; } /* Load the symmetric elements of d2jj and d2ij. */ for (l = k+1; l < 3; l++) { d2jj[l][k] = d2jj[k][l]; d2ij[l][k] = d2ij[k][l]; } } /* * Compute the Van der Waals and Coulomb energy here because * if egb2 is called nbond2 is not called. Remember that the * derivatives of Dij with respect to x, y and z supply one more * power of 1/Dij but that power of 1/Dij is included in the * equations below, instead of with the derivatives of Dij. * * Compute the Van der Waals and Coulombic energies for only * those pairs which are not on the excluded atom list. Any * pair on the excluded atom list will have atom i stored at * address j of the iexw array. It is not necessary to reset * the elements of the iexw array to -1 between successive * iterations in i because an i,j pair is uniquely identified * by atom i stored at array address j. Thus for example, the * i,j+1 pair would be stored at the same address as the i,j * pair but after the i,j pair were used. * * Initialize df here instead of within the following if statement * so that df is initialized unconditionally. */ df = 0.0; if (iexw[i] != j) { /* Initialize dg. */ dg = 0.0; rinv = 1. / sqrt(r2); r4inv = r2inv * r2inv; /* gas-phase Coulomb energy: */ eel = qiqj * rinv; df -= eel; dg += 2. * eel; /* Lennard-Jones energy: */ ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1; if( ic >= 0 ){ r6inv = r2inv * r2inv * r2inv; f6 = prm->Cn2[ic] * r6inv; f12 = prm->Cn1[ic] * r6inv * r6inv; df += 6. * f6 - 12. * f12; dg += 156. * f12 - 42. * f6; } df *= r2inv; dg *= r4inv; /* * Calculate the second derivatives of Dij for the Van der Waals * and Coulombic interactions. This calculation uses df and dg * for these interactions only. * * For OpenMP sum the ji and jj terms into the Hessian. * For ScaLAPACK sum the ij term into the Hessian but * sum the jj term into the gg vector. */ for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { #ifdef SCALAPACK ptr = ptr2f(g, super_3Nx3N, i3+k, j3+l, &info); if (ptr != NULL) *ptr += df*d2ij[k][l] + dg*di[k]*dj[l]; gg[3*(j3 + k) + l] += df*d2jj[k][l] + dg*dj[k]*dj[l]; #else g[n3*(j3 + l) + i3 + k] += df*d2ij[k][l] + dg*di[k]*dj[l]; g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l]; #endif } } } /* * Add to df the contribution to the gradient vector from * equation 1 so that it will be added to the gradient vector * along with the contributions from the Van der Waals and * Coulombic terms. */ df += a_dij; /* * Complete the first derivatives of Dij for the Van der Waals * and Coulombic interactions, as well as for the component of * the first derivatives that is contributed by equation 1. * * Sum the derivatives into the derivative accumulators. */ dedx = df * xij; dedy = df * yij; dedz = df * zij; dx += dedx; dy += dedy; dz += dedz; /* The following code sets up the sums for equations 5-16. */ temp5 = 0.5 * temp6; temp6 = fgbi * fgbi * temp5; temp7 = qiqj * (*kappa) * (*kappa) * KSCALE * KSCALE * temp4 * expmkf; temp8 = 1 - 0.25 * efac; temp9 = 0.25 * temp7 - 1.5 * temp6; temp10 = 0.25 * r2; temp11 = rb2 + temp10; temp12 = ri2 * rj2; temp13 = ri * temp11; temp14 = rj * temp11; temp15 = temp10 * rjinv; temp16 = temp10 * riinv; temp17 = temp9 * temp13 * efac; temp18 = temp9 * temp14 * efac; temp20 = temp8 * temp9; temp21 = temp20 * temp13; temp22 = temp20 * temp14; temp23 = 2.0 * efac; d_dij = temp23 * (temp21 - 0.25 * temp15 * temp5); e_dij = temp23 * (temp22 - 0.25 * temp16 * temp5); f_dij = 2.0 * temp8 * temp5; g_dij = 4.0 * temp8 * temp8 * temp9 + (2.0 * temp8 + efac * temp10 * rirjinv ) * r2inv * temp5; h_dij = efac * (temp10 * (temp13 * rjinv - ri2) * temp5 + temp13 * temp17); i_dij = efac * ((temp12 + temp16 * temp13) * temp5 + temp13 * temp18); j_dij = temp23 * ((ri - rjinv * temp11) * 0.25 * temp5 + temp21); l_dij = efac * ((temp12 + temp15 * temp14) * temp5 + temp14 * temp17); m_dij = efac * (temp10 * (temp14 * riinv - rj2) * temp5 + temp14 * temp18); n_dij = temp23 * ((rj - riinv * temp11) * 0.25 * temp5 + temp22); /* * Evaluate equations 15 and 16 and sum the results to the Hessian * matrix. These equations involve the derivatives d(Dij)/d(Xl), * d(Dij)/d(Xm) and d2(Dij)/d(Xl)d(Xm), which derivatives exist * only for values of k==i, k==j, l==i and l==j. * * For OpenMP sum the ji and jj terms into the Hessian. * For ScaLAPACK sum the ij term into the Hessian but * sum the jj term into the gg vector. */ df = f_dij; dg = g_dij; for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { #ifdef SCALAPACK ptr = ptr2f(g, super_3Nx3N, i3+k, j3+l, &info); if (ptr != NULL) *ptr += df*d2ij[k][l] + dg*di[k]*dj[l]; gg[3*(j3 + k) + l] += df*d2jj[k][l] + dg*dj[k]*dj[l]; #else g[n3*(j3 + l) + i3 + k] += df*d2ij[k][l] + dg*di[k]*dj[l]; g[n3*(j3 + k) + j3 + l] += df*d2jj[k][l] + dg*dj[k]*dj[l]; #endif } } /* * Setup the inner differentiations indicated by equations 10 and 11. * The column index selects the first derivative to be calculated, * i.e., d(Rj). The row index is j because we are setting up the * outer multiplication by d(Rj). */ #ifdef SCALAPACK ptr = ptr2f(sumdeijdn, super_NxN, i, j, &info); if (ptr != NULL) *ptr += l_dij; sn[j] += m_dij; #else sumdeijdn[jn + i] += i_dij; sumdeijdn[jn + j] += m_dij; #endif /* * Perform the inner differentiation indicated by equations 5, 6, 7 and 8. * The row or first index is selected by j because we are setting up the * outer differentiation by the derivatives of Rj. The column or second * index is selected by i or j because the Dij term affects the derivatives * of atoms i and j. Both the sumdeijdg and sumdeijdh arrays are used * because equations 5 and 6 perform inner differentiation with respect to * Xl, whereas equations 7 and 8 perform inner differentiation with respect * to Xm. Distinguishing these two cases is accomplished by transposing * the Hessian addresses, as will be done in the final two calls to the * dgemm() function below. * * Sum those terms that comprise d(Rj) times d(Dij). */ for (k = 0; k < 3; k++) { #ifdef SCALAPACK sg[j3 + k] += e_dij * dj[k]; sh[j3 + k] += n_dij * dj[k]; ptr = ptr2f(sumdeijdg, super_Nx3N, i, j3+k, &info); if (ptr != NULL) *ptr += d_dij * dj[k]; ptr = ptr2f(sumdeijdh, super_Nx3N, i, j3+k, &info); if (ptr != NULL) *ptr += j_dij * dj[k]; #else sumdeijdg[jn3 + i3 + k] += e_dij * di[k]; sumdeijdg[jn3 + j3 + k] += e_dij * dj[k]; sumdeijdh[jn3 + i3 + k] += n_dij * di[k]; sumdeijdh[jn3 + j3 + k] += n_dij * dj[k]; #endif } } /* * For OpenMP but not for ScaLAPACK, sum the first derivative * accumulators into the gradient vector f. */ #ifdef OPENMP f[j3 ] -= dx; f[j3 + 1] -= dy; f[j3 + 2] -= dz; #endif } if (gb2_debug) { printf("end loop 2\n"); fflush(stdout); } #endif /* if defined(OPENMP) || defined(SCALAPACK) */ /* * There is no pragma directive associated with the above 'for' loop * so explicit thread synchronization is performed here. */ #pragma omp barrier /* * The SCALAPACK synchronization is accomplised via reduction * of the sumdeijda array. Each element of this array should * be non-zero for one process only. * * No reduction is needed for MPI because no parallelization is performed. * * Actually, for both OpenMP and SCALAPACK it is possible to defer * synchronization until after the following (i,j) nest of loops * because the mapping of loop indices to threads or tasks is * identical to that of the two previous nests of loops, and because * only the i elements of the sumdeijda array are read in the following * nest of loops. However, synchronization is possible at this point, * and may serve to better balance the workload in the next two nests * of loops. */ #ifdef SCALAPACK ierror = MPI_Allreduce(sumdeijda, reductarr, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb sumdeijda reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce sumdeijda\n"); fflush(stdout); } for (i = 0; i < n; i++) { sumdeijda[i] = reductarr[i]; } #endif /* Free the iexw array within this potentially parallel region of code. */ free_ivector(iexw, -1, n); /* * Compute the first and second derivatives of the effective radius. * Update the gradient vector using the first derivatives. Complete * equations 2, 3 and 4. Begin equations 13, 14 and 17 which update * the Hessian matrix. * * For OpenMP this loop updates the i elements of the gradient * vector f as well as the (i,i) and (i,j) elements of the dreff and * Hessian matrices. * * For ScaLAPACK this loop updates the i and j elements of the gradient * vector grad as well as the (i,i) and (i,j) elements of the dreff and * Hessian matrices. * * For MPI and single-threaded execution this loop updates all elements * of all vectors and matrices, so the following (j,i) loop is not * necessary. * * Loop over all atoms i. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. * * It is not necessary to synchronize OpenMP threads following * this loop because the next loop uses identical loop index to * thread mapping. Hence there will be no race condition associated * with updates of the dreff, gradient and Hessian arrays. */ for (i = 0; i < n; i++) { #if defined(OPENMP) || defined(SCALAPACK) if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue; #endif /* * Don't calculate derivatives of the effective radius of atom i * if atom i is frozen or if there are no pair atoms j associated * with atom i. */ npairs = lpears[i] + upears[i]; if ( frozen[i] || (npairs <= 0) ) continue; /* Initialize the first derivative accumulators. */ dx = dy = dz = 0.0; i3 = 3 * i; in = n * i; in3 = 3 * in; /* * The following captures all terms that are to be multiplied by the * first derivative of the effective radius (equations 2, 3 and 4). */ sumda = sumdeijda[i]; if ( gb>1 ) { ri = rborn[i] - BOFFSET; thi = tanh( (gbalpha - gbbeta*psi[i] + gbgamma*psi[i]*psi[i])*psi[i] ); sumda *= (gbalpha -2.0*gbbeta*psi[i] + 3.0*gbgamma*psi[i]*psi[i]) *(1.0 - thi*thi)*ri/rborn[i]; } /* * The following captures all terms that are to be multiplied by the * second derivative of the effective radius (equations 13, 14 and 17). */ sumda = sumdeijda[i]; /* Prepare to compute the derivatives of the effective radius Ri. */ xi = x[i3 ]; yi = x[i3 + 1]; zi = x[i3 + 2]; ri = rborn[i] - BOFFSET; ri1i = 1. / ri; /* Select atom j from the pair list. Non-graceful error handling. */ for (m = 0; m < npairs; m++) { if (pearlist[i] == NULL) { printf("NULL pair list entry in egb2 loop 5, taskid = %d\n", mytaskid); fflush(stdout); } j = pearlist[i][m]; j3 = 3 * j; jn3 = n * j3; xij = xi - x[j3 ]; yij = yi - x[j3 + 1]; zij = zi - x[j3 + 2]; r2 = xij * xij + yij * yij + zij * zij; /* Ignore the ij atom pair if their separation exceeds the GB cutoff. */ if ( r2 > rgbmaxpsmax2 ) continue; dij1i = 1.0 / sqrt(r2); dij2i = dij1i * dij1i; dij = r2 * dij1i; sj = fs[j] * (rborn[j] - BOFFSET); sj2 = sj * sj; /* * The following are the numerator of the first derivatives of the * equations from the Appendix of Schaefer and Froemmel and of the * Taylor series expansion for d>>s by Andreas Svrcek-Seiler. Smooth * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev. The * derivatives are with respect to Dij. The complete derivative is * formed by multiplying the numerator by -Ri*Ri. The factor of Ri*Ri * has been moved to equations 5-18. The negation is deferred until later. * When the chain rule is used to form the first derivatives of the * effective radius with respect to the cartesian coordinates, an * additional factor of Dij appears in the denominator. That factor * is included in the following expressions. */ if ( dij > rgbmax + sj ) continue; if ( dij > rgbmax - sj ) { temp1 = 1./(dij - sj); dij3i = dij1i * dij2i; datmp = 0.125 * dij3i * ((r2 + sj2) * (temp1 * temp1 - rgbmax2i) - 2.0 * log(rgbmax * temp1)); } else if ( dij > 4.0 * sj ) { tmpsd = sj2 * dij2i; dumbo = TE + tmpsd*(TF + tmpsd*(TG + tmpsd*(TH + tmpsd*THH))); datmp = tmpsd * sj * dij2i * dij2i * dumbo; } else if ( dij > ri + sj ) { temp1 = 1. / (r2 - sj2); datmp = temp1 * sj * (-0.5 * dij2i + temp1) + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj)); } else if ( dij > fabs(ri - sj) ) { temp1 = 1. / (dij + sj); dij3i = dij2i * dij1i; datmp = -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i * ri1i + dij1i * temp1 * (temp1 - dij1i) - dij3i * log(ri * temp1)); } else if ( ri < sj ) { temp1 = 1. / (r2 - sj2); datmp = -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 - 0.5 * dij2i * dij1i * log((sj - dij) / (sj + dij))); } else continue; /* * Negate the first derivative of Ri with respect to the * interatomic distance in order to account for the * minus sign of -Ri*Ri. The factor of Ri*Ri is omitted * because it was included in equations 5-18. These * derivatives contain the appropriate factor of Dij * in the denominator because it is supplied by datmp. */ df = -datmp; /* * The following are the numerator of the second derivatives of the * equations from the Appendix of Schaefer and Froemmel and of the * Taylor series expansion for d>>s by Andreas Svrcek-Seiler. Smooth * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev. The * derivatives are with respect to Dij. */ if ( dij > rgbmax + sj ) continue; if ( dij > rgbmax - sj ) { temp1 = 1./(dij - sj); temp2 = temp1 * temp1; dij3i = dij1i * dij2i; da2tmp = -0.25 * (dij3i * (sj2 * rgbmax2i - 1.0) + dij1i * temp2 + 2.0 * (log(rgbmax * temp1) * dij3i + temp1 * (dij2i - temp2))); } else if ( dij > 4.0 * sj ) { tmpsd = sj2 * dij2i; dumbo = TI + tmpsd*(TJ + tmpsd*(TK + tmpsd*(TL + tmpsd*TLL))); da2tmp = tmpsd * sj * dij2i * dij2i * dumbo; } else if ( dij > ri + sj ) { temp1 = 2. * dij * sj; temp2 = (dij - sj) * (dij + sj); dij3i = dij2i * dij1i; da2tmp = (0.5 * log((dij - sj) / (dij + sj)) + dij * sj * ((r2 + sj2) * (r2 - sj2) + temp1 * temp1) / (temp2 * temp2 * temp2)) * dij3i; } else if ( dij > fabs(ri - sj) ) { temp1 = dij + sj; temp2 = dij * temp1; ri2 = ri * ri; da2tmp = ((2. * ri2 * log(ri / temp1) + sj2) * temp1 * temp1 * temp1 + ri2 * sj * (2. * dij * dij - dij * sj - sj2)) / (4. * ri2 * temp2 * temp2 * temp2); } else if ( ri < sj ) { temp1 = sj2 - r2; temp2 = 2. * sj * dij; dij3i = dij2i * dij1i; da2tmp = (0.5 * log((sj - dij) / (sj + dij)) + sj * dij * (temp1 * (sj2 + r2) - temp2 * temp2) / (temp1 * temp1 * temp1)) * dij3i; } else continue; /* * Calculate the second derivative of the effective radius * with respect to the interatomic distance. A factor of * the Ri*Ri has been omitted from the second derivative * because it was included in equations 5-18. * * When the chain rule is used to form the second derivatives of the * effective radius with respect to the cartesian coordinates, an * additional factor of Dij*Dij appears in the denominator. That * factor is included in the following expression. */ dg = da2tmp * dij2i; /* * Calculate the first and second derivatives of the interatomic * distance Dij with respect to the cartesian coordinates of atoms * i and j. The results are placed into five arrays: * * di[] for the first derivatives with respect to atom i * dj[] for the first derivatives with respect to atom j * d2ii[] for the second derivatives with respect to atom i * d2jj[] for the second derivatives with respect to atom j * d2ij[] for the second derivatives with respect to atoms i and j * * Some useful symmetry obtains. The d2ii and d2ij arrays * are symmetric in that their lower and upper triangles are * the transposes of one another. Also, d2jj equals d2ii, and * d2ij is the negative of d2ii. * * * Note: the additional factor of the interatomic distance Dij that * ought to be included in the denominator of the following equations * is included instead in the first and second derivatives of the * effective radius with respect to Dij (df and dg) that were calculated * above. */ di[0] = xij; di[1] = yij; di[2] = zij; dj[0] = -xij; dj[1] = -yij; dj[2] = -zij; /* Load the upper triangle of d2ii. */ r2inv = 1.0 / r2; d2ii[0][0] = 1.0 - xij * xij * r2inv; d2ii[0][1] = -xij * yij * r2inv; d2ii[0][2] = -xij * zij * r2inv; d2ii[1][1] = 1.0 - yij * yij * r2inv; d2ii[1][2] = -yij * zij * r2inv; d2ii[2][2] = 1.0 - zij * zij * r2inv; /* Finish loading the rest of all of the matrices. */ for (k = 0; k < 3; k++) { /* Load the upper triangles of d2jj and d2ij. */ for (l = k; l < 3; l++) { #if !defined(OPENMP) && !defined(SCALAPACK) d2jj[k][l] = d2ii[k][l]; #endif d2ij[k][l] = -d2ii[k][l]; } /* Load the symmetric elements of d2ii, d2jj and d2ij. */ for (l = k+1; l < 3; l++) { d2ii[l][k] = d2ii[k][l]; #if !defined(OPENMP) && !defined(SCALAPACK) d2jj[l][k] = d2jj[k][l]; #endif d2ij[l][k] = d2ij[k][l]; } } /* * Store the 1st derivatives of Ri with respect to (xj,yj,zj) * if SCALAPACK is not defined. If SCALAPACK is defined, store * these derivatives in the next nest of loops. */ dxj = df * dj[0]; dyj = df * dj[1]; dzj = df * dj[2]; #ifndef SCALAPACK dreff[in3 + j3 ] = dxj; dreff[in3 + j3 + 1] = dyj; dreff[in3 + j3 + 2] = dzj; #endif /* * If OPENMP is not defined, update the gradient vector f * with d(Ri)/d(Xj). If SCALAPACK is defined update the * gradient vector grad instead of the gradient vector f. */ #ifdef SCALAPACK grad[j3 ] += sumda * dxj; grad[j3 + 1] += sumda * dyj; grad[j3 + 2] += sumda * dzj; #elif !defined(OPENMP) f[j3 ] += sumda * dxj; f[j3 + 1] += sumda * dyj; f[j3 + 2] += sumda * dzj; #endif /* * Sum the 1st derivatives of Ri with respect to (xi,yi,zi). * Exploit the fact that d(Dij)/d(Xi) = -d(Dij)/d(Xj). */ dx -= dxj; dy -= dyj; dz -= dzj; /* * Form the second derivatives of the effective radius with * respect to the cartesian coordinates using the chain rule, * multiply the result by sumda and sum to the Hessian matrix. * This step implements equations 13, 14 and 17, except that * terms such as d2(Ri)/d(Dij)d(Dik) are not handled here * but instead are handled in the first call to dgemm() below. * * When in single-threaded mode update the Hessian using the ii, ij * ji and jjterms. For OpenMP update the Hessian using the ii and ij * terms only. For ScaLAPACK update the Hessian using the ji term * only and update the i-element of the gg vector. */ for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { ijterm = sumda*(df*d2ij[k][l] + dg*di[k]*dj[l]); #ifdef SCALAPACK gg[3*(i3 + k) + l] += sumda*(df*d2ii[k][l] + dg*di[k]*di[l]); ptr = ptr2f(g, super_3Nx3N, j3+l, i3+k, &info); if (ptr != NULL) *ptr += ijterm; #else g[n3*(i3 + k) + i3 + l] += sumda*(df*d2ii[k][l] + dg*di[k]*di[l]); g[n3*(i3 + k) + j3 + l] += ijterm; #endif #if !defined(OPENMP) && !defined(SCALAPACK) g[n3*(j3 + l) + i3 + k] += ijterm; g[n3*(j3 + k) + j3 + l] += sumda*(df*d2jj[k][l] + dg*dj[k]*dj[l]); #endif } } } /* * Store the sum of first derivatives of Ri with respect to (xi,yi,zi) * in the diagonal elements of the dreff array. These derivatives * include the appropriate factor of Dij but not a factor of Ri*Ri. * They include the minus sign of -Ri*Ri that was supplied in df. */ #ifdef SCALAPACK sr[i3 ] = dx; sr[i3 + 1] = dy; sr[i3 + 2] = dz; #else dreff[in3 + i3 ] = dx; dreff[in3 + i3 + 1] = dy; dreff[in3 + i3 + 2] = dz; #endif /* Update the gradient vector with d(Ri)/d(Xi). */ #ifdef SCALAPACK grad[i3 ] += sumda * dx; grad[i3 + 1] += sumda * dy; grad[i3 + 2] += sumda * dz; #else f[i3 ] += sumda * dx; f[i3 + 1] += sumda * dy; f[i3 + 2] += sumda * dz; #endif } if (gb2_debug) { printf("end loop 3\n"); fflush(stdout); } /* * This nest of loops updates the (j,i) and (j,j) elements of the * Hessian matrix for ScaLAPACK or OpenMP. For OpenMP update * the j elements of the gradient vector f. For ScaLAPACK do * not update the gradient vector grad. This nest of loop is * not necessary for single-threaded execution or for MPI. * * The control flow in this nest of loops must be identical to * that of the preceeding nest of loops. Perform the minimum * calculation necessary to reconstitute the conditions of * the preceeding nest of loops. * * Because no computation follows the following 'for' loop within * this parallel region, the implied barrier at the end of the * parallel region enforces the necessary thread synchronization * for OpenMP threads. */ #if defined(OPENMP) || defined(SCALAPACK) /* * Loop over all atoms j. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. */ for (j = 0; j < n; j++) { if ( !myroc(j, BLOCKSIZE, numthreads, threadnum) ) continue; /* * Don't calculate derivatives with respect to atom j if there * are no atoms i associated with atom j. */ npairs = lpears[j] + upears[j]; if ( npairs <= 0 ) continue; j3 = 3 * j; jn3 = n * j3; xj = x[j3 ]; yj = x[j3 + 1]; zj = x[j3 + 2]; sj = fs[j] * (rborn[j] - BOFFSET); sj2 = sj * sj; /* Initialize the first derivative accumulators. */ dx = dy = dz = 0.0; /* Select atom i from the pair list. Non-graceful error handling. */ for (m = 0; m < npairs; m++) { if (pearlist[j] == NULL) { printf("NULL pair list entry in egb2 loop 6, taskid = %d\n", mytaskid); fflush(stdout); } i = pearlist[j][m]; /* * Don't calculate derivatives of the effective radius of atom i * if atom i is frozen. */ if ( frozen[i] ) continue; i3 = 3 * i; in = (n * i); xij = x[i3 ] - xj; yij = x[i3 + 1] - yj; zij = x[i3 + 2] - zj; r2 = xij * xij + yij * yij + zij * zij; /* Ignore the ij atom pair if their separation exceeds the GB cutoff. */ if ( r2 > rgbmaxpsmax2 ) continue; dij1i = 1.0 / sqrt(r2); dij2i = dij1i * dij1i; dij = r2 * dij1i; ri = rborn[i] - BOFFSET; ri1i = 1. / ri; /* * The following are the numerator of the first derivatives of the * equations from the Appendix of Schaefer and Froemmel and of the * Taylor series expansion for d>>s by Andreas Svrcek-Seiler. Smooth * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev. The * derivatives are with respect to Dij. The complete derivative is * formed by multiplying the numerator by -Ri*Ri. The factor of Ri*Ri * has been moved to equations 5-18. The negation is deferred until later. * When the chain rule is used to form the first derivatives of the * effective radius with respect to the cartesian coordinates, an * additional factor of Dij appears in the denominator. That factor * is included in the following expressions. */ if ( dij > rgbmax + sj ) continue; if ( dij > rgbmax - sj ) { temp1 = 1./(dij - sj); dij3i = dij1i * dij2i; datmp = 0.125 * dij3i * ((r2 + sj2) * (temp1 * temp1 - rgbmax2i) - 2.0 * log(rgbmax * temp1)); } else if ( dij > 4.0 * sj ) { tmpsd = sj2 * dij2i; dumbo = TE + tmpsd*(TF + tmpsd*(TG + tmpsd*(TH + tmpsd*THH))); datmp = tmpsd * sj * dij2i * dij2i * dumbo; } else if ( dij > ri + sj ) { temp1 = 1. / (r2 - sj2); datmp = temp1 * sj * (-0.5 * dij2i + temp1) + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj)); } else if ( dij > fabs(ri - sj) ) { temp1 = 1. / (dij + sj); dij3i = dij2i * dij1i; datmp = -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i * ri1i + dij1i * temp1 * (temp1 - dij1i) - dij3i * log(ri * temp1)); } else if ( ri < sj ) { temp1 = 1. / (r2 - sj2); datmp = -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 - 0.5 * dij2i * dij1i * log((sj - dij) / (sj + dij))); } else continue; /* * Negate the first derivative of Ri with respect to the * interatomic distance in order to account for the * minus sign of -Ri*Ri. The factor of Ri*Ri is omitted * because it was included in equations 5-18. These * derivatives contain the appropriate factor of Dij * in the denominator because it is supplied by datmp. */ df = -datmp; /* * The following are the numerator of the second derivatives of the * equations from the Appendix of Schaefer and Froemmel and of the * Taylor series expansion for d>>s by Andreas Svrcek-Seiler. Smooth * rgbmax idea is from Andreas Svrcek-Seiler and Alexey Onufriev. The * derivatives are with respect to Dij. */ if ( dij > rgbmax + sj ) continue; if ( dij > rgbmax - sj ) { temp1 = 1./(dij - sj); temp2 = temp1 * temp1; dij3i = dij1i * dij2i; da2tmp = -0.25 * (dij3i * (sj2 * rgbmax2i - 1.0) + dij1i * temp2 + 2.0 * (log(rgbmax * temp1) * dij3i + temp1 * (dij2i - temp2))); } else if ( dij > 4.0 * sj ) { tmpsd = sj2 * dij2i; dumbo = TI + tmpsd*(TJ + tmpsd*(TK + tmpsd*(TL + tmpsd*TLL))); da2tmp = tmpsd * sj * dij2i * dij2i * dumbo; } else if ( dij > ri + sj ) { temp1 = 2. * dij * sj; temp2 = (dij - sj) * (dij + sj); dij3i = dij2i * dij1i; da2tmp = (0.5 * log((dij - sj) / (dij + sj)) + dij * sj * ((r2 + sj2) * (r2 - sj2) + temp1 * temp1) / (temp2 * temp2 * temp2)) * dij3i; } else if ( dij > fabs(ri - sj) ) { temp1 = dij + sj; temp2 = dij * temp1; ri2 = ri * ri; da2tmp = ((2. * ri2 * log(ri / temp1) + sj2) * temp1 * temp1 * temp1 + ri2 * sj * (2. * dij * dij - dij * sj - sj2)) / (4. * ri2 * temp2 * temp2 * temp2); } else if ( ri < sj ) { temp1 = sj2 - r2; temp2 = 2. * sj * dij; dij3i = dij2i * dij1i; da2tmp = (0.5 * log((sj - dij) / (sj + dij)) + sj * dij * (temp1 * (sj2 + r2) - temp2 * temp2) / (temp1 * temp1 * temp1)) * dij3i; } else continue; /* * Calculate the second derivative of the effective radius * with respect to the interatomic distance. A factor of * the Ri*Ri has been omitted from the second derivative * because it was included in equations 5-18. * * When the chain rule is used to form the second derivatives of the * effective radius with respect to the cartesian coordinates, an * additional factor of Dij*Dij appears in the denominator. That * factor is included in the following expression. */ dg = da2tmp * dij2i; /* * Calculate the first and second derivatives of the interatomic * distance Dij with respect to the cartesian coordinates of atoms * i and j. The results are placed into four arrays: * * di[] for the first derivatives with respect to atom i * dj[] for the first derivatives with respect to atom j * d2jj[] for the second derivatives with respect to atom j * d2ij[] for the second derivatives with respect to atoms i and j * * Note: the additional factor of the interatomic distance Dij that * ought to be included in the denominator of the following equations * is provided by the absence of Dij in equations 15 and 16, as well as * by one less power of Dij in the denominator of the equations for the * derivatives of the Van der Waals and Coulombic energy terms. */ di[0] = xij; di[1] = yij; di[2] = zij; dj[0] = -xij; dj[1] = -yij; dj[2] = -zij; /* Load the upper triangle of d2jj. */ r2inv = 1.0 / r2; d2jj[0][0] = 1.0 - xij * xij * r2inv; d2jj[0][1] = -xij * yij * r2inv; d2jj[0][2] = -xij * zij * r2inv; d2jj[1][1] = 1.0 - yij * yij * r2inv; d2jj[1][2] = -yij * zij * r2inv; d2jj[2][2] = 1.0 - zij * zij * r2inv; /* Finish loading the rest of all of the matrices. */ for (k = 0; k < 3; k++) { /* Load the upper triangle of d2ij. */ for (l = k; l < 3; l++) { d2ij[k][l] = -d2jj[k][l]; } /* Load the symmetric elements of d2ij and d2jj. */ for (l = k+1; l < 3; l++) { d2ij[l][k] = d2ij[k][l]; d2jj[l][k] = d2jj[k][l]; } } /* * The following captures all terms that are to be multiplied by the * second derivative of the effective radius (equations 13, 14 and 17). * * Because index i is retrieved from the pairlist array it is * not constrained to a particular range of values; therefore, * the threads that have loaded the sumdeijda array have been * synchronized above prior to the use of sumdeijda below. */ sumda = sumdeijda[i]; /* * Form the second derivatives of the effective radius with * respect to the cartesian coordinates using the chain rule, * multiply the result by sumda and sum to the Hessian matrix. * This step implements equations 13, 14 and 17, except that * terms such as d2(Ri)/d(Dij)d(Dik) are not handled here * but instead are handled in the first call to dgemm() below. * * For OpenMP update the Hessian using the ji and jj terms only. * For ScaLAPACK update the Hessian using the ij term only, and * update the j element of the ff vector. */ for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { #ifdef SCALAPACK ptr = ptr2f(g, super_3Nx3N, i3+k, j3+l, &info); if (ptr != NULL) *ptr += sumda*(df*d2ij[k][l] + dg*di[k]*dj[l]); gg[3*(j3 + k) + l] += sumda*(df*d2jj[k][l] + dg*dj[k]*dj[l]); #else g[n3*(j3 + l) + i3 + k] += sumda*(df*d2ij[k][l] + dg*di[k]*dj[l]); g[n3*(j3 + k) + j3 + l] += sumda*(df*d2jj[k][l] + dg*dj[k]*dj[l]); #endif } } /* * The following captures all terms that are to be multiplied by the * first derivative of the effective radius (equations 2, 3 and 4). */ if ( gb>1 ) { ri = rborn[i] - BOFFSET; thi = tanh( (gbalpha - gbbeta*psi[i] + gbgamma*psi[i]*psi[i])*psi[i] ); sumda *= (gbalpha -2.0*gbbeta*psi[i] + 3.0*gbgamma*psi[i]*psi[i]) *(1.0 - thi*thi)*ri/rborn[i]; } /* Store the 1st derivatives of Ri with respect to (xj,yj,zj). */ dxj = df * dj[0]; dyj = df * dj[1]; dzj = df * dj[2]; #ifdef SCALAPACK ptr = ptr2f(dreff, super_Nx3N, i, j3+0, &info); if (ptr != NULL) *ptr = dxj; ptr = ptr2f(dreff, super_Nx3N, i, j3+1, &info); if (ptr != NULL) *ptr = dyj; ptr = ptr2f(dreff, super_Nx3N, i, j3+2, &info); if (ptr != NULL) *ptr = dzj; #endif /* Update the first derivative accumulators with d(Ri)/d(Xj). */ dx += sumda * dxj; dy += sumda * dyj; dz += sumda * dzj; } /* * Update the gradient vector f from the first derivative accumulators, * but only for OpenMP, not for ScaLAPACK. */ #ifdef OPENMP f[j3 ] += dx; f[j3 + 1] += dy; f[j3 + 2] += dz; #endif } if (gb2_debug) { printf("end loop 4\n"); fflush(stdout); } #endif /* if defined(OPENMP) || defined(SCALAPACK) */ } #ifdef SCALAPACK /* * Add the gradient vector grad to the gradient vector f. * This step is straightforward for a 1x1 process grid * but requires multiple steps for a larger process grid. */ if (gridim == 1) { if (context_PxQ >= 0) { for (i = 0; i < m3; i++) { f[i] += grad[i]; } if (gb2_debug) { printf("f += grad\n"); fflush(stdout); } } } else { /* * Perform a reduction over the gradient vector grad. * Leave the result in reductarr which exists in all processes. */ ierror = MPI_Allreduce(grad, reductarr, m3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb grad reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce grad\n"); fflush(stdout); } /* Gather f from context_PxQ into grad in context_1x1. */ pdgemr2d_(&m3, &one, f, &one, &one, desc_3Nx1, grad, &one, &one, desc_1x1, &context_Nx1); if (gb2_debug) { printf("pgemr2d f\n"); fflush(stdout); } /* Add reductarr to grad. */ if (context_1x1 >= 0) { for (i = 0; i < m3; i++) { grad[i] += reductarr[i]; } if (gb2_debug) { printf("f += reductarr\n"); fflush(stdout); } } /* Distribute grad from context_1x1 into f in context_PxQ. */ pdgemr2d_(&m3, &one, grad, &one, &one, desc_1x1, f, &one, &one, desc_3Nx1, &context_Nx1); if (gb2_debug) { printf("pgemr2d grad\n"); fflush(stdout); } } /* * The diagonal elements of the Hessian, sumdeijdg, sumdeijgh, * sumdeijdn and dreff matrices are stored in the gg, sg, sh, * sn and sr vectors. Perform a reduction across each vector * and write the result into corresponding matrix. * * Here is the code for the Hessian. */ ierror = MPI_Allreduce(gg, reductarr, 3*m3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb gg reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce gg\n"); fflush(stdout); } for (i = 0; i < n; i++) { i3 = 3*i; for (k = 0; k < 3; k++) { for (l = 0; l < 3; l++) { ptr = ptr2f(g, super_3Nx3N, i3+k, i3+l, &info); if (ptr != NULL) *ptr += reductarr[3*(i3 + k) + l]; } } } /* Here is the code for sumdeijdg. */ ierror = MPI_Allreduce(sg, reductarr, m3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb sg reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce sg\n"); fflush(stdout); } for (i = 0; i < n; i++) { i3 = 3*i; for (k = 0; k < 3; k++) { ptr = ptr2f(sumdeijdg, super_Nx3N, i, i3+k, &info); if (ptr != NULL) *ptr += reductarr[i3 + k]; } } /* Here is the code for sumdeijdh. */ ierror = MPI_Allreduce(sh, reductarr, m3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb sh reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce sh\n"); fflush(stdout); } for (i = 0; i < n; i++) { i3 = 3*i; for (k = 0; k < 3; k++) { ptr = ptr2f(sumdeijdh, super_Nx3N, i, i3+k, &info); if (ptr != NULL) *ptr += reductarr[i3 + k]; } } /* Here is the code for dreff. */ ierror = MPI_Allreduce(sr, reductarr, m3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb sr reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce sr\n"); fflush(stdout); } for (i = 0; i < n; i++) { i3 = 3*i; for (k = 0; k < 3; k++) { ptr = ptr2f(dreff, super_Nx3N, i, i3+k, &info); if (ptr != NULL) *ptr += reductarr[i3 + k]; } } /* Here is the code for sumdeijdn. */ ierror = MPI_Allreduce(sn, reductarr, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (ierror != MPI_SUCCESS) { printf("Error in egb sn reduction, error = %d mytaskid = %d\n", ierror, mytaskid); } if (gb2_debug) { printf("MPI_Allreduce sn\n"); fflush(stdout); } for (i = 0; i < n; i++) { ptr = ptr2f(sumdeijdn, super_NxN, i, i, &info); if (ptr != NULL) *ptr += reductarr[i]; } /* * Add to the diagonal elements of sumdeijdn the d2(Ri)/d(Dij)d(Dik) * terms weighted by Ri and sumdeijda[i]. */ for (i = 0; i < n; i++) { ptr = ptr2f(sumdeijdn, super_NxN, i, i, &info); if (ptr != NULL) { *ptr += 2.0 * sumdeijda[i] * reff[i]; } } t2 = seconds(); if (gb2_debug) { printf("\nmatrix preparation time = %10.2f n = %d m3 = %d\n\n", t2 - t1, n, m3); fflush(stdout); } tloop += t2 - t1; t1 = t2; /* * Perform the inner multiplications of equations 9, 10, 11, 12 * and 18 in a manner analogous to the multiplications of * of equations 2, 3 and 4. Also, multiply each element of each * row of the dreff array by the d2(Ri)/d(Dij)d(Dik) terms. * * Next perform the outer multiplication for equations 5, 6, 9, 10, * 11 and 12. * * Finally, perform the outer multiplication for equations 7 and 8. * * Some ScaLAPACK functions appear to quit unexpectedly * for large matrices on a 1x1 process grid, e.g. the pdgemm_ * function in newton.c, so bypass the ScaLAPACK pdgemm_ and * use the LAPACK dgemm_ function instead. * * The correct test is (gridim == 1), not (nprow == 1 && npcol == 1) * since processes that aren't on the 1x1 grid have nprow == npcol == -1, * which would direct control to pdgemr2d_ (below) that would hang because * it would not be called from all processes, specifically not from * the process that is on the 1x1 grid and has nprow == npcol == 1. */ if (gridim == 1) { if (context_PxQ >= 0) { transa = 'T'; transb = 'N'; dgemm_(&transa, &transb, &n, &m3, &n, &dblone, sumdeijdn, &n, dreff, &n, &dblone, sumdeijdg, &n); t2 = seconds(); if (gb2_debug) { printf("dgemm1 time = %10.2f\n\n", t2 - t1); fflush(stdout); } tb2g1 += t2 - t1; t1 = t2; transa = 'T'; transb = 'N'; dgemm_(&transa, &transb, &m3, &m3, &n, &dblone, dreff, &n, sumdeijdg, &n, &dblone, g, &m3); t2 = seconds(); if (gb2_debug) { printf("dgemm2 time = %10.2f\n\n", t2 - t1); fflush(stdout); } tb2g2 += t2 - t1; t1 = t2; dgemm_(&transa, &transb, &m3, &m3, &n, &dblone, sumdeijdh, &n, dreff, &n, &dblone, g, &m3); t2 = seconds(); if (gb2_debug) { printf("dgemm3 time = %10.2f\n\n", t2 -t1); fflush(stdout); } tb2g3 += t2 - t1; } } else { if (context_PxQ >= 0) { transa = 'N'; transb = 'N'; pdgemm_(&transa, &transb, &n, &m3, &n, &dblone, sumdeijdn, &one, &one, desc_NxN, dreff, &one, &one, desc_Nx3N, &dblone, sumdeijdg, &one, &one, desc_Nx3N); t2 = seconds(); if (gb2_debug) { printf("pdgemm1 time = %10.2f\n\n", t2 - t1); fflush(stdout); } tb2g1 += t2 - t1; t1 = t2; transa = 'T'; transb = 'N'; pdgemm_(&transa, &transb, &m3, &m3, &n, &dblone, dreff, &one, &one, desc_Nx3N, sumdeijdg, &one, &one, desc_Nx3N, &dblone, g, &one, &one, desc_3Nx3N); t2 = seconds(); if (gb2_debug) { printf("pdgemm2 time = %10.2f\n\n", t2 - t1); fflush(stdout); } tb2g2 += t2 - t1; t1 = t2; transa = 'T'; transb = 'N'; pdgemm_(&transa, &transb, &m3, &m3, &n, &dblone, sumdeijdh, &one, &one, desc_Nx3N, dreff, &one, &one, desc_Nx3N, &dblone, g, &one, &one, desc_3Nx3N); t2 = seconds(); if (gb2_debug) { printf("pdgemm3 time = %10.2f\n\n", t2 -t1); fflush(stdout); } tb2g3 += t2 - t1; } } #else /* * Here is the non-ScaLAPACK section of the code. Note that the * sumdeijdg, sumdeijdh and dreff matrices are built in a transposed * manner relative to the manner in which they are built for * ScaLAPACK execution because the ScaLAPACK submatrices are built * in column major order. * * Add to the diagonal elements of sumdeijdn the d2(Ri)/d(Dij)d(Dik) * terms weighted by Ri and sumdeijda[i]. */ for (i = 0; i < n; i++) { in = (n * i); sumdeijdn[in + i] += 2.0 * sumdeijda[i] * reff[i]; } t2 = seconds(); if (gb2_debug) { printf("\nmatrix preparation time = %10.2f n = %d m3 = %d\n\n", t2 - t1, n, m3); fflush(stdout); } tloop += t2 - t1; t1 = t2; /* * Perform the inner multiplications of equations 9, 10, 11, 12 * and 18 in a manner analogous to the multiplications of * of equations 2, 3 and 4. Also, multiply each element of each * row of the dreff array by the d2(Ri)/d(Dij)d(Dik) terms. * * Next perform the outer multiplication for equations 5, 6, 9, 10, * 11 and 12. * * Finally, perform the outer multiplication for equations 7 and 8. */ transa = 'N'; transb = 'N'; dgemm_(&transa, &transb, &m3, &n, &n, &dblone, dreff, &m3, sumdeijdn, &n, &dblone, sumdeijdg, &m3); t2 = seconds(); if (gb2_debug) { printf("dgemm1 time = %10.2f\n\n", t2 - t1); fflush(stdout); } tb2g1 += t2 - t1; t1 = t2; transa = 'N'; transb = 'T'; dgemm_(&transa, &transb, &m3, &m3, &n, &dblone, sumdeijdg, &m3, dreff, &m3, &dblone, g, &m3); t2 = seconds(); if (gb2_debug) { printf("dgemm2 time = %10.2f\n\n", t2 - t1); fflush(stdout); } tb2g2 += t2 - t1; t1 = t2; transa = 'N'; transb = 'T'; dgemm_(&transa, &transb, &m3, &m3, &n, &dblone, dreff, &m3, sumdeijdh, &m3, &dblone, g, &m3); t2 = seconds(); if (gb2_debug) { printf("dgemm3 time = %10.2f\n\n", t2 -t1); fflush(stdout); } tb2g3 += t2 - t1; #endif /* Free the static arrays if STATIC_ARRAYS is undefined. */ #ifndef STATIC_ARRAYS #ifndef SCALAPACK free_vector( sumdeijdn, 0, nn ); sumdeijdn = NULL; free_vector( sumdeijdg, 0, nn3 ); sumdeijdg = NULL; free_vector( sumdeijdh, 0, nn3 ); sumdeijdh = NULL; free_vector( dreff, 0, nn3 ); dreff = NULL; #else if (context_PxQ >= 0) { free_superdesc( super_3Nx3N ); super_3Nx3N = NULL; free_vector( sumdeijdn, 0, super_NxN->size ); sumdeijdn = NULL; free_superdesc( super_NxN ); super_NxN = NULL; free_vector( sumdeijdg, 0, super_Nx3N->size ); sumdeijdg = NULL; free_vector( sumdeijdh, 0, super_Nx3N->size ); sumdeijdh = NULL; free_vector( dreff, 0, super_Nx3N->size ); dreff = NULL; free_superdesc( super_Nx3N ); super_Nx3N = NULL; free_vector( reductarr, 0, 3*n3 ); reductarr = NULL; } free_vector( gg, 0, 3*n3 ); gg = NULL; free_vector( sg, 0, n3 ); sg = NULL; free_vector( sh, 0, n3 ); sh = NULL; free_vector( sr, 0, n3 ); sr = NULL; free_vector( sn, 0, n ); sn = NULL; #endif free_vector( reff, 0, n ); reff = NULL; free_vector( sumdeijda, 0, n ); sumdeijda = NULL; free_vector( psi, 0, n ); psi = NULL; #endif if (gb2_debug) { printf("exit egb2\n"); fflush(stdout); } /* Return elec and evdw through the reference parameters eelt and enb. */ *eelt = elec; *enb = evdw; return (epol); } /*********************************************************************** MME2() ************************************************************************/ /* * Compute the energy and first and second derivatives. * * Calling parameters are as follows: * * x - input: the atomic (x,y,z) coordinates * f - updated: the gradient vector * g - updated: the Hessian matrix * m - returned: the atomic masses * grad - updated: the gradient vector in context_1x1 * descF_PxQ - input: the ScaLAPACK descriptor for vector f in context_PxQ * descF_1x1 - input: the ScaLAPACK descriptor for vector grad in context_1x1 * descG_PxQ - input: the ScaLAPACK descriptor for matrix g in context_PxQ * context_PxQ - input: the distributed vector and matrix context for ScaLAPACK * context_1x1 - input: the non-distributed vector context for ScaLAPACK * context_Nx1 - input: the general context for ScaLAPACK * gridim - input: the ScaLAPACK process grid dimension (=1 for single task) * natom - returned: the number of atoms * iter - input: the iteration counter, which if negative selects the following: * -1 print some energy values * -3 call egb to deallocate static arrays, then deallocate grad * -(any other value) normal execution */ REAL_T mme2(REAL_T *x, REAL_T *f, REAL_T *g, REAL_T *m, REAL_T *grad, INT_T *descF_PxQ, INT_T *descF_1x1, INT_T *descG_PxQ, INT_T *context_PxQ, INT_T *context_1x1, INT_T *context_Nx1, INT_T *gridim, INT_T *natom, INT_T *iter) { REAL_T ebh, eba, eth, eta, eph, epa, enb, eel, enb14, eel14, ecn, frms; REAL_T ene[20], t1, t2; extern REAL_T tcholesky; int i, j, k, n, i3, threadnum, numthreads; size_t in9, ii, jj, kk; #ifdef SCALAPACK int myrow, mycol, nprow, npcol, mb, nb; size_t locpF_PxQ, locqF_PxQ, locpG_PxQ, locqG_PxQ; int zero=0, one=1; REAL_T *ptr, reductarr[12]; char ctrl; #endif /* Enforce 3D (and disallow 4D) for second derivatives. */ t1 = seconds(); dim = 3; n = 3*prm->Natom; /* * If the iteration count equals -3, call egb2 to deallocate the * static arrays, then return; otherwise, simply return. */ if (*iter == -3) { egb2(lpairs, upairs, pairlist, x, f, g, grad, prm->Fs, prm->Rborn, prm->Charges, &kappa, &epsext, &enb, &eel, *context_Nx1, *context_1x1, *context_PxQ, descF_PxQ, descF_1x1, descG_PxQ, *gridim, 1); return (0.0); } /* * If the iteration count is zero or one, print the energy header * for task 0 only. */ if ( (*iter == 0 || *iter == 1) && mytaskid == 0 ) { printf (" iter Total bad vdW elect. cons. genBorn frms\n"); fflush(stdout); } /* * If the iteration count is zero or one, initialize the hi-res * timer variables. */ if (*iter == 0 || *iter == 1) { tnonb2 = tpair = tbond2 = tangl2 = tphi2 = tborn2 = tcons2 = 0.0; tmme2 = tb2g1 = tb2g2 = tb2g3 = tloop = 0.0; tlevel = tdiag = tmmetwo = tcholesky = tnewton = 0.0; } /* * Write the checkpoint file every nchk2 iteration if the chknm * variable is non-NULL, but for task 0 only. */ if (chknm != NULL && (*iter > 0 && *iter % nchk2 == 0) && mytaskid == 0) { checkpoint(chknm, prm->Natom, x, *iter); } /* * Build the pair list if it hasn't already been built; rebuild * it every nsnb iteration. This pair list will be used for * first derivative calculations (for line minimization within * Newton-Raphson) and for non-14 nonbonded interactions (i.e., * when nbond2 is called instead of egb2). * * If SCALAPACK or MPI is defined, build another pair list that * is dedicated for use by egb2. For ScaLAPACK this pair list * will conform to the block cyclic process grid. For MPI this * pair list will be fully populated for each task because the * Born second derivatives are not parallelized under MPI. */ if (nb_pairs < 0 || (*iter > 0 && *iter % nsnb == 0)) { t1 = seconds(); nb_pairs = nblist(lpairs, upairs, pairlist, x, *context_PxQ, 1); t2 = seconds(); tpair += t2 - t1; t1 = t2; /* printf( "nb_pairs = %d\n", nb_pairs ); */ } #if defined(SCALAPACK) || defined(MPI) if (nb_pairs2 < 0 || (*iter > 0 && *iter % nsnb == 0)) { t1 = seconds(); nb_pairs2 = nblist(lpairs2, upairs2, pairlist2, x, *context_PxQ, -1); t2 = seconds(); tpair += t2 - t1; t1 = t2; /* printf( "nb_pairs = %d\n", nb_pairs ); */ } #endif /* Initialize the gradient and Hessian arrays. */ #ifdef SCALAPACK /* * Get the number of rows and columns on the row cyclic (PxQ) process grid, * as well as this task's row and column on the grid. */ blacs_gridinfo_(context_PxQ, &nprow, &npcol, &myrow, &mycol); /* * Only processes that are active on the PxQ grid initialize * the gradient vector and Hessian matrix. */ if (*context_PxQ >= 0) { /* * The numroc_ function is used to calculate the number of vector or * matrix elements that are distributed across a PxQ processor grid * for the gradient vector or the Hessian matrix. Only processes * in grid column zero initialize the gradient vector. */ mb = descF_PxQ[MB_]; nb = descF_PxQ[NB_]; locpF_PxQ = numroc_(&n, &mb, &myrow, &zero, &nprow); locqF_PxQ = numroc_(&one, &nb, &mycol, &zero, &npcol); if (mycol == 0) { for (i = 0; i < locpF_PxQ*locqF_PxQ; i++) { f[i] = 0.0; } } /* Get the block sizes for the Hessian matrix. */ mb = descG_PxQ[MB_]; nb = descG_PxQ[NB_]; locpG_PxQ = numroc_(&n, &mb, &myrow, &zero, &nprow); locqG_PxQ = numroc_(&n, &nb, &mycol, &zero, &npcol); for (ii = 0; ii < locpG_PxQ*locqG_PxQ; ii++) { g[ii] = 0.0; } } #else #pragma omp parallel private (i, j, i3, ii, jj, in9, threadnum, numthreads) { /* * Get the thread number and the number of threads for multi-threaded * execution. */ #ifdef OPENMP threadnum = omp_get_thread_num(); numthreads = omp_get_num_threads(); #endif /* * Initialize the gradient and Hessian arrays inside of the * OpenMP parallel region. * * The "first touch" memory allocation strategy will locate * the elements of these arrays that are initialized by * a particular CPU in memory that is local to that CPU. * * Explicitly assign threads to loop indices for the following loop, * in a manner equivalent to (static, N) scheduling with OpenMP, and * identical to the manner in which threads are assigned in nblist. * * Because no computation follows the following 'for' loop within * this parallel region, the implied barrier at the end of the * parallel region enforces the necessary thread synchronization. */ #ifndef ROW_CYCLIC #pragma omp for #endif for (i = 0; i < prm->Natom; i++) { #if defined(ROW_CYCLIC) && defined(OPENMP) if ( !myroc(i, BLOCKSIZE, numthreads, threadnum) ) continue; #endif i3 = 3 * i; ii = (size_t)i; in9 = (9 * prm->Natom * ii); f[i3 ] = f[i3 + 1] = f[i3 + 2] = 0.0; for (j = 0; j < 9 * prm->Natom; j++) { jj = (size_t)j; g[in9 + jj] = 0.0; } } } #endif t2 = seconds(); tmme2 += t2 - t1; t1 = t2; ebh = ebond2(prm->Nbonh, prm->BondHAt1, prm->BondHAt2, prm->BondHNum, prm->Rk, prm->Req, x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); eba = ebond2(prm->Mbona, prm->BondAt1, prm->BondAt2, prm->BondNum, prm->Rk, prm->Req, x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); ene[3] = ebh + eba; t2 = seconds(); tbond2 += t2 - t1; t1 = t2; eth = eangl2(prm->Ntheth, prm->AngleHAt1, prm->AngleHAt2, prm->AngleHAt3, prm->AngleHNum, prm->Tk, prm->Teq, x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); eta = eangl2(prm->Ntheta, prm->AngleAt1, prm->AngleAt2, prm->AngleAt3, prm->AngleNum, prm->Tk, prm->Teq, x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); ene[4] = eth + eta; t2 = seconds(); tangl2 += t2 - t1; t1 = t2; t1 = seconds(); eph = ephi2(prm->Nphih, prm->DihHAt1, prm->DihHAt2, prm->DihHAt3, prm->DihHAt4, prm->DihHNum, prm->Pk, prm->Pn, prm->Phase, x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); epa = ephi2(prm->Mphia, prm->DihAt1, prm->DihAt2, prm->DihAt3, prm->DihAt4, prm->DihNum, prm->Pk, prm->Pn, prm->Phase, x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); ene[5] = eph + epa; t2 = seconds(); tphi2 += t2 - t1; t1 = t2; ene[6] = 0.0; /* hbond term not in Amber-94 force field */ nbond2(lpairs, prm->N14pairs, N14pearlist, 1, x, f, g, &enb14, &eel14, scnb, scee, *context_PxQ, descF_PxQ, descG_PxQ); t2 = seconds(); tnonb2 += t2 - t1; t1 = t2; if (e_debug) { EXPR("%9.3f", enb14); EXPR("%9.3f", eel14); } ene[7] = enb14; ene[8] = eel14; if (nconstrained) { ecn = econs2(x, f, g, *context_PxQ, descF_PxQ, descG_PxQ); t2 = seconds(); tcons2 += t2 - t1; t1 = t2; } else ecn = 0.0; ene[9] = ecn; /* Calculate the generalized Born energy and derivatives. */ if (gb) { #if defined(SCALAPACK) || defined(MPI) ene[10] = egb2(lpairs2, upairs2, pairlist2, x, f, g, grad, prm->Fs, prm->Rborn, prm->Charges, &kappa, &epsext, &enb, &eel, *context_Nx1, *context_1x1, *context_PxQ, descF_PxQ, descF_1x1, descG_PxQ, *gridim, 0); #else ene[10] = egb2(lpairs, upairs, pairlist, x, f, g, grad, prm->Fs, prm->Rborn, prm->Charges, &kappa, &epsext, &enb, &eel, *context_Nx1, *context_1x1, *context_PxQ, descF_PxQ, descF_1x1, descG_PxQ, *gridim, 0); #endif t2 = seconds(); tborn2 += t2 - t1; t1 = t2; ene[1] = enb; ene[2] = eel; } else { #if defined(SCALAPACK) || defined(MPI) nbond2(lpairs2, upairs2, pairlist2, 0, x, f, g, &enb, &eel, 1.0, 1.0, *context_PxQ, descF_PxQ, descG_PxQ); #else nbond2(lpairs, upairs, pairlist, 0, x, f, g, &enb, &eel, 1.0, 1.0, *context_PxQ, descF_PxQ, descG_PxQ); #endif t2 = seconds(); tnonb2 += t2 - t1; t1 = t2; ene[1] = enb; ene[2] = eel; ene[10] = 0.0; } /* * Zero out the frozen forces. Use the local leading dimension of * the distributed Hessian matrix for the ScaLAPACK implementation. */ #pragma omp parallel private (k, j, kk, jj, threadnum, numthreads) { /* * Get the thread number and the number of threads for multi-threaded * execution. Use default values of 0 and 1, respectively, for * single-threaded execution. */ #ifdef SCALAPACK threadnum = myrow; numthreads = nprow; #elif defined(OPENMP) threadnum = omp_get_thread_num(); numthreads = omp_get_num_threads(); #else threadnum = 0; numthreads = 1; #endif /* * Use BLOCKSIZE to assign the loop index i to OpenMP threads * or ScaLAPACK process rows. This approach is identical to * the approach used in the egb2 function. */ #pragma omp for for (k = 0; k < prm->Natom; k++) { #if defined(OPENMP) || defined(SCALAPACK) if ( !myroc(k, BLOCKSIZE, numthreads, threadnum) ) continue; #endif kk = (size_t)k; if (frozen[k]) { #ifndef SCALAPACK f[3 * k ] = 0.0; f[3 * k + 1] = 0.0; f[3 * k + 2] = 0.0; #else /* * Only processes in grid column 0 access the gradient vector. * The correct grid row was selected by the above continue statement, * which selects three rows at a time because the block size of * the gradient vector is three times BLOCKSIZE. */ if (mycol == 0) { ptr = ptr1d(f, descF_PxQ, 3*k+0); if (ptr != NULL) *ptr = 0.0; ptr = ptr1d(f, descF_PxQ, 3*k+1); if (ptr != NULL) *ptr = 0.0; ptr = ptr1d(f, descF_PxQ, 3*k+2); if (ptr != NULL) *ptr = 0.0; } #endif } } /* Zero the [k,j] elements of the Hessian matrix. */ for (k = 0; k < prm->Natom; k++) { if ( !myroc(k, BLOCKSIZE, numthreads, threadnum) ) continue; kk = (size_t)k; if (frozen[k]) { for (j = 0; j < prm->Natom; j++) { jj = (size_t)j; #ifndef SCALAPACK g[3*jj + 0 + n*(3*kk + 0)] = 0.0; g[3*jj + 1 + n*(3*kk + 1)] = 0.0; g[3*jj + 2 + n*(3*kk + 2)] = 0.0; #else /* * The correct grid row was selected by the above continue statement. * Determine the correct processor grid column for Hessian matrix * access using the ptr2d function. */ ptr = ptr2d(g, descG_PxQ, 3*kk+0, 3*jj+0); if (ptr != NULL) *ptr = 0.0; ptr = ptr2d(g, descG_PxQ, 3*kk+1, 3*jj+1); if (ptr != NULL) *ptr = 0.0; ptr = ptr2d(g, descG_PxQ, 3*kk+2, 3*jj+2); if (ptr != NULL) *ptr = 0.0; #endif } } } #ifdef SCALAPACK /* This step is unnecessary for a square process grid. */ threadnum = mycol; numthreads = npcol; #endif /* Zero the [j,k] elements of the Hessian matrix. */ for (k = 0; k < prm->Natom; k++) { if ( !myroc(k, BLOCKSIZE, numthreads, threadnum) ) continue; kk = (size_t)k; if (frozen[k]) { for (j = 0; j < prm->Natom; j++) { jj = (size_t)j; #ifndef SCALAPACK g[3*kk + 0 + n*(3*jj + 0)] = 0.0; g[3*kk + 1 + n*(3*jj + 1)] = 0.0; g[3*kk + 2 + n*(3*jj + 2)] = 0.0; #else /* * The correct grid column was selected by the above continue statement. * Determine the correct processor grid row for Hessian matrix * access using the ptr2d function. */ ptr = ptr2d(g, descG_PxQ, 3*jj+0, 3*kk+0); if (ptr != NULL) *ptr = 0.0; ptr = ptr2d(g, descG_PxQ, 3*jj+1, 3*kk+1); if (ptr != NULL) *ptr = 0.0; ptr = ptr2d(g, descG_PxQ, 3*jj+2, 3*kk+2); if (ptr != NULL) *ptr = 0.0; #endif } } } } /* * Calculate the RMS error of the gradient vector. * * For non-ScaLAPACK execution, set some variables to values that * will select the proper sections of code below. */ #ifndef SCALAPACK *gridim = 1; *context_PxQ = 0; #endif /* * Some ScaLAPACK functions appear to quit unexpectedly * for large matrices on a 1x1 process grid, e.g. the pdgemm_ * function in newton.c, so bypass the ScaLAPACK pdgemr2d_ * function and calculate the RMS error using f instead of grad. * * The correct test is (gridim == 1), not (nprow == 1 && npcol == 1) * since processes that aren't on the 1x1 grid have nprow == npcol == -1, * which would direct control to pdgemr2d_ (below) that would hang because * it would not be called from all processes, specifically not from * the process that is on the 1x1 grid and has nprow == npcol == 1. * * Non-ScaLAPACK execution will select this section of code because * gridim and context_PxQ were set to 1 and 0, respectively, above. */ if (*gridim == 1) { if (*context_PxQ >= 0) { frms = 0.0; for (i = 0; i < n; i++) frms += f[i] * f[i]; frms = sqrt(frms / ((REAL_T)n)); } } else { #ifdef SCALAPACK /* * For ScaLAPACK, begin by gathering the distributed gradient vector g * into the non-distributed gradient vector grad. The function pdgemr2d_ * will hang unless called from all processes. */ pdgemr2d_(&n, &one, f, &one, &one, descF_PxQ, grad, &one, &one, descF_1x1, context_Nx1); /* * Calculate the RMS error for the gradient vector grad in context_1x1 * where grad exists. */ if (*context_1x1 >= 0) { frms = 0.0; for (i = 0; i < n; i++) { frms += grad[i] * grad[i]; } frms = sqrt(frms / ((REAL_T)n)); } #endif } /* * If SCALAPACK is defined perform a reduction of the ene array, * but only the Van der Waals, Coulombic, and Born energies have * been computed in parallel by egb2, not nbond2. */ #ifdef SCALAPACK if (gb) { MPI_Allreduce(ene, reductarr, 12, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); ene[1] = reductarr[1]; ene[2] = reductarr[2]; ene[10] = reductarr[10]; } #endif /* Calculate the total energy. */ ene[0] = 0.0; for (k = 1; k <= 10; k++) { ene[0] += ene[k]; } /* * Print the energies and rms gradient but only for positive values * of the iteration counter, and if SCALAPACK is defined, only for * context_1x1 where the gradient vector grad is valid. */ #ifdef SCALAPACK if (*context_1x1 >= 0) #elif defined(MPI) if (mytaskid == 0) #endif { if (*iter > -1 && (*iter == 1 || *iter % ntpr == 0)) { printf("ff:%4d %9.2f %9.2f %9.2f %9.2f %9.2f %9.2f %9.2e\n", *iter, ene[0] + esurf, ene[3] + ene[4] + ene[5], ene[1] + ene[7], ene[2] + ene[8], ene[9], ene[10], frms); fflush(stdout); } } /* * Load the atomic masses into the array m because the calling * function (newton) needs these masses to "level shift" to high * frequencies the eigenvalues associated with overall translation * and rotation of the molecule. * * Load the number of atoms as well. */ for (i = 0; i < prm->Natom; i++) { m[i] = prm->Masses[i]; } *natom = prm->Natom; /* A value of -1 for the iteration counter is reserved for printing. */ if(*iter == -1){ printf( "Inside mme2:\n" ); printf( " bond: %15.9f\n", ene[3] ); printf( " angle: %15.9f\n", ene[4] ); printf( " dihedral: %15.9f\n", ene[5] ); printf( " enb14: %15.9f\n", ene[7] ); printf( " eel14: %15.9f\n", ene[8] ); printf( " enb: %15.9f\n", ene[1] ); printf( " eel: %15.9f\n", ene[2] ); printf( " egb: %15.9f\n", ene[10] ); printf( " econs: %15.9f\n", ene[9] ); printf( " esurf: %15.9f\n", ene[11] ); printf( " Total: %15.9f\n", ene[0] ); } t2 = seconds(); tmme2 += t2 - t1; return (ene[0] + esurf); } /*********************************************************************** MME2_TIMER() ************************************************************************/ int mme2_timer( void ) { /* Use the maximum time from all ScaLAPACK processes. */ #ifdef SCALAPACK REAL_T timarr[17], reductarr[17]; timarr[0] = tcons2; timarr[1] = tbond2; timarr[2] = tangl2; timarr[3] = tphi2; timarr[4] = tpair; timarr[5] = tnonb2; timarr[6] = tborn2; timarr[7] = tmme2; timarr[8] = tb2g1; timarr[9] = tb2g2; timarr[10] = tb2g3; timarr[11] = tmmetwo; timarr[12] = tlevel; timarr[13] = tdiag; timarr[14] = tcholesky; timarr[15] = tnewton; timarr[16] = tloop; MPI_Allreduce(timarr, reductarr, 17, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); tcons2 = reductarr[0]; tbond2 = reductarr[1]; tangl2 = reductarr[2]; tphi2 = reductarr[3]; tpair = reductarr[4]; tnonb2 = reductarr[5]; tborn2 = reductarr[6]; tmme2 = reductarr[7]; tb2g1 = reductarr[8]; tb2g2 = reductarr[9]; tb2g3 = reductarr[10]; tmmetwo = reductarr[11]; tlevel = reductarr[12]; tdiag = reductarr[13]; tcholesky = reductarr[14]; tnewton = reductarr[15]; tloop = reductarr[16]; #endif if (mytaskid == 0) { printf("\nSecond derivative timing summary:\n"); printf(" constraints %10.2f\n", tcons2); printf(" bonds %10.2f\n", tbond2); printf(" angles %10.2f\n", tangl2); printf(" torsions %10.2f\n", tphi2); printf(" pairlist %10.2f\n", tpair); printf(" nonbonds %10.2f\n", tnonb2); printf(" gen. Born %10.2f\n", tborn2); printf(" mme2 %10.2f\n", tmme2); printf(" Total deriv %10.2f\n", tbond2 + tangl2 + tphi2 + tpair + tnonb2 + tborn2 + tcons2 + tmme2); printf("\nBorn dgemm timing summary:\n"); printf(" dgemm1 %10.2f\n", tb2g1); printf(" dgemm2 %10.2f\n", tb2g2); printf(" dgemm3 %10.2f\n", tb2g3); printf(" total dgemm %10.2f\n", tb2g1 + tb2g2 + tb2g3); printf(" Born-dgemm %10.2f\n", tloop); printf("\nOverall timing summary:\n"); printf(" derivatives %10.2f\n", tmmetwo); printf(" level %10.2f\n", tlevel); printf(" dsyev %10.2f\n", tdiag); printf(" dposv %10.2f\n", tcholesky); printf(" newton %10.2f\n", tnewton); printf(" Total %10.2f\n\n", tmmetwo + tlevel + tdiag + tcholesky + tnewton); fflush(stdout); } return (0); }