/* 
 * Energy subroutines, to be used in sff.c
 *
 * Parallelization via OpenMP and creation of pair lists using a kd tree
 * were added by Russ Brown (russ.brown@sun.com)
 */

#undef KDTREE_DEBUG

REAL_T econs( REAL_T *x, REAL_T *f )
{
	int i;
	REAL_T e_cons, rx, ry, rz;

	e_cons = 0.0;
	for (i = 0; i < prm->Natom; i++) {
		if (constrained[i]) {
			rx = x[3 * i] - x0[3 * i];
			ry = x[3 * i + 1] - x0[3 * i + 1];
			rz = x[3 * i + 2] - x0[3 * i + 2];
			e_cons += wcons * (rx * rx + ry * ry + rz * rz);
			f[3 * i] += 2. * wcons * rx;
			f[3 * i + 1] += 2. * wcons * ry;
			f[3 * i + 2] += 2. * wcons * rz;
		}
	}
	return (e_cons);
}

REAL_T ebond( int nbond, int *a1, int *a2, int *atype,
	REAL_T *Rk, REAL_T *Req, REAL_T *x, REAL_T *f )
{
	int i, at1, at2, atyp;
	REAL_T e_bond, r, rx, ry, rz, r2, s, db, df, e;

	e_bond = 0.0;
	for (i = 0; i < nbond; i++) {
		at1 = a1[i];
		at2 = a2[i];
		atyp = atype[i] - 1;
		rx = x[at1] - x[at2];
		ry = x[at1 + 1] - x[at2 + 1];
		rz = x[at1 + 2] - x[at2 + 2];
		r2 = rx * rx + ry * ry + rz * rz;
		s = sqrt(r2);
		r = 2.0 / s;
		db = s - Req[atyp];
		df = Rk[atyp] * db;
		e = df * db;
		e_bond += e;
		df *= r;
		f[at1 + 0] += rx * df;
		f[at1 + 1] += ry * df;
		f[at1 + 2] += rz * df;
		f[at2 + 0] -= rx * df;
		f[at2 + 1] -= ry * df;
		f[at2 + 2] -= rz * df;
	}
	if (e_debug)
		EXPR("%9.3f", e_bond);
	return (e_bond);
}

REAL_T eangl( int nang, int *a1, int *a2, int *a3, int *atype,
	REAL_T *Tk, REAL_T *Teq, REAL_T *x, REAL_T *f )
{
	int i, atyp, at1, at2, at3;
	REAL_T dxi, dyi, dzi, dxj, dyj, dzj, ri2, rj2, ri, rj, rir, rjr;
	REAL_T dxir, dyir, dzir, dxjr, dyjr, dzjr, cst, at, da, df, e, e_theta;
	REAL_T xtmp, dxtmp, ytmp, dytmp, ztmp, dztmp;

	e_theta = 0.0;
	for (i = 0; i < nang; i++) {
		at1 = a1[i];
		at2 = a2[i];
		at3 = a3[i];
		atyp = atype[i] - 1;

		dxi = x[at1] - x[at2];
		dyi = x[at1 + 1] - x[at2 + 1];
		dzi = x[at1 + 2] - x[at2 + 2];
		dxj = x[at3] - x[at2];
		dyj = x[at3 + 1] - x[at2 + 1];
		dzj = x[at3 + 2] - x[at2 + 2];

		ri2 = dxi * dxi + dyi * dyi + dzi * dzi;
		rj2 = dxj * dxj + dyj * dyj + dzj * dzj;
		ri = sqrt(ri2);
		rj = sqrt(rj2);
		rir = 1. / ri;
		rjr = 1. / rj;

		dxir = dxi * rir;
		dyir = dyi * rir;
		dzir = dzi * rir;
		dxjr = dxj * rjr;
		dyjr = dyj * rjr;
		dzjr = dzj * rjr;

		cst = dxir * dxjr + dyir * dyjr + dzir * dzjr;
		if (cst > 1.0)
			cst = 1.0;
		if (cst < -1.0)
			cst = -1.0;

		at = acos(cst);
		da = at - Teq[atyp];
		df = da * Tk[atyp];
		e = df * da;
		e_theta = e_theta + e;
		df = df + df;
		at = sin(at);
		if (at > 0 && at < 1.e-3)
			at = 1.e-3;
		else if (at < 0 && at > -1.e-3)
			at = -1.e-3;
		df = -df / at;

		xtmp = df * rir * (dxjr - cst * dxir);
		dxtmp = df * rjr * (dxir - cst * dxjr);
		ytmp = df * rir * (dyjr - cst * dyir);
		dytmp = df * rjr * (dyir - cst * dyjr);
		ztmp = df * rir * (dzjr - cst * dzir);
		dztmp = df * rjr * (dzir - cst * dzjr);

		f[at1 + 0] += xtmp;
		f[at3 + 0] += dxtmp;
		f[at2 + 0] -= xtmp + dxtmp;
		f[at1 + 1] += ytmp;
		f[at3 + 1] += dytmp;
		f[at2 + 1] -= ytmp + dytmp;
		f[at1 + 2] += ztmp;
		f[at3 + 2] += dztmp;
		f[at2 + 2] -= ztmp + dztmp;

	}
	if (e_debug)
		EXPR("%9.3f", e_theta);
	return (e_theta);
}

REAL_T ephi( int nphi, int *a1, int *a2, int *a3, int *a4, int *atype,
	REAL_T *Pk, REAL_T *Pn, REAL_T *Phase, REAL_T *x, REAL_T *f )
{
	REAL_T e, co, den, co1, uu, vv, uv, ax, bx, cx, ay, by, cy, az, bz, cz;
	REAL_T a0x, b0x, c0x, a0y, b0y, c0y, a0z, b0z, c0z, a1x, b1x;
	REAL_T a1y, b1y, a1z, b1z, a2x, b2x, a2y, b2y, a2z, b2z;
	REAL_T dd1x, dd2x, dd3x, dd4x, dd1y, dd2y, dd3y, dd4y, dd1z, dd2z,
		dd3z, dd4z;
	REAL_T df, aa, bb, cc, ab, bc, ac, ktot, cosq;
	REAL_T ktors, phase, e_tors;
	int i, at1, at2, at3, at4, atyp;
	REAL_T ux, uy, uz, vx, vy, vz, delta, phi, dx1, dy1, dz1, yy, pi;
	pi = 3.1415927;

	e_tors = 0.0;
	for (i = 0; i < nphi; i++) {

		at1 = a1[i];
		at2 = a2[i];
		at3 = abs(a3[i]);
		at4 = abs(a4[i]);
		atyp = atype[i] - 1;

		ax = x[at2 + 0] - x[at1 + 0];
		ay = x[at2 + 1] - x[at1 + 1];
		az = x[at2 + 2] - x[at1 + 2];
		bx = x[at3 + 0] - x[at2 + 0];
		by = x[at3 + 1] - x[at2 + 1];
		bz = x[at3 + 2] - x[at2 + 2];
		cx = x[at4 + 0] - x[at3 + 0];
		cy = x[at4 + 1] - x[at3 + 1];
		cz = x[at4 + 2] - x[at3 + 2];

#		define DOT(a,b,c,d,e,f) a*d + b*e + c*f
		ab = DOT(ax, ay, az, bx, by, bz);
		bc = DOT(bx, by, bz, cx, cy, cz);
		ac = DOT(ax, ay, az, cx, cy, cz);
		aa = DOT(ax, ay, az, ax, ay, az);
		bb = DOT(bx, by, bz, bx, by, bz);
		cc = DOT(cx, cy, cz, cx, cy, cz);

		uu = (aa * bb) - (ab * ab);
		vv = (bb * cc) - (bc * bc);
		uv = (ab * bc) - (ac * bb);

		den = 1.0 / sqrt(fabs(uu * vv));
		co = uv * den;
		co1 = 0.5 * co * den;

		a0x = -bc * bx + bb * cx;
		a0y = -bc * by + bb * cy;
		a0z = -bc * bz + bb * cz;

		b0x = ab * cx + bc * ax - 2. * ac * bx;
		b0y = ab * cy + bc * ay - 2. * ac * by;
		b0z = ab * cz + bc * az - 2. * ac * bz;

		c0x = ab * bx - bb * ax;
		c0y = ab * by - bb * ay;
		c0z = ab * bz - bb * az;

		a1x = 2. * uu * (-cc * bx + bc * cx);
		a1y = 2. * uu * (-cc * by + bc * cy);
		a1z = 2. * uu * (-cc * bz + bc * cz);

		b1x = 2. * uu * (bb * cx - bc * bx);
		b1y = 2. * uu * (bb * cy - bc * by);
		b1z = 2. * uu * (bb * cz - bc * bz);

		a2x = -2. * vv * (bb * ax - ab * bx);
		a2y = -2. * vv * (bb * ay - ab * by);
		a2z = -2. * vv * (bb * az - ab * bz);

		b2x = 2. * vv * (aa * bx - ab * ax);
		b2y = 2. * vv * (aa * by - ab * ay);
		b2z = 2. * vv * (aa * bz - ab * az);

		dd1x = (a0x - a2x * co1) * den;
		dd1y = (a0y - a2y * co1) * den;
		dd1z = (a0z - a2z * co1) * den;

		dd2x = (-a0x - b0x - (a1x - a2x - b2x) * co1) * den;
		dd2y = (-a0y - b0y - (a1y - a2y - b2y) * co1) * den;
		dd2z = (-a0z - b0z - (a1z - a2z - b2z) * co1) * den;

		dd3x = (b0x - c0x - (-a1x - b1x + b2x) * co1) * den;
		dd3y = (b0y - c0y - (-a1y - b1y + b2y) * co1) * den;
		dd3z = (b0z - c0z - (-a1z - b1z + b2z) * co1) * den;

		dd4x = (c0x - b1x * co1) * den;
		dd4y = (c0y - b1y * co1) * den;
		dd4z = (c0z - b1z * co1) * den;

		if (prm->Nhparm && a3[i] < 0) {
/*   here we will use a quadratic form for the improper torsion  */

			co = co > 1.0 ? 1.0 : co;
			co = co < -1.0 ? -1.0 : co;

			phi = acos(co);

/*
	now calculate sin(phi) because cos(phi) is symmetric, so
	we can decide between +-phi.
*/

			ux = ay * bz - az * by;
			uy = az * bx - ax * bz;
			uz = ax * by - ay * bx;

			vx = by * cz - bz * cy;
			vy = bz * cx - bx * cz;
			vz = bx * cy - by * cx;

			dx1 = uy * vz - uz * vy;
			dy1 = uz * vx - ux * vz;
			dz1 = ux * vy - uy * vx;

			dx1 = DOT(dx1, dy1, dz1, bx, by, bz);
			if (dx1 < 0.0)
				phi = -phi;

			delta = phi - Phase[atyp];
			delta = delta > pi ? pi : delta;
			delta = delta < -pi ? -pi : delta;

			df = Pk[atyp] * delta;
			e = df * delta;
			e_tors += e;
			yy = sin(phi);

/*
	Decide what expansion to use
	Check first for the "normal" expression, since it will be
	the most used

	the 0.001 value could be lowered for increased precision.
	This insures ~1e-05% error for sin(phi)=0.001
*/

			if (fabs(yy) > 0.001) {
				df = -2.0 * df / yy;
			} else {
				if (fabs(delta) < 0.10) {
					if (Phase[atyp] == 0.0) {
						df = -2.0 * Pk[atyp] * (1 + phi * phi / 6.0);
					} else {
						if (fabs(Phase[atyp]) == pi) {
							df = 2.0 * Pk[atyp] * (1 +
												   delta * delta / 6.0);
						}
					}
				} else {
					if ((phi > 0.0 && phi < (pi / 2.0)) ||
						(phi < 0.0 && phi > -pi / 2.0))
						df = df * 1000.;
					else
						df = -df * 1000.;
				}
			}

		} else {
		  multi_term:

			if (fabs(Phase[atyp] - 3.142) < 0.01)
				phase = -1.0;
			else
				phase = 1.0;

			ktors = Pk[atyp];
			switch ((int) fabs(Pn[atyp])) {

			case 1:
				e = ktors * (1.0 + phase * co);
				df = phase * ktors;
				break;
			case 2:
				e = ktors * (1.0 + phase * (2.*co*co -1.));
				df = phase*ktors*4.*co;
				break;
			case 3:
				cosq = co*co;
				e = ktors * (1.0 + phase * co*(4.*cosq -3.));
				df = phase*ktors*(12.*cosq - 3.);
				break;
			case 4:
				cosq = co*co;
				e = ktors * (1.0 + phase * (8.*cosq*(cosq - 1.) + 1.));
				df = phase*ktors*co*( 32.*cosq -16.);
				break;
			case 6:
				cosq = co*co;
				e = ktors * (1.0 + phase * (32.*cosq*cosq*cosq -
					48.*cosq*cosq + 18.*cosq - 1.));
				df = phase*ktors*co*( 192.*cosq*cosq - 192.*cosq + 36. );
				break;
			default:
				fprintf(stderr, "bad value for Pn: %d %d %d %d %8.3f\n",
						at1, at2, at3, at4, Pn[atyp]);
				exit(1);
			}
			e_tors += e;

		}

		f[at1 + 0] += df * dd1x;
		f[at1 + 1] += df * dd1y;
		f[at1 + 2] += df * dd1z;

		f[at2 + 0] += df * dd2x;
		f[at2 + 1] += df * dd2y;
		f[at2 + 2] += df * dd2z;

		f[at3 + 0] += df * dd3x;
		f[at3 + 1] += df * dd3y;
		f[at3 + 2] += df * dd3z;

		f[at4 + 0] += df * dd4x;
		f[at4 + 1] += df * dd4y;
		f[at4 + 2] += df * dd4z;

#ifdef PRINT_EPHI
		printf("%4d %4d %4d %4d %4d %9.4f\n", i + 1, at1 / 3, at2 / 3,
			   at3 / 3, at4 / 3, e);
		printf("%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f%10.5f\n",
			   -df * dd1x, -df * dd1y, -df * dd1z, -df * dd2x, -df * dd2y,
			   -df * dd2z, -df * dd3x, -df * dd3y);
		printf("%10.5f%10.5f%10.5f%10.5f\n", -df * dd3z, -df * dd4x,
			   -df * dd4y, -df * dd4z);
#endif
		if (Pn[atyp] < 0) {
			atyp++;
			goto multi_term;
		}

	}
	if (e_debug)
		EXPR("%9.3f", e_tors);
	return (e_tors);
}

REAL_T eimphi( int nphi, int *a1, int *a2, int *a3, int *a4, int *atype,
	REAL_T *Pk, REAL_T *Peq, REAL_T *x, REAL_T *f )
{

/*
	calculate improper dihedral energies and forces according
	to T.Schlick, J.Comput.Chem, vol 10, 7 (1989) 
	with local variant (no delta funcion involved)

   If |sin(phi)|.gt.1e-03 use the angle directly
       V = kimp ( phi -impeq ) ^2
	dV/dcos(phi) = -2*k/sin(phi)

   If |sin(phi)|.lt.1e-03 and (phi-Peq).lt.0.1
   and cnseq(phi)=0.
	V = kimp (phi - impeq)^2
	dV/dcos(phi) = -2*k*(1+phi*phi/6.)

   If |sin(phi)|.lt.1e-03 and (phi-Peq).lt.0.1
   and cnseq(phi)=pi
	V = kimp (phi - impeq)^2
	dV/dcos(phi) = 2*k*(1+phi*phi/6.)

   Now, if |sin(phi)|.lt.1e-03 but far from minimum,
   set sin(phi) to 1e-03 arbitrarily

*/

	REAL_T e;
	REAL_T co, phi, den, co1;
	REAL_T uu, vv, uv;
	REAL_T ax, bx, cx;
	REAL_T ay, by, cy;
	REAL_T az, bz, cz;
	REAL_T ux, uy, uz;
	REAL_T vx, vy, vz;
	REAL_T dx1, dy1, dz1;
	REAL_T a0x, b0x, c0x;
	REAL_T a0y, b0y, c0y;
	REAL_T a0z, b0z, c0z;
	REAL_T a1x, b1x;
	REAL_T a1y, b1y;
	REAL_T a1z, b1z;
	REAL_T a2x, b2x;
	REAL_T a2y, b2y;
	REAL_T a2z, b2z;
	REAL_T dd1x, dd2x, dd3x, dd4x;
	REAL_T dd1y, dd2y, dd3y, dd4y;
	REAL_T dd1z, dd2z, dd3z, dd4z;
	REAL_T df, delta, pi, yy;
	REAL_T aa, bb, cc, ab, bc, ac;
	REAL_T e_imp;
	int i;
	int at1, at2, at3, at4, atyp;

	pi = 3.1415927;

	e_imp = 0.0;
	for (i = 0; i < nphi; i++) {

		at1 = a1[i];
		at2 = a2[i];
		at3 = abs(a3[i]);
		at4 = abs(a4[i]);
		atyp = atype[i] - 1;


/*	calculate distances  */

		ax = x[at2 + 0] - x[at1 + 0];
		ay = x[at2 + 1] - x[at1 + 1];
		az = x[at2 + 2] - x[at1 + 2];
		bx = x[at3 + 0] - x[at2 + 0];
		by = x[at3 + 1] - x[at2 + 1];
		bz = x[at3 + 2] - x[at2 + 2];
		cx = x[at4 + 0] - x[at3 + 0];
		cy = x[at4 + 1] - x[at3 + 1];
		cz = x[at4 + 2] - x[at3 + 2];

#		define DOT(a,b,c,d,e,f) a*d + b*e + c*f
		ab = DOT(ax, ay, az, bx, by, bz);
		bc = DOT(bx, by, bz, cx, cy, cz);
		ac = DOT(ax, ay, az, cx, cy, cz);
		aa = DOT(ax, ay, az, ax, ay, az);
		bb = DOT(bx, by, bz, bx, by, bz);
		cc = DOT(cx, cy, cz, cx, cy, cz);


		uu = (aa * bb) - (ab * ab);
		vv = (bb * cc) - (bc * bc);
		uv = (ab * bc) - (ac * bb);

		den = 1.0 / sqrt(fabs(uu * vv));
		co = uv * den;
		co = co > 1.0 ? 1.0 : co;
		co = co < -1.0 ? -1.0 : co;

		phi = acos(co);

/*
	now calculate sin(phi) because cos(phi) is symmetric, so
	we can decide between +-phi.
*/

		ux = ay * bz - az * by;
		uy = az * bx - ax * bz;
		uz = ax * by - ay * bx;

		vx = by * cz - bz * cy;
		vy = bz * cx - bx * cz;
		vz = bx * cy - by * cx;

		dx1 = uy * vz - uz * vy;
		dy1 = uz * vx - ux * vz;
		dz1 = ux * vy - uy * vx;

		dx1 = DOT(dx1, dy1, dz1, bx, by, bz);
		if (dx1 < 0.0)
			phi = -phi;

		delta = phi - Peq[atyp];
		delta = delta > pi ? pi : delta;
		delta = delta < -pi ? -pi : delta;

		df = Pk[atyp] * delta;
		e = df * delta;
		e_imp += e;
		yy = sin(phi);

/*
	Decide what expansion to use
	Check first for the "normal" expression, since it will be
	the most used

	the 0.001 value could be lowered for increased precision.
	This insures ~1e-05% error for sin(phi)=0.001
*/

		if (fabs(yy) > 0.001) {
			df = -2.0 * df / yy;
		} else {
			if (fabs(delta) < 0.10) {
				if (Peq[atyp] == 0.0) {
					df = -2.0 * Pk[atyp] * (1 + phi * phi / 6.0);
				} else {
					if (fabs(Peq[atyp]) == pi)
						df = 2.0 * Pk[atyp] * (1 + delta * delta / 6.0);
				}
			} else {
				if ((phi > 0.0 && phi < (pi / 2.0)) ||
					(phi < 0.0 && phi > -pi / 2.0))
					df = df * 1000.;
				else
					df = -df * 1000.;
			}
		}

		co1 = 0.5 * co * den;

		a0x = -bc * bx + bb * cx;
		a0y = -bc * by + bb * cy;
		a0z = -bc * bz + bb * cz;

		b0x = ab * cx + bc * ax - 2. * ac * bx;
		b0y = ab * cy + bc * ay - 2. * ac * by;
		b0z = ab * cz + bc * az - 2. * ac * bz;

		c0x = ab * bx - bb * ax;
		c0y = ab * by - bb * ay;
		c0z = ab * bz - bb * az;

		a1x = 2. * uu * (-cc * bx + bc * cx);
		a1y = 2. * uu * (-cc * by + bc * cy);
		a1z = 2. * uu * (-cc * bz + bc * cz);

		b1x = 2. * uu * (bb * cx - bc * bx);
		b1y = 2. * uu * (bb * cy - bc * by);
		b1z = 2. * uu * (bb * cz - bc * bz);

		a2x = -2. * vv * (bb * ax - ab * bx);
		a2y = -2. * vv * (bb * ay - ab * by);
		a2z = -2. * vv * (bb * az - ab * bz);

		b2x = 2. * vv * (aa * bx - ab * ax);
		b2y = 2. * vv * (aa * by - ab * ay);
		b2z = 2. * vv * (aa * bz - ab * az);

		dd1x = (a0x - a2x * co1) * den;
		dd1y = (a0y - a2y * co1) * den;
		dd1z = (a0z - a2z * co1) * den;

		dd2x = (-a0x - b0x - (a1x - a2x - b2x) * co1) * den;
		dd2y = (-a0y - b0y - (a1y - a2y - b2y) * co1) * den;
		dd2z = (-a0z - b0z - (a1z - a2z - b2z) * co1) * den;

		dd3x = (b0x - c0x - (-a1x - b1x + b2x) * co1) * den;
		dd3y = (b0y - c0y - (-a1y - b1y + b2y) * co1) * den;
		dd3z = (b0z - c0z - (-a1z - b1z + b2z) * co1) * den;

		dd4x = (c0x - b1x * co1) * den;
		dd4y = (c0y - b1y * co1) * den;
		dd4z = (c0z - b1z * co1) * den;

		f[at1 + 0] += df * dd1x;
		f[at1 + 1] += df * dd1y;
		f[at1 + 2] += df * dd1z;

		f[at2 + 0] += df * dd2x;
		f[at2 + 1] += df * dd2y;
		f[at2 + 2] += df * dd2z;

		f[at3 + 0] += df * dd3x;
		f[at3 + 1] += df * dd3y;
		f[at3 + 2] += df * dd3z;

		f[at4 + 0] += df * dd4x;
		f[at4 + 1] += df * dd4y;
		f[at4 + 2] += df * dd4z;

	}

	return (e_imp);
}

/*
 * The downheap function from Robert Sedgewick's "Algorithms in C++" p. 152,
 * corrected for the fact that Sedgewick indexes the heap array from 1 to n
 * whereas Java indexes the heap array from 0 to n-1. Note, however, that
 * the heap should be indexed conceptually from 1 to n in order that for
 * any node k the two children are found at nodes 2*k and 2*k+1. Move down
 * the heap, exchanging the node at position k with the larger of its two
 * children if necessary and stopping when the node at k is larger than both
 * children or the bottom of the heap is reached. Note that it is possible
 * for the node at k to have only one child: this case is treated properly.
 * A full exchange is not necessary because the variable 'v' is involved in
 * the exchanges.  The 'while' loop has two exits: one for the case that
 * the bottom of the heap is hit, and another for the case that the heap
 * condition (the parent is greater than or equal to both children) is
 * satisfied somewhere in the interior of the heap.
 *
 * Calling parameters are as follows:
 *
 * a - array of indices into the atomic coordinate array x
 * n - the number of items to be sorted
 * k - the exchange node (or element)
 * x - the atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
 */

static void downheap(int *a, int n, int k, REAL_T *x, int p) {

  int j, v;

  v = a[k-1];
  while (k <= n/2) {
    j = k + k;
    if ( (j < n) && (x[3 * a[j-1] + p] < x[3 * a[j] + p]) ) j++;
    if (x[3 * v + p] >= x[3 * a[j-1] + p]) break;
    a[k-1] = a[j-1];
    k = j;
  }
  a[k-1] = v;
}

/*
 * The heapsort function from Robert Sedgewick's "Algorithms in C++" p. 156,
 * corrected for the fact that Sedgewick indexes the heap array from 1 to n
 * whereas Java indexes the heap array from 0 to n-1. Note, however, that
 * the heap should be indexed conceptually from 1 to n in order that for
 * any node k the two children are found at nodes 2*k and 2*k+1.  In what
 * follows, the 'for' loop heaporders the array in linear time and the
 * 'while' loop exchanges the largest element with the last element then
 * repairs the heap.
 *
 * Calling parameters are as follows:
 *
 * a - array of indices into the atomic coordinate array x
 * n - the number of items to be sorted
 * x - the atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
 */

static void heapsort(int *a, int n, REAL_T *x, int p) {

  int k, v;

  for (k = n/2; k >= 1; k--) downheap(a, n, k, x, p);
  while (n > 1) {
    v = a[0];
    a[0] = a[n-1];
    a[n-1] = v;
    downheap(a, --n, 1, x, p);
  }
}

/*
 * Build the kd tree by recursively subdividing the atom number
 * arrays and adding nodes to the tree.  Note that the arrays
 * are permuted cyclically as control moves down the tree in
 * order that sorting occur on x, y and z.  The temporary array
 * is provided to facilitate the copy and partition operation.
 *
 * Calling parameters are as follows:
 *
 * xn - x sorted array of atom numbers
 * yn - y sorted array of atom numbers
 * zn - z sorted array of atom numbers
 * tn - temporary array for atom numbers
 * start - first element of array 
 * end - last element of array
 * kdpptr - pointer to pointer to kd tree node next available for allocation
 * that - the node currently visited, the equivalent of 'this' in C++
 * x - atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
 */

static void buildkdtree(int *xn, int *yn, int *zn, int *tn, int start, int end,
			KDNODE **kdpptr, KDNODE *that, REAL_T *x, int p) {

  int i, middle, lower, upper;
  REAL_T median;

  /* The partition cycles modulo 3. */

  p %= 3;

  /* If only one element is passed to this function, add it to the tree. */

  if (end == start) {
    that->n = xn[start];
  }

  /*
   * Otherwise, if two elements are passed to this function, determine
   * whether the first element is the low child or the high child.
   * Allocate a new KDNODE and make it one or the other of the children.
   */

  else if (end == start + 1) {
    that->n = xn[end];
    (*kdpptr)->n = xn[start];
    (*kdpptr)->lo = NULL;
    (*kdpptr)->hi = NULL;
    if (x[3 * xn[start] + p] < x[3 * xn[end] + p]) {
      that->lo = (*kdpptr)++;
    } else {
      that->hi = (*kdpptr)++;
    }
  }

  /* Otherwise, more than two elements are passed to this function. */

  else {

    /*
     * The middle element of the xn array is taken as the element about
     * which the yn and zn arrays will be partitioned.  However, search
     * lower elements of the xn array to ensure that the x values of the
     * atomic coordinate array that correspond to these elements are indeed
     * less than the median value because they may be equal to it.  This
     * approach is consistent with partitioning between < and >=.
     */

    middle = (start + end)/2;
    median = x[3 * xn[middle] + p];
    for (i = middle - 1; i >= start; i--) {
      if (x[3 * xn[i] + p] < median) {
	break;
      } else {
	middle = i;
      }
    }

    /* Store the middle element at this kd node. */

    that->n = xn[middle];

    /*
     * Scan the yn array in ascending order and compare the x value of
     * each corresponding element of the atomic coordinate array to the
     * median value.  If the x value is less than the median value, copy
     * the element of the yn array into the lower part of the tn array.
     * If the x value is greater than or equal to the median value, copy
     * the element of the yn array into the upper part of the tn array.
     * The lower part of the tn array begins with the start index, and the
     * upper part of the tn array begins one element above the middle index.
     * At the end of this scan and copy operation, the tn array will have
     * been subdivided into three groups: (1) a group of indices beginning
     * with start and continuing up to but not including middle, which indices
     * point to atoms for which the x value is less than the median value;
     * (2) the middle index that has been stored in this node of  the kd tree;
     * and (3) a group of indices beginning one address above middle and
     * continuing up to and including end, which indices point to atoms for
     * which the x value is greater than or equal to the median value.
     *
     * This approach preserves the relative heapsorted order of elements
     * of the atomic coordinate array that correspond to elements of the
     * yn array while those elements are partitioned about the x median.
     *
     * Note: when scanning the yn array, skip the element (i.e., the atom
     * number) that equals the middle element because that atom number has
     * been stored at this node of the kd-tree.
     */

    lower = start - 1;
    upper = middle;
    for (i = start; i <= end; i++) {
      if (yn[i] != xn[middle]) {
	if (x[3 * yn[i] + p] < median) {
	  tn[++lower] = yn[i];
	} else {
	  tn[++upper] = yn[i];
	}
      }
    }

    /* Ensure that all yn array elements have been copied . */

#ifdef KDTREE_DEBUG

    if (lower != middle - 1) {
      printf("yn: lower = %d doesn't equal middle = %d - 1\n", lower, middle);
      exit (1);
    }
    if (upper != end) {
      printf("yn: upper = %d doesn't equal end = %d\n", upper, end);
      exit (1);
    }

#endif

    /*
     * All elements of the yn array between start and end have been copied
     * and partitioned into the tn array, so the yn array is available for
     * elements of the zn array to be copied and partitioned into the yn
     * array, in the same manner in which elements of the yn array were
     * copied and partitioned into the tn array.
     *
     * This approach preserves the relative heapsorted order of elements
     * of the atomic coordinate array that correspond to elements of the
     * zn array while those elements are partitioned about the x median.
     *
     * Note: when scanning the zn array, skip the element (i.e., the atom
     * number) that equals the middle element because that atom number has
     * been stored at this node of the kd-tree.
     */

    lower = start - 1;
    upper = middle;
    for (i = start; i <= end; i++) {
      if (zn[i] != xn[middle]) {
	if (x[3 * zn[i] + p] < median) {
	  yn[++lower] = zn[i];
	} else {
	  yn[++upper] = zn[i];
	}
      }
    }

    /* Ensure that all zn array elements have been copied . */

#ifdef KDTREE_DEBUG

    if (lower != middle - 1)
      printf("zn: lower = %d doesn't equal middle = %d - 1\n", lower, middle);
    if (upper != end)
      printf("zn: upper = %d doesn't equal end = %d\n", upper, end);

#endif

    /*
     * Recurse down the lo branch of the tree if the lower group of
     * the tn array is non-null.  Note permutation of the xn, yn, zn
     * and tn arrays.  In particular, xn was used for partitioning at
     * this level of the tree.  At one level down the tree, yn (which
     * has been copied into tn) will be used for partitioning.  At two
     * levels down the tree, zn (which has been copied into yn) will
     * be used for partitioning.  At three levels down the tree, xn
     * will be used for partitioning.  In this manner, partitioning
     * cycles through xn, yn and zn at successive levels of the tree.
     */

    if (lower >= start) {
      (*kdpptr)->lo = NULL;
      (*kdpptr)->hi = NULL;
      that->lo = (*kdpptr)++;
      buildkdtree(tn, yn, xn, zn, start, lower, kdpptr, that->lo, x, p+1);
    }
      
    /*
     * Recurse down the hi branch of the tree if the upper group of
     * the tn array is non-null.  Note permutation of the xn, yn, zn
     * and tn arrays, as explained above for recursion down the lo
     * branch of the tree.
     */

    if (upper > middle) {
      (*kdpptr)->lo = NULL;
      (*kdpptr)->hi = NULL;
      that->hi = (*kdpptr)++;
      buildkdtree(tn, yn, xn, zn, middle+1, end, kdpptr, that->hi, x, p+1);
    }
  }
}

#ifdef KDTREE_DEBUG

/*
 * Check that all children are < the reference kd node.
 *
 * Calling parameters are as follows:
 *
 * parent - the node against which all children are compared
 * that - the node currently visited, equivalent to 'this' in C++
 * x - atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
*/

static void checkkdlo(KDNODE *parent, KDNODE *that, REAL_T *x, int p) {

  /* Check the current kdnode against its parent. */

  if (x[3 * that->n + p] > x[3 * parent->n + p]) {
    printf("Error in checkkdlo!  x[%d] = %f > x[%d] = %f\n",
	   3 * that->n + p, x[3 * that->n + p],
	   3 * parent->n + p, x[3 * parent->n + p]);
    exit (1);
  }

  /* Check the children against their grandparent. */

  if (that->lo != NULL) {
    checkkdlo(parent, that->lo, x, p);
  }
  if (that->hi != NULL) {
    checkkdlo(parent, that->hi, x, p);
  }
}

/*
 * Check that all children are >= the reference kd node.
 *
 * Calling parameters are as follows:
 *
 * parent - the node against which all children are compared
 * that - the node currently visited, equivalent to 'this' in C++
 * x - atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
 */

static void checkkdhi( KDNODE *parent, KDNODE *that, REAL_T *x, int p ) {

  /* Check the current kdnode against its parent. */

  if (x[3 * parent->n + p] > x[3 * that->n + p]) {
    printf("Error in checkkdhi!  x[%d] = %f > x[%d] = %f\n",
	   3 * parent->n + p, 3 * x[parent->n + p],
	   3 * that->n + p, 3 * x[that->n + p]);
    exit (1);
  }

  /* Check the children against their grandparent. */

  if (that->lo != NULL) {
    checkkdhi(parent, that->lo, x, p);
  }
  if (that->hi != NULL) {
    checkkdhi(parent, that->hi, x, p);
  }
}

/*
 * Walk the kd tree and call the check functions.  All children on the
 * lo branch of the tree should be < the reference kd node.  All children
 * on the hi branch of the tree should be >= the reference kd node.
 *
 * Calling parameters are as follows:
 *
 * that - the node currently visited, equivalent to 'this' in C++
 * x - atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
 */

static void checkkdtree( KDNODE *that, REAL_T *x, int p ) {

  p %= 3;

  if (that->lo != NULL) {
    checkkdtree(that->lo, x, p+1);
    checkkdlo(that, that->lo, x, p);
  }
  if (that->hi != NULL) {
    checkkdtree(that->hi, x, p+1);
    checkkdhi(that, that->hi, x, p);
  }
}

#endif

/*
 * Walk the kd tree and generate the pair lists for the upper and lower
 * triangles.  The pair lists are not ordered in the atom number.
 *
 * Calling parameters are as follows:
 *
 * that - the node currently visited, equivalent to 'this' in C++
 * x - atomic coordinate array
 * p - the partition (x, y or z) on which sorting occurs
 * q - the query atom number
 * lopairs - pair count array indexed by atom for the lower triangle
 * uppairs - pair count array indexed by atom for the upper triangle
 * lopairlist - the pair list for the lower triangle
 * uppairlist - the pair list for the upper triangle
 */

static void searchkdtree( KDNODE *that, REAL_T *x, int p, int q,
			  int *lopairs, int *uppairs,
			  int *lopairlist, int *uppairlist ) {

  /* The partition cycles modulo 3. */

  p %= 3;

  /*
   * Search the low branch of the tree if the atomic coordinate of the
   * query atom minus the cutoff radius less than the atomic coordinate
   * of the kd node atom.
   */

  if ( (that->lo != NULL) &&
       (x[3 * q + p] - cut < x[3 * that->n + p]) ) {
    searchkdtree(that->lo, x, p+1, q, lopairs, uppairs, lopairlist, uppairlist);
  }

  /*
   * If the query atom number does not equal the kd tree node atom number
   * and at least one of the two atoms is not frozen, calculate the interatomic
   * distance and add the kd tree node atom to one of the pair lists if the
   * distance is less than the cutoff distance.  The atom belongs on the lower
   * triangle pair list if the atom number is less than the query node atom
   * number.  Otherwise, it belongs on the upper triangle pair list.
   */

  if ( ( q != that->n ) && ( !frozen[q] || !frozen[that->n] ) ){
    REAL_T xij = x[3 * q + 0] - x[3 * that->n + 0];
    REAL_T yij = x[3 * q + 1] - x[3 * that->n + 1];
    REAL_T zij = x[3 * q + 2] - x[3 * that->n + 2];
    REAL_T r2 = xij * xij + yij * yij + zij * zij;
    if (r2 < cut2) {
      if (that->n < q) {
	lopairlist[lopairs[q]++] = that->n;
      } else {
	uppairlist[uppairs[q]++] = that->n;
      }
    }
  }

  /*
   * Search the high branch of the tree if the atomic coordinate of the
   * query atom plus the cutoff radius less than than the atomic coordinate
   * of the kd node atom.
   */

  if ( (that->hi != NULL) &&
       (x[3 * q + p] + cut >= x[3 * that->n + p]) ) {
    searchkdtree(that->hi, x, p+1, q, lopairs, uppairs, lopairlist, uppairlist);
  }
}

/*
 * Create the non-bonded pairlist using a kd-tree.  The kd-tree nodes
 * are allocated from the kdtree array.
 *
 * Calling parameters are as follows:
 *
 * x - atomic coordinate array
 * lopairs - pair count array indexed by atom for the lower triangle
 * uppairs - pair count array indexed by atom for the upper triangle
 * lostart - beginning address indexed by atom for the lower triangle
 * upstart - beginning address indexed by atom for the upper triangle
 * pairlist - the pair list for the lower and upper triangles
 *
 * This function returns the total number of pairs.
 */

int nblist( REAL_T *x, int *lopairs, int *uppairs, int* lostart, int* upstart,
	    int *bothpairlist )
{
  int i, j;
  int *xn, *yn, *zn, *tn;
  int totpair = 0;
  KDNODE *kdtree, *kdptr;

  /* Square the cutoff distance for use in searchkdtree. */

  cut2 = cut * cut;

  /* Allocate the kdtree array that must hold one node per atom. */

  if ( (kdtree = (KDNODE *) malloc(prm->Natom*sizeof(KDNODE))) == NULL ) {
    printf("Error allocate kdnode array in nbtree!\n");
    exit (0);
  }

  /*
   * Allocate, initialize and sort the arrays that hold the results of the
   * heapsort on x,y,z.  These arrays are used as pointers (via array indices)
   * into the atomic coordinate array x.  Allocate an additional temp array
   * so that the buildkdtree function can cycle through x,y,z.
   */

  xn = ivector(0, prm->Natom);
  yn = ivector(0, prm->Natom);
  zn = ivector(0, prm->Natom);
  tn = ivector(0, prm->Natom);

  for (i = 0; i < prm->Natom; i++) {
    xn[i] = yn[i] = zn[i] = i;
  }

  heapsort(xn, prm->Natom, x, 0);
  heapsort(yn, prm->Natom, x, 1);
  heapsort(zn, prm->Natom, x, 2);

#ifdef KDTREE_DEBUG

  /* Verify that heapsort functioned correctly. */

  for (i = 1; i < prm->Natom; i++) {
    if (x[3 * xn[i] + 0] < x[3 * xn[i-1] + 0])
      printf("Error in x heapsort!  x[%d] = %f  x[%d] = %f\n",
	     i, x[3 * xn[i] + 0], i, x[3 * xn[i-1] + 0]);

    if (x[3 * yn[i] + 1] < x[3 * yn[i-1] + 1])
      printf("Error in y heapsort!  y[%d] = %f  y[%d] = %f\n",
	     i, x[3 * yn[i] + 1], i, x[3 * yn[i-1] + 1]);

    if (x[3 * zn[i] + 2] < x[3 * zn[i-1] + 2])
      printf("Error in z heapsort!  z[%d] = %f  z[%d] = %f\n",
	     i, x[3 * zn[i] + 2], i, x[3 * zn[i-1] + 2]);
  }

#endif

  /* Build the kd tree. */

  kdptr = kdtree;
  KDNODE *root = kdptr++;
  root->lo = NULL;
  root->hi = NULL;
  buildkdtree(xn, yn, zn, tn, 0, prm->Natom-1, &kdptr, root, x, 0);

  /*
   * Search the kd tree with each atom.  Use xn and yn as temporary
   * arrays for the lower and upper triangle pair lists for one atom.
   */

  for (i = 0; i < prm->Natom; i++) {
    lostart[i] = totpair;
    lopairs[i] = uppairs[i] = 0;
    searchkdtree(root, x, 0, i, lopairs, uppairs, xn, yn);
    for (j = 0; j < lopairs[i]; j++) {
      if (totpair >= maxnb) {
	printf("maximum pairlist size exceeded by lower triangle\n");
	exit(1);
      } else {
	bothpairlist[totpair++] = xn[j];
      }
    }
    upstart[i] = totpair;
    for (j = 0; j < uppairs[i]; j++) {
      if (totpair >= maxnb) {
	printf("maximum pairlist size exceeded by upper triangle\n");
	exit(1);
      } else {
	bothpairlist[totpair++] = yn[j];
      }
    }
  }

#ifdef KDTREE_DEBUG

  /* Check that the kd tree is built correctly. */

  checkkdtree(root, x, 0);

  /*
   * Verify that every atom j on the lower triangle pair list has
   * an atom number less than i.
   */

  for (i = 0; i < prm->Natom; i++) {
    for (j = 0; j < lopairs[i]; j++) {
      if (bothpairlist[lostart[i] + j] > i) {
	printf("Lower triangle j = %d > i = %d\n",
	       bothpairlist[lostart[i] + j], i);
	exit(1);
      }
    }
  }

  /*
   * Verify that every atom j on the upper triangle pair list has
   * an atom number greater than i.
   */

  for (i = 0; i < prm->Natom; i++) {
    for (j = 0; j < uppairs[i]; j++) {
      if (bothpairlist[upstart[i] + j] < i) {
	printf("Upper triangle j = %d < i = %d\n",
	       bothpairlist[upstart[i] + j], i);
	exit(1);
      }
    }
  }

  /* Verify that every atom on the pair list has a valid atom number. */

  for (i = 0; i < prm->Natom; i++) {
    for (j = lostart[i]; j < lostart[i] + lopairs[i] + uppairs[i]; j++) {
      if (bothpairlist[j] < 0 || bothpairlist[j] >= prm->Natom) {
	printf("pairlist[%d] = %d is not a valid atom number!\n",
	       j, bothpairlist[j]);
	exit(1);
      }
    }
  }

  /* Verify that every atom on the pair list is within the cutoff radius. */

  for (i = 0; i < prm->Natom; i++) {
    REAL_T xi = x[3 * i    ];
    REAL_T yi = x[3 * i + 1];
    REAL_T zi = x[3 * i + 2];
    for (j = 0; j < lopairs[i] + uppairs[i]; j++) {
      REAL_T xij = xi - x[3 * bothpairlist[lostart[i] + j]    ];
      REAL_T yij = yi - x[3 * bothpairlist[lostart[i] + j] + 1];
      REAL_T zij = zi - x[3 * bothpairlist[lostart[i] + j] + 2];
      REAL_T r2 = xij * xij + yij * yij + zij * zij;
      if (r2 >= cut2) {
	printf("Atom %d is not within the cutoff radius from atom %d\n",
	       bothpairlist[lostart[i] + j], i);
	exit(1);
      }
    }
  }

  /*
   * Verify that every atom not on the pair lists is not within the
   * cutoff radius.  Set every element of the xn array to -1.  Then
   * for every atom i, mark the atoms j on the pairlist of i by setting
   * xn[j] to i.  Mark also atom i by setting xn[i] to i because atom i
   * is always within its own cutoff radius.  Now the atoms j that are
   * not on the pairlist of atom i may be identified because xn[j] is
   * not equal to i.  For example, xn[j] could be equal to i-1 because
   * it was marked from the pairlist of atom i-1.  Marked atoms remain
   * validly marked until the entire process begins again with the first i.
   */

  for (i = 0; i < prm->Natom; i++) {
    xn[i] = -1;
  }
  for (i = 0; i < prm->Natom; i++) {
    REAL_T xi = x[3 * i    ];
    REAL_T yi = x[3 * i + 1];
    REAL_T zi = x[3 * i + 2];
    for (j = 0; j < lopairs[i] + uppairs[i]; j++) {
      xn[bothpairlist[lostart[i] + j]] = i;
    }
    xn[i] = i;
    for (j = 0; j < prm->Natom; j++) {
      if (xn[j] != i) {
	REAL_T xij = xi - x[3 * j    ];
	REAL_T yij = yi - x[3 * j + 1];
	REAL_T zij = zi - x[3 * j + 2];
	REAL_T r2 = xij * xij + yij * yij + zij * zij;
	if (r2 < cut2) {
	  printf("Atom %d is within the cutoff radius from atom %d\n", xn[j], i);
	  exit(1);
	}
      }
    }
  }

#endif

  /* Free the temporary arrays. */

  free(kdtree);
  free_ivector(xn, 0, prm->Natom);
  free_ivector(yn, 0, prm->Natom);
  free_ivector(zn, 0, prm->Natom);
  free_ivector(tn, 0, prm->Natom);

  return totpair;
}

/* 
 * Calculate the non-bonded energy and first derivatives.
 * This function is complicated by the fact that it must
 * process two forms of pair lists: the 1-4 pair list and
 * the non-bonded pair list.  The non-bonded pair list
 * must be modified by the excluded atom list whereas the
 * 1-4 pair list is used unmodified.  Also, the non-bonded
 * pair list comprises lower and upper triangles whereas
 * the 1-4 pair list comprises an upper triangle only.
 *
 * Calling parameters are as follows:
 *
 * npairs - the number of pairs on either pair list
 * pairlist - either the 1-4 pair list or the non-bonded pair list
 * offset - the starting address of the upper triangle; ignored
 *          for the 1-4 pair list
 * non14 - set to 1 for the non-bonded pair list, 0 for the 1-4 pair list
 * x - the atomic coordinate array
 * f - the gradient vector
 * enb - Van der Waals energy return value, passed by reference
 * eel - Coulombic energy return value, passed by reference
 * enbfac - scale factor for Van der Waals energy
 * eelfac - scale factor for Coulombic energy
 */

int nbond( int *npairs, int *pairlist, int *offset, int non14,
	   REAL_T *x, REAL_T *f, REAL_T *enb, REAL_T *eel,
	   REAL_T enbfac, REAL_T eelfac )
{
  int i, j, jn, ic, npr, lpair, iaci, iexcl, nexcl;
  int *iexw;
  REAL_T dumx, dumy, dumz, cgi, r2inv, df2, r6, r10, f1, f2;
  REAL_T dedx, dedy, dedz, df, enbfaci, eelfaci;
  REAL_T xi, yi, zi, xij, yij, zij, r, r2;
  REAL_T dis, kij, d0, diff, rinv, rs, rssq, eps1, epsi, cgijr, pow;
  int nhbpair, ibig, isml;
#if 0
  REAL_T hbener, nb14;
  nhbpair = 0;
  hbener = 0.0;
  nb14 = 0.0;
#endif

#define SIG 0.3
#define DIW 78.0
#define C1 38.5

  *enb = 0.;
  *eel = 0.;
  enbfaci = 1. / enbfac;
  eelfaci = 1. / eelfac;

  /* Initialize offsets into the pair list and the excluded atom list. */

  lpair = 0;
  iexcl = 0;

  /*
   * Allocate and initialize the iexw array used for skipping excluded
   * atoms.  Note that because of the manner in which iexw is used, it
   * is necessary to initialize it before only the first iteration of
   * the following loop.
   */

  iexw = ivector(-1, prm->Natom);
  for (i = -1; i < prm->Natom; i++) {
    iexw[i] = -1;
  }

  /* Loop over all atoms i except for the final atom i. */

  for (i = 0; i < prm->Natom - 1; i++) {

    /*
     * If the 'non14' calling parameter is set, ignore the offset
     * into the pair list that accumulates with each iteration of
     * the i-loop, and use instead the beginning address of the upper
     * triangle of the pair list.
     */

    if (non14 != 0) {
      lpair = offset[i];
    }

    /* Check whether there are any atoms j on the pair list of atom i. */

    nexcl = prm->Iblo[i];
    npr = npairs[i];
    if (npr > 0) {
      iaci = prm->Ntypes * (prm->Iac[i] - 1);
      dumx = 0.0;
      dumy = 0.0;
      dumz = 0.0;
      xi = x[3 * i + 0];
      yi = x[3 * i + 1];
      zi = x[3 * i + 2];
      cgi = eelfaci * prm->Charges[i];

      /*
       * Expand the excluded list into the iexw array by storing i
       * at array address j.
       */

      for (j = 0; j < nexcl; j++) {
	iexw[prm->ExclAt[iexcl + j] - 1] = i;
      }

      /* Select atoms j from the pair list. */

      for (jn = 0; jn < npr; jn++) {
	j = pairlist[jn + lpair];

	/*
	 * If the 'non14' calling parameter is set, check whether
	 * this i,j pair is exempted by the excluded atom list.
	 */

	if (non14 == 0 || iexw[j] != i) {
	  xij = xi - x[3 * j + 0];
	  yij = yi - x[3 * j + 1];
	  zij = zi - x[3 * j + 2];
	  r2 = xij * xij + yij * yij + zij * zij;
	  r2inv = 1.0 / r2;
	  r = sqrt(r2);
	  rinv = r * r2inv;

	  /* Calculate the energy and derivatives according to dield. */

	  if (dield == -3) {

	    /* special code Ramstein & Lavery dielectric, 94 force field */

	    rs = SIG * r;
	    rssq = rs * rs;
	    pow = exp(-rs);
	    eps1 = rssq + rs + rs + 2.0;
	    epsi = 1.0 / (DIW - C1 * pow * eps1);
	    cgijr = cgi * prm->Charges[j] * rinv * epsi;
	    *eel += cgijr;
	    df2 = -cgijr * (1.0 + C1 * pow * rs * rssq * epsi);
	    ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	    if( ic >= 0 ){
	      r6 = r2inv * r2inv * r2inv;
	      f2 = prm->Cn2[ic] * r6;
	      f1 = prm->Cn1[ic] * r6 * r6;
	      *enb += (f1 - f2) * enbfaci;
	      df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	    } else {
	      df = df2 * rinv;
	    }

	  } else if (dield == -4) {

	    /* distance-dependent dielectric code, 94 ff */
	    /* epsilon = r  */

	    rs = cgi * prm->Charges[j] * r2inv;
	    df2 = -2.0 * rs;
	    *eel += rs;
	    ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	    if( ic >= 0 ){
	      r6 = r2inv * r2inv * r2inv;
	      f2 = prm->Cn2[ic] * r6;
	      f1 = prm->Cn1[ic] * r6 * r6;
	      *enb += (f1 - f2) * enbfaci;
	      df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	    } else {
	      df = df2 * rinv;
	    }

	  } else if (dield == -5) {

	    /* non-bonded term from yammp  */

	    dis = r;
	    ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	    d0 = prm->Cn2[ic];
	    if (dis < d0) {
	      kij = prm->Cn1[ic];
	      diff = dis - d0;
	      *enb += kij * diff * diff;
	      df = 2.0 * kij * diff;
	    } else {
	      df = 0.0;
	    }
	  } else {

	    /*
	     * Code for various dielectric models.
	     * The df2 variable should hold r(dV/dr).
	     */

	    if (dield == 0) {

	      /* epsilon = r  */

	      rs = cgi * prm->Charges[j] * r2inv;
	      df2 = -2.0 * rs;
	      *eel += rs;

	    } else if (dield == 1) {
	      
	      /* epsilon = 1  */

	      rs = cgi * prm->Charges[j] * rinv;
	      df2 = -rs;
	      *eel += rs;

	    } else if (dield == -2) {

	      /* Ramstein & Lavery dielectric, PNAS 85, 7231 (1988). */

	      rs = SIG * r;
	      rssq = rs * rs;
	      pow = exp(-rs);
	      eps1 = rssq + rs + rs + 2.0;
	      epsi = 1.0 / (DIW - C1 * pow * eps1);
	      cgijr = cgi * prm->Charges[j] * rinv * epsi;
	      *eel += cgijr;
	      df2 = -cgijr * (1.0 + C1 * pow * rs * rssq * epsi);
	    }

	    /* Calculate either Van der Waals or hydrogen bonded term. */

	    ic = prm->Cno[iaci + prm->Iac[j] - 1];
	    if (ic > 0 || enbfac != 1.0) {
	      if (ic > 0) {
		ic--;
	      } else {
		ibig = prm->Iac[i] > prm->Iac[j] ?
		  prm->Iac[i] : prm->Iac[j];
		isml = prm->Iac[i] > prm->Iac[j] ?
		  prm->Iac[j] : prm->Iac[i];
		ic = ibig * (ibig - 1) / 2 + isml - 1;
	      }
	      r6 = r2inv * r2inv * r2inv;
	      f2 = prm->Cn2[ic] * r6;
	      f1 = prm->Cn1[ic] * r6 * r6;
	      *enb += (f1 - f2) * enbfaci;
	      df = (df2 + (6.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
#if 0
	      if( enbfac != 1.0 ) nb14 += (f1-f2)*enbfaci;
#endif
	    } else {
	      ic = -ic - 1;
	      r10 = r2inv * r2inv * r2inv * r2inv * r2inv;
	      f2 = prm->HB10[ic] * r10;
	      f1 = prm->HB12[ic] * r10 * r2inv;
	      *enb += (f1 - f2) * enbfaci;
	      df = (df2 + (10.0 * f2 - 12.0 * f1) * enbfaci) * rinv;
	      nhbpair++;
#if 0
	      hbener += (f1-f2)*enbfaci;
#endif
	    }
	  }

	  /*
	   * Divide by r here instead dividing the derivatives by r,
	   * and update the gradient for atom j.
	   */

	  df *= rinv;

	  dedx = df * xij;
	  dedy = df * yij;
	  dedz = df * zij;

	  dumx += dedx;
	  dumy += dedy;
	  dumz += dedz;

	  f[3 * j + 0] -= dedx;
	  f[3 * j + 1] -= dedy;
	  f[3 * j + 2] -= dedz;
	}
      }

    /* For atom i, the gradient is updated in the i-loop only. */

      f[3 * i + 0] += dumx;
      f[3 * i + 1] += dumy;
      f[3 * i + 2] += dumz;
    }

    /* Update the offsets into the pair list and the excluded atom list. */

    lpair += npr;
    iexcl += nexcl;
  }

  /* Deallocate the iexw array. */

  free_ivector(iexw, -1, prm->Natom);

  return (0);
}

/*
 * Calculate the generalized Born energy and first derivatives.
 *
 * Calling parameters are as follows:
 *
 * natom - input: the number of atoms
 * x - input: the atomic (x,y,z) coordinates
 * f - updated: the gradient vector
 * fs - input: overlap parameters
 * rborn - input: atomic radii
 * q - input: atomic charges
 * kappa - input: inverse of the Debye-Huckel length
 * diel_ext - input: solvent dielectric constant
 * enb - updated: Lennard-Jones energy
 * eelt - updated: gas-phase electrostatic energy
 */

REAL_T egb( int *natom, REAL_T *x, REAL_T *f, REAL_T *fs, REAL_T *rborn, REAL_T *q,
	    REAL_T *kappa, REAL_T *diel_ext, REAL_T *enb, REAL_T *eelt )

#define BOFFSET 0.09
#define KSCALE 0.73

#define PIx4 12.5663706143591724639918
#define PIx2  6.2831853071795862319959
#define PIx1  3.1415926535897931159979

{

#ifdef MULTIPLE_THREADS
  int iexcl_last;
#endif

  int i, i3, j, j3, k;
  int iexcl, jexcl, jexcl_last, nexcl, npairs, ic, iaci;
  int *iexw;
  REAL_T epol, dielfac, qi, qj, qiqj, fgbi, fgbk, rb2, expmkf;
  REAL_T elec, evdw, sumda, daix, daiy, daiz;
  REAL_T xi, yi, zi, xij, yij, zij;
  REAL_T dedx, dedy, dedz, de;
  REAL_T dij1i, dij3i, temp1;
  REAL_T qi2h, qid2h, datmp;
  REAL_T theta, ri1i, dij2i;

  REAL_T dij, sumi;
  REAL_T eel, f6, f12, rinv, r2inv, r6inv;
  REAL_T r2, ri, rj, sj, sj2, thi;
  REAL_T uij, efac, temp4, temp5, temp6;
  REAL_T *reff, *sumdeijda, *sumepol, *sumelec, *sumevdw, *psi;

  /* LCPO stuff follows */
  int count, count2, icount;
  REAL_T si, sumAij, sumAjk, sumAijAjk, sumdAijddijdxi;
  REAL_T sumdAijddijdyi, sumdAijddijdzi, sumdAijddijdxiAjk;
  REAL_T sumdAijddijdyiAjk, sumdAijddijdziAjk, rij, tmpaij, Aij,
    dAijddij;
  REAL_T dAijddijdxj, dAijddijdyj, dAijddijdzj;
  REAL_T sumdAjkddjkdxj, sumdAjkddjkdyj, sumdAjkddjkdzj, p3p4Aij;
  REAL_T xk, yk, zk, rjk2, djk1i, rjk, vdw2dif, tmpajk, Ajk, sumAjk2,
    dAjkddjk;
  REAL_T dAjkddjkdxj, dAjkddjkdyj, dAjkddjkdzj, lastxj, lastyj, lastzj;
  REAL_T dAidxj, dAidyj, dAidzj, Ai, dAidxi, dAidyi,
    dAidzi;
  REAL_T totsasa;
  REAL_T xj, yj, zj;
  REAL_T dumbo,tmpsd;
  REAL_T rgbmax1i, rgbmax2i, rgbmaxpsmax2;

  /* arrays and vars for use in vectorizable loops */
  int icount2;
  REAL_T *vfgbi, *vefac,*r2x,*vrinv,*vexpmkf,*vectmp1;
  REAL_T kkscale,invdiel_ext;

  /*FGB taylor coefficients follow */
  /* from A to H :                 */
  /* 1/3 , 2/5 , 3/7 , 4/9 , 5/11  */
  /* 4/3 , 12/5 , 24/7 , 40/9 , 60/11 */

#define TA 0.33333333333333333333
#define TB 0.4
#define TC 0.42857142857142857143
#define TD 0.44444444444444444444
#define TDD 0.45454545454545454545

#define TE 1.33333333333333333333
#define TF 2.4
#define TG 3.42857142857142857143
#define TH 4.44444444444444444444
#define THH 5.45454545454545454545

  /*
   * Smooth "cut-off" in calculating GB effective radii.
   * Implementd by Andreas Svrcek-Seiler and Alexey Onufriev.
   * The integration over solute is performed up to rgbmax and includes
   * parts of spheres; that is an atom is not just "in" or "out", as
   * with standard non-bonded cut.  As a result, calclated effective
   * radii are less than rgbmax. This saves time, and there is no
   * discontinuity in dReff/drij.
   *
   * Only the case rgbmax > 5*max(sij) = 5*fsmax ~ 9A is handled; this is
   * enforced in mdread().  Smaller values would not make much physical
   * sense anyway.
   *
   * Note: rgbmax must be less than or equal to cut so that the pairlist
   * generated from cut may be applied to calculation of the effective
   * radius and its derivatives.
   */

  if (rgbmax > cut) {
    printf("Error in egb: rgbmax = %f is greater than cutoff = %f\n",
	   rgbmax, cut);
    exit(1);
  }
  rgbmax1i = 1.0/rgbmax;
  rgbmax2i = rgbmax1i*rgbmax1i;
  rgbmaxpsmax2 = (rgbmax+prm->Fsmax)*(rgbmax+prm->Fsmax);


  kkscale = -(*kappa)*KSCALE;
  invdiel_ext = 1./(*diel_ext);

  /* allocate arrays for use in vectorizable loops */

  vfgbi = vector(0, *natom);
  vefac = vector(0, *natom);
  r2x = vector(0, *natom);
  vrinv = vector(0, *natom);
  vexpmkf = vector(0, *natom);
  vectmp1 = vector(0, *natom);
  
  sumepol = vector(0, *natom);
  sumelec = vector(0, *natom);
  sumevdw = vector(0, *natom);

  /* Allocate and initialize the arrays used for energy and derivatives. */

  if ( gb==2 || gb==5 ) psi = vector(0, *natom);
  reff = vector(0, *natom);
  sumdeijda = vector(0, *natom);
  for (i = 0; i < *natom; i++) {
    sumdeijda[i] = 0.0;
    sumepol[i] = 0.0;
    sumelec[i] = 0.0;
    sumevdw[i] = 0.0;
  }

  count = 0;
  totsasa = 0.0;

  /* 
   * Get the "effective" Born radii via the approximate pairwise method
   * Use Eqs 9-11 of Hawkins, Cramer, Truhlar, J. Phys. Chem. 100:19824
   * (1996).
   *
   * The computation is not executed in parallel for the case of gbsa==1
   * due to the loop-carried dependence of the 'count' variable.
   */

  if (gb_debug)
    printf("Effective Born radii:\n");

#ifdef MULTIPLE_THREADS
#pragma omp parallel for if (gbsa != 1) \
  private (i, xi, yi, zi, ri, ri1i, sumi, j, k, xij, yij, zij, \
	   r2, dij1i, dij, sj, sj2, uij, dij2i, tmpsd, dumbo, theta)
#endif


  /* Loop over all atoms i. */

  for (i = 0; i < *natom; i++) {

    xi = x[3 * i];
    yi = x[3 * i + 1];
    zi = x[3 * i + 2];
    ri = rborn[i] - BOFFSET;
    ri1i = 1. / ri;
    sumi = 0.;

    /* Select atom j from the pair list. */

    for (k = lprstr[i]; k < lprstr[i] + lpairs[i] + upairs[i]; k++) {

      j = pairlist[k];

      xij = xi - x[3 * j];
      yij = yi - x[3 * j + 1];
      zij = zi - x[3 * j + 2];
      r2 = xij * xij + yij * yij + zij * zij;
      if ( r2 > rgbmaxpsmax2 ) continue;
      dij1i = 1.0 / sqrt(r2);
      dij = r2 * dij1i;
      sj = fs[j] * (rborn[j] - BOFFSET);
      sj2 = sj * sj;

      /*
       * ---following are from the Appendix of Schaefer and Froemmel,
       * JMB 216:1045-1066, 1990;  Taylor series expansion for d>>s
       * is by Andreas Svrcek-Seiler; smooth rgbmax idea is from
       * Andreas Svrcek-Seiler and Alexey Onufrie.
       */

      if( dij > rgbmax + sj ) continue;

      if ((dij > rgbmax - sj)) {
	uij = 1./(dij -sj);
	sumi -= 0.125 * dij1i * (1.0 + 2.0 * dij *uij +
				 rgbmax2i * (r2 - 4.0 * rgbmax * dij - sj2) +
				 2.0 * log((dij-sj)*rgbmax1i));

      } else if (dij >4.0*sj) {
	dij2i = dij1i*dij1i;
	tmpsd = sj2*dij2i;
	dumbo = TA+tmpsd*(TB+tmpsd*(TC+tmpsd*(TD+tmpsd*TDD)));
	sumi -= sj*tmpsd*dij2i*dumbo;

      } else if (dij > ri + sj) {
	sumi -= 0.5 * (sj / (r2 - sj2) +
		       0.5 * dij1i * log((dij - sj) / (dij + sj)));

      } else if (dij > fabs(ri - sj)) {
	theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2);
	uij = 1. / (dij + sj);
	sumi -= 0.25 * (ri1i * (2. - theta) - uij +
			dij1i * log(ri * uij));

      } else if (ri < sj) {
	sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i +
		       0.5 * dij1i * log((sj - dij) / (sj + dij)));

      }

      if (gbsa == 1) {
	if ((P0[i] + P0[j]) > dij) {
	  if (P0[i] > 2.5 && P0[j] > 2.5) {

	    ineighbor[count] = j + 1;
	    /* this +1 is VERY important */
	    count += 1;
	  }
	}
      }

    }

    if( gb==1 ){

      /* "standard" (HCT) effective radii:  */
      reff[i] = 1.0/(ri1i + sumi);
      if( reff[i] < 0.0 ) reff[i] = 30.0;

    } else {

      /* "gbao" formulas:  */

      psi[i] = -ri*sumi;
      reff[i] = 1.0 / (ri1i - tanh( (gbalpha - gbbeta*psi[i] + 
				     gbgamma*psi[i]*psi[i]) *psi[i])/rborn[i] );
    }

    if (gb_debug)
      printf("%d\t%15.7f\t%15.7f\n", i + 1, rborn[i], reff[i]);

    if (gbsa == 1) {
      ineighbor[count] = 0;
      count = count + 1;
    }
  }

  /*
   * Main LCPO stuff follows.  No parallelization using OpenMP due to
   * the loop-carried dependence of the 'count variable.
   */

  if (gbsa == 1) {
    totsasa = 0.0;
    count = 0;
    for (i = 0; i < *natom; i++) {
      if (ineighbor[count] == 0) {
	count = count + 1;
      } else {

	si = PIx4 * P0[i] * P0[i];
	sumAij = 0.0;
	sumAjk = 0.0;
	sumAjk2 = 0.0;
	sumAijAjk = 0.0;
	sumdAijddijdxi = 0.0;
	sumdAijddijdyi = 0.0;
	sumdAijddijdzi = 0.0;
	sumdAijddijdxiAjk = 0.0;
	sumdAijddijdyiAjk = 0.0;
	sumdAijddijdziAjk = 0.0;

	icount = count;

      L70:j = ineighbor[count] - 1;
				/* mind the -1 , fortran again */
      xi = x[3 * i];
      yi = x[3 * i + 1];
      zi = x[3 * i + 2];

      xj = x[3 * j];
      yj = x[3 * j + 1];
      zj = x[3 * j + 2];


      r2 = (xi - xj) * (xi - xj) + (yi - yj) * (yi - yj) + (zi -
							    zj) *
	(zi - zj);
      dij1i = 1. / sqrt(r2);
      rij = r2 * dij1i;
      tmpaij =
	P0[i] - rij * 0.5 - (P0[i] * P0[i] -
			     P0[j] * P0[j]) * 0.5 * dij1i;

      Aij = PIx2 * P0[i] * tmpaij;

      dAijddij =
	PIx1 * P0[i] * (dij1i * dij1i *
			(P0[i] * P0[i] - P0[j] * P0[j]) - 1.0);

      dAijddijdxj = dAijddij * (xj - xi) * dij1i;
      dAijddijdyj = dAijddij * (yj - yi) * dij1i;
      dAijddijdzj = dAijddij * (zj - zi) * dij1i;

      sumAij = sumAij + Aij;

      count2 = icount;
      sumAjk2 = 0.0;
      sumdAjkddjkdxj = 0.0;
      sumdAjkddjkdyj = 0.0;
      sumdAjkddjkdzj = 0.0;

      p3p4Aij = -surften * (P3[i] + P4[i] * Aij);

      L80:k = ineighbor[count2] - 1;
				/*same as above, -1 ! */
      if (j == k)
	goto L85;

      xk = x[3 * k];
      yk = x[3 * k + 1];
      zk = x[3 * k + 2];

      rjk2 =
	(xj - xk) * (xj - xk) + (yj - yk) * (yj - yk) + (zj -
							 zk) *
	(zj - zk);

      djk1i = 1.0 / sqrt(rjk2);

      rjk = rjk2 * djk1i;

      if (P0[j] + P0[k] > rjk) {
	vdw2dif = P0[j] * P0[j] - P0[k] * P0[k];
	tmpajk = 2.0 * P0[j] - rjk - vdw2dif * djk1i;

	Ajk = PIx1 * P0[j] * tmpajk;

	sumAjk = sumAjk + Ajk;
	sumAjk2 = sumAjk2 + Ajk;

	dAjkddjk =
	  PIx1 * P0[j] * djk1i * (djk1i * djk1i * vdw2dif -
				  1.0);

	dAjkddjkdxj = dAjkddjk * (xj - xk);
	dAjkddjkdyj = dAjkddjk * (yj - yk);
	dAjkddjkdzj = dAjkddjk * (zj - zk);

	f[3 * k] = f[3 * k] + dAjkddjkdxj * p3p4Aij;
	f[3 * k + 1] = f[3 * k + 1] + dAjkddjkdyj * p3p4Aij;
	f[3 * k + 2] = f[3 * k + 2] + dAjkddjkdzj * p3p4Aij;

	sumdAjkddjkdxj = sumdAjkddjkdxj + dAjkddjkdxj;
	sumdAjkddjkdyj = sumdAjkddjkdyj + dAjkddjkdyj;
	sumdAjkddjkdzj = sumdAjkddjkdzj + dAjkddjkdzj;

      }

      L85:count2 = count2 + 1;
      if (ineighbor[count2] != 0) {
	goto L80;
      } else {
	count2 = icount;
      }


      sumAijAjk = sumAijAjk + Aij * sumAjk2;

      sumdAijddijdxi = sumdAijddijdxi - dAijddijdxj;
      sumdAijddijdyi = sumdAijddijdyi - dAijddijdyj;
      sumdAijddijdzi = sumdAijddijdzi - dAijddijdzj;
      sumdAijddijdxiAjk =
	sumdAijddijdxiAjk - dAijddijdxj * sumAjk2;
      sumdAijddijdyiAjk =
	sumdAijddijdyiAjk - dAijddijdyj * sumAjk2;
      sumdAijddijdziAjk =
	sumdAijddijdziAjk - dAijddijdzj * sumAjk2;

      lastxj = dAijddijdxj * sumAjk2 + Aij * sumdAjkddjkdxj;
      lastyj = dAijddijdyj * sumAjk2 + Aij * sumdAjkddjkdyj;
      lastzj = dAijddijdzj * sumAjk2 + Aij * sumdAjkddjkdzj;

      dAidxj =
	surften * (P2[i] * dAijddijdxj +
		   P3[i] * sumdAjkddjkdxj + P4[i] * lastxj);
      dAidyj =
	surften * (P2[i] * dAijddijdyj +
		   P3[i] * sumdAjkddjkdyj + P4[i] * lastyj);
      dAidzj =
	surften * (P2[i] * dAijddijdzj +
		   P3[i] * sumdAjkddjkdzj + P4[i] * lastzj);

      f[3 * j] = f[3 * j] + dAidxj;
      f[3 * j + 1] = f[3 * j + 1] + dAidyj;
      f[3 * j + 2] = f[3 * j + 2] + dAidzj;

      count = count + 1;
      if (ineighbor[count] != 0) {
	goto L70;
      } else {
	count = count + 1;
      }

      Ai = P1[i] * si + P2[i] * sumAij + P3[i] * sumAjk +
	P4[i] * sumAijAjk;

      dAidxi =
	surften * (P2[i] * sumdAijddijdxi +
		   P4[i] * sumdAijddijdxiAjk);
      dAidyi =
	surften * (P2[i] * sumdAijddijdyi +
		   P4[i] * sumdAijddijdyiAjk);
      dAidzi =
	surften * (P2[i] * sumdAijddijdzi +
		   P4[i] * sumdAijddijdziAjk);

      f[3 * i] = f[3 * i] + dAidxi;
      f[3 * i + 1] = f[3 * i + 1] + dAidyi;
      f[3 * i + 2] = f[3 * i + 2] + dAidzi;

      totsasa = totsasa + Ai;
				/*  printf("totsasa up to %d %f\n",i,totsasa); */
      }
    }
    /*      printf("SASA %f , ESURF %f \n",totsasa,totsasa*surften); */
    esurf = totsasa * surften;
  }

  /*
   * Compute the GB, Coulomb and Lennard-Jones energies and derivatives.
   * When in multi-threaded mode in this nest of loops sum data relative only
   * To atom i so that OpenMP can be used to parallelize the outer loop.
   * Perform excluded list management when in single-threaded mode only.
   */

#ifndef MULTIPLE_THREADS
  iexcl = 0;
#endif

#ifdef MULTIPLE_THREADS
#pragma omp parallel \
  private (i, i3, ri, qi, qj, expmkf, dielfac, qi2h, qid2h, \
	   iaci, xi, yi, zi, k, j, j3, xij, yij, zij, r2, qiqj, \
	   rj, rb2, efac, fgbi, fgbk, temp4, temp6, eel, de, temp5, \
	   rinv, r2inv, ic, r6inv, f6, f12, dedx, dedy, dedz, \
	   epol, elec, evdw, jexcl, jexcl_last, iexw, xj, yj, zj, \
	   sumda, thi, ri1i, dij1i, datmp, daix, daiy, daiz, \
	   dij2i, dij, sj, sj2, temp1, dij3i, tmpsd, dumbo, \
	   iexcl, iexcl_last, nexcl, npairs)
#endif
  {

    /*
     * Allocate and initialize the iexw array used for skipping excluded
     * atoms.  Note that because of the manner in which iexw is used, it
     * is necessary to initialize it before only the first iteration of
     * the following loop.
     */

    iexw = ivector(-1, *natom);
    for (i = -1; i < *natom; i++) {
      iexw[i] = -1;
    }

#ifdef MULTIPLE_THREADS
#pragma omp for schedule (dynamic, 8)
#endif

    /* Loop over all atoms i. */

    for (i = 0; i < *natom; i++) {

      ri = reff[i];
      qi = q[i];

      /*
       * If atom i is not frozen, compute the "diagonal" energy that
       * is a function of only the effective radius Ri but not of the
       * interatomic distance Dij.  Compute also the contribution of
       * the diagonal energy term to the sum by which the derivative
       * of Ri will be multiplied.
       */

      if (!frozen[i]) {
	expmkf = exp(-KSCALE * (*kappa) * ri) / (*diel_ext);
	dielfac = 1.0 - expmkf;
	qi2h = 0.5 * qi * qi;
	qid2h = qi2h * dielfac;
	sumepol[i] += -qid2h / ri;
	sumdeijda[i] += qid2h - KSCALE * (*kappa) * qi2h * expmkf *ri;
      }

      /*
       * Skip the pair calculations if there are no atoms j on the
       * pair list of atom i.
       */

      nexcl = prm->Iblo[i];
      npairs = upairs[i];
      if ( npairs > 0 ) {

	i3 = 3 * i;

	xi = x[i3    ];
	yi = x[i3 + 1];
	zi = x[i3 + 2];

	iaci = prm->Ntypes * (prm->Iac[i] - 1);

	/*
	 * Expand the excluded atom list into the iexw array by storing i
	 * at array address j.  The starting address is either iexcl that
	 * gets updated with each iteration of the i-loop, or Istr[i] that
	 * has been created in the mme_init function to permit parallelization.
	 */

#ifndef MULTIPLE_THREADS
	jexcl = iexcl;
#else
	jexcl = Istr[i];
#endif

	jexcl_last = jexcl + nexcl;
	for (j = jexcl; j < jexcl_last; j++) {
	  iexw[prm->ExclAt[j] - 1] = i;
	}

	epol = elec = evdw = 0.0;

	/* Select atoms j from the pair list. */
	icount2 = 0;
	for (k = uprstr[i]; k < uprstr[i] + npairs; k++) {

	  j = pairlist[k];

	  j3 = 3 * j;

	  xij = xi - x[j3    ];
	  yij = yi - x[j3 + 1];
	  zij = zi - x[j3 + 2];
	  r2 = xij * xij + yij * yij + zij * zij;
	  rj = reff[j];
	  rb2 = ri*rj;
	  vfgbi[icount2] = rb2;
	  vefac[icount2] = -0.25*r2/rb2;
	  r2x[icount2] = r2;
	  icount2++;
	}

        	for (k=0; k < icount2 ; k++) vefac[k] = exp(vefac[k]); /* efac */
		for (k=0; k < icount2 ; k++) vrinv[k] = 1./sqrt(r2x[k]); /* inverse distances */
		for (k=0; k < icount2 ; k++) vfgbi[k] = vfgbi[k]*vefac[k] + r2x[k]; 
		for (k=0; k < icount2 ; k++) vfgbi[k] = 1./sqrt(vfgbi[k]); /* vfgbi complete */
		for (k=0; k < icount2 ; k++) vexpmkf[k] = kkscale/vfgbi[k];
		for (k=0; k < icount2 ; k++) vectmp1[k] = 1.0 - vexpmkf[k];
		for (k=0; k < icount2 ; k++) vexpmkf[k] = exp(vexpmkf[k]) * invdiel_ext; /* expmkf complete */
		for (k=0; k < icount2 ; k++) vectmp1[k] = 1.0 - vexpmkf[k]* vectmp1[k];
		for (k=0; k < icount2 ; k++) vectmp1[k] = vectmp1[k]*vfgbi[k]*vfgbi[k]*vfgbi[k]; /* now contains temp6 without 'qiqj' */
	  
	  
	icount2 = 0;		
	for (k = uprstr[i]; k < uprstr[i] + npairs; k++) {

	  j = pairlist[k];

	  j3 = 3 * j;

	  /* Continue computing the non-diagonal energy term. */

	  xij = xi - x[j3    ];
	  yij = yi - x[j3 + 1];
	  zij = zi - x[j3 + 2];
	  r2 = r2x[icount2];

	  qiqj = qi * q[j];
	  rj = reff[j];
	  rb2 = ri * rj;

	  efac = vefac[icount2];
	  fgbi = vfgbi[icount2];
	  
	  fgbk = kkscale / fgbi;
	  
	  expmkf = vexpmkf[icount2];
	  dielfac = 1.0 - expmkf;

	  epol += -qiqj * dielfac * fgbi;

	  temp6 = qiqj*vectmp1[icount2];
	  de = temp6 * (1.0 - 0.25 * efac);

	  temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2);

	  /*
	   * Compute the contribution of the non-diagonal energy term to the sum
	   * by which the derivatives of Ri and Rj will be multiplied.  When in
	   * single-threaded mode sum the contribution for the derivatives of Ri
	   * and Rj.  When in multi-threaded mode sum the contribution for the
	   * derivative of R.
	   */

	  sumdeijda[i] += ri * temp5;

#ifndef MULTIPLE_THREADS
	  sumdeijda[j] += rj * temp5;
#endif

	  /*
	   * Compute the Van der Waals and Coulombic energies for only
	   * those pairs that are not on the excluded atom list.  Any
	   * pair on the excluded atom list will have atom i stored at
	   * address j of the iexw array.  It is not necessary to reset
	   * the elements of the iexw array to -1 between successive
	   * iterations in i because an i,j pair is uniquely identified
	   * by atom i stored at array address j.  Thus for example, the
	   * i+1,j pair would be stored at the same address as the i,j
	   * pair but after the i,j pair were used.
	   */

	  if (iexw[j] != i) {

	    rinv = vrinv[icount2];
	    r2inv = rinv * rinv;

	    /*  gas-phase Coulomb energy:  */

	    eel = qiqj * rinv;
	    elec += eel;
	    de -= eel * r2inv;

	    /* Lennard-Jones energy:   */

	    ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	    if( ic >= 0 ){
	      r6inv = r2inv * r2inv * r2inv;
	      f6 = prm->Cn2[ic] * r6inv;
	      f12 = prm->Cn1[ic] * r6inv * r6inv;
	      evdw += f12 - f6;
	      de -= (12. * f12 - 6. * f6) * r2inv;
	    }
	  }
	  icount2 ++;
	  /*
	   * When in single-threaded mode sum to the gradient vector the
	   * derivatives of Dij that are computed relative to the cartesian
	   * coordinates of atoms i and j.  When in multi-threaded mode sum
	   * to the gradient vector the derivatives that are computed relative
	   * to the cartesian coordinates of atom i.
	   */

	  dedx = de * xij;
	  dedy = de * yij;
	  dedz = de * zij;

	  f[i3    ] += dedx;
	  f[i3 + 1] += dedy;
	  f[i3 + 2] += dedz;

#ifndef MULTIPLE_THREADS
	  f[j3    ] -= dedx;
	  f[j3 + 1] -= dedy;
	  f[j3 + 2] -= dedz;
#endif
	}

	/* Sum the energy terms. */

	sumepol[i] += epol;
	sumelec[i] += elec;
	sumevdw[i] += evdw;
      }

      /*
       * When in single-threaded mode update iexcl.  This update is unnecessary
       * in multi-threaded mode because jexcl is loaded from Istr[i] not iexcl.
       */

#ifndef MULTIPLE_THREADS
      iexcl += nexcl;
#endif
    }

#ifdef MULTIPLE_THREADS

    /*
     * Initialize the iexw array used for skipping excluded atoms.
     * Note that because of the manner in which iexw is used, it
     * is necessary to initialize it before only the first iteration
     * of the following loop.
     */

    for (i = -1; i < *natom; i++) {
      iexw[i] = -1;
    }

    /*
     * When in multi-threaded mode in this nest of loops sum data
     * relative only to atom j so that OpenMP can be used to parallelize
     * the outer loop.
     * 
     * The control flow in this nest of loops must be identical to 
     * that of the preceeding nest of loops.  Perform the minimum
     * computation required to replicate the control flow of the
     * preceeding nest of loops.
     */

#pragma omp for schedule (dynamic, 8)

    /* Loop over all atoms j. */

    for (j = 0; j < *natom; j++) {

      /*
       * Skip the pair calculations if there are no atoms i on the
       * pair list of atom j.
       */

      nexcl = Jblo[j];
      npairs = lpairs[j];
      if ( npairs > 0 ) {

	j3 = 3 * j;

	rj = reff[j];
	qj = q[j];
	xj = x[j3    ];
	yj = x[j3 + 1];
	zj = x[j3 + 2];

	/*
	 * Expand the excluded list into the iexw array by storing j
	 * at array address i.
	 */

	iexcl = Jstr[j];
	iexcl_last = iexcl + nexcl;
	for (i = iexcl; i < iexcl_last; i++) {
	  iexw[JExclAt[i] - 1] = j;
	}

	/* Select atoms i from the pair list. */

	for (k = lprstr[j]; k < lprstr[j] + npairs; k++) {

	  i = pairlist[k];

	  i3 = 3 * i;

	  iaci = prm->Ntypes * (prm->Iac[i] - 1);

	  xij = x[i3    ] - xj;
	  yij = x[i3 + 1] - yj;
	  zij = x[i3 + 2] - zj;
	  r2 = xij * xij + yij * yij + zij * zij;

	  qiqj = q[i] * qj;
	  ri = reff[i];
	  rb2 = ri * rj;
	  efac = exp(-r2 / (4.0 * rb2));
	  fgbi = 1.0 / sqrt(r2 + rb2 * efac);
	  fgbk = -(*kappa) * KSCALE / fgbi;

	  expmkf = exp(fgbk) / (*diel_ext);
	  dielfac = 1.0 - expmkf;

	  epol += -qiqj * dielfac * fgbi;

	  temp4 = fgbi * fgbi * fgbi;
	  temp6 = qiqj * temp4 * (dielfac + fgbk * expmkf);
	  de = temp6 * (1.0 - 0.25 * efac);

	  temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2);

	  /*
	   * Compute the contribution of the non-diagonal energy term to the sum
	   * by which the derivatives of Ri and Rj will be multiplied.  When in
	   * single-threaded mode sum the contribution for the derivatives of Ri
	   * and Rj.  When in multi-threaded mode sum the contribution for the
	   * derivative of Ri.
	   */

	  sumdeijda[j] += rj * temp5;

	  /*
	   * Compute the Van der Waals and Coulombic energies for only
	   * those pairs that are not on the excluded atom list.  Any
	   * pair on the excluded atom list will have atom j stored at
	   * address i of the iexw array.  It is not necessary to reset
	   * the elements of the iexw array to -1 between successive
	   * iterations in j because an i,j pair is uniquely identified
	   * by atom j stored at array address i.  Thus for example, the
	   * i,j+1 pair would be stored at the same address as the i,j
	   * pair but after the i,j pair were used.
	   */

	  if (iexw[i] != j) {

	    rinv = 1. / sqrt(r2);
	    r2inv = rinv * rinv;

	    /*  gas-phase Coulomb energy:  */

	    eel = qiqj * rinv;
	    elec += eel;
	    de -= eel * r2inv;

	    /* Lennard-Jones energy:   */

	    ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
	    if( ic >= 0 ){
	      r6inv = r2inv * r2inv * r2inv;
	      f6 = prm->Cn2[ic] * r6inv;
	      f12 = prm->Cn1[ic] * r6inv * r6inv;
	      evdw += f12 - f6;
	      de -= (12. * f12 - 6. * f6) * r2inv;
	    }
	  }

	  /*
	   * Sum to the gradient vector the derivatives that are computed
	   * relative to the cartesian coordinates of atom j.
	   */

	  f[j3    ] -= de * xij;
	  f[j3 + 1] -= de * yij;
	  f[j3 + 2] -= de * zij;
	}
      }
    }
#endif

    /* Free the iexw array within this potentially parallel region of code. */

    free_ivector(iexw, -1, *natom);

    /*
     * Compute the derivatives of the effective radius Ri of atom i
     * with respect to the cartesian coordinates of each atom j.
     *
     * When in single-threaded mode sum all of these derivatives into the
     * gradient vector.  When in multi-threaded mode sum into the gradient
     * vector only the derivatives with respect to the cartesian coordinates
     * of atom i (which are in fact the sums of all other derivatives).
     */

#ifdef MULTIPLE_THREADS
#pragma omp for
#endif

    /* Loop over all atoms i. */

    for (i = 0; i < *natom; i++) {

      /*
       * Don't calculate derivatives of the effective radius of atom i
       * if atom i is frozen or if there are no pair atoms j associated
       * with atom i.
       */

      npairs = lpairs[i] + upairs[i];
      if ( frozen[i] || !npairs ) continue;

      i3 = 3 * i;

      xi = x[i3    ];
      yi = x[i3 + 1];
      zi = x[i3 + 2];
      ri = rborn[i] - BOFFSET;
      ri1i = 1. / ri;

      sumda = sumdeijda[i];

      if ( gb>1 ) {

	ri = rborn[i] - BOFFSET;
	thi = tanh( (gbalpha - gbbeta*psi[i] + gbgamma*psi[i]*psi[i])*psi[i] );
	sumda *= (gbalpha -2.0*gbbeta*psi[i] + 3.0*gbgamma*psi[i]*psi[i])
	  *(1.0 - thi*thi)*ri/rborn[i];
      }

      /* Initialize the derivative accumulators. */

      daix = daiy = daiz = 0.0;

      /* Select atom j from the pair list. */

      for (k = lprstr[i]; k < lprstr[i] + npairs; k++) {

	j = pairlist[k];

	j3 = 3 * j;

	xij = xi - x[j3    ];
	yij = yi - x[j3 + 1];
	zij = zi - x[j3 + 2];
	r2 = xij * xij + yij * yij + zij * zij;

	if ( r2 > rgbmaxpsmax2 ) continue;

	dij1i = 1.0 / sqrt(r2);
	dij2i = dij1i * dij1i;
	dij = r2 * dij1i;
	sj = fs[j] * (rborn[j] - BOFFSET);
	sj2 = sj * sj;

	/*
	 * The following are the numerator of the first derivatives of the
	 * effective radius Ri with respect to the interatomic distance Dij.
	 * They are derived from the equations from the Appendix of Schaefer
	 * and Froemmel as well as from the Taylor series expansion for d>>s
	 * by Andreas Svrcek-Seiler.  The smooth rgbmax idea is from Andreas
	 * Svrcek-Seiler and Alexey Onufriev.  The complete derivative is
	 * formed by multiplying the numerator by -Ri*Ri.  The factor of Ri*Ri
	 * has been moved to the terms that are multiplied by the derivative.
	 * The negation is deferred until later.  When the chain rule is used
	 * to form the first derivatives of the effective radius with respect
	 * to the cartesian coordinates, an additional factor of Dij appears
	 * in the denominator.  That factor is included in the following
	 * expressions.
	 */

	if ( dij > rgbmax + sj ) continue;

	if( dij > rgbmax - sj ){
	  temp1 = 1./(dij-sj);
	  dij3i = dij1i * dij2i;
	  datmp = 0.125 * dij3i * ((r2 + sj2) *
				   (temp1*temp1 - rgbmax2i) -
				   2.0 * log(rgbmax*temp1));

	} else if(dij >4.0*sj) {
	  tmpsd = sj2*dij2i;
	  dumbo = TE + tmpsd*(TF+tmpsd*(TG+tmpsd*(TH+tmpsd*THH)));
	  datmp = tmpsd*sj*dij2i*dij2i*dumbo;

	} else if (dij > ri + sj) {

	  temp1 = 1. / (r2 - sj2);
	  datmp = temp1 * sj * (-0.5 * dij2i + temp1)
	    + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj));

	} else if (dij > fabs(ri - sj)) {

	  temp1 = 1. / (dij + sj);
	  dij3i = dij1i * dij1i * dij1i;
	  datmp =
	    -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i *
		     ri1i + dij1i * temp1 * (temp1 - dij1i)
		     - dij3i * log(ri * temp1));

	} else if (ri < sj) {

	  temp1 = 1. / (r2 - sj2);
	  datmp =
	    -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 -
		    0.5 * dij2i * dij1i * log((sj - dij) /
					      (sj + dij)));

	} else {
	  datmp = 0.;
	}

	/* Sum the derivatives into daix, daiy and daiz. */

	daix += xij * datmp;
	daiy += yij * datmp;
	daiz += zij * datmp;

	/*
	 * When in single-threaded mode sum the derivatives relative to
	 * atom j (weighted by -sumdeijda[i]) into the gradient vector.
	 * For example, f[j3 + 2] contains the derivatives of Ri with
	 * respect to the z-coordinate of atom j.
	 */

#ifndef MULTIPLE_THREADS
	datmp *= sumda;
	f[j3    ] += xij * datmp;
	f[j3 + 1] += yij * datmp;
	f[j3 + 2] += zij * datmp;
#endif
      }

      /*
       * Update the gradient vector with the sums of derivatives of the
       * effective radius Ri with respect to the cartesian coordinates.
       * For example, f[i3 + 1] contains the sum of derivatives of Ri
       * with respect to the y-coordinate of each atom.  Multiply by
       * -sumdeijda[i] here (instead of merely using datmp multiplied by
       * -sumdeijda) in order to distribute the product across the sum of
       * derivatives in an attempt to obtain greater numeric stability.
       */

      f[i3    ] -= sumda * daix;
      f[i3 + 1] -= sumda * daiy;
      f[i3 + 2] -= sumda * daiz;
    }

    /*
     * Compute the derivatives of the effective radius Ri of atom i
     * with respect to the cartesian coordinates of each atom j, and
     * sum all of these derivatives into the gradient vector.
     *
     * Use the nowait clause for the following 'for' loop because
     * there are implied barriers at the end of the loop and at the
     * end of the parallel region; hence, the barrier at the end of
     * the loop is redundant.
     */

#ifdef MULTIPLE_THREADS
#pragma omp for nowait

    /* Loop over all atoms j. */

    for (j = 0; j < *natom; j++) {

      /*
       * Don't calculate derivatives with respect to atom j if there
       * are no atoms i associated with atom j.
       */

      npairs = lpairs[j] + upairs[j];
      if ( !npairs ) continue;

      j3 = 3 * j;

      xj = x[j3 + 0];
      yj = x[j3 + 1];
      zj = x[j3 + 2];

      sj = fs[j] * (rborn[j] - BOFFSET);
      sj2 = sj * sj;

      /* Select atom i from the pair list. */

      for (k = lprstr[j]; k < lprstr[j] + npairs; k++) {

	i = pairlist[k];

	/*
	 * Don't calculate derivatives of the effective radius of atom i
	 * if atom i is frozen.
	 */

	if ( frozen[i] ) continue;

	i3 = 3 * i;

	xij = x[i3 + 0] - xj;
	yij = x[i3 + 1] - yj;
	zij = x[i3 + 2] - zj;
	r2 = xij * xij + yij * yij + zij * zij;

	/* Ignore the ij atom pair if their separation exceeds the GB cutoff. */

	if ( r2 > rgbmaxpsmax2 ) continue;

	dij1i = 1.0 / sqrt(r2);
	dij2i = dij1i * dij1i;
	dij = r2 * dij1i;

	ri = rborn[i] - BOFFSET;
	ri1i = 1. / ri;

	/*
	 * The following are the numerator of the first derivatives of the
	 * effective radius Ri with respect to the interatomic distance Dij.
	 * They are derived from the equations from the Appendix of Schaefer
	 * and Froemmel as well as from the Taylor series expansion for d>>s
	 * by Andreas Svrcek-Seiler.  The smooth rgbmax idea is from Andreas
	 * Svrcek-Seiler and Alexey Onufriev.  The complete derivative is
	 * formed by multiplying the numerator by -Ri*Ri.  The factor of Ri*Ri
	 * has been moved to the terms that are multiplied by the derivative.
	 * The negation is deferred until later.  When the chain rule is used
	 * to form the first derivatives of the effective radius with respect
	 * to the cartesian coordinates, an additional factor of Dij appears
	 * in the denominator.  That factor is included in the following
	 * expressions.
	 */

	if ( dij > rgbmax + sj ) continue;

	if( dij > rgbmax - sj ){
	  temp1 = 1./(dij-sj);
	  dij3i = dij1i * dij2i;
	  datmp = 0.125 * dij3i * ((r2 + sj2) *
				   (temp1*temp1 - rgbmax2i) -
				   2.0 * log(rgbmax*temp1));

	} else if(dij >4.0*sj) {
	  tmpsd = sj2*dij2i;
	  dumbo = TE + tmpsd*(TF+tmpsd*(TG+tmpsd*(TH+tmpsd*THH)));
	  datmp = tmpsd*sj*dij2i*dij2i*dumbo;

	} else if (dij > ri + sj) {

	  temp1 = 1. / (r2 - sj2);
	  datmp = temp1 * sj * (-0.5 * dij2i + temp1)
	    + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj));

	} else if (dij > fabs(ri - sj)) {

	  temp1 = 1. / (dij + sj);
	  dij3i = dij1i * dij1i * dij1i;
	  datmp =
	    -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i *
		     ri1i + dij1i * temp1 * (temp1 - dij1i)
		     - dij3i * log(ri * temp1));

	} else if (ri < sj) {

	  temp1 = 1. / (r2 - sj2);
	  datmp =
	    -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 -
		    0.5 * dij2i * dij1i * log((sj - dij) /
					      (sj + dij)));

	} else {
	  datmp = 0.;
	}

	/*
	 * Compute the derivatives of the effective radius Ri of atom i
	 * with respect to the cartesian coordinates of atom j.  Sum
	 * these derivatives into the gradient vector.
	 *
	 * The derivative of atom i is multiplied by sumdeijda[i] at
	 * this point to minimize the number of multiplies.  The
	 * negation from -Ri*Ri is applied at this point.
	 */

	sumda = sumdeijda[i];

	if ( gb>1 ) {

	  ri = rborn[i] - BOFFSET;
	  thi = tanh( (gbalpha - gbbeta*psi[i] + gbgamma*psi[i]*psi[i])*psi[i] );
	  sumda *= (gbalpha -2.0*gbbeta*psi[i] + 3.0*gbgamma*psi[i]*psi[i])
	    *(1.0 - thi*thi)*ri/rborn[i];
	}

	datmp *= -sumda;

	f[j3 + 0] -= xij * datmp;
	f[j3 + 1] -= yij * datmp;
	f[j3 + 2] -= zij * datmp;
      }
    }
#endif
  }

  /* Perform non-logarithmic reduction of the energy terms. */

  epol = elec = evdw = 0.0;

  for (i = 0; i < *natom; i++) {
    epol += sumepol[i];
    elec += sumelec[i];
    evdw += sumevdw[i];
  }

  /* Return elec and evdw through the reference parameters eelt and enb. */

  *eelt = elec;
  *enb = evdw;

  if (gb_debug > 1) {
    printf("gb    epol= %10.3f\n", epol);
    for (k = 0; k < *natom; k++) {
      printf("%15.7f\n", sumdeijda[k]);
    }
    for (k = 0; k < *natom * 3; k += 3) {
      printf("%15.7f%15.7f%15.7f\n", f[k], f[k + 1], f[k + 2]);
    }
  }

  free_vector(reff, 0, *natom);
  free_vector(sumepol, 0, *natom);
  free_vector(sumelec, 0, *natom);
  free_vector(sumevdw, 0, *natom);
  free_vector(sumdeijda, 0, *natom);
  if( gb == 2 || gb==5 ) free_vector(psi, 0, *natom);

  free_vector (vfgbi, 0, *natom);
  free_vector (vefac, 0,*natom);
  free_vector (r2x, 0, *natom);
  free_vector  (vrinv, 0, *natom);
  free_vector (vexpmkf, 0, *natom);
  free_vector  (vectmp1, 0, *natom);
  
  
  return (epol);
}