#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include "nab.h"
#include "memutil.h"

#ifdef SCALAPACK
#include "mpi.h"
#endif

/* Here are offsets into the ScaLAPACK descriptor array. */

#define DTYPE_ (0)
#define CTXT_ (1)
#define M_ (2)
#define N_ (3)
#define MB_ (4)
#define NB_ (5)
#define RSRC_ (6)
#define CSRC_ (7)
#define LLD_ (8)
#define DLEN_ (9)

extern REAL_T tdiag, tmmetwo;

REAL_T seconds( void );

INT_T get_blocksize( void );

INT_T get_mytaskid( void );

INT_T get_numtasks( void );

INT_T dspev_(char *, char *, int *, REAL_T *,
	     REAL_T *, REAL_T *, int *, REAL_T *, int *);

INT_T dsyev_(char *, char *, INT_T *, REAL_T *, INT_T *,
	     REAL_T *, REAL_T *, INT_T *, INT_T *);

void blacs_pinfo_(INT_T *, INT_T *);

void sl_init_(INT_T *, INT_T *, INT_T *);

INT_T sl_gridreshape_(INT_T *, INT_T *, INT_T *, INT_T *, INT_T *, INT_T *);

void blacs_gridinfo_(INT_T *, INT_T *, INT_T *, INT_T *, INT_T *);

void blacs_barrier_(INT_T *, char *);

void blacs_gridexit_(INT_T *);

INT_T numroc_(INT_T *, INT_T *, INT_T *, INT_T *, INT_T *);

void descinit_(INT_T *, INT_T *, INT_T *, INT_T *, INT_T *,
	       INT_T *, INT_T *, INT_T *, INT_T *, INT_T *);

void pdsyev_(char *, char *, INT_T *,
	     REAL_T *, INT_T *, INT_T *, INT_T *, REAL_T *,
	     REAL_T *, INT_T *, INT_T *, INT_T *,
	     REAL_T *, INT_T *, INT_T *);

void pdgemr2d_(INT_T *, INT_T *,
	       REAL_T *, INT_T *, INT_T *, INT_T *, 
               REAL_T *, INT_T *, INT_T *, INT_T *,
	       INT_T *);

INT_T myroc(int, int, int, int);

REAL_T *ptr2d(REAL_T *, int *, int, int);


FILE	*safe_fopen( char *, char * );

static int c__3 = 3;

/***********************************************************************
                            SYMNUM()
************************************************************************/

/*     ----- routine to give the symmetry number. only for linear */
/*           molecules. for others symmetry number is unity ----- */

static int symnum(int natoms, REAL_T * amass, REAL_T * sn, int *linear)
{


	/* Parameter adjustments */
	--amass;

	/* Function Body */
	*sn = 1.;
	if (natoms <= 2) {
		*linear = 1;
		if (amass[1] == amass[2]) {
			*sn = 2.;
		}
	}
	return 0;
}								/* symnum */

/* ----------------------------------------------------------------------- */

static int mofi(int natoms, REAL_T * c, REAL_T * amass, REAL_T * pmom)
{
	/* Initialized data */

	static char uplo[1] = "U";
	static char jobz[1] = "V";
	static REAL_T zero = 0.;

	/* System generated locals */

	/* Local variables */
	static REAL_T t[9], x, y, z, e2[30], wt;
	static int i, iat;
	static REAL_T com[3];
	static int ier, iaind;
	static REAL_T totwt, eigvec[9];


/*     compute the principal moments of inertia. */
/*     units are amu-bohr**2 */

	/* Parameter adjustments */
	--pmom;
	--amass;
	--c;

	/* Function Body */


/*     compute the position of the center of mass and translate */
/*     it to the origin. */

	com[0] = zero;
	com[1] = zero;
	com[2] = zero;

	totwt = zero;
	for (iat = 1; iat <= natoms; ++iat) {
		iaind = (iat - 1) * 3;
		wt = amass[iat];
		totwt += wt;
		com[0] += wt * c[iaind + 1];
		com[1] += wt * c[iaind + 2];
		com[2] += wt * c[iaind + 3];
	}

	com[0] /= totwt;
	com[1] /= totwt;
	com[2] /= totwt;

/*     compute the principal moments. */

	for (i = 1; i <= 9; ++i) {
		t[i - 1] = zero;
	}

	for (iat = 1; iat <= natoms; ++iat) {
		wt = amass[iat];
		x = c[1 + 3 * (iat - 1)] - com[1 - 1];
		y = c[2 + 3 * (iat - 1)] - com[2 - 1];
		z = c[3 + 3 * (iat - 1)] - com[3 - 1];
		t[0] += wt * (y * y + z * z);
		t[2] += wt * (x * x + z * z);
		t[5] += wt * (x * x + y * y);
		t[1] -= wt * x * y;
		t[3] -= wt * x * z;
		t[4] -= wt * y * z;
	}
	ier = 0;
	dspev_(jobz, uplo, &c__3, t, &pmom[1], eigvec, &c__3, e2, &ier);
	return 0;
}								/* mofi */

/***********************************************************************
                            THERMO()
************************************************************************/

static int thermo(int natoms, int n, int ilevel, REAL_T *c, REAL_T *amass,
		   REAL_T *freq, REAL_T t, REAL_T patm, REAL_T energy )
{
        REAL_T *vtemp=NULL, *evibn=NULL, *cvibn=NULL, *svibn=NULL;

	/* Initialized data */

	static REAL_T zero = 0.;
	static REAL_T akilo = 1e3;
	static REAL_T pstd = 101325.;
	static REAL_T pt2 = .2;
	static REAL_T half = .5;
	static REAL_T one = 1.;
	static REAL_T onept5 = 1.5;
	static REAL_T two = 2.;
	static REAL_T twopt5 = 2.5;
	static REAL_T four = 4.;
	static REAL_T eight = 8.;

	/* System generated locals */
	REAL_T d__1;

	/* Local variables */
	REAL_T e;
	int i;
	REAL_T p, s, pi, cv, sn, rt, em1;
	int iff;
	REAL_T arg, gas;
	int iat;
	REAL_T con, dum, dum1, dum2, argd, cvib, evib;
	int ndof;
	REAL_T avog, ezkc, pipi, ezpe, tokg, svib, crot, pmom[10],
		erot, etot, ctot, srot, stot, tovt, jpcal, tocal, ccont, ctran,
		econt, etran, scont, stran, tomet, rtemp, boltz, etovt, rtemp1,
		rtemp2, rtemp3, planck;
	int linear;
	REAL_T tokcal, hartre, weight;
	int nimag;
	int mytaskid;

	/* Allocate temporary vectors. */

	vtemp = vector( 1, n );
	evibn = vector( 1, n );
	cvibn = vector( 1, n );
	svibn = vector( 1, n );

	/* Get the MPI task id. */

	mytaskid = get_mytaskid();

/*     given the structure of a molecule and its normal mode vibrational */
/*     frequencies this routine uses standard statistical mechanical */
/*     formulas for an ideal gas (in the canonical ensemble, see, */
/*     for example, d. a. mcquarrie, "statistical thermodynamics", */
/*     harper & row, new york, 1973, chapters 5, 6, and 8) to compute */
/*     the entropy, heat capacity, and internal energy. */

/*     the si system of units is used internally.  conversion to units */
/*     more familiar to most chemists is made for output. */


/*     amass:   atomic weights, in amu. */
/*     pmom:    principal moments of inertia, in amu-bohr**2 and */
/*              in ascending order. */
/*     freq:    vibrational frequencies, in cm**-1 and in ascending */
/*              order */
/*     c    :   coordinates in Angstroms */
/*     vtemp:   vibrational temperatures, in kelvin. */
/*     evibn:   contribution to e from the vibration n. */
/*     cvibn:   contribution to cv from the vibration n. */
/*     svibn:   contribution to s from the vibration n. */
/*     t:       temperature */
/*     patm:    pressure, in atmospheres */


	/* Parameter adjustments */
	--freq;
	--amass;
	--c;


/*     tokg:    kilograms per amu. */
/*     boltz:   boltzman constant, in joules per kelvin. */
/*     planck:  planck constant, in joule-seconds. */
/*     avog:    avogadro constant, in mol**(-1). */
/*     jpcal:   joules per calorie. */
/*     tomet:   metres per Angstrom. */
/*     hartre:  joules per hartree. */

	tokg = 1.660531e-27;
	boltz = 1.380622e-23;
	planck = 6.626196e-34;
	avog = 6.022169e23;
	jpcal = 4.18674;
	tomet = 1e-10;
	hartre = 4.35981e-18;

/*     compute the gas constant, pi, pi**2, and e. */
/*     compute the conversion factors cal per joule and kcal per joule. */

	gas = avog * boltz;
	pi = four * atan(one);
	pipi = pi * pi;
	e = exp(one);
	tocal = one / jpcal;
	tokcal = tocal / akilo;

/*     print the temperature and pressure. */

	p = pstd * patm;
	if (mytaskid == 0) {
	  fprintf(stdout, "\n                - Thermochemistry -\n\n");
	  fprintf(stdout, "Temperature:  %8.3f\n   Pressure:  %8.3f\n", t, patm);
	}
	rt = gas * t;

/*     compute and print the molecular mass in amu, then convert to */
/*     kilograms. */

	weight = zero;
	for (iat = 1; iat <= natoms; ++iat) {
		weight += amass[iat];
	}
	if (mytaskid == 0) {
	  fprintf(stdout, "       Mass:  %8.3f\n", weight);
	}
	weight *= tokg;

/*     compute contributions due to translation: */
/*        etran-- internal energy */
/*        ctran-- constant v heat capacity */
/*        stran-- entropy */

	dum1 = boltz * t;
	d__1 = two * pi;
	dum2 = pow(d__1, onept5);
	arg = pow(dum1, onept5) / planck;
	arg = arg / p * (dum1 / planck);
	arg = arg * dum2 * (weight / planck);
	arg = arg * sqrt(weight) * pow(e, twopt5);
	stran = gas * log(arg);
	etran = onept5 * rt;
	ctran = onept5 * gas;

/*     Compute contributions due to electronic motion: */
/*        It is assumed that the first electronic excitation energy */
/*        is much greater than kt and that the ground state has a */
/*        degeneracy of one.  Under these conditions the electronic */
/*        partition function can be considered to be unity.  The */
/*        ground electronic state is taken to be the zero of */
/*        electronic energy. */

/*     for monatomics print and return. */

	if (natoms <= 1) {
		s = stran * tocal;
		e = etran * tokcal;
		cv = ctran * tocal;
		/* need print statement here */
		free_vector( vtemp, 1, n );
		free_vector( evibn, 1, n );
		free_vector( cvibn, 1, n );
		free_vector( svibn, 1, n );
		return 0;
	}

/*     compute contributions due to rotation. */

/*     Compute the principal moments of inertia, get the rotational */
/*     symmetry number, see if the molecule is linear, and compute */
/*     the rotational temperatures.  Note the imbedded conversion */
/*     of the moments to SI units. */

	mofi(natoms, &c[1], &amass[1], pmom);
	if (mytaskid == 0) {
	  fprintf(stdout, "Principal moments of inertia in amu-A**2:\n");
	  fprintf(stdout, "     %12.2f%12.2f%12.2f\n", pmom[0], pmom[1],
		  pmom[2]);
	}
	linear = 0;
	symnum(natoms, &amass[1], &sn, &linear);
	if (mytaskid == 0) {
	  fprintf(stdout, "Rotational symmetry number is %2.0f\n", sn);
	}
	con = planck / (boltz * eight * pipi);
	con = con / tokg * (planck / (tomet * tomet));
	if (linear) {
		rtemp = con / pmom[2];
		if (mytaskid == 0) {
		  if (rtemp < pt2)
		    fprintf(stdout, "Assuming classical behavior for rotation\n");
		  fprintf(stdout, "Rotational temperature: %8.3f\n", rtemp);
		}
	} else {
		rtemp1 = con / pmom[0];
		rtemp2 = con / pmom[1];
		rtemp3 = con / pmom[2];
		if (mytaskid == 0) {
		  if (rtemp1 < pt2) {
		    fprintf(stdout, "Assuming classical behavior for rotation\n");
		  }
		  fprintf(stdout, "Rotational temperatures: %8.3f  %8.3f  %8.3f\n",
			  rtemp1, rtemp2, rtemp3);
		}
	}

/*         erot-- rotational contribution to internal energy. */
/*         crot-- rotational contribution to cv. */
/*         srot-- rotational contribution to entropy. */

	if (linear) {
		erot = rt;
		crot = gas;
		arg = t / rtemp * (e / sn);
		srot = gas * log(arg);
	} else {
		erot = onept5 * rt;
		crot = onept5 * gas;
		arg = sqrt(pi * e * e * e) / sn;
		dum = t / rtemp1 * (t / rtemp2) * (t / rtemp3);
		arg *= sqrt(dum);
		srot = gas * log(arg);
	}

/*     compute contributions due to vibration. */

/*     compute vibrational temperatures and zero point vibrational */
/*     energy.  only real frequencies are included in the analysis. */

	nimag = 0;
	for( i=1; i<=3*natoms; i++ ) if( freq[i] < -0.5 ) nimag++;
	if (mytaskid == 0) {
	  if( nimag > 0 ) fprintf( stdout, "found %d imaginary frequencies\n", nimag);
	}

/*       (---iff is the first frequency to include in thermo:) */

	if (ilevel != 0) {
		iff = 0 + nimag;
	} else if (linear) {
		iff = 5 + nimag;
	} else {
		iff = 6 + nimag;
	}
	ndof = 3 * natoms - iff;
	con = planck / boltz;
	ezpe = zero;
	for (i = 1; i <= ndof; ++i) {
		vtemp[i] = freq[i + iff] * con * 3e10;
		ezpe += freq[i + iff] * 3e10;
	}
	ezpe = half * planck * ezpe;
	ezkc = ezpe * tokcal * avog;
	if (mytaskid == 0) {
	  fprintf(stdout, "Zero-point vibrational energy: %10.3f\n", ezkc);
	}


/*     compute: */
/*        evib-- the vibrational component of the internal energy. */
/*        cvib-- the vibrational component of the heat capacity. */
/*        svib-- the vibrational component of the entropy. */

	evib = zero;
	cvib = zero;
	svib = zero;
	for (i = 1; i <= ndof; ++i) {

/*       compute some common factors. */

		tovt = vtemp[i] / t;
		etovt = exp(tovt);
		em1 = etovt - one;

/*       compute contributions due to the i'th vibration. */

		econt = tovt * (half + one / em1);
		d__1 = tovt / em1;
		ccont = etovt * (d__1 * d__1);
		argd = one - one / etovt;
		if (argd > 1e-7) {
			scont = tovt / em1 - log(argd);
		} else {
			scont = 0.f;
			if (mytaskid == 0) {
			  fprintf(stdout,
				  "Warning: setting vibrational entropy to zero for mode %d with vtemp = %8.3f\n",
				  i, vtemp[i]);
			}
		}
		evibn[i] = econt * rt;
		cvibn[i] = ccont * gas;
		svibn[i] = scont * gas;

		evib += econt;
		cvib += ccont;
		svib += scont;
	}
	evib *= rt;
	cvib *= gas;
	svib *= gas;

/*     the units are now: */
/*         e-- joules/mol */
/*         c-- joules/mol-kelvin */
/*         s-- joules/mol-kelvin */

	etot = etran + erot + evib;
	ctot = ctran + crot + cvib;
	stot = stran + srot + svib;


/*     convert to the following and print */
/*         e-- kcal/mol */
/*         c-- cal/mol-kelvin */
/*         s-- cal/mol-kelvin */

	etran *= tokcal;
	ctran *= tocal;
	stran *= tocal;
	erot *= tokcal;
	crot *= tocal;
	srot *= tocal;
	evib *= tokcal;
	cvib *= tocal;
	svib *= tocal;
	etot = etran + erot + evib + energy;
	ctot = ctran + crot + cvib;
	stot = stran + srot + svib;
	for (i = 1; i <= ndof; ++i) {
		evibn[i] *= tokcal;
		cvibn[i] *= tocal;
		svibn[i] *= tocal;
	}

	if (mytaskid == 0) {
	  fprintf(stdout,
		  "\n             freq.         E               Cv              S\n");
	  fprintf(stdout,
		  "            cm**-1       kcal/mol       cal/mol-K     cal/mol-K\n");
	  fprintf(stdout, "Total:               %11.3f    %11.3f    %11.3f\n",
		  etot, ctot, stot);
	  fprintf(stdout, "translational:       %11.3f    %11.3f    %11.3f\n",
		  etran, ctran, stran);
	  fprintf(stdout, "rotational:          %11.3f    %11.3f    %11.3f\n",
		  erot, crot, srot);
	  fprintf(stdout, "vibrational:         %11.3f    %11.3f    %11.3f\n",
		  evib, cvib, svib);
	  fprintf(stdout, "ff   energy:         %11.3f\n", energy );
	  
	  for (i = 1; i <= iff; ++i) {
	    fprintf(stdout, " %5d   %10.3f\n", i, freq[i]);
	  }
	  for (i = 1; i <= ndof; ++i) {
	    fprintf(stdout, " %5d   %10.3f  %11.3f    %11.3f    %11.3f\n",
		    i + iff, freq[i + iff], evibn[i], cvibn[i], svibn[i]);
	  }
	}
	free_vector( vtemp, 1, n );
	free_vector( evibn, 1, n );
	free_vector( cvibn, 1, n );
	free_vector( svibn, 1, n );

	return 0;
}								/* thermo */

/***********************************************************************
                            NMODE()
************************************************************************/

/* 
 * Compute normal modes
 *
 * Calling parameters are as follows:
 *
 * x[n]    contains the coordinates
 * n       dimension of variables
 * func    pointer to function that computes the energy, gradient and hessian
 *           (probably only works now with mme2())
 * eigp    number of modes to post to file "vecs"
 *
 */

int nmode( REAL_T *x, int *n, 
	   REAL_T ( *func)( REAL_T*, REAL_T*, REAL_T*, REAL_T*,
			    REAL_T*, int*, int*, int*,
			    int*, int*, int*, int*,
			    int*, int* ),
	   int *eigp )

{
  REAL_T *g=NULL, *m=NULL, *v=NULL, *h=NULL, *work=NULL, *grad=NULL;
  REAL_T sumg, energy, t1, t2;
  int niter, i, j, k, mcopy, natom, ret_val, lwork, info;
  int mytaskid, numtasks, gridim;
  int context_PxQ=-1, context_1x1=-1, context_Nx1=-1;
  int descH_PxQ[DLEN_], descG_PxQ[DLEN_], descG_1x1[DLEN_];
  FILE *vfp;
  REAL_T pressure, temperature;
  char uplo, jobz;
  size_t ncopy;

#ifdef SCALAPACK
  int zero=0, one=1;
  int myrow, mycol, nprow, npcol;
  int myrowC, mycolC, nprowC, npcolC;
  int bs3, ierror, lld;
  int np, nq, nb, sizesytrd, nprocs, contextC, nrc, ldc, qrmem;
  int lldH_PxQ, lldG_PxQ, lldZ_PxQ, lldG_1x1;
  size_t locpH_PxQ, locpG_PxQ, locpG_1x1;
  size_t locqH_PxQ, locqG_PxQ, locqG_1x1;
  size_t sizeH_PxQ, sizeG_PxQ, sizeG_1x1;
  size_t adr, sizemqrleft;
  REAL_T *ptr, *reductarr=NULL, *eigvecrow=NULL;
#endif

  /* If PRINT_NM_TIMES is defined print some calculation times. */

#undef PRINT_NM_TIMES

  /* Get mytaskid and, if SCALAPACK is defined, numtasks. */

  mytaskid = get_mytaskid();

#ifdef SCALAPACK
  numtasks = get_numtasks();
#endif

  /* Allocate some dynamic vectors and matrices. */

  mcopy = *n;
  ncopy = *n;
  m = vector( 1, ncopy );
  v = vector( 1, ncopy );

#ifndef SCALAPACK

  /* If SCALAPACK is not defined, allocate full copies of g and h. */

  g = vector( 1, ncopy );
  h = vector( 0, ncopy*ncopy );

#else

  /*
   * If SCALAPACK is defined, allocate distributed copies of g and h,
   * as well as a single copy of grad.
   *
   * Create a general context.  Although context_PxQ does comprise all
   * of the processes, it appears that the general context must be
   * distinct from the context(s) of the matrices that participate
   * in the redistribution via pdgemr2d.
   *
   * The topologic layout of the general context does not appear
   * to be important, so this general context is row cyclic.
   *
   * It appears to be important that the most general context be created
   * first, followed by successively less general contexts.  For this
   * code the correct order of creation is Nx1 then PxQ then 1x1.  Each
   * context is subsumed by the earlier created contexts.  I don't know
   * what to do about overlapping contexts where one does not sumsume
   * the other.  Failure to the most general context first leads to
   * synchronization errors.
   */

  sl_init_(&context_Nx1, &numtasks, &one);

  /* Calculate the dimensions of the largest possible square process grid. */

  gridim = (int)(sqrt((REAL_T)numtasks)+0.5);
  if (gridim*gridim > numtasks) {
    gridim -= 1;
  }
  if (gridim == 0) {
    gridim = 1;
  }

  /*
   * Initialize the process grid for block cyclic distribution of matrices.
   * Note that a returned context of -1 indicates that the task is not
   * active on the process grid.
   */

  sl_init_(&context_PxQ, &gridim, &gridim);

  /* Initialize the process grid for a single (1x1) process. */

  sl_init_(&context_1x1, &one, &one);

  /*
   * Get the number of rows and columns on the block cyclic (PxQ) process grid,
   * as well as this task's row and column on the grid.
   */

  blacs_gridinfo_(&context_PxQ, &nprow, &npcol, &myrow, &mycol);

  /*
   * Get the blocksize for a square block.  Because in egb2 the
   * loop index i selects three rows of the Hessian, multiply
   * the blocksize by three.
   */

  bs3 = 3*get_blocksize();

  /*
   * If this task is on the process grid, set up the array descriptors.
   * If this task isn't on the process grid, set descZ_PxQ[CTXT_],
   * descG_PxQ[CTXT_] and descH_PxQ[CTXT_] to -1.  These values will
   * be used by pdgemr2d to determine activity on the grid.
   */

  if (context_PxQ >= 0) {

    /*
     * Allocate then distribute the Hessian matrix h, the eigenvector
     * matrix z and the gradient vector g on the block cyclic process grid.
     * The gradient vector g does not need to be allocated for columns other
     * than column 0, but the descinit_ function must be called for all columns.
     *
     * The numroc_ function is used to calculate the number of matrix
     * elements that are distributed across a PxQ processor grid.
     */

    locpG_PxQ = numroc_(&mcopy, &bs3, &myrow, &zero, &nprow);
    locqG_PxQ = numroc_(&one, &bs3, &mycol, &zero, &npcol);
    sizeG_PxQ = locpG_PxQ * locqG_PxQ;
    lldG_PxQ = locpG_PxQ;
    descinit_(descG_PxQ, &mcopy, &one, &bs3, &bs3,
	      &zero, &zero, &context_PxQ, &lldG_PxQ, &info);
    g = vector( 1, sizeG_PxQ );

    locpH_PxQ = numroc_(&mcopy, &bs3, &myrow, &zero, &nprow);
    locqH_PxQ = numroc_(&mcopy, &bs3, &mycol, &zero, &npcol);
    sizeH_PxQ = locpH_PxQ * locqH_PxQ;
    lldH_PxQ = locpH_PxQ;
    descinit_(descH_PxQ, &mcopy, &mcopy, &bs3, &bs3,
	      &zero, &zero, &context_PxQ, &lldH_PxQ, &info);
    h = vector( 0, sizeH_PxQ );
  
  } else {
    descG_PxQ[CTXT_] = -1;
    descH_PxQ[CTXT_] = -1;
  }

  /*
   * Get the number of rows and columns on the single process grid,
   * as well as this task's row and column on the grid.
   */

  blacs_gridinfo_(&context_1x1, &nprow, &npcol, &myrow, &mycol);

  /*
   * If this task is on the process grid, set up the array descriptors.
   * If this task isn't on the process grid, set descG_1x1[CTXT_] ] to -1.
   * This value will be used by pdgemr2d to determine activity on the grid.
   */

  if (context_1x1 >= 0) {

    /*
     * Allocate then distribute the gradient vector grad on the single
     * process grid.  The descinit_ function is called for this array
     * for only the task that is active on the grid.
     *
     * Also, for the other tasks that are not active, the vector grad
     * is allocated because the copy of grad from the single process
     * grid is broadcast to the other copies.  The size of this vector
     * for these other tasks is determined by the ncopy variable.
     *
     * The numroc_ function is used to calculate the number of matrix
     * elements that are distributed across a 1x1 processor grid.
     */

    locpG_1x1 = numroc_(&mcopy, &bs3, &myrow, &zero, &nprow);
    locqG_1x1 = numroc_(&one, &bs3, &mycol, &zero, &npcol);
    sizeG_1x1 = locpG_1x1 * locqG_1x1;
    lldG_1x1 = locpG_1x1;
    descinit_(descG_1x1, &mcopy, &one, &bs3, &bs3,
	      &zero, &zero, &context_1x1, &lldG_1x1, &info);
    grad = vector( 1, sizeG_1x1 );

  } else {
    descG_1x1[CTXT_] = -1;
    grad = vector( 1, ncopy );
  }

  /* Allocate the eigenvector row and reduction arrays. */

  eigvecrow = vector( 0, ncopy );
  reductarr = vector( 0, ncopy );
  
#endif /* ifndef SCALAPACK */

  niter = 1;

  t1 = seconds();

  /*
   * For non-ScaLAPACK execution, set some variables to values that
   * will select the proper sections of code below.
   */

#ifndef SCALAPACK

  gridim = 1;
  context_PxQ = 0;

#endif

  /* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */

  /*
   * Compute the function value in "f",
   * its gradient (with respect to x) in "g", and
   * its hessian (with respect to x) in "h".
   * The atomic masses are returned in "m".
   * The number of atoms is returned in "natom".
   *
   * The grad, descG_PxQ, descG_1x1, descH_PxQ,
   * gridim, context_PxQ, context_1x1 and context_Nx1
   * calling parameters supply ScaLAPACK information,
   * or are dummy arguments for non-ScaLAPACK execution.
   *
   * The &g[1] and &m[1] calling parameters
   * map from 1..n indexing in this newton function
   * to 0..n-1 indexing in *func (which defaults to mme2).
   * This technique is not used for h because it must be
   * indexed from 1 to n.
   */

  energy = ( *func )( x, &g[ 1 ], h, &m[ 1 ],
		      &grad[ 1 ], descG_PxQ, descG_1x1, descH_PxQ,
		      &context_PxQ, &context_1x1, &context_Nx1,
		      &gridim, &natom, &niter );

  /* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% */

  t2 = seconds();

#ifdef PRINT_NM_TIMES
  if (mytaskid == 0) {
    printf("\nmme2 time = %10.2f\n\n", t2 - t1);
    fflush(stdout);
  }
#endif

  tmmetwo += t2 - t1;

  /*
   * Some ScaLAPACK functions appear to quit unexpectedly
   * for large matrices on a 1x1 process grid, e.g. the pdgemm_
   * function in newton.c, so bypass the ScaLAPACK pdsyev_ and
   * use the LAPACK dsyev_ function instead.
   *
   * The correct test is (gridim == 1), not (nprow == 1 && npcol == 1)
   * since processes that aren't on the 1x1 grid have nprow == npcol == -1,
   * which would direct control to pdgemr2d_ (below) that would hang because
   * it would not be called from all processes, specifically not from
   * the process that is on the 1x1 grid and has nprow == npcol == 1.
   *
   * Non-ScaLAPACK execution will select this section of code because
   * gridim and context_PxQ were set to 1 and 0, respectively, above.
   */

  if (gridim == 1) {
    if (context_PxQ >= 0) {

      sumg = 0.0;
      for (i = 1; i <= ncopy; i++) {
	sumg += g[i] * g[i];
      }
      sumg = sqrt(sumg/((REAL_T)ncopy));
      if (mytaskid == 0) {
	printf( "RMS gradient = %12.5e\n", sumg );
	fflush( stdout );
      }

      /*
       * Mass weight the Hessian:
       */

      j = 1;
      for(i=1; i<=natom; i++) {
	g[j+2] = g[j+1] = g[j] = 1.0/sqrt(m[i]);
	j += 3;
      }

      k = 0;
      for(i=1; i<=ncopy; i++){
	for(j=1; j<=ncopy; j++){
	  h[k] = g[i]*h[k]*g[j];
	  k++;
	}
      }

      /*
       * Allocate the work array, call dsyev_ and deallocate the work array.
       * This step requires that LAPACK and BLAS support be available in your
       * NAB installation.
       */

      if( *eigp ) jobz = 'V'; else jobz = 'N';
      uplo = 'U';
      lwork = 3*ncopy;
      work = vector( 0, lwork );
      k=0;

      t1 = seconds();

      dsyev_( &jobz, &uplo, n, h, n, &v[1], work, &lwork, &info );

      t2 = seconds();

#ifdef PRINT_NR_TIMES
      if (mytaskid == 0) {
	printf("dysev time = %10.2f\n\n", t2 - t1);
	fflush(stdout);
      }
#endif

      tdiag += t2 - t1;

      free_vector( work, 0, lwork );

      /* un-mass-weight the eigenvectors:  */

      k = 0;
      for( i=1; i<=ncopy; i++ ){
	for( j=1; j<=ncopy; j++ ){
	  h[k++] *= g[j];
	}
      }

      /*  get the eigenvalues into cm**-1 units:   */

      for(i=1; i<=ncopy; i++){
	v[i] = v[i]>=0.0 ? 108.587*sqrt(v[i]) : -108.587*sqrt(-v[i]);
      }

      /* Make an Amber-compatible formatted vecs file but only via task 0. */

      if( *eigp && mytaskid == 0){
	vfp = safe_fopen( "vecs", "w" );
	fprintf( vfp, " modes from NAB\n" );
	fprintf( vfp, "%5d\n", ncopy );
	for( i=0; i<ncopy; i++ ){
	  fprintf( vfp, "%11.5f", x[i] );
	  if( i%7 == 6 ) fprintf( vfp, "\n" );
	}
	if( ncopy%7 != 0 ) fprintf( vfp, "\n" );
	k = 0;
	for( j=1; j<=*eigp; j++ ){
	  fprintf( vfp, "****\n%5d%12.5f\n", j, v[j] );
	  for( i=0; i<ncopy; i++ ){
	    fprintf( vfp, "%11.5f", h[k+i] );
	    if( i%7 == 6 ) fprintf( vfp, "\n" );
	  }
	  if( ncopy%7 != 0 ) fprintf( vfp, "\n" );
	  k += ncopy;
	}
	fclose( vfp );
      }

      /*  get the thermodynamic parameters  */

      j = 0;
      pressure = 1.0;
      temperature = 298.15;
      thermo( natom, mcopy, j, x, &m[1], &v[1], temperature, pressure, energy);
    }
  } else {

#ifdef SCALAPACK

    /*
     * Here is the ScaLAPACK code that will execute if gridim != 1,
     * i.e., for more than one process on the process grid.
     *
     * Gather the distributed g vectors to the non-distributed grad vector.
     * Call the pdgemr2d_ function from all processes so that it doesn't hang.
     */

    pdgemr2d_(&mcopy, &one,
	      &g[1], &one, &one, descG_PxQ,
	      &grad[1], &one, &one, descG_1x1, &context_Nx1);

    /*
     * Calculate the RMS error for the gradient and print via the
     * process of the 1x1 grid.
     */

    if (context_1x1 >= 0) {
      sumg = 0.0;
      for (i = 1; i <= ncopy; i++) {
	sumg += grad[i] * grad[i];
      }
      sumg = sqrt(sumg/((REAL_T)ncopy));
      printf( "RMS gradient = %12.5e\n", sumg );
      fflush( stdout );
    }

    /*
     * Mass weight the Hessian:
     *
     * Get the processor row and column in context_PxQ, and  modify the
     * diagonal elements of the Hessian.  If the task is not active
     * in context_PxQ, the myroc function returns 0.
     */

    blacs_gridinfo_(&context_PxQ, &nprow, &npcol, &myrow, &mycol);

    if (context_PxQ >= 0) {
      j = 1;
      for(i=1; i<=natom; i++) {
	grad[j+2] = grad[j+1] = grad[j] = 1.0/sqrt(m[i]);
	j += 3;
      }

      for (i=1; i<=ncopy; i++) {
	for (j=1; j<=ncopy; j++) {
	  ptr = ptr2d(h, descH_PxQ, i-1, j-1);
	  if (ptr != NULL) {
	    *ptr = grad[i] * (*ptr) * grad[j];
	  }
	}
      }
    
      /*
       *  Call the ScaLAPACK routines; requires that ScaLAPACK and BLAS support
       *  be available in your NAB installation.
       *
       * Calculate minimum size for the work array with or without calculation
       * of the eigenvectors.  See the pdsyev_ source code for details on how
       * to calculate LWMIN.
       */

      if ( *eigp ) {
	jobz = 'V';
	nprocs = nprow * npcol;
	contextC = sl_gridreshape_(&context_PxQ, &zero, &one,
				   &one, &nprocs, &one);
	blacs_gridinfo_(&contextC, &nprowC, &npcolC, &myrowC, &mycolC);
	blacs_gridexit_(&contextC);
	nb = descH_PxQ[NB_];
	nrc = numroc_(&mcopy, &nb, &myrowC, &zero, &nprocs);
	ldc = (nrc > 1) ? nrc : 1;
	np = numroc_(&mcopy, &nb, &myrow, &zero, &nprow);
	nq = numroc_(&mcopy, &nb, &mycol, &zero, &npcol);
	sizemqrleft = nb*nb +
	  ( (nb*(nb-1)/2 > (np+nq)*nb) ? nb*(nb-1)/2 : (np+nq)*nb );
	qrmem = 2*ncopy-2;
	lwork = 5*ncopy + ncopy*ldc + 1 +
	  ( (sizemqrleft > qrmem) ? sizemqrleft : qrmem );
      } else {
	jobz = 'N';
	nb = descH_PxQ[NB_];
	np = numroc_(&mcopy, &nb, &myrow, &zero, &nprow);
	sizesytrd = ( 3*nb > nb*(np+1) ) ? 3*nb : nb*(np+1);
	lwork = 5*ncopy + sizesytrd + 1;
      }

      /*
       * Allocate the work array, call pdsyev_ and deallocate the work array.
       * This step requires that ScaLAPACK and BLACS support be available in
       * your NAB installation.
       */

      work = vector( 0, lwork );

      t1 = seconds();

      uplo = 'U';
      pdsyev_( &jobz, &uplo, n,
	       h, &one, &one, descH_PxQ, &v[1],
	       h, &one, &one, descH_PxQ,
	       work, &lwork, &info );

      t2 = seconds();

#ifdef PRINT_NR_TIMES
      if (mytaskid == 0) {
	printf("pdsyev time = %10.2f\n\n", t2 - t1);
	fflush(stdout);
      }
#endif

      tdiag += t2 - t1;

      free_vector( work, 0, lwork );

      /* un-mass-weight the eigenvectors:  */

      for (i=1; i<=ncopy; i++) {
	for (j=1; j<=ncopy; j++) {
	  ptr = ptr2d(h, descH_PxQ, i-1, j-1);
	  if (ptr != NULL) {
	    *ptr *= grad[j];
	  }
	}
      }

      /* get the eigenvalues into cm**-1 units:   */

      for(i=1; i<=ncopy; i++){
	v[i] = v[i]>=0.0 ? 108.587*sqrt(v[i]) : -108.587*sqrt(-v[i]);
      }
    }
    
    /*
     * Make an Amber-compatible formatted vecs file.  Write only by task 0.
     * Because the elements of the eigenvector matrix are distributed across
     * all of the processes of context_PxQ, it is necessary to gather them
     * for writing by task 0.  This gather is accomplished by copying the
     * matrix elements one row at a time into the eigvecrow array in the following
     * manner.  If a matrix element exists in the submatrix associated with
     * a particular process, that element is copied to the eigvecrow array,  If
     * that element does not exist in the submatrix, a zero is placed in the
     * eigvecrow array.  Once a row of the eigenvector matrix has been processed
     * in this manner, a reduction is performed involving all of the eigvecrow
     * arrays, including those that aren't active on the process grid so that
     * pdgemr2d_ doesn't hang.
     */

    if ( *eigp ) {
      if (mytaskid == 0) {
	vfp = safe_fopen( "vecs", "w" );
	fprintf( vfp, " modes from NAB\n" );
	fprintf( vfp, "%5d\n", ncopy );
	for( i=0; i<ncopy; i++ ){
	  fprintf( vfp, "%11.5f", x[i] );
	  if( i%7 == 6 ) fprintf( vfp, "\n" );
	}
	if( ncopy%7 != 0 ) fprintf( vfp, "\n" );
      }
      for( j=1; j<=*eigp; j++ ){
	if (mytaskid == 0) {
	  fprintf( vfp, "****\n%5d%12.5f\n", j, v[j] );
	}
	for( i=0; i<ncopy; i++ ){
	  ptr = ptr2d(h, descH_PxQ, j-1, i);
	  if (ptr != NULL) {
	    eigvecrow[i] = *ptr;
	  } else {
	    eigvecrow[i] = 0.0;
	  }
	}
	ierror = MPI_Allreduce(eigvecrow, reductarr, ncopy,
			       MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	if (ierror != MPI_SUCCESS) {
	  printf("Error in nmode work reduction, error = %d  mytaskid = %d\n",
		 ierror, mytaskid);
	}
	for( i=0; i<ncopy; i++ ){
	  if (mytaskid == 0) {
	    fprintf( vfp, "%11.5f", reductarr[i] );
	    if( i%7 == 6 ) fprintf( vfp, "\n" );
	  }
	}
	if (mytaskid == 0) {
	  if( ncopy%7 != 0 ) fprintf( vfp, "\n" );
	}
      }
      if (mytaskid == 0) {
	fclose( vfp );
      }
    }

    /*  get the thermodynamic parameters */

    if (context_PxQ >= 0) {
      j = 0;
      pressure = 1.0;
      temperature = 298.15;
      thermo( natom, mcopy, j, x, &m[1], &v[1], temperature, pressure, energy );
    }

#endif /* ifdef SCALAPACK */

  }

#ifndef SCALAPACK

  /* Free the dynamic vectors and matrices. */

  if (g != NULL) free_vector( g, 1, ncopy );
  if (m != NULL) free_vector( m, 1, ncopy );
  if (v != NULL) free_vector( v, 1, ncopy );
  if (h != NULL) free_vector( h, 0, ncopy*ncopy );

#else

  /* Free the dynamic vectors and matrices. */

  if (context_PxQ >= 0) {
    if (g != NULL) free_vector( g, 1, sizeG_PxQ );
    if (h != NULL) free_vector( h, 0, sizeH_PxQ );
  }
  if (context_1x1 >= 0) {
    if (grad != NULL) free_vector( grad, 1, sizeG_1x1 );
  } else {
    if (grad != NULL) free_vector( grad, 1, ncopy );
  }
  if (eigvecrow != NULL) free_vector( eigvecrow, 0, ncopy );
  if (reductarr != NULL) free_vector( reductarr, 0, ncopy );
  if (m != NULL) free_vector( m, 1, ncopy );
  if (v != NULL) free_vector( v, 1, ncopy );

  /* Exit the process grid only for tasks that are on the process grid. */

  if (context_PxQ >= 0) {
    blacs_gridexit_(&context_PxQ);
  }
  if (context_1x1 >= 0) {
    blacs_gridexit_(&context_1x1);
  }

  if (context_Nx1 >= 0) {
    blacs_gridexit_(&context_Nx1);
  }

#endif /* ifndef SCALAPACK */

  /* Call egb2 with an iteration count of -3 to free static vectors. */

  niter = -3;
  ( *func )( x, &g[ 1 ], h, &m[ 1 ],
	     &grad[ 1 ], descG_PxQ, descG_1x1, descH_PxQ,
	     &context_PxQ, &context_1x1, &context_Nx1,
	     &gridim, &natom, &niter );

  ret_val = 0;
  return ret_val;
}