/*
    DFT++ is a density functional package developed by the research group
    of Professor Tomas Arias

    Copyright 1996-2003 Sohrab Ismail-Beigi

    This file is part of DFT++.

    DFT++ is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    DFT++ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with DFT++; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Please see the file CREDITS for a list of authors.

    For academic users, we request that publications using results obtained with
    this software reference

    "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
    and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).

    and, if using the wavelet basis, further reference

    "Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
    T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).

    and 

    "Robust ab initio calculation of condensed matter: transparent convergence through
    semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
    Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).

    For your convenience, preprints of the above articles may be obtained from
    http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/

/*
 * Routines that do various matrix multiplications.  These routines
 * are the computational Kernels for all the matrix multiplies in the code,
 * so optimizing them is the way to improve matrix multiplication
 * performance.
 */

#include "header.h"

//
// This routine does the multiplication
//
//       Bres[i][j] += sum_k { B1[i][k]*B2[j][k] }
//
// B1   is s1 x cs in memory layout
// B2   is s2 x cs in memory layout
// Bres is s1 x s2 in memory layout
//
// The matrix multiply is done for the sublocks in memory of size:
// B1   n1 x nc
// B2   n2 x nc
// Bres n1 x n2
//

// extern "C" void esvzgemul(void *,int,char *,
// 			  void *,int,char *,
// 			  void *,int,int,int,int);

void
small_block_matrix_mult(int n1, int n2, int nc,
			int s1, int s2, int cs,
			scalar *B1, scalar *B2, scalar *Bres)
{
// This uses the essl esvzgemul routine to do the work.
// using it turned out to not give ANY performance benefit.
//
//   static scalar Btemp[10000];
//   esvzgemul(B2,cs,"T",B1,cs,"N",Btemp,s2,n2,nc,n1);
//   for (int i=0; i < n1; i++)
//     {
//       register int is2 = i*s2;
//       for (int j=0; j < n2; j++)
// 	{
// 	  Bres[is2+j].x += Btemp[is2+j].x;
// 	  Bres[is2+j].y += Btemp[is2+j].y;
// 	}
//     }
//   return;

  // The code blocks below does this:
  //    scalar sum = 0.0;
  //    for (register int k=0; k<nc; k++)
  //       sum += B1[r1][k] * B2[r2][k];
  //    Bres[r1][r2] += sum;
  //
  // If we have even sized blocks, do register level 2 x 2 blocks
  if ( (n1%2==0) && (n2%2==0) && (nc%2==0) )
    {
      for (int r1 = 0; r1 < n1; r1+=2)		
	for (int r2 = 0; r2 < n2; r2+=2)
	  {
	    int r1cs = r1*cs;
	    int r1s2 = r1*s2;
	    int r2cs = r2*cs;
	    double ax, bx, cx, dx, ex, fx, gx, hx;
	    double ay, by, cy, dy, ey, fy, gy, hy;
	    double wx, xx, yx, zx, wy, xy, yy, zy;
	    wx = xx = yx = zx = wy = xy = yy = zy = 0.0;
	    for (int k=0; k < nc; k+=2)
	      {
		// a = B1[r1][k]    b = B1[r1][k+1],
		// c = B1[r1+1][k]  d = B1[r1+1][k+1]
		ax = B1[r1cs+k].x;
		ay = B1[r1cs+k].y;
		bx = B1[r1cs+k+1].x;
		by = B1[r1cs+k+1].y;
		cx = B1[r1cs+cs+k].x;
		cy = B1[r1cs+cs+k].y;
		dx = B1[r1cs+cs+k+1].x;
		dy = B1[r1cs+cs+k+1].y;
		// e = B2[r2][k]    f = B2[r2][k+1]
		// g = B2[r2+1][k]  h = B2[r2+1][k+1]
		ex = B2[r2cs+k].x;
		ey = B2[r2cs+k].y;
		fx = B2[r2cs+k+1].x;
		fy = B2[r2cs+k+1].y;
		gx = B2[r2cs+cs+k].x;
		gy = B2[r2cs+cs+k].y;
		hx = B2[r2cs+cs+k+1].x;
		hy = B2[r2cs+cs+k+1].y;
		// w = a*e + b*f    x = a*g + b*h
		// y = c*e + d*f    z = c*g + d*h
		wx += ax*ex - ay*ey + bx*fx - by*fy;
		wy += ax*ey + ay*ex + bx*fy + by*fx;
		xx += ax*gx - ay*gy + bx*hx - by*hy;
		xy += ax*gy + ay*gx + bx*hy + by*hx;
		yx += cx*ex - cy*ey + dx*fx - dy*fy;
		yy += cx*ey + cy*ex + dx*fy + dy*fx;
		zx += cx*gx - cy*gy + dx*hx - dy*hy;
		zy += cx*gy + cy*gx + dx*hy + dy*hx;
	      }
	    // Bres[r1][r2]   += w    Bres[r1][r2+1]   += x
	    // Bres[r1+1][r2] += y    Bres[r1+1][r2+1] += z
	    Bres[r1s2+r2].x += wx;
	    Bres[r1s2+r2].y += wy;
	    Bres[r1s2+r2+1].x += xx;
	    Bres[r1s2+r2+1].y += xy;
	    Bres[r1s2+s2+r2].x += yx;
	    Bres[r1s2+s2+r2].y += yy;
	    Bres[r1s2+s2+r2+1].x += zx;
	    Bres[r1s2+s2+r2+1].y += zy;	
	  }
    }
  // The blocks aren't even sized; so do the simpler matrix multiply
  else
    {
      for (int r1 = 0; r1 < n1; r1++)
	for (int r2 = 0; r2 < n2; r2++)
	  {
	    int r1cs = r1*cs;
	    int r1s2 = r1*s2;
	    int r2cs = r2*cs;
	    double sx = 0.0;
	    double sy = 0.0;
	    for (int k=0; k < nc; k++)
	      {
		double ax;
		double ay;
		double bx;
		double by;
		ax = B1[r1cs+k].x;
		ay = B1[r1cs+k].y;
		bx = B2[r2cs+k].x;
		by = B2[r2cs+k].y;
		sx += ax * bx - ay * by;
		sy += ay * bx + ax * by;
	      }
	    Bres[r1s2+r2].x += sx;
	    Bres[r1s2+r2].y += sy;
	  }
    }
}

// define block sizes used for block matrix multiplies
#define BL1 32
#define BL2 32
#define BLK 32

//
// This routine does the operation M = Y1^Y2 via blocked matrix
// multiplies that call the routine above for the actual work.
// So all that is done below is to loop over blocks, load the
// blocks from memory, and then to output the result blocks to M.
//
// n1 and n2 are the number of columns of Y1 and Y2 resp.
// N is the length of the columns of Y1 and Y2.
//
// offsetMrow/col are offsets to the output M (see the formulae below).
//
// transpose==1 means Y1 and Y2 are actually input in transpose format.
// transpose==0 means they are in "normal" format
//
// i.e. in transpose==0 mode, the routine does
//
// M(i+offsetMrow,j+offsetMcol) =
//         sum_k { conj(Y1.col[i].c[k])*Y2.col[j].c[k] }
//
// offsetY2rowtranspose is an offset added to the row accessing of Y2
// when in transpose mode (specially needed when doing distributed
// case when data comes in transposed... see dist_multiply.c for more
// juicy details).
//
// The computation is split into three routines in order for threading
// and serial implementations to all interface most easily:
//
// The routine Y1dagY2_partial_mult() does the actual work of multiplying
//    for indices ib going from start_ib to start_ib+n_ib_todo-1.  The
//    Entire multiply would have ib going from 0 to n1.
//
// The routine Y1dagY2_thread() is a wrapper routine that a thread
//    executes:  it gets the input data given to it, decodes it,
//    and then calls Y1dagY2_partial_mult() with the correct start_ib
//    and n_ib_todo indices.
//
// The master routine Y1dagY2_block_matrix_mult() is the actual routine
// called by the outside, whose job is to compuute M=Y1^Y2.  It either
// calls Y1dagY2_partial_mult() with one big multiply (if running serial),
// or spawns threads tha will run Y1dagY2_thread() (if running threads).
//
static void
Y1dagY2_partial_mult(const ColumnBundle &Y1,
		     const ColumnBundle &Y2,
		     Matrix &M,
		     int start_ib,int n_ib_todo, int n2, int N,
		     int offsetMrow,int offsetMcol,
		     int transpose,int offsetY2rowtranspose)
{
  // loop over blocks of size BL1xBL2 of the output matrix M
  // staring ib at start_ib and ending at start_ib+n_ib_todo-1
  int ib,jb;
  for (ib=start_ib; ib < start_ib+n_ib_todo; ib+=BL1)
    for (jb=0; jb < n2; jb+=BL2)
      {
	// calculate sizes of output block
	int si = (n_ib_todo-(ib-start_ib)) >= BL1 ? BL1 : n_ib_todo%BL1;
	int sj = (n2-jb)                   >= BL2 ? BL2 : n2%BL2;

	// zero output block
	scalar out[BL1][BL2];
	int i,j;
	for (i=0; i < si; i++)
	  for (j=0; j < sj; j++)
	    out[i][j] = 0.0;

	// input blocks for multiply loop below
	scalar b1[BL1][BLK],b2[BL2][BLK];

	// loop over long direction k for the sum
	int kb;
	for (kb=0; kb < N; kb+=BLK)
	  {
	    // size of k-block
	    int sk = (N-kb) >= BLK ? BLK : N%BLK;

	    // get data from Y1 and Y2 into input blocks:
	    // if in transpose mode...
	    int k;
	    if (transpose)
	      {
		for (k=0; k < sk; k++)
		  for (i=0; i < si; i++)
		    {
#if defined SCALAR_IS_COMPLEX
		      b1[i][k] = conj(Y1.col[kb+k]->data.d[ib+i]);
#elif defined SCALAR_IS_REAL
		      b1[i][k] = Y1.col[kb+k]->data.d[ib+i];
#else
#error scalar is neither real nor complex!
#endif
		    }
		for (k=0; k < sk; k++)
		  for (j=0; j < sj; j++)
		    b2[j][k] = Y2.col[kb+k]->data.d[jb+j+offsetY2rowtranspose];
	      } 
	    // non-transpose mode
	    else
	      {
		for (i=0; i < si; i++)
		  for (k=0; k < sk; k++)
		    {
#if defined SCALAR_IS_COMPLEX
		      b1[i][k] = conj(Y1.col[ib+i]->data.d[kb+k]);
#elif defined SCALAR_IS_REAL
		      b1[i][k] = Y1.col[ib+i]->data.d[kb+k];
#else
#error scalar is neither real nor complex!
#endif
		    }
		for (j=0; j < sj; j++)
		  for (k=0; k < sk; k++)
		    b2[j][k] = Y2.col[jb+j]->data.d[kb+k];
	      }
	    // multiply blocks: out[i][j] += sum_k { b1[i][k]*b2[j][k] }
	    small_block_matrix_mult(si,sj,sk,BL1,BL2,BLK,
				    (scalar *)b1,(scalar *)b2,(scalar *)out);
	  } // over kb

	// now write out the output block to the matrix M
	for (i=0; i < si; i++)
	  for (j=0; j < sj; j++)
	    M(ib+i+offsetMrow,jb+j+offsetMcol) = out[i][j];
      } // over ib,jb
}

#ifdef DFT_THREAD
static void *
Y1dagY2_thread(void *arg)
{
  // Decode what the data the thread is given
  dft_thread_data *data = (dft_thread_data *)arg;

  ColumnBundle *Y1 = (ColumnBundle *)data->p1;
  ColumnBundle *Y2 = (ColumnBundle *)data->p2;
  Matrix *M = (Matrix *)data->p3;
  int ib_start = data->start;
  int ib_n_todo = data->n;
  int n2 = data->i1;
  int N = data->i2;
  int offsetMrow = data->i3;
  int offsetMcol = data->i4;
  int transpose = data->i5;
  int offsetY2rowtranspose = data->i6;

  // Call the multiplier above with the ranges this thread works on
  Y1dagY2_partial_mult(*Y1,*Y2,*M,
		       ib_start,ib_n_todo,n2,N,offsetMrow,offsetMcol,
		       transpose,offsetY2rowtranspose);

  // Free the data-passing intermediary and end the thread
  myfree(arg);
  return NULL;
}
#endif // DFT_THREAD

void
Y1dagY2_block_matrix_mult(const ColumnBundle &Y1,
                          const ColumnBundle &Y2,
                          Matrix &M,
                          int n1, int n2, int N,
                          int offsetMrow,int offsetMcol,
                          int transpose,int offsetY2rowtranspose)
{
#ifdef DFT_THREAD
  // Threads!  Distribute the work
  dft_call_threads(n1,
		   (void *)&Y1,(void *)&Y2,(void *)&M,NULL,NULL,
		   n2,N,offsetMrow,offsetMcol,transpose,offsetY2rowtranspose,
		   Y1dagY2_thread);
#else
  // No threads... just do all the work in one call
  Y1dagY2_partial_mult(Y1,Y2,M,0,n1,n2,N,offsetMrow,offsetMcol,
		       transpose,offsetY2rowtranspose);
#endif
}


//
// This routine does the operation YM = Y*M or YM += Y*M via blocked matrix
// multiplies that call the routine small_block_matrix_mult()
// to do the actual FLOP work. So all that is done below is to loop over
// blocks, load the blocks from memory, and then to output the result
// blocks to YM.
//
// N is the length of the columns of YM and Y.
// nrM is the number of rows of M.
// ncM is the number of cols of M.
//
// offsetMrow/col are offsets of the input M (see the formulae below) used
// to read information out of it.
//
// accum==0 does YM =  Y*M
// accum==1 does YM += Y*M
//
// transpose==0 means they are in normal format
// transpose==1 means Y and YM are actually input in transpose format.
//
// The overall routine calculation does
//
//        YM(i,j) = sum_k { Y(i,k)*M(k+offsetMrow,j+offsetMcol) }
//
// In transpose mode, often the routine is called with Y and YM being
// the same column_bundle, so the routine is written so as to be able to do
// the multiplication in place:  i.e. YM and Y being the same will not
// affect things.  This is the role of the temporary matrix temp
// below.
//
// Like the multiplication routine above, this is also broken
// into three pieces to ease threading:
//
// Y_M_partial_mult() does the multiplication for rYb going from
// rYb_start to rYb_start+n_rYb_todo-1 (the entire multiplication has
// rYb going from 0 to N-1)
//
// Y_M_thread() is run by a thread and it calls Y_M_partial_mult() with
// appropriate ranges of rYb
//
// Y_M_block_matrix_mult() is the routine called from the outside and
// it either calls Y_M_partial_mult() once with rYb going from 0 to N-1
// (if serial), or launches threads to do the job.
//
static void
Y_M_partial_mult(const ColumnBundle &Y,
		 const Matrix &M,
		 ColumnBundle &YM,
		 int rYb_start,int n_rYb_todo, int nrM, int ncM,
		 int offsetMrow, int offsetMcol,
		 int transpose,
		 int accum)
{
  // temporary work space
  Matrix temp(BL1,ncM);

  // loop over blocks of size BL1 on the rows of Y (or cols of Ytranspose)
  // rYb goes from rYb_start to rYb_start+n_rYb_todo-1
  int rYb;
  for (rYb=rYb_start; rYb < rYb_start+n_rYb_todo; rYb+=BL1)
    {
      // compute size of block in the rY direction
      int srY = (n_rYb_todo-(rYb-rYb_start)) >= BL1 ? BL1 : n_rYb_todo%BL1;

      // loop over blocks of size BL2 on the columns of M
      int cMb;
      for (cMb=0; cMb < ncM; cMb+=BL2)
	{
	  // compute actual size of block in the cM direction
	  int scM = (ncM-cMb) >= BL2 ? BL2 : ncM%BL2;
	  
	  // zero out output block
	  scalar out[BL1][BL2];
	  int rY,cM;
	  for (rY=0; rY < srY; rY++)
	    for (cM=0; cM < scM; cM++)
	      out[rY][cM] = 0.0;
	  
	  // input blocks for multiply loop below
	  scalar by[BL1][BLK],bm[BL2][BLK];
	  
	  // loop over blocks of rows of M
	  int kb;
	  for (kb=0; kb < nrM; kb+=BLK)
	    {
	      // size of block in k direction
	      int sk = (nrM-kb) >= BLK ? BLK : nrM%BLK;
	      
	      // read in data of Y and M into input blocks
	      int k;
	      if (transpose)
		for (rY=0; rY < srY; rY++)
		  for (k=0; k < sk; k++)
		    by[rY][k] = Y.col[rYb+rY]->data.d[kb+k];
	      else
		for (k=0; k < sk; k++)
		  for (rY=0; rY < srY; rY++)
		    by[rY][k] = Y.col[kb+k]->data.d[rYb+rY];
	      for (k=0; k < sk; k++)
		for (cM=0; cM < scM; cM++)
		  bm[cM][k] = M(kb+k+offsetMrow,cMb+cM+offsetMcol);
	      
	      // do the multiply
	      small_block_matrix_mult(srY,scM,sk,BL1,BL2,BLK,
				      (scalar *)by,(scalar *)bm,(scalar *)out);
	    } // over kb
	  
	  // write out to the output block to the temporary space
	  for (rY=0; rY < srY; rY++)
	    for (cM=0; cM < scM; cM++)
	      temp(rY,cMb+cM) = out[rY][cM];
	  
	} // over cMb

      // write out temporary block to main memory (depending on
      // accumulate and transpose flags)
      int rY,cM;
      if (transpose)
	{
	  if (accum)
	    for (rY=0; rY < srY; rY++)
	      for (cM=0; cM < M.nc; cM++)
		YM.col[rYb+rY]->data.d[cM] += temp(rY,cM);
	  else
	    for (rY=0; rY < srY; rY++)
	      for (cM=0; cM < M.nc; cM++)
		YM.col[rYb+rY]->data.d[cM] = temp(rY,cM);
	}
      else
	{
	  if (accum)
	    for (cM=0; cM < M.nc; cM++)
	      for (rY=0; rY < srY; rY++)
		YM.col[cM]->data.d[rYb+rY] += temp(rY,cM);
	  else
	    for (cM=0; cM < M.nc; cM++)
	      for (rY=0; rY < srY; rY++)
		YM.col[cM]->data.d[rYb+rY] = temp(rY,cM);
	}

    } // over rYb
}  

#ifdef DFT_THREAD
static void *
Y_M_thread(void *arg)
{
  // Decode what the thread is given
  dft_thread_data *data = (dft_thread_data *)arg;

  ColumnBundle *Y = (ColumnBundle *)data->p1;
  Matrix *M = (Matrix *)data->p2;
  ColumnBundle *YM = (ColumnBundle *)data->p3;
  int rYb_start = data->start;
  int rYb_n_todo = data->n;
  int nrM = data->i1;
  int ncM = data->i2;
  int offsetMrow = data->i3;
  int offsetMcol = data->i4;
  int transpose = data->i5;
  int accum = data->i6;

  // Call the multiply on the part belong to this thread
  Y_M_partial_mult(*Y,*M,*YM,
		   rYb_start,rYb_n_todo,
		   nrM,ncM,offsetMrow,offsetMcol,transpose,accum);
  
  // Free the data-passing structure and exit the thread
  myfree(arg);
  return NULL;
}
#endif  // DFT_THREAD


void
Y_M_block_matrix_mult(const ColumnBundle &Y,
		      const Matrix &M,
		      ColumnBundle &YM,
		      int N, int nrM, int ncM,
		      int offsetMrow, int offsetMcol,
		      int transpose,
		      int accum)
{
#ifdef DFT_THREAD
  // Threads!  Distribute the work
  dft_call_threads(N,
		   (void *)&Y,(void *)&M,(void *)&YM,NULL,NULL,
		   nrM,ncM,offsetMrow,offsetMcol,transpose,accum,
		   Y_M_thread);
#else
  // No threads... do all the work in one big call
  Y_M_partial_mult(Y,M,YM,0,N,nrM,ncM,offsetMrow,offsetMcol,transpose,accum);
#endif
} 

//
// Does the matrix multiply mprod = m1*m2
// i.e. mprod(i,j) = sum_k { m1(i,k)*m2(k,j) }
// by doing block matrix multiplies.
//
void
matrix_matrix_block_matrix_mult(const Matrix &m1,
                                const Matrix &m2,
                                Matrix &mprod)
{
  // input blocks
  static scalar ini[BL1][BLK],inj[BL2][BLK];
  
  // output blocks
  static scalar out[BL1][BL2];

  // loop over blocks of output mprod
  int ib,jb;
  for (ib=0; ib < m1.nr; ib+=BL1)
    for (jb=0; jb < m2.nc; jb+=BL2)
      {
	// compute size of blocks
	int si = (m1.nr-ib) >= BL1 ? BL1 : m1.nr%BL1;
	int sj = (m2.nc-jb) >= BL2 ? BL2 : m2.nc%BL2;

	// zero output block 
	int i,j;
	for (i=0; i < si; i++)
	  for (j=0; j < sj; j++)
	    out[i][j].x = out[i][j].y = 0.0;

	// loop over blocks of columns of m1 (i.e. rows of m2)
	int kb;
	for (kb=0; kb < m1.nc; kb+=BLK)
	  {
	    // Size of block in k-direction
	    int sk = (m1.nc-kb) >= BLK ? BLK : m1.nc%BLK;

	    // read in input blocks 
	    int k;
	    for (k=0; k < sk; k++)
	      for (i=0; i < si; i++)
		{
		  ini[i][k].x = m1.c[(ib+i)*m1.nc+kb+k].x;
		  ini[i][k].y = m1.c[(ib+i)*m1.nc+kb+k].y;
		}
	    for (j=0; j < sj; j++)
	      for (k=0; k < sk; k++)
		{
		  inj[j][k].x = m2.c[(kb+k)*m2.nc+jb+j].x;
		  inj[j][k].y = m2.c[(kb+k)*m2.nc+jb+j].y;
		}

	    // Do block mult
	    small_block_matrix_mult(si,sj,sk,BL1,BL2,BLK,
				    (scalar *)ini,(scalar *)inj,(scalar *)out);
	  } // kb blocks

	// write out block
	for (j=0; j < sj; j++)
	  for (i=0; i < si; i++)
	    {
	      mprod.c[(ib+i)*mprod.nc+jb+j].x = out[i][j].x;
	      mprod.c[(ib+i)*mprod.nc+jb+j].y = out[i][j].y;
	    }
      } // (ib,jb) blocks
}