/* Copyright (C) 1999, 2000, 2001, 2002, Massachusetts Institute of Technology.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include "../config.h"
#include <check.h>

#include "matrices.h"
#include "blasglue.h"

/* Simple operations on sqmatrices.  Also, some not-so-simple operations,
   like inversion and eigenvalue decomposition. */

/* A = B */
void sqmatrix_copy(sqmatrix A, sqmatrix B)
{
     CHECK(A.p == B.p, "arrays not conformant");

     blasglue_copy(A.p * A.p, B.data, 1, A.data, 1);
}

/* Resize A from its current size to a pxp matrix, assuming that
   A was initially allocated to hold at least this big a matrix.
   If preserve_data is nonzero, copies the existing data in A (or
   a subset of it, if the matrix is shrinking) to the corresponding
   entries of the resized matrix. */
void sqmatrix_resize(sqmatrix *A, int p, short preserve_data)
{
     CHECK(p <= A->alloc_p, "tried to resize beyond allocated limit");

     if (preserve_data) {
	  int i, j;
	  
	  if (p < A->p) {
	       for (i = 0; i < p; ++i)
		    for (j = 0; j < p; ++j)
			 A->data[i*p + j] = A->data[i*A->p + j];
	  }
	  else {
	       for (i = A->p-1; i >= 0; --i)
		    for (j = A->p-1; j >= 0; --j)
			 A->data[i*p + j] = A->data[i*A->p + j];
	  }
     }

     A->p = p;
}

/* U contains the upper triangle of a Hermitian matrix; we copy this 
   to F and also fill in the lower triangle with the adjoint of the upper. */
void sqmatrix_copy_upper2full(sqmatrix F, sqmatrix U)
{
     int i, j;

     CHECK(F.p == U.p, "arrays not conformant");
     for (i = 0; i < U.p; ++i) {
	  for (j = 0; j < i; ++j) {
	       ASSIGN_CONJ(F.data[i*U.p + j], U.data[j*U.p + i]);
	  }
	  for (; j < U.p; ++j)
	       F.data[i*U.p + j] = U.data[i*U.p + j];
     }
}

/* Asym = (A + adjoint(A)) / 2.  Asym is thus Hermitian. */
void sqmatrix_symmetrize(sqmatrix Asym, sqmatrix A)
{
     int i, j;

     CHECK(Asym.p == A.p, "arrays not conformant");

     for (i = 0; i < A.p; ++i) 
	  for (j = 0; j < A.p; ++j) {
	       int ij = i * A.p + j, ji = j * A.p + i;
	       ASSIGN_SCALAR(Asym.data[ij], 
			     0.5 * (SCALAR_RE(A.data[ij]) +
				    SCALAR_RE(A.data[ji])),
			     0.5 * (SCALAR_IM(A.data[ij]) -
				    SCALAR_IM(A.data[ji])));
	  }
}

/* trace(U) */
scalar sqmatrix_trace(sqmatrix U)
{
     int i;
     scalar trace = SCALAR_INIT_ZERO;

     for (i = 0; i < U.p; ++i)
	  ACCUMULATE_SUM(trace, U.data[i*U.p + i]);

     return trace;
}

/* compute trace(adjoint(A) * B) */
scalar sqmatrix_traceAtB(sqmatrix A, sqmatrix B)
{
     scalar trace;

     CHECK(A.p == B.p, "matrices not conformant");

     trace = blasglue_dotc(A.p * A.p, A.data, 1, B.data, 1);

     return trace;
}

/* A = B * C.  If bdagger != 0, then adjoint(B) is used; similarly for C. 
   A must be distinct from B and C.   Note that since the matrices
   are stored in row-major order, the most efficient operation should
   be B * adjoint(C), assuming the BLAS is sane.  i.e. if C is hermitian,
   you should use cdagger = 1.  Conversely, the worst operation is
   probably adjoint(B) * C. */
void sqmatrix_AeBC(sqmatrix A, sqmatrix B, short bdagger,
		   sqmatrix C, short cdagger)
{
     CHECK(A.p == B.p && A.p == C.p, "matrices not conformant");

     blasglue_gemm(bdagger ? 'C' : 'N', cdagger ? 'C' : 'N', A.p, A.p, A.p,
                   1.0, B.data, B.p, C.data, C.p, 0.0, A.data, A.p);
}

/* A += a B * C.  bdagger, cdagger are as for sqmatrix_AeBC, above. */
void sqmatrix_ApaBC(sqmatrix A, real a, sqmatrix B, short bdagger,
		    sqmatrix C, short cdagger)
{
     CHECK(A.p == B.p && A.p == C.p, "matrices not conformant");

     blasglue_gemm(bdagger ? 'C' : 'N', cdagger ? 'C' : 'N', A.p, A.p, A.p,
                   a, B.data, B.p, C.data, C.p, 1.0, A.data, A.p);
}

/* A += a B */
void sqmatrix_ApaB(sqmatrix A, real a, sqmatrix B)
{
     CHECK(A.p == B.p, "matrices not conformant");

     blasglue_axpy(A.p * A.p, a, B.data, 1, A.data, 1);
}

/* compute A = a*A + b*B; A and B may be equal. */
void sqmatrix_aApbB(real a, sqmatrix A, real b, sqmatrix B)
{
     CHECK(A.p == B.p, "arrays not conformant");

     if (a != 1.0)
          blasglue_rscal(A.p * A.p, a, A.data, 1);

     blasglue_axpy(A.p * A.p, b, B.data, 1, A.data, 1);
}

/* U <- 1/U.  U must be Hermitian and, if positive_definite != 0,
   positive-definite (e.g. U = Yt*Y).  Work is a scratch matrix. */
void sqmatrix_invert(sqmatrix U, short positive_definite,
		     sqmatrix Work)
{
     int i, j;

     if (positive_definite) {
	  /* factorize U: */
	  lapackglue_potrf('U', U.p, U.data, U.p);
	  
	  /* QUESTION: would it be more efficient to stop here,
	     returning the Cholesky factorization of U?  This
	     could then be used to multiply by 1/U without
	     ever calculating the inverse explicitely.  It
	     would probably be more numerically stable, but
	     how do the computational costs compare? */
	  
	  /* Compute 1/U (upper half only) */
	  lapackglue_potri('U', U.p, U.data, U.p);
     }
     else {
	  int *ipiv;
	  CHK_MALLOC(ipiv, int, U.p);

	  CHECK(Work.p * Work.p >= U.p, "scratch matrix is too small");

	  /* factorize U: */
	  lapackglue_hetrf('U', U.p, U.data, U.p,
			   ipiv, Work.data, Work.p * Work.p);

	  /* Compute 1/U (upper half only) */
	  lapackglue_hetri('U', U.p, U.data, U.p, ipiv, Work.data);
	  
	  free(ipiv);
     }
	  
     /* Now, copy the conjugate of the upper half
	onto the lower half of U */
     for (i = 0; i < U.p; ++i)
	  for (j = i + 1; j < U.p; ++j) {
	       ASSIGN_CONJ(U.data[j * U.p + i], U.data[i * U.p + j]);
	  }
}

/* U <- eigenvectors of U.  U must be Hermitian. eigenvals <- eigenvalues.
   W is a work array.  The columns of adjoint(U') are the eigenvectors, so that
   U == adjoint(U') D U', with D = diag(eigenvals). 

   The eigenvalues are returned in ascending order. */
void sqmatrix_eigensolve(sqmatrix U, real *eigenvals, sqmatrix W)
{
     real *work;

     CHK_MALLOC(work, real, 3*U.p - 2);

     if (W.p * W.p >= 3 * U.p - 1)
	  lapackglue_heev('V', 'U', U.p, U.data, U.p, eigenvals,
			  W.data, W.p * W.p, work);
     else {
	  scalar *morework;
	  CHK_MALLOC(morework, scalar, 3 * U.p - 1);
	  lapackglue_heev('V', 'U', U.p, U.data, U.p, eigenvals,
			  morework, 3 * U.p - 1, work);
	  free(morework);
     }

     free(work);
}

/* Compute Usqrt <- sqrt(U), where U must be Hermitian positive-definite. 
   W is a work array, and must be the same size as U.  Both U and
   W are overwritten. */
void sqmatrix_sqrt(sqmatrix Usqrt, sqmatrix U, sqmatrix W)
{
     real *eigenvals;

     CHECK(Usqrt.p == U.p && U.p == W.p, "matrices not conformant");

     CHK_MALLOC(eigenvals, real, U.p);

     sqmatrix_eigensolve(U, eigenvals, W);

     {
	  int i;
	  
	  /* Compute W = diag(sqrt(eigenvals)) * U; i.e. the rows of W
	     become the rows of U times sqrt(corresponding eigenvalue) */
	  for (i = 0; i < U.p; ++i) {
	       CHECK(eigenvals[i] > 0, "non-positive eigenvalue");
	       
	       blasglue_copy(U.p, U.data + i*U.p, 1, W.data + i*U.p, 1);
	       blasglue_rscal(U.p, sqrt(eigenvals[i]), W.data + i*U.p, 1);
	  }
     }

     free(eigenvals);

     /* compute Usqrt = Ut * W */
     sqmatrix_AeBC(Usqrt, U, 1, W, 0);
}


syntax highlighted by Code2HTML, v. 0.9.1