/*    __author__ = "Richard Everson (R.M.Everson@exeter.ac.uk)"
    __revision__ = "$Revision: 1.7 $"
    __version__  = "1.0"

    MIT License
*/
    
static char module_doc[] =
"This module provides a BLAS optimized\nmatrix multiply, inner product and dot for Numeric arrays";


#include "Python.h"
#include "libnumarray.h"
#include "arrayobject.h"
#include <cblas.h>

#include <stdio.h>

/* Defined to be the same as MAX_DIMS in multiarray */
#define MAX_DIMS 40


static void FLOAT_dot(void *a, int stridea, void *b, int strideb, void *res, int n)
{
  *((float *)res) = cblas_sdot(n, (float *)a, stridea, (float *)b, strideb);
}

static void DOUBLE_dot(void *a, int stridea, void *b, int strideb, void *res, int n)
{
  *((double *)res) = cblas_ddot(n, (double *)a, stridea, (double *)b, strideb);
}

static void CFLOAT_dot(void *a, int stridea, void *b, int strideb, void *res, int n)
{
  cblas_cdotu_sub(n, (double *)a, stridea, (double *)b, strideb,
                  (double *)res);
}

static void CDOUBLE_dot(void *a, int stridea, void *b, int strideb, void *res, int n)
{
  cblas_zdotu_sub(n, (double *)a, stridea, (double *)b, strideb,
                  (double *)res);
}


typedef void (DotFunction)(void *, int, void *, int, void *, int);

static DotFunction *dotFunctions[PyArray_NTYPES];


static char doc_dot[] = "dot(a,b)\nReturns the dot product of a and b for arrays of floating point types.\nLike the generic Numeric equivalent the product sum is over\nthe last dimension of a and the second-to-last dimension of b.\nNB: The first argument is not conjugated.";

static PyObject *dotblas_dot(PyObject *dummy, PyObject *args) {
  PyObject *op1, *op2;
  PyArrayObject *ap1, *ap2, *ret;
  int i, j, l, lda, ldb, matchDim = -1, otherDim = -1;
  int typenum;
  int dimensions[MAX_DIMS], nd;
  static const float oneF[2] = {1.0, 0.0};
  static const float zeroF[2] = {0.0, 0.0};
  static const double oneD[2] = {1.0, 0.0};
  static const double zeroD[2] = {0.0, 0.0};


  if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
	
  /* 
   * "Matrix product" using the BLAS.  
   * Only works for float double and complex types.
   */


  typenum = PyArray_ObjectType(op1, 0);  
  typenum = PyArray_ObjectType(op2, typenum);

  ret = NULL;
  ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
  if (ap1 == NULL) return NULL;
  ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
  if (ap2 == NULL) goto fail;

  if (typenum != PyArray_FLOAT && typenum != PyArray_DOUBLE &&
      typenum != PyArray_CFLOAT && typenum != PyArray_CDOUBLE) {
   PyErr_SetString(PyExc_TypeError, "at least one argument must be (possibly complex) float or double");
    goto fail;
  }

  
  if (ap1->nd < 0 || ap2->nd < 0) {
    PyErr_SetString(PyExc_TypeError, "negative dimensioned arrays!");
    goto fail;
  }

  if (ap1->nd == 0 || ap2->nd == 0) {
    /* One of ap1 or ap2 is a scalar */
    if (ap1->nd == 0) {		/* Make ap2 the scalar */
      PyArrayObject *t = ap1;
      ap1 = ap2;
      ap2 = t;
    }
    for (l = 1, j = 0; j < ap1->nd; j++) {
      dimensions[j] = ap1->dimensions[j];
      l *= dimensions[j];
    }
    nd = ap1->nd;
  }
  else if (ap1->nd <= 2 && ap2->nd <= 2) {
    /*  Both ap1 and ap2 are vectors or matrices */
    l = ap1->dimensions[ap1->nd-1];
	
    if (ap2->dimensions[0] != l) {
      PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
      goto fail;
    }
    nd = ap1->nd+ap2->nd-2;
 
    if (nd == 1) 
      dimensions[0] = (ap1->nd == 2) ? ap1->dimensions[0] : ap2->dimensions[1];
    else if (nd == 2) {
      dimensions[0] = ap1->dimensions[0];
      dimensions[1] = ap2->dimensions[1];
    }
  }
  else {
    /* At least one of ap1 or ap2 has dimension > 2. */
    l = ap1->dimensions[ap1->nd-1];
    matchDim = ap2->nd - 2;
    otherDim = ap2->nd - 1;
    
    if (ap2->dimensions[matchDim] != l) {
	PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
	goto fail;
    }

    nd = ap1->nd+ap2->nd-2;
    j = 0;
    for(i=0; i<ap1->nd-1; i++) {
	dimensions[j++] = ap1->dimensions[i];
    }
    for(i=0; i<ap2->nd-2; i++) {
	dimensions[j++] = ap2->dimensions[i];
    }
    if(ap2->nd > 1) {
      dimensions[j++] = ap2->dimensions[ap2->nd-1];
    }
  }

  ret = (PyArrayObject *)PyArray_FromDims(nd, dimensions, typenum);
  if (ret == NULL) goto fail;

  NA_clearFPErrors();

  if (ap2->nd == 0) {
    /* Multiplication by a scalar -- Level 1 BLAS */
    if (typenum == PyArray_DOUBLE) {
      cblas_daxpy(l, *((double *)ap2->data), (double *)ap1->data, 1,
		  (double *)ret->data, 1);
    } 
    else  if (typenum == PyArray_FLOAT) {
      cblas_saxpy(l, *((float *)ap2->data), (float *)ap1->data, 1,
		  (float *)ret->data, 1);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zaxpy(l, (double *)ap2->data, (double *)ap1->data, 1,
		  (double *)ret->data, 1);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_caxpy(l, (float *)ap2->data, (float *)ap1->data, 1,
		  (float *)ret->data, 1);
    }
  }
  else if (ap1->nd == 1 && ap2->nd == 1) {
    /* Dot product between two vectors -- Level 1 BLAS */
    if (typenum == PyArray_DOUBLE) {
      double result = cblas_ddot(l, (double *)ap1->data, 1, 
				 (double *)ap2->data, 1);
      *((double *)ret->data) = result;
    }
    else if (typenum == PyArray_FLOAT) {
      float result = cblas_sdot(l, (float *)ap1->data, 1, 
				 (float *)ap2->data, 1);
      *((float *)ret->data) = result;
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zdotu_sub(l, (double *)ap1->data, 1, 
		      (double *)ap2->data, 1, (double *)ret->data);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cdotu_sub(l, (float *)ap1->data, 1, 
		      (float *)ap2->data, 1, (float *)ret->data);
    }
  }
  else if (ap1->nd == 2 && ap2->nd == 1) {
    /* Matrix vector multiplication -- Level 2 BLAS */
    /* lda must be MAX(M,1) */
    lda = (ap1->dimensions[1] > 1 ? ap1->dimensions[1] : 1);
    if (typenum == PyArray_DOUBLE) {
      cblas_dgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  1.0, (double *)ap1->data, lda, 
		  (double *)ap2->data, 1, 0.0, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_FLOAT) {
      cblas_sgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  1.0, (float *)ap1->data, lda, 
		  (float *)ap2->data, 1, 0.0, (float *)ret->data, 1);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  oneD, (double *)ap1->data, lda, 
		  (double *)ap2->data, 1, zeroD, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  oneF, (float *)ap1->data, lda, 
		  (float *)ap2->data, 1, zeroF, (float *)ret->data, 1);
    }
  }
  else if (ap1->nd == 1 && ap2->nd == 2) {
    /* Vector matrix multiplication -- Level 2 BLAS */
    lda = (ap2->dimensions[1] > 1 ? ap2->dimensions[1] : 1);
    if (typenum == PyArray_DOUBLE) {
      cblas_dgemv(CblasRowMajor, 
		  CblasTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  1.0, (double *)ap2->data, lda,
		  (double *)ap1->data, 1, 0.0, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_FLOAT) {
      cblas_sgemv(CblasRowMajor, 
		  CblasTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  1.0, (float *)ap2->data, lda,
		  (float *)ap1->data, 1, 0.0, (float *)ret->data, 1);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zgemv(CblasRowMajor, 
		  CblasTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  oneD, (double *)ap2->data, lda,
		  (double *)ap1->data, 1, zeroD, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cgemv(CblasRowMajor, 
		  CblasTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  oneF, (float *)ap2->data, lda,
		  (float *)ap1->data, 1, zeroF, (float *)ret->data, 1);
    }
  }
  else if (ap1->nd == 2 && ap2->nd == 2) {
    /* Matrix matrix multiplication -- Level 3 BLAS */  
    lda = (ap1->dimensions[1] > 1 ? ap1->dimensions[1] : 1);
    ldb = (ap2->dimensions[1] > 1 ? ap2->dimensions[1] : 1);
    if (typenum == PyArray_DOUBLE) {
      cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
		  ap1->dimensions[0], ap2->dimensions[1], ap2->dimensions[0],
		  1.0, (double *)ap1->data, lda,
		  (double *)ap2->data, ldb,
		  0.0, (double *)ret->data, ldb);
    }
    else if (typenum == PyArray_FLOAT) {
      cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
		  ap1->dimensions[0], ap2->dimensions[1], ap2->dimensions[0],
		  1.0, (float *)ap1->data, lda,
		  (float *)ap2->data, ldb,
		  0.0, (float *)ret->data, ldb);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
		  ap1->dimensions[0], ap2->dimensions[1], ap2->dimensions[0],
		  oneD, (double *)ap1->data, lda,
		  (double *)ap2->data, ldb,
		  zeroD, (double *)ret->data, ldb);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
		  ap1->dimensions[0], ap2->dimensions[1], ap2->dimensions[0],
		  oneF, (float *)ap1->data, lda,
		  (float *)ap2->data, ldb,
		  zeroF, (float *)ret->data, ldb);
    }
  }
  else {
    /* Deal with arrays with dimension greater than two.  All we do is copy
     * the original multiarraymodule.c/PyArray_InnerProduct but use the BLAS
     * one dimensional functions instead of the macro versions in
     * multiarraymodule
     */

    int is1, is2, is1r, is2r, n1, n2, os, i1, i2;
    char *ip1, *ip2, *op;
    DotFunction *dot = dotFunctions[(int)(ret->descr->type_num)];
    if (dot == NULL) {
      PyErr_SetString(PyExc_ValueError, 
                      "dotblas matrixMultiply not available for this type");
      goto fail;
    }

    n1 = PyArray_SIZE(ap1)/l;
    n2 = PyArray_SIZE(ap2)/l;
    is1 = ap1->strides[ap1->nd-1]/ap1->descr->elsize;
    is2 = ap2->strides[matchDim]/ap2->descr->elsize;
    if(ap1->nd > 1)
      is1r = ap1->strides[ap1->nd-2];
    else
      is1r = ap1->strides[ap1->nd-1];
    is2r = ap2->strides[otherDim];
    op = ret->data;
    os = ret->descr->elsize;

    ip1 = ap1->data;
    for(i1=0; i1<n1; i1++) {
      ip2 = ap2->data;
      for(i2=0; i2<n2; i2++) {
        dot(ip1, is1, ip2, is2, op, l);
        ip2 += is2r;
        op += os;
      }
      ip1 += is1r;
    }
  }

  if (NA_checkAndReportFPErrors("dot") < 0)
	  goto fail;

  Py_DECREF(ap1);
  Py_DECREF(ap2);

  return PyArray_Return(ret);
	
 fail:
  Py_XDECREF(ap1);
  Py_XDECREF(ap2);
  Py_XDECREF(ret);
  return NULL;
}


static char doc_innerproduct[] = "innerproduct(a,b)\nReturns the inner product of a and b for arrays of floating point types.\nLike the generic Numeric equivalent the product sum is over\nthe last dimension of a and b.\nNB: The first argument is not conjugated.";

static PyObject *dotblas_innerproduct(PyObject *dummy, PyObject *args) {
  PyObject *op1, *op2;
  PyArrayObject *ap1, *ap2, *ret;
  int i, j, l, lda, ldb, ldc;
  int typenum;
  int dimensions[MAX_DIMS], nd;
  static const float oneF[2] = {1.0, 0.0};
  static const float zeroF[2] = {0.0, 0.0};
  static const double oneD[2] = {1.0, 0.0};
  static const double zeroD[2] = {0.0, 0.0};


  if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
	
  /* 
   * Inner product using the BLAS.  The product sum is taken along the last
   * dimensions of the two arrays.
   * Only works for float double and complex types.
   */


  typenum = PyArray_ObjectType(op1, 0);  
  typenum = PyArray_ObjectType(op2, typenum);

  ret = NULL;
  ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
  if (ap1 == NULL) return NULL;
  ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
  if (ap2 == NULL) goto fail;

  if (typenum != PyArray_FLOAT && typenum != PyArray_DOUBLE &&
      typenum != PyArray_CFLOAT && typenum != PyArray_CDOUBLE) {
    PyErr_SetString(PyExc_TypeError, "at least one argument must be (possibly complex) float or double");
    goto fail;
  }

  
  if (ap1->nd < 0 || ap2->nd < 0) {
    PyErr_SetString(PyExc_TypeError, "negative dimensioned arrays!");
    goto fail;
  }

  if (ap1->nd == 0 || ap2->nd == 0) {
    /* One of ap1 or ap2 is a scalar */
    if (ap1->nd == 0) {		/* Make ap2 the scalar */
      PyArrayObject *t = ap1;
      ap1 = ap2;
      ap2 = t;
    }
    for (l = 1, j = 0; j < ap1->nd; j++) {
      dimensions[j] = ap1->dimensions[j];
      l *= dimensions[j];
    }
    nd = ap1->nd;
  }
  else if (ap1->nd <= 2 && ap2->nd <= 2) {
    /*  Both ap1 and ap2 are vectors or matrices */
    l = ap1->dimensions[ap1->nd-1];
	
    if (ap2->dimensions[ap2->nd-1] != l) {
      PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
      goto fail;
    }
    nd = ap1->nd+ap2->nd-2;
 
    if (nd == 1) 
      dimensions[0] = (ap1->nd == 2) ? ap1->dimensions[0] : ap2->dimensions[0];
    else if (nd == 2) {
      dimensions[0] = ap1->dimensions[0];
      dimensions[1] = ap2->dimensions[0];
    }
  }
  else {
    /* At least one of ap1 or ap2 has dimension > 2. */
    l = ap1->dimensions[ap1->nd-1];
    
    if (ap2->dimensions[ap2->nd-1] != l) {
      PyErr_SetString(PyExc_ValueError, "matrices are not aligned");
      goto fail;
    }

    nd = ap1->nd+ap2->nd-2;
    j = 0;
    for(i=0; i<ap1->nd-1; i++) {
	dimensions[j++] = ap1->dimensions[i];
    }
    for(i=0; i<ap2->nd-1; i++) {
	dimensions[j++] = ap2->dimensions[i];
    }
  }

  ret = (PyArrayObject *)PyArray_FromDims(nd, dimensions, typenum);
  if (ret == NULL) goto fail;

  NA_clearFPErrors();

  if (ap2->nd == 0) {
    /* Multiplication by a scalar -- Level 1 BLAS */
    if (typenum == PyArray_DOUBLE) {
      cblas_daxpy(l, *((double *)ap2->data), (double *)ap1->data, 1,
		  (double *)ret->data, 1);
    } 
    else  if (typenum == PyArray_FLOAT) {
      cblas_saxpy(l, *((float *)ap2->data), (float *)ap1->data, 1,
		  (float *)ret->data, 1);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zaxpy(l, (double *)ap2->data, (double *)ap1->data, 1,
		  (double *)ret->data, 1);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_caxpy(l, (float *)ap2->data, (float *)ap1->data, 1,
		  (float *)ret->data, 1);
    }
  }
  else if (ap1->nd == 1 && ap2->nd == 1) {
    /* Dot product between two vectors -- Level 1 BLAS */
    if (typenum == PyArray_DOUBLE) {
      double result = cblas_ddot(l, (double *)ap1->data, 1, 
				 (double *)ap2->data, 1);
      *((double *)ret->data) = result;
    }
    else if (typenum == PyArray_FLOAT) {
      float result = cblas_sdot(l, (float *)ap1->data, 1, 
                                (float *)ap2->data, 1);
      *((float *)ret->data) = result;
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zdotu_sub(l, (double *)ap1->data, 1, 
		      (double *)ap2->data, 1, (double *)ret->data);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cdotu_sub(l, (float *)ap1->data, 1, 
		      (float *)ap2->data, 1, (float *)ret->data);
    }
  }
  else if (ap1->nd == 2 && ap2->nd == 1) {
    /* Matrix-vector multiplication -- Level 2 BLAS */
    lda = (ap1->dimensions[1] > 1 ? ap1->dimensions[1] : 1);
    if (typenum == PyArray_DOUBLE) {
      cblas_dgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  1.0, (double *)ap1->data, lda,
		  (double *)ap2->data, 1, 0.0, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_FLOAT) {
      cblas_sgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  1.0, (float *)ap1->data, lda,
		  (float *)ap2->data, 1, 0.0, (float *)ret->data, 1);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  oneD, (double *)ap1->data, lda,
		  (double *)ap2->data, 1, zeroD, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cgemv(CblasRowMajor, 
		  CblasNoTrans,  ap1->dimensions[0], ap1->dimensions[1], 
		  oneF, (float *)ap1->data, lda,
		  (float *)ap2->data, 1, zeroF, (float *)ret->data, 1);
    }
  }
  else if (ap1->nd == 1 && ap2->nd == 2) {
    /* Vector matrix multiplication -- Level 2 BLAS */
    lda = (ap2->dimensions[1] > 1 ? ap2->dimensions[1] : 1);
    if (typenum == PyArray_DOUBLE) {
      cblas_dgemv(CblasRowMajor, 
		  CblasNoTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  1.0, (double *)ap2->data, lda,
		  (double *)ap1->data, 1, 0.0, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_FLOAT) {
      cblas_sgemv(CblasRowMajor, 
		  CblasNoTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  1.0, (float *)ap2->data, lda,
		  (float *)ap1->data, 1, 0.0, (float *)ret->data, 1);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zgemv(CblasRowMajor, 
		  CblasNoTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  oneD, (double *)ap2->data, lda,
		  (double *)ap1->data, 1, zeroD, (double *)ret->data, 1);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cgemv(CblasRowMajor, 
		  CblasNoTrans,  ap2->dimensions[0], ap2->dimensions[1], 
		  oneF, (float *)ap2->data, lda,
		  (float *)ap1->data, 1, zeroF, (float *)ret->data, 1);
    }
  }
  else if (ap1->nd == 2 && ap2->nd == 2) {
    /* Matrix matrix multiplication -- Level 3 BLAS */  
    lda = (ap1->dimensions[1] > 1 ? ap1->dimensions[1] : 1);
    ldb = (ap2->dimensions[1] > 1 ? ap2->dimensions[1] : 1);
    ldc = ap2->dimensions[0];
    if (typenum == PyArray_DOUBLE) {
      cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                 ap1->dimensions[0], ap2->dimensions[0], ap1->dimensions[1],
                 1.0, (double *)ap1->data, lda,
                 (double *)ap2->data, ldb,
                 0.0, (double *)ret->data, ldc);
    }
    else if (typenum == PyArray_FLOAT) {
      cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
		  ap1->dimensions[0], ap2->dimensions[0], ap1->dimensions[1],
		  1.0, (float *)ap1->data, lda,
		  (float *)ap2->data, ldb,
		  0.0, (float *)ret->data, ldc);
    }
    else if (typenum == PyArray_CDOUBLE) {
      cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
		  ap1->dimensions[0], ap2->dimensions[0], ap1->dimensions[1],
		  oneD, (double *)ap1->data, lda,
		  (double *)ap2->data, ldb,
		  zeroD, (double *)ret->data, ldc);
    }
    else if (typenum == PyArray_CFLOAT) {
      cblas_cgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
		  ap1->dimensions[0], ap2->dimensions[0], ap1->dimensions[1],
		  oneF, (float *)ap1->data, lda,
		  (float *)ap2->data, ldb,
		  zeroF, (float *)ret->data, ldc);
    }
  }
  else {
    /* Deal with arrays with dimension greater than two.  All we do is copy
     * the original multiarraymodule.c/PyArray_InnerProduct but use the BLAS
     * one dimensional functions instead of the macro versions in
     * multiarraymodule
     */

    int is1, is2, n1, n2, os, i1, i2, s1, s2;
    char *ip1, *ip2, *op;
    DotFunction *dot = dotFunctions[(int)(ret->descr->type_num)];
    if (dot == NULL) {
      PyErr_SetString(PyExc_ValueError, 
                      "dotblas inner product not available for this type");
      goto fail;
    }

    n1 = PyArray_SIZE(ap1)/l;
    n2 = PyArray_SIZE(ap2)/l;
    is1 = ap1->strides[ap1->nd-1];
    s1 = is1/ap1->descr->elsize;
    is2 = ap2->strides[ap2->nd-1];
    s2 = is2/ap2->descr->elsize;
    op = ret->data;
    os = ret->descr->elsize;
	
    ip1 = ap1->data;
    for(i1=0; i1<n1; i1++) {
      ip2 = ap2->data;
      for(i2=0; i2<n2; i2++) {
        dot(ip1, s1, ip2, s2, op, l);
        ip2 += is2*l;
        op += os;
      }
      ip1 += is1*l;
    }
  }

  if (NA_checkAndReportFPErrors("innerproduct") < 0)
	  goto fail;

  Py_DECREF(ap1);
  Py_DECREF(ap2);
  return PyArray_Return(ret);
	
 fail:
  Py_XDECREF(ap1);
  Py_XDECREF(ap2);
  Py_XDECREF(ret);
  return NULL;
}


static char doc_vdot[] = "vdot(a,b)\nReturns the dot product of a and b for scalars and vectors\nof floating point and complex types.  The first argument, a, is conjugated.";


static PyObject *dotblas_vdot(PyObject *dummy, PyObject *args) {
  PyObject *op1, *op2;
  PyArrayObject *ap1, *ap2, *ret;
  int l;
  int typenum;
  int dimensions[MAX_DIMS];

  if (!PyArg_ParseTuple(args, "OO", &op1, &op2)) return NULL;
	
  /* 
   * Conjugating dot product using the BLAS for vectors.
   * Multiplies op1 and op2, each of which must be vector.
   */

  typenum = PyArray_ObjectType(op1, 0);  
  typenum = PyArray_ObjectType(op2, typenum);


  ret = NULL;
  ap1 = (PyArrayObject *)PyArray_ContiguousFromObject(op1, typenum, 0, 0);
  if (ap1 == NULL) return NULL;
  ap2 = (PyArrayObject *)PyArray_ContiguousFromObject(op2, typenum, 0, 0);
  if (ap2 == NULL) goto fail;

  if (typenum != PyArray_FLOAT && typenum != PyArray_DOUBLE &&
      typenum != PyArray_CFLOAT && typenum != PyArray_CDOUBLE) {
    PyErr_SetString(PyExc_TypeError, "at least one argument must be (possibly complex) float or double");
    goto fail;
  }

  if (ap1->nd != 1 || ap2->nd != 1) {
    PyErr_SetString(PyExc_TypeError, "arguments must be vectors");
    goto fail;
  }

  if (ap2->dimensions[0] != ap1->dimensions[ap1->nd-1]) {
    PyErr_SetString(PyExc_ValueError, "vectors have different lengths");
    goto fail;
  }
  l = ap1->dimensions[ap1->nd-1];
  
  ret = (PyArrayObject *)PyArray_FromDims(0, dimensions, typenum);
  if (ret == NULL) goto fail;

  NA_clearFPErrors();

  /* Dot product between two vectors -- Level 1 BLAS */
  if (typenum == PyArray_DOUBLE) {
    *((double *)ret->data) = cblas_ddot(l, (double *)ap1->data, 1, 
                                        (double *)ap2->data, 1);
  }
  else if (typenum == PyArray_FLOAT) {
    *((float *)ret->data) = cblas_sdot(l, (float *)ap1->data, 1, 
                                       (float *)ap2->data, 1);
  }
  else if (typenum == PyArray_CDOUBLE) {
    cblas_zdotc_sub(l, (double *)ap1->data, 1, 
                    (double *)ap2->data, 1, (double *)ret->data);
  }
  else if (typenum == PyArray_CFLOAT) {
    cblas_cdotc_sub(l, (float *)ap1->data, 1, 
                    (float *)ap2->data, 1, (float *)ret->data);
  }

  if (NA_checkAndReportFPErrors("vdot") < 0)
	  goto fail;

  Py_DECREF(ap1);
  Py_DECREF(ap2);
  return PyArray_Return(ret);
	
 fail:
  Py_XDECREF(ap1);
  Py_XDECREF(ap2);
  Py_XDECREF(ret);
  return NULL;
}



static struct PyMethodDef dotblas_module_methods[] = {
  {"dot",  (PyCFunction)dotblas_dot, 1, doc_dot},
  {"innerproduct",   (PyCFunction)dotblas_innerproduct,  1, doc_innerproduct},
  {"vdot", (PyCFunction)dotblas_vdot, 1, doc_vdot},
    {NULL,		NULL, 0}		/* sentinel */
};

/* Initialization function for the module */
DL_EXPORT(void) init_dotblas(void) {
  int i;
    PyObject *m;
	
    /* Create the module and add the functions */
    m = Py_InitModule3("_dotblas", dotblas_module_methods, module_doc);

    /* Import the array object */
    import_array();
    import_libnumarray();

    ADD_VERSION(m);

    /* Initialise the array of dot functions */
    for (i = 0; i < PyArray_NTYPES; i++)
      dotFunctions[i] = NULL;
    dotFunctions[PyArray_FLOAT] = FLOAT_dot;
    dotFunctions[PyArray_DOUBLE] = DOUBLE_dot;
    dotFunctions[PyArray_CFLOAT] = CFLOAT_dot;
    dotFunctions[PyArray_CDOUBLE] = CDOUBLE_dot;

    
    /* Check for errors */
    if (PyErr_Occurred())
	Py_FatalError("can't initialize module _dotblas");
}


syntax highlighted by Code2HTML, v. 0.9.1