/*********************************************************/
/* TAUCS                                                 */
/* Author: Sivan Toledo                                  */
/*                                                       */
/*********************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include <assert.h>

#include "taucs.h"

#define HEADER_NROWS   0
#define HEADER_NCOLS   1
#define HEADER_FLAGS   2
#define HEADER_COLPERM 3
#define HEADER_IPIVOTS 4
#define HEADER_LCLEN   5
#define HEADER_UCLEN   6
#define HEADER 7

#ifndef TAUCS_CORE_GENERAL

#ifdef OSTYPE_win32
#include <io.h>
#else
#include <unistd.h>
#include <sys/uio.h>
#endif

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

#define iabs(x) ((x) > 0 ? (x) : (-(x)))

static double remaining_memory;

/*********************************************************/
/* NEW IO ROUTINES                                       */
/*********************************************************/

/*
  Layout of data structures in the file:
  m       (number of rows)
  n       (number of cols)
  flags
  colperm
  ipivots
  Lclen      (nonzeros per column in L)
  Lclen      (nonzeros per column in U)
  Lrowind[0]
  Lvalues[0]
  Urowind[0]
  Uvalues[0]
  Lrowind[1]
  Lvalues[1]
  Urowind[1]
  Uvalues[1]
  ...
*/
  
static
int Lappendcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  taucs_io_append(LU,
		  0 + HEADER+j*4,
		  nnz,1,
		  TAUCS_INT,
		  ind);
  taucs_io_append(LU,
		  1 + HEADER+j*4,
		  nnz,1,
		  TAUCS_CORE_DATATYPE,
		  re);
  return 0;
}

static
int Uappendcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  taucs_io_append(LU,
		  2 + HEADER+j*4,
		  nnz,1,
		  TAUCS_INT,
		  ind);
  taucs_io_append(LU,
		  3 + HEADER+j*4,
		  nnz,1,
		  TAUCS_CORE_DATATYPE,
		  re);
  return 0;
}

static
int Lreadcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  taucs_io_read(LU,
		0 + HEADER+j*4,
		nnz,1,
		TAUCS_INT,
		ind);
  taucs_io_read(LU,
		1 + HEADER+j*4,
		nnz,1,
		TAUCS_CORE_DATATYPE,
		re);
  return 0;
}

static
int Ureadcol(taucs_io_handle* LU, int j, int nnz, int* ind, taucs_datatype* re)
{
  taucs_io_read(LU,
		2 + HEADER+j*4,
		nnz,1,
		TAUCS_INT,
		ind);
  taucs_io_read(LU,
		3 + HEADER+j*4,
		nnz,1,
		TAUCS_CORE_DATATYPE,
		re);
  return 0;
}

/*********************************************************/
/* ANALYSIS PHASE                                        */
/*********************************************************/

/* BUFFER SIZES AND MEMORY MANAGEMENT */

static int get_iobufsize()
{
  return 65536; /* minimal size of I/O buffers for good performance */
}

/* COLUMN-HEAP OPERATIONS */

/* The last entry in each input run must be extracted last among
   all the entries from that run, otherwise the heap might
   overflow */

static int heap_compare(int* heap, int i, int j) 
{
  /* compare columns */

  if (heap[3*i+1] < heap[3*j+1]) return -1;
  if (heap[3*i+1] > heap[3*j+1]) return  1;

  /* same column, compare runs (and order within runs) */

  if (heap[3*i+2] < heap[3*j+2]) return -1;
  if (heap[3*i+2] > heap[3*j+2]) return  1;

  return 0;
}

static void heap_heapify(int* heap, int* heapsize,int p) 
{
  int r,l,smallest;
  int i,j,k;

  l = p * 2;
  r = l + 1;

  if (l != 2*p || r != 2*p+1)
    taucs_printf("heap: left-right computation failed\n");

  if (l < *heapsize && heap_compare(heap,l,p) < 0)
    smallest = l;
  else
    smallest = p;

  if (r < *heapsize && heap_compare(heap,r,smallest) < 0)
    smallest = r;
  
  if (smallest != p) {
    i = heap[3*p]; 
    j = heap[3*p+1]; 
    k = heap[3*p+2]; 

    heap[3*p]   = heap[3*smallest];
    heap[3*p+1] = heap[3*smallest+1];
    heap[3*p+2] = heap[3*smallest+2];

    heap[3*smallest]   = i;
    heap[3*smallest+1] = j;
    heap[3*smallest+2] = k;
    
    heap_heapify(heap,heapsize,smallest);
  }
}

#if 0
static void x_heap_insert(int* heap, int* heapsize, int i, int j, int k)
{
  int l = *heapsize;

  heap[3*l]   = i;
  heap[3*l+1] = j;
  heap[3*l+2] = k;

  (*heapsize)++;
}  

static void x_heap_extract_min(int* heap, int* heapsize, int* i, int* j, int* k) 
{
  int l,m,mv;

  if (*heapsize <= 0)
    taucs_printf("heap: Trying to extract from an expty heap\n");

  mv = INT_MAX;
  for (l=0; l<(*heapsize); l++) {
    if (heap[3*l+1] < mv) {
      m = l;
      mv = heap[3*l+1];
    }
  }
  
  *i = heap[3*m];
  *j = heap[3*m+1];
  *k = heap[3*m+2];

  (*heapsize)--;
  for (l=m; l<(*heapsize); l++) {
    heap[3*l]   = heap[3*(l+1)];
    heap[3*l+1] = heap[3*(l+1) + 1];
    heap[3*l+2] = heap[3*(l+1) + 2];
  }
}
#endif

static void heap_insert(int* heap, int* heapsize, int i, int j, int k)
{
  int child, parent;

  (*heapsize)++;

  child = (*heapsize-1);
  parent = child / 2;
  while (child > 0 && 
	 ((heap[3*parent+1] > j) || 
	  ((heap[3*parent+1] == j) && (heap[3*parent+2] > k)))) {
    heap[3*child]   = heap[3*parent];
    heap[3*child+1] = heap[3*parent+1];
    heap[3*child+2] = heap[3*parent+2];
    child = parent;
    parent = child / 2;
  }

  heap[3*child]   = i;
  heap[3*child+1] = j;
  heap[3*child+2] = k;
}

static void heap_extract_min(int* heap, int* heapsize, int* i, int* j, int* k) 
{
  if (*heapsize <= 0)
    taucs_printf("heap: Trying to extract from an expty heap\n");
  
  *i = heap[0];
  *j = heap[1];
  *k = heap[2];

  heap[0] = heap[3 * ((*heapsize)-1)];
  heap[1] = heap[3 * ((*heapsize)-1) + 1];
  heap[2] = heap[3 * ((*heapsize)-1) + 2];

  (*heapsize) --;

  heap_heapify(heap,heapsize,0);
}

/* SYMMETRIC SKELETON GRAPH OPERATIONS */

static char skel_basename[256];
static int* skel_buffer;
static int  skel_buffer_size;
static int  skel_buffer_ptr;
static int  skel_outfiles;
static int  skel_infiles;
static int  skel_outfile;

static char skel_inphase;
static char skel_outphase;

static int skel_get_lastcol;
static int skel_next;

static int skel_compare(const void* e1, const void* e2) 
{
  /* we sort according to columns */
  int j1, j2;
  const int* jp1;
  const int* jp2;

  jp1 = (const int*) e1;
  jp2 = (const int*) e2;

  j1 = jp1[1];
  j2 = jp2[1];
  if (j1 < j2) return -1;
  if (j1 > j2) return 1;

  /* same column, compare rows */

  j1 = jp1[0];
  j2 = jp2[0];
  if (j1 < j2) return -1;
  if (j1 > j2) return 1;
  
  return 0;
}

static void skel_init(char* basename)
{
  sprintf(skel_basename,"%s.ssort",basename);

  /* adjust remaining memory.
     We allocate 2 io buffers for the sort phase, plus an array
     of file pointers that is guarantreed to be smaller.
     We then free them and allocate another io buffer for the
     stack phase, but we do not free the skel buffer first,
     so 3 io buffers is a conservative estimate. */

  remaining_memory -= (double) (3*get_iobufsize());

  skel_buffer_size = (int)(remaining_memory) / (2*sizeof(int));

  skel_buffer      = (int*)taucs_malloc(skel_buffer_size * 2 * sizeof(int));
  skel_buffer_ptr  = 0;
  skel_outfiles    = 0;

  skel_outphase = 'e';
  skel_inphase  = 'o';

  skel_get_lastcol = -1;
  skel_next = 0;
  skel_outfile = -1;
}

static void skel_finalize()
{
  taucs_free(skel_buffer);

  remaining_memory += (double) (3*get_iobufsize());
}

static void skel_add(int i,int j) 
{
  if (skel_buffer_ptr < skel_buffer_size) {
    skel_buffer[2*skel_buffer_ptr]   = i;
    skel_buffer[2*skel_buffer_ptr+1] = j;
    skel_buffer_ptr++;
  } else {
    int     file;
    ssize_t io_size;
    char    fname[256];

    /* SORT THIS BUFFER */

    qsort(skel_buffer, skel_buffer_ptr, 2*sizeof(int), &skel_compare);


    /* WRITE OUT */
    sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,skel_outfiles);
    taucs_printf("oocsp_colanalyze: Writing out skel sort buffer <%s> (3)\n",fname);
    file = open(fname,O_WRONLY | O_CREAT,0644);
    if (file == -1)
      taucs_printf("oocsp_colanalyze: could not create skel sort file\n");
    io_size = write(file,skel_buffer,skel_buffer_ptr * 2 * sizeof(int));
    if (io_size != skel_buffer_ptr * 2 * sizeof(int))
      taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
    close(file);

    skel_outfiles++;
    skel_buffer_ptr = 0;

    taucs_printf("oocsp_colanalyze: done (3)\n",fname);
  }
}

static void skel_sort_incore(int* postorder, int ncols, int* inv_postorder)
{
  int natural,i,col;

  natural = 1;
  for (i=0; i<ncols; i++) {
    if (postorder[i] != i) {
      natural = 0;
      break;
    }
  }

  if (natural == 0) {
    for (i=0; i<ncols; i++)
      inv_postorder[postorder[i]] = i;

    for (i=0; i<skel_buffer_ptr; i++) {
      col = skel_buffer[2*i+1];
      skel_buffer[ 2*i+1 ] = inv_postorder[col];
    }
  }

  qsort(skel_buffer, skel_buffer_ptr, 2*sizeof(int),
	&skel_compare);
}

static void skel_sort_outofcore(int* postorder, int ncols, int* inv_postorder)
{
  int  natural,i,j,k,e,f;
  int  heapsize, iobufsize, openruns, maxopenruns, runstart;
  int* infiles;
  int  outfile;

  int *inbuf;
  int *outbuf;
  int outbuf_ptr;

  int     file;
  ssize_t io_size;
  ssize_t io_count;
  char    fname[256];

  int last_col, last_extracted;

  /* FIRST, WRITE OUT THIS BUFFER */
  if (skel_buffer_ptr > 0) {
    /* SORT THIS BUFFER */

    qsort(skel_buffer, skel_buffer_ptr, 2*sizeof(int), &skel_compare);

    /* WRITE OUT */
    sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,skel_outfiles);
    taucs_printf("oocsp_colanalyze: Writing out skel sort buffer <%s> (1)\n",fname);
    file = open(fname,O_WRONLY | O_CREAT,0644);
    if (file == -1)
      taucs_printf("oocsp_colanalyze: could not create skel sort file\n");
    io_size = write(file,skel_buffer,skel_buffer_ptr * 2 * sizeof(int));
    if (io_size != skel_buffer_ptr * 2 * sizeof(int))
      taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
    close(file);

    skel_outfiles++;
    skel_buffer_ptr = 0;
  }

  /* DO WE NEED TO SORT THE RUNS AGAIN IN POSTORDER? */
  natural = 1;
  for (i=0; i<ncols; i++) {
    if (postorder[i] != i) {
      natural = 0;
      break;
    }
  }

  if (natural == 0) {
    for (i=0; i<ncols; i++)
      inv_postorder[postorder[i]] = i;

    for (f=0; f<skel_outfiles; f++) {
      sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,f);
      taucs_printf("oocsp_colanalyze: Resorting skel sort file <%s>\n",fname);
      file = open(fname,O_RDWR);
      if (file == -1)
	taucs_printf("oocsp_colanalyze: could not open skel sort file\n");
      /* read the run */
      io_size = read(file,skel_buffer,skel_buffer_size*2*sizeof(int));
      if (io_size == -1)
	taucs_printf("oocsp_colanalyze: read from skel sort file failed\n");
      io_count = io_size/(2*sizeof(int));
      close(file);

      /* sort again */

      for (i=0; i<(int)io_count; i++) {
	j = skel_buffer[2*i+1];
	skel_buffer[ 2*i+1 ] = inv_postorder[j];
      }

      qsort(skel_buffer, io_count, 2*sizeof(int),
	    &skel_compare);

      /* rewind the file and write back */
      file = open(fname,O_RDWR);
      if (file == -1)
	taucs_printf("oocsp_colanalyze: could not open skel sort file\n");
      /*      lseek(file,0,SEEK_SET);*/
      io_size = write(file,skel_buffer,io_count * 2 * sizeof(int));
      if (io_size != io_count * 2 * sizeof(int))
	taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
      close(file);
    }
  }

  /* WE NOW USE THE BUFFER AS A HEAP */
  
  heapsize = 0;

  /* in blocks of 2*sizeof(int), not bytes! */
  iobufsize   = get_iobufsize() / (2*sizeof(int)); 

  /* 
     each element in the skel_buffer is 2 ints, but 
     each element in the heap is 3 ints 
  */
  maxopenruns = (2*skel_buffer_size) / (3*iobufsize);
  if (maxopenruns < 2) {
    maxopenruns = 2;
    iobufsize   = (2*skel_buffer_size) / (3 * 2);
  }
  taucs_printf("oocsp_colanalyze: Using io buffers of %d elements (%d bytes), max runs = %d\n",
	     iobufsize,iobufsize*2,maxopenruns);

  inbuf  = (int*)taucs_malloc(iobufsize*2*sizeof(int));
  outbuf = (int*)taucs_malloc(iobufsize*2*sizeof(int));
  infiles  = (int*)taucs_malloc(skel_outfiles*sizeof(int));
  /*
  inbuf  = taucs_calloc(iobufsize,2*sizeof(int));
  outbuf = taucs_calloc(iobufsize,2*sizeof(int));
  infiles  = taucs_calloc(skel_outfiles,sizeof(int));
  */
  while (skel_outfiles > 1) {
    char phase;
    /*    int  i,j,k,runstart,openruns;*/

    taucs_printf("oocsp_colanalyze: Starting another merge phase with %d input runs\n",
	       skel_outfiles);

    skel_infiles  = skel_outfiles;
    skel_outfiles = 0;

    phase         = skel_inphase;
    skel_inphase  = skel_outphase;
    skel_outphase = phase;

    for (runstart=0; runstart<skel_infiles; runstart += maxopenruns) {

      sprintf(fname,"%s.%c.%d",
	      skel_basename,skel_outphase,runstart/maxopenruns);
      taucs_printf("oocsp_colanalyze: Opening output run <%s>\n",fname);
      outfile = open(fname,O_WRONLY | O_CREAT,0644);
      if (outfile == -1)
	taucs_printf("oocsp_colanalyze: could not open skel sort output file\n");
      skel_outfiles++;

      for (openruns=0; 
	   openruns < maxopenruns && runstart+openruns < skel_infiles;
	   openruns++) {
	
	sprintf(fname,"%s.%c.%d",skel_basename,skel_inphase,runstart+openruns);
	infiles[openruns] = open(fname,O_RDONLY);
	taucs_printf("oocsp_colanalyze: Opening input run <%s> (%d)\n",fname,infiles[openruns]);
	if (infiles[openruns] == -1)
	  taucs_printf("oocsp_colanalyze: could not open skel sort input file\n");
	io_size = read(infiles[openruns],inbuf,iobufsize*2*sizeof(int));
	if (io_size == -1)
	  taucs_printf("oocsp_colanalyze: read from skel sort file failed\n");
	io_count = io_size/(2*sizeof(int));
	if (io_count == 0) {
	  close(infiles[openruns]);
	  unlink(fname);
	}

	taucs_printf("oocsp_colanalyze: files %d %d\n",infiles[0],infiles[1]);

	taucs_printf("oocsp_colanalyze: Inserting %d elements from input run into heap\n",io_count);
	last_col = -1;
	for (e=0; e<(int)io_count; e++) {
	  i = inbuf[2*e];
	  j = inbuf[2*e+1];
	  /*printf("ij = %d\t%d\n",i,j);*/
	  if (last_col > j) {
	    taucs_printf("oocsp_colanalyze: > last = %d col = %d, (%d %d)\n",last_col,j,
		      e,io_count);
	    taucs_printf("oocsp_colanalyze: input run not sorted!\n");
	  }
	  last_col = j;
	  if (e == (int)io_count-1) /* end of inbuf marker */
	    heap_insert(skel_buffer,&heapsize,i,j,2*openruns+1);
	  else
	    heap_insert(skel_buffer,&heapsize,i,j,2*openruns);
	}
      } 

      taucs_printf("oocsp_colanalyze: files %d %d\n",infiles[0],infiles[1]);

      taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);

      outbuf_ptr = 0;

      last_extracted = -1;
      while (heapsize > 0) {
	int end_of_run,run;

	heap_extract_min(skel_buffer,&heapsize,&i,&j,&k);
	if (last_extracted > j)
	  taucs_printf("oocsp_colanalyze: heap order error!\n");
	last_extracted = j;
	outbuf[2*outbuf_ptr]   = i;
	outbuf[2*outbuf_ptr+1] = j;
	if (k % 2 == 1) end_of_run = 1;
	else end_of_run = 0;
	run = k / 2;
	outbuf_ptr ++;
	/*
	printf("extracted (%d,%d,%d) heapsize = %d\n",i,j,k,heapsize);
	*/


	if (end_of_run) {
	  taucs_printf("oocsp_colanalyze: Reading more from run %d\n",run);
	  io_size = read(infiles[run],inbuf,iobufsize * 2 * sizeof(int));
	  if (io_size == -1) {
	    taucs_printf("oocsp_colanalyze: errno = %d (%d)\n",errno,infiles[run]);
	    taucs_printf("oocsp_colanalyze: read from skel sort file failed\n");
	  }
	  io_count = io_size/(2*sizeof(int));
	  if (io_count == 0) {
	    sprintf(fname,"%s.%c.%d",skel_basename,skel_inphase,runstart+run);
	    taucs_printf("oocsp_colanalyze: Closing input run %d <%s>\n",run,fname);
	    close(infiles[run]);
	    unlink(fname);
	  }

	  taucs_printf("oocsp_colanalyze: Extracted %d,%d,%d\n",i,j,k);
	  taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);
	  /*
	  { 
	    int ii; 
	    printf("heap: \n");
	    for (ii=0; ii<heapsize; ii++) printf("(%d,%d,%d) ",
						    skel_buffer[3*ii+0],
						    skel_buffer[3*ii+1],
						    skel_buffer[3*ii+2]);
	    printf("\n");
	  }
	  */
	  taucs_printf("oocsp_colanalyze: Inserting %d elements from input run into heap\n",io_count);

	  last_col = -1;
	  for (e=0; e<(int)io_count; e++) {
	    i = inbuf[2*e];
	    j = inbuf[2*e+1];

	    if (last_col > j) 
	      taucs_printf("oocsp_colanalyze: input run not sorted!\n");
	    last_col = j;

	    if (e == (int)io_count-1) /* end of inbuf marker */
	      heap_insert(skel_buffer,&heapsize,i,j,2*run+1);
	    else
	      heap_insert(skel_buffer,&heapsize,i,j,2*run);

	    if (3*heapsize >= 2*skel_buffer_size) {
	      taucs_printf("oocsp_colanalyze: heapsize = %d, buffer_size = %d\n",
			heapsize,2*skel_buffer_size);
	      taucs_printf("oocsp_colanalyze: merge-heap overflow\n");
	    }
	  }

	  taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);
	}

	if (outbuf_ptr >= iobufsize) {
	  taucs_printf("oocsp_colanalyze: Writing to output run\n");
	  taucs_printf("oocsp_colanalyze: heapsize = %d\n",heapsize);
	  io_count = iobufsize;
	  io_size = write(outfile,outbuf,io_count * 2 * sizeof(int));
	  if (io_size != io_count * 2 * sizeof(int))
	    taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
	  outbuf_ptr = 0;
	}
      }

      /* write the rest of the output */
      if (outbuf_ptr > 0) {
	taucs_printf("oocsp_colanalyze: Writing to output run and closing\n");
	io_count = outbuf_ptr;
	io_size = write(outfile,outbuf,io_count * 2 * sizeof(int));
	if (io_size != io_count * 2 * sizeof(int))
	  taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
	outbuf_ptr = 0;
      }
      close(outfile);
      skel_buffer_ptr = 0;
    }
  }


  taucs_free(infiles);
  taucs_free(inbuf);
  taucs_free(outbuf);

  /*
  taucs_free(infiles);
  taucs_free(inbuf);
  taucs_free(outbuf);
  */
}

static int  stack_allocated;
static int  stack_buffer_size;
static int* stack_buffer;

static void skel_sort(int* postorder, int ncols,int* tmp)
{
  int  file;
  char fname[256];
  ssize_t io_size;
  int iobufsize;

  iobufsize   = get_iobufsize() / (2*sizeof(int)); 

  if (skel_outfiles == 0) {
    skel_sort_incore(postorder,ncols,tmp);

    if (skel_buffer_ptr <= skel_buffer_size / 2) {
      stack_buffer      = skel_buffer + (2*skel_buffer_ptr);
      stack_buffer_size = 2*(skel_buffer_size - skel_buffer_ptr);
      stack_allocated   = 0;
      taucs_printf("oocsp_colanalyze: Using remainder of skeleton buffer for stack,\n");
      taucs_printf("oocsp_colanalyze: size = %d ints\n",stack_buffer_size);
    } else {
      sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,0);
      taucs_printf("oocsp_colanalyze: Writing out skel sort buffer <%s> (2)\n",fname);
      file = open(fname,O_WRONLY | O_CREAT,0644);
      if (file == -1)
	taucs_printf("oocsp_colanalyze: could not create skel sort file\n");
      io_size = write(file,skel_buffer,skel_buffer_ptr * 2 * sizeof(int));
      if (io_size != skel_buffer_ptr * 2 * sizeof(int))
	taucs_printf("oocsp_colanalyze: write to skel sort file failed\n");
      close(file);

      skel_outfiles++;
      skel_buffer_ptr = 0;

      stack_buffer      = skel_buffer;
      stack_buffer_size = 2*skel_buffer_size;
      stack_allocated   = 1; /* we need to free it */

      skel_buffer       = (int*)taucs_malloc(iobufsize*2*sizeof(int));
      skel_buffer_size  = iobufsize;
      taucs_printf("oocsp_colanalyze: Using skeleton buffer for stack, allocating \n");
      taucs_printf("oocsp_colanalyze: new skeleton buffer\n\n");
    }
  }
  else {
    skel_sort_outofcore(postorder,ncols,tmp);

    stack_buffer      = skel_buffer;
    stack_buffer_size = 2*skel_buffer_size;
    skel_buffer       = (int*)taucs_malloc(iobufsize*2*sizeof(int));
    skel_buffer_size  = iobufsize;
    taucs_printf("oocsp_colanalyze: Using skeleton buffer for stack, allocating \n");
    taucs_printf("oocsp_colanalyze: new skeleton buffer\n");
  }
}

static int skel_get_next(int j)
{
  int row, col;
  char fname[256];
  ssize_t io_size;

  if (skel_next >= skel_buffer_ptr) {
    if (skel_outfiles > 0) {
      if (skel_outfile == -1) {
	sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,0);
	taucs_printf("oocsp_colanalyze: Opening skel sort buffer <%s> (2)\n",fname);
	skel_outfile = open(fname,O_RDONLY);
	if (skel_outfile == -1)
	  taucs_printf("oocsp_colanalyze: could not open skel sort file\n");
      }
      io_size = read(skel_outfile,
		     skel_buffer,skel_buffer_size * 2 * sizeof(int));
      if (io_size == -1) 
	taucs_printf("oocsp_colanalyze: I/O error while trying to read skel sort file\n");
      if (io_size == 0) { /* end of file */
	taucs_printf("oocsp_colanalyze: Closing and removing skel file, col=%d\n",j);
	sprintf(fname,"%s.%c.%d",skel_basename,skel_outphase,0);
	close(skel_outfile);
	unlink(fname);
	skel_outfiles   = 0;
	skel_next       = 0;
	skel_buffer_ptr = 0;
	return -1;
      } else {
	skel_next       = 0;
	skel_buffer_ptr = io_size / (2*sizeof(int));
	taucs_printf("oocsp_colanalyze: read %d elements from skel sort file\n",skel_buffer_ptr);
      }
    } else
      return -1;
  }

  if (skel_next >= skel_buffer_ptr)
    return -1;

  row = skel_buffer[2*skel_next];
  col = skel_buffer[2*skel_next+1];
  if (col == j) {
    skel_next++;
    return row;
  }
  else
    return -1;
}
    

static void skel_get_postordercol(int* found, int flag,
				  int j,
				  int* nnz, int* rowind)
{
  int row;

  *nnz = 0;
  while ((row = skel_get_next(j)) != -1) {
    if (found[row] < flag) {
      found[row] = flag;
      rowind[ *nnz ] = row;
      (*nnz)++;
    }
  }
}


/*
static void skel_get_postordercol(int* found, int flag,
				  int j,
				  int* nnz, int* rowind)
{
  int row;
  int i;
  int next;

  next = skel_get_next;
  while (skel_buffer[2*next + 1] < j && next < skel_buffer_ptr) {
    next++;
  }

  if (skel_buffer[2*next + 1] == j && next < skel_buffer_ptr) {
    if (next != skel_get_lastcol+1)
      taucs_printf("oocsp_colanalyze: internal error in get_postordercol\n");
  }

  *nnz = 0;
  for (i = next;
       skel_buffer[2*i + 1] == j && i < skel_buffer_ptr;
       i++) {
    row = skel_buffer[2*i];
    if (found[row] < flag) {
      found[row] = flag;
      rowind[ *nnz ] = row;
      (*nnz)++;
    }
  }

  skel_get_next = i;
  skel_get_lastcol = i - 1;
}
*/

/* UNION FIND ROUTINES */

static int uf_makeset(int* uf, int i)        { uf[i] = i; return i; }
static int uf_union  (int* uf, int s, int t) { uf[s] = t; return t; }
static int uf_find   (int* uf, int i)        { if (uf[i] != i) 
                                                 uf[i] = uf_find(uf,uf[i]); 
                                               return uf[i]; }

/* FILL STACK ROUTINES */

static int  stack_files;
static char stack_basename[256];

/*
static int  stack_buffer_size;
static int* stack_buffer;
*/
static int  stack_buffer_ptr;

static int  stack_top;

static double stack_size;
static double stack_max_size;

static void stack_init(char* basename,
		       int* colptr, int* colstack, 
		       int ncols)
{
  int j;

  sprintf(stack_basename,"%s.fstack",basename);
  stack_files      = 0;
  stack_buffer_ptr = 0;
  stack_top        = -1;

  for (j=0; j<ncols; j++) colptr[j] = -1;

  /*
  stack_buffer_size = 1048576;
  stack_buffer = mxCalloc(stack_buffer_size,sizeof(int));
  */

  stack_size     = 0.0;
  stack_max_size = 0.0;
}

static void stack_finalize()
{
  if (stack_files != 0 || stack_buffer_ptr != 0)
    taucs_printf("oocsp_colanalyze: fill stack did not get empty\n");

  if (stack_allocated)
    taucs_free(stack_buffer);

  taucs_printf("oocsp_colanalyze: max stack size = %.0lf\n",stack_max_size);
}

static void stack_push(int* colptr, int* colstack, int i, int j)
{
  if (stack_top < 0 || colstack[stack_top] != j) {
    if (colptr[j] != -1) {
      taucs_printf("oocsp_colanalyze: fill stack internal error (push)\n");
    }
    stack_top++;
    colstack[stack_top] = j;
    colptr[j] = (stack_buffer_size*stack_files) + stack_buffer_ptr;
    if (colptr[j] > (INT_MAX/2))
      taucs_printf("oocsp_colanalyze: Warning! Pointers to fill stack may overflow\n");
  }
   
  stack_buffer[stack_buffer_ptr] = i;
  stack_buffer_ptr++;

  if (stack_buffer_ptr >= stack_buffer_size) {
    int     file;
    ssize_t io_size;
    char    fname[256];

    sprintf(fname,"%s.%d",stack_basename,stack_files);
    taucs_printf("oocsp_colanalyze: Writing out fill stack buffer <%s>\n",fname);
    file = open(fname,O_WRONLY | O_CREAT,0644);
    if (file == -1)
      taucs_printf("oocsp_colanalyze: could not create stack file\n");
    io_size = write(file,stack_buffer,stack_buffer_size * sizeof(int));
    if (io_size != stack_buffer_size * sizeof(int))
      taucs_printf("oocsp_colanalyze: write to stack file failed\n");
    close(file);

    stack_files++;
    stack_buffer_ptr = 0;
  }

  /*
  stack_size++;
  if (stack_size > stack_max_size) stack_max_size = stack_size;
  */
}

static void stack_pop(int* colptr, int* colstack, 
		      int* found, int flag,
		      int j, int* nnz, int* rowind)
{
  int row;
  int   i;

  if (stack_top < 0 || colstack[stack_top] != j) { /* empty fill column */
    for (i=0; i<=stack_top; i++)
      if (colstack[i] == j)
	taucs_printf("oocsp_colanalyze: fill stack internal error (pop)\n");
    
    *nnz    = 0;
  } else {
    *nnz = 0;
    while (colptr[j] < (stack_buffer_size*stack_files) +stack_buffer_ptr) {

      if (stack_buffer_ptr == 0) {
	int     file;
	ssize_t io_size;
	char    fname[256];
	
	stack_files--;
	stack_buffer_ptr = stack_buffer_size;
	sprintf(fname,"%s.%d",stack_basename,stack_files);
	taucs_printf("oocsp_colanalyze: Reading a fill stack buffer <%s>\n",fname);
	file = open(fname,O_RDONLY);
	if (file == -1)
	  taucs_printf("oocsp_colanalyze: could not open stack file\n");
	io_size = read(file,stack_buffer,stack_buffer_size * sizeof(int));
	if (io_size != stack_buffer_size * sizeof(int))
	  taucs_printf("oocsp_colanalyze: read from stack file failed\n");
	close(file);
	unlink(fname);
      }

      stack_buffer_ptr --;
      row = stack_buffer[ stack_buffer_ptr ];

      if (found[row] < flag) {
	found[row] = flag;
	rowind[ *nnz ] = row;
	(*nnz)++;
      }
    }

    stack_top--;
    colptr[j] = -1;
  }
  /*
  stack_size -= (double) (*nnz);
  */
}

/* MAIN ROUTINE */

static
void oocsp_colanalyze(taucs_ccs_matrix* matrix,
		      char* basename,
		      int*  colperm,
		      int** ptrparent,
		      int** ptrpostorder,
		      int** ptrlcolcount,
		      int** ptrucolcount)
{
  int i,j,ip,p,jp;
  int nnz,cset,rset,rroot,fcol;
  
  int  postnum, depth;
  int* first_kid;
  int* next_kid;
  int* stack_vertex;
  int* stack_child;

  /*int* colptr;*/
  int* rowind;
  int* firstcol;
  int* uf;
  int* found;
  int* root;
  int* stack_colptr;
  int* stack_colstk;
  int* tmp_col;
  
  int* parent;
  int* postorder;
  int* lcolcount;
  int* ucolcount;  
  
  int*    nrows;
  int*    ncols;

  taucs_printf("oocsp_colanalyze: In colanalyze\n");
  taucs_printf("oocsp_colanalyze: using %.0lf MBytes of memory\n",(remaining_memory)/1048576.0);
  
  nrows     = &matrix->m;
  ncols     = &matrix->n;

  /* START THE ANALYSYS */

  skel_init(basename);

  (remaining_memory) -= (double) ( 4 * (*ncols) * 4 /* sizeof(int32) */);

  /* +1 for stack_vertex, stack_child */
  parent          = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrparent = parent;
  lcolcount       = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrlcolcount = lcolcount;
  ucolcount       = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrucolcount = ucolcount;
  postorder       = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  *ptrpostorder = postorder;

  (remaining_memory) -= (double) ( 2 * ((*ncols)+1) * sizeof(int));
  (remaining_memory) -= (double) ( 2 * (*nrows) * sizeof(int));

  uf        = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  root      = (int*)taucs_malloc((*ncols+1)*sizeof(int));
  firstcol  = (int*)taucs_malloc((*nrows)*sizeof(int));

  tmp_col   = (int*)taucs_malloc((*nrows)*sizeof(int));

  /* we can reuse the same space */
  first_kid = uf;
  next_kid  = root;

  found        = firstcol;
  stack_colptr = uf;
  stack_colstk = root;

  /* we use the output arrays before they are used. */
  stack_vertex = lcolcount; 
  stack_child  = ucolcount;

  for (i=0; i < (*nrows); i++) {
    firstcol[i] = (*ncols);
  }

  for (j=0; j < (*ncols); j++) {

#ifdef OLD
    {
      int   file;
      off_t offset;
      ssize_t io_size;
	
      nnz = clen[j];
      /* taucs_printf("oocsp_colanalyze: Debug_1: j= %d nnz= %d\n",j,nnz);*/
      file   = dfile_fid[ ind_fid[j] ];
      offset = ind_off[j];
      if (nnz != 0 && file != -1 && offset != -1) {
	if (lseek(file,offset,SEEK_SET) == -1) taucs_printf("oocsp_colanalyze: lseek failed\n");
	io_size = read(file, tmp_col, nnz * sizeof(int));
	if (io_size != nnz * sizeof(int)) taucs_printf("oocsp_colanalyze: Error reading data.\n");
      }
      rowind = tmp_col;

    }
#else
    /*
    nnz = clen[j];
    oocsp_readcol_structure(matrix,j,tmp_col);
    rowind = tmp_col;
    */

    /* new code: Sivan 28 Feb 2002 */
    nnz = (matrix->colptr)[colperm[j]+1] - (matrix->colptr)[colperm[j]];
    rowind = (matrix->rowind) + (matrix->colptr)[colperm[j]];
#endif

    cset       = uf_makeset(uf, j);
    root[cset] = j;
    parent[j]  = (*ncols);
    
    for (ip=0; ip<nnz; ip++) {
      
      i = rowind[ip];
      fcol = firstcol[i];
      if (fcol >= j) {
	firstcol[i] = j;
	fcol        = j;
      } else {
	rset = uf_find(uf,fcol);
	rroot = root[rset]; 
	if (rroot != j) {
	  parent[rroot] = j;
	  cset          = uf_union(uf,cset,rset);
	  root[cset]    = j;
	}
      }
      /* ADD (j,fcol) TO SKELETON */
      skel_add(j,fcol);
    }
    /*
    mxDestroyArray(output_args[0]);
    */
  }

  /* COMPUTE POSTORDER OF ETREE */

  /* create linked lists of children */
  for (j=0; j <= *ncols; j++) first_kid[j] = -1;
  for (j = (*ncols)-1; j >= 0; j--) {
    p              = parent[j];
    next_kid[j]    = first_kid[p];
    first_kid[p] = j;
  }

  
  /* do dfs in a loop */
  postnum = 0;
  depth = 0;
  stack_vertex[depth] = *ncols; /* root */
  stack_child [depth] = first_kid[ stack_vertex[depth] ];
  while (depth >= 0) {
    if ( stack_child[depth] != -1 ) {
      stack_vertex[depth+1] = stack_child[depth];
      stack_child [depth+1] = first_kid[  stack_vertex[depth+1] ];
      depth++;
    } else {
      if ( stack_vertex[depth] != (*ncols) ) { /* not root */
	if (stack_vertex[depth] >= *ncols) 
	  taucs_printf("oocsp_colanalyze: internal error in dfs (0)\n");
	postorder[ postnum ] = stack_vertex[depth];
	postnum++;
      }
      depth--;
      if (depth >= 0) /* sivan June 30, to avoid crash, seems to be right */
	stack_child[depth] = next_kid[  stack_child[depth] ];
    }
    if (depth > *ncols) {
      int i;
      taucs_printf("oocsp_colanalyze: depth=%d, ncols=%d\n",depth,*ncols);
      for (i=0; i<(*ncols); i++) {
	taucs_printf("oocsp_colanalyze: %d: [%d %d]\n",i,first_kid[i],next_kid[i]);
      }
      taucs_printf("oocsp_colanalyze: internal error in dfs (1)\n");
    }
  }

  
  if (postnum != *ncols) taucs_printf("oocsp_colanalyze: internal error in dfs (2)\n");
    
  /* SORT THE SKELETON MATRIX */

  skel_sort(postorder, *ncols, 
	    found /* temporary */);


  
  /* SECOND PHASE, COMPUTE COLCOUNTS */

  /* we reuse the space of uf and root */
  stack_init(basename,stack_colptr,stack_colstk,*ncols); 

  for (i=0; i < (*nrows); i++) {
    found[i] = -1;
  }

  for (j=0; j<(*ncols); j++) {
    lcolcount[j] = 1;
    ucolcount[j] = 1;
  }

  for (jp=0; jp<(*ncols); jp++) {
    j = postorder[jp];
    found[j] = jp;
    p = parent[j];

    if (p < (*ncols)) {
      found[p] = jp;
      lcolcount[j]++;
      ucolcount[p]++;
    }

    stack_pop(stack_colptr,stack_colstk,
	      found, jp,
	      j,
	      &nnz,tmp_col);
    rowind = tmp_col;
    for (ip=0; ip<nnz; ip++) {
      i = rowind[ip];

      lcolcount[j]++;
      ucolcount[i]++;
      if (p < *ncols) stack_push(stack_colptr,stack_colstk,i,p);
    }

    skel_get_postordercol(found,jp,
			  jp, /* use postorder column index */
			  &nnz,tmp_col);
    rowind = tmp_col;
    for (ip=0; ip<nnz; ip++) {
      i = rowind[ip];

      lcolcount[j]++;
      ucolcount[i]++;

      if (lcolcount[j] > *(ncols))
	taucs_printf("oocsp_colanalyze: Internal error while producing lcolcounts\n");
      if (ucolcount[i] > *(ncols))
	taucs_printf("oocsp_colanalyze: Internal error while producing ucolcounts\n");

      if (p < *ncols) stack_push(stack_colptr,stack_colstk,i,p);
    }
  }

  stack_finalize();

  /*
  mxDestroyArray(tmp1_array);
  mxDestroyArray(tmp2_array);
  mxDestroyArray(tmp3_array);
  mxDestroyArray(tmp4_array);
  */

  taucs_free(uf);
  taucs_free(firstcol);
  taucs_free(root);
  taucs_free(tmp_col);

  (remaining_memory) += (double) ( 2 * ((*ncols)+1) * sizeof(int));
  (remaining_memory) += (double) ( 2 * (*nrows) * sizeof(int));

  skel_finalize();

  /* MAKE AND POSTORDER PARENT 1-BASED AND MARK ROOTS WITH A ZERO */

  /*for (j=0; j < (*ncols); j++) {
    postorder[j]++;
    if (parent[j] == (*ncols))
      parent[j] = 0;
    else 
      parent[j]++;
      }  */
  /* fix up by Vladi */

  for (j=0; j < (*ncols); j++) {
    if (parent[j] == (*ncols))
      parent[j] = -1;
    /*lcolcount[j]--;
      ucolcount[i]--;*/
  }

  taucs_printf("oocsp_colanalyze: done\n");
}

/*********************************************************/
/* PANELIZATION                                          */
/*********************************************************/


/* There seems to be a confusion here between spawidth and remaining memory; sivan */

static
void oocsp_panelize_simple(
			   int  nrows,             /* input  */  
			   int  ncols,             /* input  */
			   int* postorder,         /* input  */
			   int* l_colcounts,       /* input  */
			   int* u_colcounts,       /* input  */
			   int* parents,           /* input  */

                           int* spawidth,          /* input  */  
                           int* maxsn,             /* input  */  
			   int** ptrpanels,        /* output */
			   int** ptrschedstart,    /* output */
			   int** ptrschedend,      /* output */
			   int** ptrfetchnext,     /* output */
			   int** ptrejectnext      /* output */
			   )

{
  int i,j,c;
  int panelsize, panelcols, panelnumber;
  int stop,eject,newpanel;
  int colcount;
  int* panels;
  int* schedstart;
  int* schedend;
  int* fetchnext;
  int* ejectnext;  
  double maxpanelsize;
  double memuse, width_multiplier;
  int  maxcolcount;

  maxcolcount = 0;
  for (j=0; j<ncols; j++)
    maxcolcount = max( maxcolcount, l_colcounts[j]+u_colcounts[j] );

  taucs_printf("oocsp_panelize: max col count = %d, nrows = %d\n",
	     maxcolcount, nrows);

  /*
    memory usage in numfact, width > 0, snodes:
           8*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           2*ncols*I + 
           1*w*nrows*I + 1*w*nrows*D + 1*w*I +
           5*w*maxc *I + 2*w*maxc *D +
           1*s*I +
           1*s*maxc *I + 2*s*maxc *D 

    we also account for the memory required for the in-core representation of A, L, U,
    which is 2*ncols*sizeof(char)+3*ncols*sizeof(int)
  */

  *maxsn = 8 - 4; /* 8 is the minimum value we use here */
  do {
    *maxsn += 4;
    memuse = 
      8.0*nrows*sizeof(int) + 2.0*nrows*sizeof(taucs_datatype) 
      + 2.0*nrows*sizeof(char) + 3.0*nrows*sizeof(void*)
      + 2.0*ncols*sizeof(int)
      + 1.0*(*maxsn)*sizeof(int)
      + 1.0*(*maxsn)*maxcolcount*sizeof(int) + 2.0*(*maxsn)*maxcolcount*sizeof(taucs_datatype);

    memuse +=
      3 * ( 3.0*ncols*sizeof(int) + 2.0*ncols*sizeof(char) );

    width_multiplier = 
      1.0*nrows*sizeof(int) + 1.0*nrows*sizeof(taucs_datatype) + 1.0*sizeof(int)
      + 5.0*maxcolcount*sizeof(int) + 2.0*maxcolcount*sizeof(taucs_datatype);
    *spawidth = (int) floor( (remaining_memory - memuse) / width_multiplier );
  } while (*spawidth > 4*(*maxsn));

  if (*spawidth < 8) *spawidth = 8; /* it might go over the limit */

  maxpanelsize = nrows * *spawidth; 

  taucs_printf("oocsp_panelize: spawidth = %d, max supernode = %d\n",*spawidth,*maxsn);

  panels         = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrpanels)     = panels;
  schedstart     = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrschedstart) = schedstart;
  schedend       = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrschedend)   = schedend;
  fetchnext      = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrfetchnext)  = fetchnext;
  ejectnext      = (int*)taucs_malloc(ncols*sizeof(int)); (*ptrejectnext)  = ejectnext;

  panelnumber = 1;
  j           = 0;
  eject = 0;
  
  while (j<ncols)
  {
    panelsize = 0;
    panelcols = 0;
    stop = 0;

    while (stop == 0) 
      {
	c = postorder[j];
	colcount = l_colcounts[c] + u_colcounts[c]; /* was only L, don't remember why, sivan */
	newpanel = 0;
	if (*spawidth > 0)
	  {
	  if (panelcols >= *spawidth) 
	    newpanel=1; 
	  else
	    if (panelsize + colcount > maxpanelsize) newpanel=1; 
	  }

	if (newpanel == 0)
	  {
	    /* add c to panel */
	    panelsize = panelsize + colcount;
	    panelcols = panelcols + 1;
	    panels[c] = panelnumber;
	    ejectnext[j] = c;
	    schedstart[c] = panelnumber;
	    schedend[c]   = panelnumber;
	    j = j+1;
	    if (j >= ncols) stop = 1; 
	  } 
	else
	  {
	    panelnumber = panelnumber + 1;
	    stop = 1;
	  }
      }
    
    /* now reverse panel, compute fetchnext */
#ifdef VLADIMIR
    if(j<ncols)
      for (i=0;i<j-eject;i++)
	fetchnext[eject+i] = ejectnext[j-i];
    else {
      printf("j >= ncols???\n");
      exit(1);
      for (i=0;i<j-eject;i++)
	fetchnext[eject+i] = ejectnext[j-i-1];
    }
#else
    for (i=0;i<j-eject;i++)
      fetchnext[eject+i] = ejectnext[j-i-1];
#endif

    eject = j;
     
  }
}


/*********************************************************/
/* NUMERICAL PHASE                                       */
/*********************************************************/

#define SNODES
#define SNODE_THRESHOLD 4
#define SNODE_BLOCK 8
#define SIMPLE_COL_COL_no
#define SPA_ONEARRAY
#define USE_BLAS
#define BLAS_THRESHOLD 10
#define BLOCK 16

#ifdef DETAILED_TIMING
static double flops_extra = 0.0;
static double flops_dense = 0.0;
#endif

/*
  Out-of-core sparse LU

  Numerical factorization.

  Memory management
    ipivots              nrows*I                            ! not freed, returned to caller
    rowlists             nrows*I + 3*width*maxlucols*I      ! heads,colind,next,prev
    heap                 nrows*I                            !                        

    spa                  nrows*I + nrows*D                  ! if spawidth < 0
    spa (width>0)        width*nrows*I + width*nrows*D      !

    lindices             nrows*C                            ! L or U 
    nnzmap               nrows*C                            ! ?

    panel                2*ncols*I                          ! id, nnz
                         + 3*ncols*P                        ! ind, inrowlist, re

    panel_compressed     sum_j lucols_j*(D+2*I), j=1:width  ! re, ind, inrowlist
                         <= width*maxlucols*(D+2*I)
                         
    snodes               2*ncols*I                          ! pivots, snode_index
                         +   MAX_SNODE*I                    ! pivrows
			 + 2*nrows*I                        ! ind, map
			 +   nrows*D                        ! re ???
			 +   MAX_SNODE*maxcolcount*I        ! lu_ind
			 +   MAX_SNODE*maxcolcount*D        ! lu_re

    snodes_dense         SNODE_MAX*maxcolcont*D             ! S
                         + width*maxcolcount*D              ! P
                         + width*I                          ! updcols

    no_snodes            nrows*I + nrows*D                  ! lu_ind, lu_re

    total: width > 0, snodes
           8*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           2*ncols*I + 
           1*w*nrows*I + 1*w*nrows*D + 1*w*I +
           5*w*maxc *I + 2*w*maxc *D +
           1*s*I +
           1*s*maxc *I + 2*s*maxc *D 

    total: width < 0, snodes
           8*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           2*ncols*I + 
           1*w*I +
           5*w*maxc *I + 2*w*maxc *D +
           1*s*I +
           1*s*maxc *I + 2*s*maxc *D 

    total: width > 0, no snodes
           7*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           1*w*nrows*I + 1*w*nrows*D +
           5*w*maxc *I + 1*w*maxc *D +

    total: width < 0, no snodes
           7*nrows*I + 2*nrows*D + 2*nrows*C + 3*nrows*P  +
           5*w*maxc *I + 1*w*maxc *D +

    where:
      I=sizeof(int) D=sizeof(taucs_datatype) C=sizeof(char) P=sizeof(void*)
      maxc=maxcolcount w=spawidth (=active panelwidth) s=max_snode
*/



static double time_total;

#ifdef DETAILED_TIMING
static double time_colcol;
static double time_colcol_1;
static double time_colcol_2;
static double time_factor;
static double time_scatter;
static double time_gather;
static double time_append;
static double time_read;
#ifdef SNODES
static double time_snode_tmp;
static double time_snode_1 = 0;
static double time_snode_2 = 0;
static double time_snode_21 = 0;
static double time_snode_3 = 0;
static double time_snode_4 = 0;
static double time_snode_detect;
static double time_snode_prepare;
static double time_snode_dense;
#endif

static double bytes_read;
static double bytes_appended;
static double col_ooc_updates;
static double col_read;

static double  flops;
static double  scatters;
static double  gathers;
static double  rowlist_ops;
static double  num_heap_ops;
#endif /* DETAILED_TIMING */

/****************************************************/
/*                                                  */
/* Heap operations                                  */
/*                                                  */
/****************************************************/

/* HEAP OPERATIONS */


static void num_heap_heapify(int* heap, int* heapsize, 
			     int* ipivots, int p) 
{
  int r,l,smallest;
  int temp;

#ifdef DETAILED_TIMING
  num_heap_ops += 1.0;
#endif

  r = (p+1) * 2;
  l = r - 1;

#ifdef INTERNAL_CHECKS
  if ((l-1)/2 != p || (r-1)/2 != p)
    taucs_printf("oocsp_numfact: left-right computation in heap failed\n");
#endif

  if (l < *heapsize && ipivots[heap[l]] < ipivots[heap[p]])
    smallest = l;
  else
    smallest = p;

  if (r < *heapsize && ipivots[heap[r]] < ipivots[heap[smallest]])
    smallest = r;
  
  if (smallest != p) {
    temp           = heap[p]; 
    heap[p]        = heap[smallest];
    heap[smallest] = temp;
    
    num_heap_heapify(heap, heapsize, ipivots, smallest);
  }
}

static void num_heap_insert(int* heap, int* heapsize, int* ipivots, int i)
{
  int child, parent;

  (*heapsize)++;

#ifdef DETAILED_TIMING
  num_heap_ops += 1.0;
#endif

  child = (*heapsize-1);
  parent = (child-1) / 2;
  while (child > 0 && (ipivots[heap[parent]] > ipivots[i])) {
    heap[child]   = heap[parent];
    child = parent;
    parent = (child-1) / 2;

#ifdef DETAILED_TIMING
    num_heap_ops += 1.0;
#endif
  }

  heap[child]   = i;
}

static int num_heap_extractmin(int* heap, int* heapsize, int* ipivots) 
{
  int m; 

#ifdef DETAILED_TIMING
  num_heap_ops += 1.0;
#endif

  if (*heapsize <= 0) return -1;
  
  m = heap[0];
  
  heap[0] = heap[(*heapsize)-1];

  (*heapsize)--;

  num_heap_heapify(heap,heapsize,ipivots,0);

  return m;
}

/****************************************************/
/*                                                  */
/* Row lists                                        */
/*                                                  */
/****************************************************/

static int* rowlists_head;   /* one head per row */
static int* rowlists_colind; 
static int* rowlists_next;
static int* rowlists_prev;
static int  rowlists_size;
static int  rowlists_freehead;
/*static int  rowlists_freenext;*/

static void rowlists_finalize()
{
  taucs_free(rowlists_head);
  taucs_free(rowlists_colind);
  taucs_free(rowlists_next);
  taucs_free(rowlists_prev);
}

static void rowlists_init(int size, int nrows)
{
  int i;

  rowlists_size = size;

  rowlists_head   = (int*)taucs_malloc(nrows*sizeof(int));
  rowlists_colind = (int*)taucs_malloc(rowlists_size*sizeof(int));
  rowlists_next   = (int*)taucs_malloc(rowlists_size*sizeof(int));
  rowlists_prev   = (int*)taucs_malloc(rowlists_size*sizeof(int));
  assert(rowlists_head && rowlists_colind && rowlists_next && rowlists_prev); 

  for (i=0; i<nrows; i++) rowlists_head[i] = -1;

  /* link the entire rowlist as one freelist */

  rowlists_freehead = 0;
  for (i=0; i<rowlists_size; i++) {
    rowlists_next[i] = i+1;
    /* freelist does not need prev pointers */ 
    /* rowlists_prev[i] = i-1; */ 
  }
  rowlists_next[ rowlists_size - 1 ] = -1;
}

static int rowlists_insert(int row, int panelcol)
{
  int new;

#ifdef DETAILED_TIMING
  rowlist_ops += 1.0;
#endif /* DETAILED_TIMING */

  /* get memory from the freelist */

  if ((new = rowlists_freehead) == -1) {
    taucs_printf("oocsp_numfact: Out of rowlist memory\n");
    exit(1);
  }

  /* remove this memory from the freelist; freelist does now use prev */

  rowlists_freehead = rowlists_next[ new ];

  /* link to row list */

  rowlists_next[ new ] = rowlists_head[ row ];
  rowlists_prev[ new ] = -1;
  rowlists_colind[ new ] = panelcol;

  if (rowlists_next[new] != -1)
    rowlists_prev[ rowlists_next[new] ] = new;

  rowlists_head[ row ] = new;

  return new;
}

static void rowlists_delete(int row, int index)
{
#ifdef DETAILED_TIMING
  rowlist_ops += 1.0;
#endif /* DETAILED_TIMING */

  if (rowlists_head[ row ] == index)
    rowlists_head[ row ] = rowlists_next[ index ];

  if (rowlists_next[ index ] != -1)
    rowlists_prev[ rowlists_next[index] ] = rowlists_prev[ index ];

  if (rowlists_prev[ index ] != -1)
    rowlists_next[ rowlists_prev[index] ] = rowlists_next[ index ];

  rowlists_next[ index ] = rowlists_freehead;
  rowlists_freehead = index;
}

static int rowlists_isempty()
{
  int i,count;

  i = rowlists_freehead;
  count = 0;
  while (i != -1) {
    count++;
    i = rowlists_next[i];
  }

  if (count == rowlists_size) return 1;
  else return 0;
}
  
/****************************************************/
/*                                                  */
/* Spa routines                                     */
/*                                                  */
/****************************************************/

#if 0
static
int intcmp(const void* v1, const void* v2)
{
  const int* i1 = (const int*) v1;
  const int* i2 = (const int*) v2;

  if      (*i1 < *i2) { return -1; }
  else if (*i1 > *i2) { return  1; }
  else                  return  0;
}
#endif

static taucs_datatype*  spa;
static char*            spamap;

static void spa_finalize()
{
  taucs_free(spa);
  taucs_free(spamap);
}

static void spa_init(int nrows)
{
  int i;

  spa    = (taucs_datatype*) taucs_malloc(nrows*sizeof(taucs_datatype));
  spamap = (char*)  taucs_malloc(nrows*sizeof(char));
  assert(spa && spamap);

  for (i=0; i<nrows; i++) {spa[i] = taucs_zero; spamap[i] = 0;}
}

static void
gather(int             a_nnz,
       taucs_datatype* a_re,
       int*            a_ind,
       taucs_datatype* spa,
       char*           spamap)

{
  int i,ip;

#ifdef DETAILED_TIMING
  double time_tmp;

  gathers += ((double) a_nnz);
  time_tmp = taucs_wtime();
#endif

  for (ip=0; ip<a_nnz; ip++) {
    i = a_ind[ip];
    a_re[ip] = spa[i];
    spamap[i] = 0;

    spa[i] = taucs_zero;
  }

#ifdef DETAILED_TIMING
  time_gather += (taucs_wtime() - time_tmp);
#endif
}

static void 
scatter(int             a_nnz,
	taucs_datatype* a_re,
	int*            a_ind,
	taucs_datatype* spa,
	char*           spamap)
     
{
  int i,ip;

#ifdef DETAILED_TIMING
  double time_tmp;

  scatters += ((double) a_nnz);
  time_tmp = taucs_wtime();
#endif

  for (ip=0; ip<a_nnz; ip++) {
    i = a_ind[ip];
    spa[i]    = a_re[ip];
    spamap[i] = 1;
  }

#ifdef DETAILED_TIMING
  time_scatter += (taucs_wtime() - time_tmp);
#endif
}

/****************************************************/
/*                                                  */
/* Column updates                                   */
/*                                                  */
/****************************************************/

static
void spcol_spa_update(int pivotindex,
		      taucs_datatype* l_re,
		      int*    l_ind,
		      int     l_nnz,
		      int*    panelcols,
 		      int     panelcols_n,
		      int     nrows,
		      int**   a_inrowlist,
		      taucs_datatype* spa,
		      char*   spamap,
		      int**   a_ind,
		      int*    a_nnz)

{
  int i,ip,j,q;
  taucs_datatype v;

  for(j = 0; j < panelcols_n; j++) {
    q = panelcols[j];
    if (   taucs_re(spa[q*nrows + pivotindex]) == 0.0 
	&& taucs_im(spa[q*nrows + pivotindex]) == 0.0) continue;
    for (ip=0; ip<l_nnz; ip++) {
      i = l_ind[ip];
      v = l_re[ip];
      if (spamap[q*nrows + i] == 0) {
	spamap[q*nrows + i] = 1;
	spa   [q*nrows + i] = taucs_zero;
	a_ind      [q][ a_nnz[q] ] = i;
	a_inrowlist[q][ a_nnz[q] ] = rowlists_insert(i,q);
	(a_nnz[q])++;
      }
      /*spa[q*nrows + i] -= (spa[q*nrows + pivotindex] * v);*/
      spa[q*nrows + i] = taucs_sub(spa[q*nrows + i],
				   taucs_mul(spa[q*nrows + pivotindex] , v));
    }
  }
  return;
}

#ifdef SIMPLE_COL_COL
static void
spcol_spcol_update(int pivotindex,
		   taucs_datatype* l_re,
		   int*    l_ind,
		   int     l_nnz,
		   int     panelcol,
		   int*    a_inrowlist,
		   taucs_datatype* spa,
		   char*  spamap,
		   int*    a_ind,
		   int*    a_nnz,
		   int     lu_colcount)
{
  int i,ip;
  taucs_datatype pv;

  pv = spa[pivotindex];
  
  if (taucs_iszero(pv)) return;

#ifdef DETAILED_TIMING
  flops += 2.0 * ((double) l_nnz);
#endif /* DETAILED_TIMING */

  for (ip=0; ip<l_nnz; ip++) {
    i = l_ind[ip];
    if (spamap[i] == 0) {
      spamap[i] = 1;
      spa[i] = taucs_zero;
      a_ind[ *a_nnz ] = i;
      a_inrowlist[ *a_nnz ] = rowlists_insert(i,panelcol);
      (*a_nnz)++;
    }
    /*spa[i] -= (spa[pivotindex] * l_re[ip]);*/
    spa[i] = taucs_sub(spa[i],taucs_mul(spa[pivotindex] , l_re[ip]));
  }

  /*
  if ((*a_nnz) > lu_colcount) {
    taucs_printf("oocsp_numfact: prediction=%d, size now=%d\n",lu_colcount,*a_nnz);
    taucs_printf("oocsp_numfact: Column expands beyond prediction\n");
  }
  */

}
#else /* simple col col */

static int oocsp_spcol_n1 = 0;
static int oocsp_spcol_n2 = 0;

static void
spcol_spcol_update(int pivotindex,
			  taucs_datatype* l_re,
			  int*    l_ind,
			  int     l_nnz,
			  int     panelcol,
			  int*    a_inrowlist,
			  taucs_datatype* spa,
			  char*  spamap,
			  int*    a_ind,
			  int*    a_nnz,
			  int     lu_colcount)
{
  register int i;
  register int ip;
  register int ip_block;
  register int loop_bound;
  register char flag;
  register taucs_datatype pv;

  pv = spa[pivotindex];
  
  if (taucs_iszero(pv)) return;

#ifdef DETAILED_TIMING
  flops += 2.0 * ((double) l_nnz);
#endif /* DETAILED_TIMING */

  for (ip_block=0; ip_block<l_nnz; ip_block += BLOCK) {

    loop_bound = min(ip_block + BLOCK,l_nnz);

    flag = 1;
    oocsp_spcol_n1++;
    for (ip=ip_block; ip<loop_bound; ip++) {
      i = l_ind[ip];
      flag &= spamap[i];
      /* spa[i] -= (pv * l_re[ip]); */
      spa[i] = taucs_sub(spa[i], taucs_mul(pv , l_re[ip]));
    }

    if (!flag) {
      oocsp_spcol_n2++;

      for (ip=ip_block; ip<loop_bound; ip++) {
	i = l_ind[ip];
	if (spamap[i] == 0) {
	  spamap[i] = 1;
	  a_ind[ *a_nnz ] = i;
	  a_inrowlist[ *a_nnz ] = rowlists_insert(i,panelcol);
	  (*a_nnz)++;
	  /* we essentially zero and update */
	  /*spa[i] = - (pv * l_re[ip]);*/
	  spa[i] = taucs_neg(taucs_mul(pv , l_re[ip]));
	}
      }
    }
  }
  return;
}
#endif

#if 0
static void spcol_panel_update(int pivotindex,
			       taucs_datatype* l_re,
			       int*    l_ind,
			       int     l_nnz,
			       int*    subpanel, int subpanel_size, 
			       taucs_datatype* subpanel_tmp,
			       int**   a_inrowlist,
			       taucs_datatype* spa,
			       char*   spamap,
			       int**   a_ind,
			       int*    a_nnz,
			       int     nrows)
{
  int i,ii,ip,q,j;
  taucs_datatype x;

  assert(0);

#ifdef DETAILED_TIMING
  flops += ((double) subpanel_size) * 2.0 * ((double) l_nnz);
#endif /* DETAILED_TIMING */

  for (j=0; j<subpanel_size; j++) {
    q = subpanel[j];
    subpanel_tmp[q] = spa[q*nrows+pivotindex];
  }

  for (ip=0; ip<l_nnz; ip++) {
    i = l_ind[ip];
    x = l_re[ip];
    for (j=0; j<subpanel_size; j++) {
      q = subpanel[j];
      ii = (q*nrows) + i;
      if (spamap[ii] == 0) {
	spamap[ii] = 1;
	spa[ii] = taucs_zero;
	a_ind[q][ a_nnz[q] ] = i;
	a_inrowlist[q][ a_nnz[q] ] = rowlists_insert(i,q);
	(a_nnz[q])++;
      }
      /*spa[ii] -= (subpanel_tmp[q] * x);*/
      spa[ii] = taucs_sub(spa[ii], taucs_mul(subpanel_tmp[q] , x));
    }
  }
}
#endif

/****************************************************/
/*                                                  */
/* OLD STUFF                                        */
/*                                                  */
/****************************************************/

#if 0
void x_heap_insert( int* heap, int* heapsize,  int* ipivots, int value) 
{
  heap[ *heapsize ] = value;
  (*heapsize)++;

  taucs_printf("oocsp_numfact: heap insert %d\n",value);
}

int x_heap_extractmin( int* heap, int* heapsize, int* ipivots) 
{
  int i,m,mindex;

  if (*heapsize == 0) return -1;

  m = INT_MAX;

  for (i=0; i<(*heapsize); i++) 
    if (ipivots[heap[i]] < ipivots[m]) {m = heap[i]; mindex=i;}

  if (m == INT_MAX) return -1;

  for (i=mindex; i<(*heapsize)-1; i++) 
    heap[ i ] = heap[i+1];

  (*heapsize)--;
 
  taucs_printf("oocsp_numfact: heap extractmin %d (%d)\n",m,ipivots[m]);

  return m;
}
#endif

/****************************************************/
/*                                                  */
/* NUMERICAL FACTORIZATION MAIN ROUTINE             */
/*                                                  */
/****************************************************/

static
void oocsp_numfact (taucs_ccs_matrix* A, int* colperm,
		    taucs_io_handle* LU,
		    int* panels,
		    int* schedstart,
		    int* schedend,
		    int* fetchnext,
		    int* ejectnext,
		    int* lcolcount,
		    int* ucolcount,
		    int spawidth,
		    int maxsn
		    )
{
  int i,j,k,ip,p,q,qp,ii,ks;/* ip_next,jp_next,jp omer*/

  int nrows, ncols;

  
  /*char    fname[256]; 
  int     file; 
  int     mode; 
  mode_t  perm; omer*/

  /*int     len,status; omer*/

  /*
  double* pr;
  */

  /* NEW VARS (Sivan, for this function) */

  int fn;
  int en;
  int nsteps;
  int step;

  /*int nnz; omer*/
  /*off_t offset; omer*/
  /*ssize_t io_size; omer*/

  int  heapsize;
  int* heap;

  char* nnzmap;   /* bit vector */
  char* lindices; /* bit vector */
  /* char* uindices; */ /* bit vector */

  int* panel_id;
  int* panel_nnz;
  int**    panel_ind;
  int**    panel_inrowlist;
  taucs_datatype** panel_re;

  int* Lclen;
  int* Uclen;

  /*
  taucs_datatype* update_tmp;
  int* update_vec;
  int  update_vec_next;
  */

#ifdef SPA_ONEARRAY
  taucs_datatype*  panel_spa = NULL; /* warning */
  char*    panel_spamap = NULL; /* warning */
#else
  taucs_datatype**  panel_spa;
  char**    panel_spamap;
#endif

  int     unext,lnext;
  int*    lu_ind;
  taucs_datatype* lu_re;

  int* ipivots;

  int    maxind;
  double maxval, absval;
  int    pivotindex;

#ifdef SNODES
  int*    snode_ind;
  taucs_datatype* snode_re;
  /*char*   snode_map;*/ /* sivan changed to int to support m2 */
  int*    snode_map;
  int*    snode_pivrows;
  int*    snode_index;
  int*    pivots;

  int     snode_hash=0; /* warning */
  int     snode_size, snode_flag, snode_nnz, snode_lastcol;
  int     snode_last;
  int     hash;
  int     snode_id;

  taucs_datatype* S;
  taucs_datatype* P;
  int*    spa_updcols;
  int     spa_n;
  int*    m2;

  int*    srows;       /* indices of rows in the supernode */
  int     srows_n;     /* number of rows in the supernode  */
  int     srow_next;

  int     dense_flag;
  int     tmp;
#endif

  int maxcolcount;
  
  /*double time_tmp; omer*/

  /* READ GLOBALS */

  taucs_printf("oocsp_numfact: Using %.0lf MBytes of memory\n",
	     remaining_memory/1048576.0);

    
  /* START THE FACTORIZATION */
  /*
  nrows = A->nrows;
  ncols = A->ncols;
  */
  nrows = A->m;
  ncols = A->n;

#ifdef DETAILED_TIMING
  flops       = 0.0;
  scatters    = 0.0;
  gathers     = 0.0;
  num_heap_ops    = 0.0;
  rowlist_ops = 0.0;
  time_append = 0.0;
  time_read   = 0.0;
  time_colcol = 0.0;
  time_colcol_1 = 0.0;
  time_colcol_2 = 0.0;
  time_factor = 0.0;
  time_scatter= 0.0;
  time_gather = 0.0;
#ifdef SNODES
  time_snode_detect  = 0.0;
  time_snode_prepare = 0.0;
  time_snode_dense   = 0.0;
#endif
  bytes_read  = 0.0;
  bytes_appended = 0.0;
  col_read    = 0.0;
  col_ooc_updates = 0.0;
#endif /* DETAILED_TIMING */

  time_total  = taucs_wtime();
  
  maxcolcount = 0;
  for (j=0; j<ncols; j++)
    maxcolcount = max( maxcolcount, lcolcount[j]+ucolcount[j] );
  taucs_printf("oocsp_numfact: maxcolcount = %d, nrows = %d, spawidth = %d, maxsn = %d\n",
	     maxcolcount, nrows, spawidth, maxsn);

  Lclen            = (int*) taucs_calloc(ncols,sizeof(int));
  Uclen            = (int*) taucs_calloc(ncols,sizeof(int));
  assert(Uclen && Lclen);

  ipivots          = (int*)taucs_malloc(ncols*sizeof(int));
  assert(ipivots);
  for (i=0; i<nrows; i++) ipivots[i] = INT_MAX;

  /* create row lists */

  rowlists_init(maxcolcount * iabs(spawidth),nrows);
  
  fn = 0;
  en = 0;
  
  lindices = (char*)taucs_malloc(nrows*sizeof(char));
  /* uindices = (char*)taucs_malloc(nrows*sizeof(char)); */

  for (i=0; i<nrows; i++) {
    lindices[i] = 1;
    /*uindices[i] = 0;*/
  }

  nnzmap   = (char*)taucs_malloc(nrows*sizeof(char));
  heap     = (int*) taucs_malloc(nrows*sizeof(int));

#ifdef SNODES
  /* the supernodes consists of contiguous columns and we need to know */
  /* the corresponding pivot rows */
  pivots           = (int*)taucs_malloc(ncols*sizeof(int));
  assert(pivots);
  for (i=0; i<ncols; i++) pivots[i] = INT_MAX;

  snode_lastcol = snode_nnz = snode_size = 0;
  snode_pivrows = (int*)   taucs_malloc(maxsn*sizeof(int)); /* size was ncols */
  snode_ind     = (int*)   taucs_malloc(nrows*sizeof(int));
  snode_re      = (taucs_datatype*)taucs_malloc(nrows*sizeof(taucs_datatype));
  snode_map     = (int*)   taucs_malloc(nrows*sizeof(int));

  snode_id      = 0;
  snode_last    = 0;
  snode_index   = (int*)taucs_malloc(ncols*sizeof(int));
  for (i=0; i<ncols; i++) snode_index[i] = -1;
  for (i=0; i<nrows; i++) snode_map[i]   = -1;

  S = (taucs_datatype*) taucs_malloc( maxcolcount * maxsn  * sizeof(taucs_datatype) );
  P = (taucs_datatype*) taucs_malloc( maxcolcount * spawidth   * sizeof(taucs_datatype) );
  srows = (int*) taucs_malloc( maxcolcount * sizeof(int) );
  spa_updcols = (int*) taucs_malloc( spawidth * sizeof(int) );
  assert(spa_updcols);
  assert(srows);
  assert(S);
  assert(P);

  /*
  lu_re    = (taucs_datatype*)taucs_malloc(maxsn * nrows * sizeof(taucs_datatype) );
  lu_ind   = (int*)   taucs_malloc(maxsn * nrows * sizeof(int)    );
  */
  lu_re    = (taucs_datatype*)taucs_malloc(maxsn * maxcolcount * sizeof(taucs_datatype) );
  lu_ind   = (int*)   taucs_malloc(maxsn * maxcolcount * sizeof(int)    );
  taucs_printf("lu_re  = %08x -> %08x\n",lu_re,lu_re+(maxsn*maxcolcount));
  taucs_printf("lu_ind = %08x -> %08x\n",lu_ind,lu_ind+(maxsn*maxcolcount));
#else
  lu_re    = (taucs_datatype*)taucs_malloc(maxcolcount * sizeof(taucs_datatype));
  lu_ind   = (int*)   taucs_malloc(maxcolcount * sizeof(int));
  /*
  lu_re    = (taucs_datatype*)taucs_malloc(nrows*sizeof(taucs_datatype));
  lu_ind   = (int*)taucs_malloc(nrows*sizeof(int));
  */
#endif

  /* These two can be smaller */

  /*
  update_vec = (int*)taucs_malloc(ncols*sizeof(int));
  update_tmp = (taucs_datatype*)taucs_malloc(ncols*sizeof(taucs_datatype));
  */

  panel_id  = (int*)taucs_malloc(ncols*sizeof(int));
  panel_nnz = (int*)taucs_malloc(ncols*sizeof(int));
  panel_ind = (int**)taucs_malloc(ncols*sizeof(int*));
  panel_inrowlist = (int**)taucs_malloc(ncols*sizeof(int*));
  panel_re  = (taucs_datatype**)taucs_malloc(ncols*sizeof(taucs_datatype*));
  for (i=0; i<ncols; i++) panel_id[i] = -1;

  if (spawidth > 0) {
#ifdef SPA_ONEARRAY
    panel_spa = (taucs_datatype*)taucs_malloc(spawidth*nrows*sizeof(taucs_datatype));
    panel_spamap = (char*)taucs_malloc(spawidth*nrows*sizeof(char) );
    assert(panel_spa && panel_spamap);
#else
    panel_spa = (taucs_datatype**) taucs_malloc(spawidth*sizeof(taucs_datatype*));
    panel_spamap = (char**) taucs_malloc(spawidth*sizeof(char*) );
    assert(panel_spa && panel_spamap);
#endif
  } 
    
  spa_init( nrows );

  nsteps=0; 
  for (i=0; i<ncols; i++) {
    if ((int) (schedstart[i]) > nsteps) nsteps = (int) schedstart[i];
  }


  /* INITIALIZE HEAP */

  heapsize = 0;
  for (i=0; i<nrows; i++) nnzmap[i] = 0;

  taucs_printf("oocsp_numfact: Starting numerical factorization (%d steps)\n",nsteps);

  p = 0;

  for (step=1; step<=nsteps; step++) {

    taucs_printf("oocsp_numfact: Starting step %d/%d\r",step,nsteps);

    if (p==0) {
      /*taucs_printf("oocsp_numfact: (new panel)\n");*/
#ifdef SNODES
      snode_last = -1;
#endif

      if (!rowlists_isempty())
	taucs_printf("oocsp_numfact: Internal Error (row lists not empty)\n");
      if (heapsize) 
	taucs_printf("oocsp_numfact: Internal Error (heap not empty; 1)\n");
      for (i=0; i<nrows; i++) 
	if (nnzmap[i]) taucs_printf("oocsp_numfact: Internal Error (heap not empty; 2)\n");
    } else {
      taucs_printf("oocsp_numfact: (same panel)\n");
    }
    
    /* LOAD A PARTIAL PANEL */
    
    while (fn < ncols && 
            (schedstart[fetchnext[fn]]) == step) {
      j = fetchnext[fn];

      /*
      if (A->clen[j] > lcolcount[j]+ucolcount[j]) {
	taucs_printf("oocsp_numfact: Column %d in matrix larger than L+U estimate\n",j);
	taucs_printf("oocsp_numfact: Size %d > %d+%d\n",A->clen[j],lcolcount[j],ucolcount[j]); 
	taucs_printf("oocsp_numfact: Aborting\n");
      }

      panel_nnz[p] = A->clen[j];
      */

      /* New code: Sivan 28 Feb 2002 */

      panel_nnz[p] = (A->colptr)[colperm[j]+1] - (A->colptr)[colperm[j]];
      panel_id[p]  = j;
      if (lcolcount[j] < 1 
	  || lcolcount[j] > nrows
	  || ucolcount[j] < 1 
	  || ucolcount[j] > nrows) {
	taucs_printf("oocsp_numfact: Column %d: l,u colcounts = (%d+%d)\n",
		  j,lcolcount[j],ucolcount[j]);
	taucs_printf("oocsp_numfact: Invalid column nonzero count\n\n");
      }	

      panel_inrowlist[p] = (int*)    taucs_malloc((lcolcount[j] + ucolcount[j]) * sizeof(int));
      panel_ind[p]       = (int*)    taucs_malloc((lcolcount[j] + ucolcount[j]) * sizeof(int));
      panel_re[p]        = (taucs_datatype*) taucs_malloc((lcolcount[j] + ucolcount[j]) * sizeof(taucs_datatype));
      if (!panel_inrowlist[p] || 
	  !panel_ind[p] || 
	  !panel_re[p]) {
	fprintf(stderr,"out of memory for panel compressed vector\n");
	fprintf(stderr,"j = %d lcolcount = %d ucolcount = %d\n",
		j,lcolcount[j],ucolcount[j]);
	exit(1);
      }
      assert (panel_inrowlist[p] && panel_ind[p] && panel_re[p]);

      if (spawidth > 0) {
#ifdef SPA_ONEARRAY
#else
	panel_spa[p]     = (taucs_datatype*) taucs_malloc(nrows * sizeof(taucs_datatype));
	panel_spamap[p]  = (char*)   taucs_malloc(nrows * sizeof(char));
	assert(panel_spa[p] && panel_spamap[p]);
#endif
      }

      /*
      time_tmp = taucs_wtime();
      oocsp_readcol(A,j,panel_ind[p],panel_re[p]);
      time_read += (taucs_wtime() - time_tmp);
      bytes_read += (double) (panel_nnz[p] * (sizeof(taucs_datatype)+sizeof(int)));
      */

      /* new code: Sivan 28 Feb 2002 */
      {
	int jp,ip,i;
	jp = colperm[j];
	for (i=0, ip=(A->colptr)[jp]; ip<(A->colptr)[jp+1]; i++, ip++) {
	  (panel_ind[p])[i] = (A->rowind)[ip];
	  (panel_re [p])[i] = (A->taucs_values)[ip];
	  /*
	  printf(">>> reading column number %d (index %d), row %d value %.2e\n",
		 j,jp,(panel_ind[p])[i],(panel_re[p])[i]);
	  */
	}
      }

      if (spawidth > 0) {
	if (p >= spawidth) {
	  taucs_printf("oocsp_numfact: p=%d spawidth=%lg\n",p,spawidth);
	  taucs_printf("oocsp_numfact: Panel wider than spawidth\n");
	} 
	scatter(panel_nnz[p],panel_re[p],panel_ind[p],
#ifdef SPA_ONEARRAY
		panel_spa+(p * nrows),
		panel_spamap+(p * nrows)
#else
		panel_spa[p],
		panel_spamap[p]
#endif
		);
      }

      for (i=0; i<panel_nnz[p]; i++) {
	panel_inrowlist[p][i] = rowlists_insert(panel_ind[p][i],
						p);

	/*if (nnzmap[ panel_ind[p][i] ] == 0 && uindices[ panel_ind[p][i] ]) {*/
	if (nnzmap[ panel_ind[p][i] ] == 0 && !lindices[ panel_ind[p][i] ]) {
	  nnzmap[ panel_ind[p][i] ] = 1;
	  num_heap_insert( heap, &heapsize, ipivots, panel_ind[p][i] );
	}
      }

      p++;
      fn++;
    }

    /*taucs_printf("oocsp_numfact: Updating (step %d)\n",step);*/

#ifndef SNODES
    /* we need to prevent panel cols from being loaded! */

    while ((i = num_heap_extractmin( heap, &heapsize, ipivots)) != -1) {
      if (i == INT_MAX) continue;

      nnzmap[ i ] = 0;
      
      k = ipivots[i];
      if (k == INT_MAX) continue;
      
      /*taucs_printf("oocsp_numfact: extracted row %d (col %d) from heap. %d\n",i,k,uindices[i]);*/

      if (panels[k] == panels[ panel_id[0] ]) {
	/*taucs_printf("oocsp_numfact: skipping column from this panel (col %d, panel %lg)\n",
	       k,panels[k]);*/
	continue;
      }
      
      /*if (lindices[i]) continue;*/   /*if (!uindices[i]) continue;*/

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      /* Sivan: replaced 2 March 2002 */
      /*oocsp_readcol(L,k,lu_ind,lu_re);*/
      Lreadcol(LU,k,Lclen[k],lu_ind,lu_re);

#ifdef DETAILED_TIMING
      bytes_read += (double) (Lclen[k] * (sizeof(taucs_datatype)+sizeof(int)));
      col_read += 1.0;
#endif /* DETAILED_TIMING */

      /*
      for (ks = k+1; ks<ncols && snode_index[ks]==snode_index[k]; ks++) {
	oocsp_readcol(L,ks,lu_ind+((ks-k)*nrows),lu_re+((ks-k)*nrows));
	bytes_read += (double) ((L.clen[ks]) * (sizeof(taucs_datatype)+sizeof(int)));
#ifdef DETAILED_TIMING
	col_read += 1.0;
#endif
      }
      taucs_printf("oocsp_numfact: Read supernode, %d cols\n",ks-k+1);
      */

#ifdef DETAILED_TIMING
      time_read += (taucs_wtime() - time_tmp);
#endif


      /*
      taucs_printf("oocsp_numfact: col %d (panel %lg, now %lg; read from disk) for row %d, length %d\n",
	     k,panels[k],panels[panel_id[0]],i,(L.clen)[k]);
	     */

      if (Lclen[k] < 0) {
	taucs_printf("oocsp_numfact: k=%d len=%d\n",k,Lclen[k]);
	taucs_printf("oocsp_numfact: Internal Error\n");
      }

      for (ii=0; ii<Lclen[k]; ii++) {
	/*if (nnzmap[ lu_ind[ii] ] == 0 && uindices[ lu_ind[ii] ]*/
	if (nnzmap[ lu_ind[ii] ] == 0 && !lindices[ lu_ind[ii] ]
	    && !taucs_iszero(lu_re[ii])) {
	  nnzmap[ lu_ind[ii] ] = 1;
	  num_heap_insert( heap, &heapsize, ipivots, lu_ind[ii] );
	  /*taucs_printf("oocsp_numfact: inserting row %d into heap\n",lu_ind[ii]);*/
	}
      }

      for(qp = rowlists_head[i]; qp != -1; qp = rowlists_next[qp]) {
	q = rowlists_colind[qp];
#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif
	if (spawidth <= 0) {
	  scatter(panel_nnz[q],panel_re[q],panel_ind[q],
		  spa,spamap);
	  spcol_spcol_update(i,
			     lu_re,lu_ind,Lclen[k],
			     q,panel_inrowlist[q],
			     spa,
			     spamap,
			     panel_ind[q],&(panel_nnz[q]),
			     lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	  gather(panel_nnz[q],panel_re[q],panel_ind[q],
		 spa,spamap);
	} else {
	  spcol_spcol_update(i,
			     lu_re,lu_ind,Lclen[k],
			     q,panel_inrowlist[q],
#ifdef SPA_ONEARRAY
			     panel_spa+(q * nrows),
			     panel_spamap+(q * nrows),
#else
			     panel_spa[q],
			     panel_spamap[q],
#endif
			     panel_ind[q],&(panel_nnz[q]),
			     lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	}
#ifdef DETAILED_TIMING
	time_colcol += (taucs_wtime() - time_tmp);
	col_ooc_updates += 1.0;
#endif
      }

      /*
      update_vec_next = 0;
      for(qp = rowlists_head[i]; qp != -1; qp = rowlists_next[qp]) {
	q = rowlists_colind[qp];
	col_ooc_updates += 1.0;
	update_vec[update_vec_next] = q;
	update_vec_next++;
      }

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif

#ifdef SPA_ONEARRAY
      spcol_panel_update(i,
			 lu_re,lu_ind,Lclen[k],
			 update_vec,update_vec_next,update_tmp,
			 panel_inrowlist,
			 panel_spa,
			 panel_spamap,
			 panel_ind,panel_nnz,
			 nrows);
#endif

#ifdef DETAILED_TIMING
      time_colcol += (taucs_wtime() - time_tmp);
#endif
      */
    }
#else  /* with SNODES */
    /* we need to prevent panel cols from being loaded! */

    while ((i = num_heap_extractmin( heap, &heapsize, ipivots)) != -1) {
      if (i == INT_MAX) continue;

      nnzmap[ i ] = 0;
      
      k = ipivots[i];
      if (k == INT_MAX) continue;
      
      /*printf("last supernode %d this column is %d its supernode %d\n",snode_last,k,snode_index[k]);*/
      if (snode_index[k] == snode_last) continue; /* skip rest of supernode */
      snode_last = snode_index[k]; /* mark for next time */
      
      /*taucs_printf("oocsp_numfact: extracted row %d (col %d) from heap. %d\n",i,k,uindices[i]);*/

      if (panels[k] == panels[ panel_id[0] ]) {
	/*taucs_printf("oocsp_numfact: skipping column from this panel (col %d, panel %lg)\n",
	  k,panels[k]);*/
	continue;
      }
      
      /*if (lindices[i]) continue;*/ /*if (!uindices[i]) continue;*/

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif

      for (ks = k; ks<ncols && snode_index[ks]==snode_index[k]; ks++) {
	/*printf(">>> %d %d\n",ks,snode_index[ks]);*/
	/*oocsp_readcol(L,ks,lu_ind+((ks-k)*maxcolcount),lu_re+((ks-k)*maxcolcount));*/
	Lreadcol(LU,ks,Lclen[ks],lu_ind+((ks-k)*maxcolcount),lu_re+((ks-k)*maxcolcount));
#ifdef DETAILED_TIMING
	bytes_read += (double) (Lclen[ks] * (sizeof(taucs_datatype)+sizeof(int)));
	col_read += 1.0;
#endif /* DETAILED_TIMING */
      }

#ifdef DETAILED_TIMING
      time_read += (taucs_wtime() - time_tmp);
#endif
      /*taucs_printf("oocsp_numfact: Read supernode, %d cols %d:%d\n",ks-k,k,ks-1);*/
      
      if (Lclen[k] < 0) {
	taucs_printf("oocsp_numfact: k=%d len=%d\n",k,Lclen[k]);
	taucs_printf("oocsp_numfact: Internal Error\n");
      }

      for (ii=0; ii<Lclen[k]; ii++) {
	/*if (nnzmap[ lu_ind[ii] ] == 0 && uindices[ lu_ind[ii] ]*/
	if (nnzmap[ lu_ind[ii] ] == 0 && !lindices[ lu_ind[ii] ]
	    && !taucs_iszero(lu_re[ii])) {
	  nnzmap[ lu_ind[ii] ] = 1;
	  num_heap_insert( heap, &heapsize, ipivots, lu_ind[ii] );
	  /*taucs_printf("oocsp_numfact: inserting row %d into heap\n",lu_ind[ii]);*/
	}
      }

      /* determine which panel cols need to be updates */
      dense_flag = 1;
      for (tmp = 0; tmp<1; tmp++) {
	int jj,ii,kk,iip,jjp;

	/*	printf("snode %d:%d\n",k,ks-1);*/

	if (ks-k < SNODE_THRESHOLD) {dense_flag=0; break;}

	assert(spawidth > 0); /* the code for the other case is not implemented yet; Sivan */

#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif

	srows_n = Lclen[k] + 1; /* the diagonal element in column k is not */
                                /* represented explicitely in L, it's 1    */

	if (srows_n < SNODE_THRESHOLD) {dense_flag=0; break;}

	m2 = snode_map; /* reuse the vector */

	/* create an array of row indices, with pivot rows first, in order */
	/* so that the supernode array will be a trapezoidal matrix        */
	/* we should keep this and restore the -1 invariant                */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif

	for (jj=k; jj<ks; jj++) {
	  ii = pivots[jj];
	  srows[jj-k] = ii;
	  m2[ii] = 1;
	}
	/* now the rest of the rows */
	srow_next = ks-k;
	for (iip=0; iip<Lclen[k]; iip++) {
	  ii = lu_ind[iip];
	  if (ii == pivots[k]) srows_n--; /* I have no idea why this row shows up; always with 0.0 */
	  if (m2[ii] == -1) {
	    srows[srow_next] = ii;
	    srow_next++;
	  } else {
	    m2[ii] = -1; /* restore the invariant */
	  }
	}
	assert(srow_next == srows_n);
	
	/* we begin by figuring out which columns of the panel are updated    */
	/* by this supernode.                                                 */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif

	spa_n = 0;
	for (jj=k; jj<ks; jj++) {
	  ii = pivots[jj];
	  for(qp = rowlists_head[ii]; qp != -1; qp = rowlists_next[qp]) {
	    int skip; /* don't add a column twice to spa_updcols */
	    q = rowlists_colind[qp];
	    for (skip=0, jjp=0; jjp<spa_n; jjp++) {
	      if (spa_updcols[jjp] == q) {
		skip = 1;
		break;
	      }
	    }
	    if (!skip) {
	      /*if (jj-k > 4) printf("*** jj-k %d ks-k %d\n",jj-k,ks-k);*/

#ifdef DETAILED_TIMING
	      flops += 2.0 * ( (ks-jj) * (srows_n - (jj-k)) - 0.5*(ks-jj)*(ks-jj) );
	      flops_extra += 2.0 * ( (jj-k) * (srows_n) - 0.5*(jj-k)*(jj-k) );
#endif /* DETAILED_TIMING */

	      spa_updcols[spa_n] = q;
	      /*spa_updptrs[spa_n] = jj-k;*/
	      spa_n++;
	    }
	  }
	}
#ifdef DETAILED_TIMING
	time_snode_1 += (taucs_wtime()-time_snode_tmp);
#endif

	if (spa_n < SNODE_THRESHOLD) {
	  for (jj=k; jj<ks; jj++) {
	    ii = pivots[jj];
	    m2[ii] = -1;
	  }
	  for (iip=0; iip<Lclen[k]; iip++) {
	    ii = lu_ind[iip];
	    m2[ii] = -1; /* restore the invariant */
	  }
	  dense_flag=0; 
	  break;
	}

	/*printf("snode %d:%d (dense)\n",k,ks-1);*/

	/* copy snode columns into a dense array */

	for (jj=k; jj<ks; jj++) {
	  for (iip=0; iip<Lclen[jj]; iip++)
	    /* mark in m2 where each row is */
	    m2[ lu_ind[(jj-k)*maxcolcount + iip] ] = iip;
	  
	  for (iip=0; iip<srows_n; iip++) {
	    if (m2[ srows[iip] ] != -1) {
	      S[ (jj-k)*srows_n + iip ] = lu_re[(jj-k)*maxcolcount + m2[ srows[iip] ]];
	      m2[ srows[iip] ] = -1; /* restore invariant for next column */
	    } else
	      S[ (jj-k)*srows_n + iip ] = taucs_zero;
	  }
	}
#ifdef DETAILED_TIMING
	time_snode_3 += (taucs_wtime()-time_snode_tmp);
#endif

	/* now the snode is stored in a dense array S, with row indices srows */
	/* and with the diagonal block of L on top.                           */

	/* next we copy the columns of the panels that need to be updated     */
	/* into the dense array P.                                            */

	/* we then copy these columns into the dense array P, and if          */
	/* fill occurs, we update the nonzero bitmap and row lists.           */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif
#define OLD_1
#ifdef OLD_1
	for (jjp=0; jjp<spa_n; jjp++) {
	  jj = spa_updcols[jjp];

	  for (iip=0; iip<srows_n; iip++) {
	  /*for (iip=spa_updptrs[jjp]; iip<srows_n; iip++) {*/
	    if (panel_spamap[jj*nrows + srows[iip]] == 0) {
	      /* fill will occur here */
	      panel_spa[jj*nrows + srows[iip]] = taucs_zero;
	      /*P[jjp*srows_n + iip] = taucs_zero;*/
	      
	      panel_spamap   [jj*nrows + srows[iip]] = 1;
	      panel_ind      [jj][panel_nnz[jj] ] = srows[iip];
	      panel_inrowlist[jj][panel_nnz[jj] ] = rowlists_insert(srows[iip],jj);
	      (panel_nnz[jj])++;
	    }
#ifdef JUNK
	    } else { 
	      /* no fill, just copy the element */
  	      P[jjp*srows_n + iip] = panel_spa[jj*nrows + srows[iip]];
	    }
#endif
	  }
	} 

	{ 
#ifdef DETAILED_TIMING
	  double x = taucs_wtime();
#endif
	  for (jjp=0; jjp<spa_n; jjp++) {
	    jj = spa_updcols[jjp];
	    for (iip=0; iip<srows_n; iip++) {
	      P[jjp*srows_n + iip] = panel_spa[jj*nrows + srows[iip]];
	    }
	  }
#ifdef DETAILED_TIMING
	  time_snode_21 += (taucs_wtime()-x);
#endif
        }
#else
	for (jjp=0; jjp<spa_n; jjp++) {
	  int iip_block, loop_bound, flag;
	  
	  jj = spa_updcols[jjp];

	  for (iip_block=0; iip_block<srows_n; iip_block += SNODE_BLOCK) {

	    loop_bound = min(iip_block + SNODE_BLOCK,srows_n);
	    
	    flag = 1;
	    for (iip=iip_block; iip<loop_bound; iip++)
	      flag &= panel_spamap[jj*nrows + srows[iip]];
	     
	    if (!flag) {
	      for (iip=iip_block; iip<loop_bound; iip++) {
		if (panel_spamap[jj*nrows + srows[iip]] == 0) {
		  panel_spamap   [jj*nrows + srows[iip]] = 1;
		  panel_spa      [jj*nrows + srows[iip]] = taucs_zero;
		  panel_ind      [jj][panel_nnz[jj] ] = srows[iip];
		  panel_inrowlist[jj][panel_nnz[jj] ] = rowlists_insert(srows[iip],jj);
		  (panel_nnz[jj])++;
		}
	      }
	    }
	  }
	  
	  { 
#ifdef DETAILED_TIMING
	    double x = taucs_wtime();
#endif
	    for (iip=0; iip<srows_n; iip++)
	      P[jjp*srows_n + iip] = panel_spa[jj*nrows + srows[iip]];
#ifdef DETAILED_TIMING
	    time_snode_21 += (taucs_wtime()-x);
#endif
	  }
	}
#endif

#ifdef DETAILED_TIMING
	time_snode_2 += (taucs_wtime()-time_snode_tmp);
#endif

	/*printf("supernode update: col %d pivotrow %d updates col %d\n",jj,ii,panel_id[q]);*/

#ifdef DETAILED_TIMING
	time_snode_prepare += (taucs_wtime() - time_tmp);
#endif

#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif

	/*flops += (2.0 * spa_n * srows_n * (ks-k) - 2.0); */ /* over estimate; sivan. */
	/* we can subract triangle in estimate, skip zero pivots in flops & code */
#ifdef DETAILED_TIMING
	flops_dense += (2.0 * spa_n * srows_n * (ks-k) -
			1.0 * spa_n * (ks-k) * (ks-k)); 
#endif /* DETAILED_TIMING */

#ifdef JUNK
	printf("TRSM's RHS:\n");
	for (kk=k; kk<ks; kk++) {
	  for (jj=0; jj<spa_n; jj++) {
	    printf("%c", P[jj*srows_n + (kk-k)] ? '*' : 'o');
	  }
	  printf("\n");
	}
#endif

#ifdef USE_BLAS
	{ 
	  int m = ks-k;
	  int n = spa_n;
	  int M = srows_n - m;
	  int N = n;
	  int K = m;
	  
	  if (m > BLAS_THRESHOLD && n > BLAS_THRESHOLD) {
	    taucs_trsm("Left",
		       "Lower",
		       "No transpose",
		       "Unit",
		       &m,&n,
		       &taucs_one_const,
		       S,&srows_n,
		       P,&srows_n
		       );
	  } else {
	    /* TRSM */
	    for (jj=0; jj<spa_n; jj++) {
	      for (kk=k; kk<ks; kk++) {
		for (ii=kk-k+1; ii<(ks-k); ii++) {
		  /*P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);*/
		  P[jj*srows_n + ii] = 
		  taucs_sub(P[jj*srows_n + ii],
			    taucs_mul(P[jj*srows_n + (kk-k)] , S[(kk-k)*srows_n + ii]));
		}
	      }
	    }
	  }

	  if (M > BLAS_THRESHOLD && N > BLAS_THRESHOLD && K > BLAS_THRESHOLD) {
	    taucs_gemm("No transpose",
		       "No transpose",
		       &M,&N,&K,
		       &taucs_minusone_const,
		       S+m, &srows_n,
		       P  , &srows_n,
		       &taucs_one_const,
		       P+m, &srows_n);
	  } else {
	    /* GEMM */
	    for (jj=0; jj<spa_n; jj++) {
	      for (kk=k; kk<ks; kk++) {
		for (ii=(ks-k); ii<srows_n; ii++) {
		  /*P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);*/
		  P[jj*srows_n + ii] =
		    taucs_sub(P[jj*srows_n + ii],
			      taucs_mul(P[jj*srows_n + (kk-k)] , S[(kk-k)*srows_n + ii]));
		}
	      }
	    }
	  }
	}
#else	   
	/* TRSM */
	for (jj=0; jj<spa_n; jj++) {
	  for (kk=k; kk<ks; kk++) {
	    for (ii=kk-k+1; ii<(ks-k); ii++) {
	      P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);
	    }
	  }
	}

	/* GEMM */
	for (jj=0; jj<spa_n; jj++) {
	  for (kk=k; kk<ks; kk++) {
	    for (ii=(ks-k); ii<srows_n; ii++) {
	      P[jj*srows_n + ii] -= (P[jj*srows_n + (kk-k)] * S[(kk-k)*srows_n + ii]);
	    }
	  }
	}
#endif
#ifdef DETAILED_TIMING
	time_snode_dense += (taucs_wtime() - time_tmp);
	col_ooc_updates += 1.0;
#endif

#ifdef DETAILED_TIMING
	time_tmp = taucs_wtime();
#endif

	/* now copy panel columns out of the dense P */

#ifdef DETAILED_TIMING
	time_snode_tmp = taucs_wtime();
#endif
	for (jjp=0; jjp<spa_n; jjp++) {
	  jj = spa_updcols[jjp];
	  for (iip=0; iip<srows_n; iip++) {
	    /*for (iip=spa_updptrs[jjp]; iip<srows_n; iip++) {*/
	    panel_spa[jj*nrows + srows[iip] ] = P[jjp*srows_n + iip];
	  }
	}
#ifdef DETAILED_TIMING
	time_snode_4 += (taucs_wtime()-time_snode_tmp);
        time_snode_prepare += (taucs_wtime() - time_tmp);
#endif
      }
      if (!dense_flag) { /* we didn't do it using the blas since m,n, or k were too small */
	/* not worth copying into dense arrays etc */
	/* determine which panel cols need to be updates */
	int jj,ii;
	for (jj=k; jj<ks; jj++) {
	  ii = pivots[jj];
	  if (ii == INT_MAX) {
	    taucs_printf("oocsp_numfact: internal error (supernode update)\n");
	    exit(1);
	  }
	  for(qp = rowlists_head[ii]; qp != -1; qp = rowlists_next[qp]) {
	    q = rowlists_colind[qp];
	    /*printf("supernode update: col %d pivotrow %d updates col %d\n",jj,ii,panel_id[q]);*/

#ifdef DETAILED_TIMING
	    time_tmp = taucs_wtime();
#endif
	    if (spawidth <= 0) {
	      taucs_printf("oocsp_numfact: internal error (supernode without a spa)\n");
	      exit(1);
	    } else {
	      spcol_spcol_update(ii,
				 lu_re+((jj-k)*maxcolcount),lu_ind+((jj-k)*maxcolcount),Lclen[jj],
				 q,panel_inrowlist[q],
#ifdef SPA_ONEARRAY
				 panel_spa+(q * nrows),
				 panel_spamap+(q * nrows),
#else
				 panel_spa[q],
				 panel_spamap[q],
#endif
				 panel_ind[q],&(panel_nnz[q]),
				 lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	    }
#ifdef DETAILED_TIMING
	    time_colcol   += (taucs_wtime() - time_tmp);
	    time_colcol_1 += (taucs_wtime() - time_tmp);
	    col_ooc_updates += 1.0;
#endif
	  }
	}
      }
    }

#endif /* else SNODES */

    
    /*taucs_printf("oocsp_numfact: Factoring and Writing (step %d)\n",step);*/

    while (en < ncols && 
            (schedend[ejectnext[en]]) == step) {
      p--;
      j = ejectnext[en];

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      
      if (panel_id[p] < 0)  taucs_printf("oocsp_numfact: internal error (panel stack)\n");
      if (panel_id[p] != j) {
	taucs_printf("oocsp_numfact: en=%d p=%d, j=%d, panel_id[p]=%d\n",
	       en,p,j,panel_id[p]);
	taucs_printf("oocsp_numfact: internal error (panel id's)\n");
      }

      /* gather */

      if (spawidth > 0) {
	gather(panel_nnz[p],panel_re[p],panel_ind[p],
#ifdef SPA_ONEARRAY
	       panel_spa + (p * nrows),
	       panel_spamap + (p * nrows));
#else
	       panel_spa[p],
	       panel_spamap[p]);
#endif
      }

      /* Find pivot */
      
      maxval = 0;
      maxind = -1;
      for (ip = 0; ip<panel_nnz[p]; ip++) {
	absval = (double) taucs_abs(panel_re[p][ip]);
	if (!lindices[panel_ind[p][ip]]) continue;
	if (absval > maxval) {
	  maxval = absval;
	  maxind = ip;
	}
      }
      
      if ( maxind == -1 ) {
	taucs_printf("oocsp_numfact: Zero Column!\n");
      }
      
      if ( taucs_iszero(panel_re[p][maxind]) ) {
	taucs_printf("oocsp_numfact: Zero Pivot!\n");
      }
      
      pivotindex = panel_ind[p][maxind];

      /*
      taucs_printf("oocsp_numfact: pivot for column %d is %d (%lg)\n",j,pivotindex,
	     panel_re[p][maxind]);
	     */

      if ( ipivots[ pivotindex ] != INT_MAX ) 
	taucs_printf("oocsp_numfact: Pivoting twice on the same row\n");
      
      ipivots[ pivotindex ] = j;
#ifdef SNODES
      pivots[ j ] = pivotindex;
#endif
      /*printf("### pivot row for column %d is %d\n",j,pivotindex);*/

      lindices[ pivotindex ] = 0;
      /*uindices[ pivotindex ] = 1;*/
      
      /* copy to L, U */
      
      /*taucs_printf("oocsp_numfact: copying to l,u\n");*/
      
      lnext = 0;
      /* unext = nrows-1; */
      unext = maxcolcount-1;
      for (ip = 0; ip<panel_nnz[p]; ip++) {
	i = panel_ind[p][ip];
	/*if (uindices[i]) {*/
	if (!lindices[i]) {
	  lu_re[unext]  = panel_re[p][ip];
	  lu_ind[unext] = i;
	  /*assert( unext > 0 && unext < maxsn*maxcolcount);*/
	  unext--;
	} else {
	  /*lu_re[lnext]  = (panel_re[p][ip])/(panel_re[p][maxind]);*/
	  lu_re[lnext]  = taucs_div(panel_re[p][ip] , panel_re[p][maxind]);
	  lu_ind[lnext] = i;
	  lnext++;
	}	  
      }

#ifdef DETAILED_TIMING
      flops += (double) lnext;
#endif /* DETAILED_TIMING */
      
#ifdef DETAILED_TIMING
      time_factor += (taucs_wtime() - time_tmp);
#endif

      /* Write out column of L, U */
      /*
      taucs_printf("oocsp_numfact: writing column %d of l,u (sizes %d %d)\n",
	     j,lnext,nrows - 1 - unext);
	     */

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      /*
      oocsp_appendcol(U,j,nrows - 1 - unext,
		      lu_ind + (unext+1),
		      lu_re  + (unext+1));
      */

      /* Sivan: replaced this code on 2 March 2002 */
      /*
      oocsp_appendcol(U,j,maxcolcount - 1 - unext,
		      lu_ind + (unext+1),
		      lu_re  + (unext+1));
      */
      assert(Uclen[j] == 0);
      Uclen[j] = maxcolcount - 1 - unext;
      Uappendcol(LU,j,maxcolcount - 1 - unext,
		 lu_ind + (unext+1),
		 lu_re  + (unext+1));
#ifdef DETAILED_TIMING
      time_append += (taucs_wtime() - time_tmp);
#endif

#ifdef DETAILED_TIMING
      bytes_appended += (double) ((maxcolcount - 1 - unext) 
				  * (sizeof(taucs_datatype)+sizeof(int)));
#endif /* DETAILED_TIMING */

#ifdef SNODES
      /* detect supernodes */

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      
      snode_flag = 1; /* we assume so for now */

      /* a quick check using a hash function to quickly rule out columns */

      if (j>0 && panels[j] != panels[j-1]) snode_flag = 0;

      hash = pivotindex;
      for (ii=0; ii<snode_size; ii++)
	hash ^= snode_pivrows[ii];
      for (ii=0; ii<lnext; ii++)
	hash ^= lu_ind[ii];

      if (hash == snode_hash) {
	/* The hash is identical, but is it in the same supernode? */

	/* first, mark this column's structure in the snode_map    */
	/* then make sure all the supernode's nonzeros are marked  */

	/* for (ii=0; ii<nrows; ii++) assert( snode_map[ii] == -1 ); */

	                                snode_map[pivotindex       ] = 1;
	for (ii=0; ii<lnext; ii++)      snode_map[lu_ind[ii]       ] = 1;
	for (ii=0; ii<snode_size; ii++) snode_map[snode_pivrows[ii]] = 1;
	for (ii=0; ii<snode_nnz; ii++)
	  if (snode_map[snode_ind[ii]] != 1) snode_flag = 0;

	/* next, zero this column's structure in the snode_map     */
	/* then make sure all the supernode's nonzeros are marked  */


    	                                snode_map[pivotindex]        = -1;
	for (ii=0; ii<lnext; ii++)	snode_map[lu_ind[ii]       ] = -1;
	for (ii=0; ii<snode_size; ii++) snode_map[snode_pivrows[ii]] = -1;
	
	/* mark this supernodes nonzeros in the map                */
	
	for (ii=0; ii<snode_nnz; ii++) snode_map[snode_ind[ii]] = 1;

	/* check that all the column's nonzeros are in the snode   */

	for (ii=0; ii<lnext; ii++) if (snode_map[lu_ind[ii]] != 1) snode_flag = 0;

	/* zero the bitmap for next time                           */

	for (ii=0; ii<snode_nnz; ii++) snode_map[snode_ind[ii]] = -1;

	/* for (ii=0; ii<nrows; ii++) assert( snode_map[ii] == -1 ); */

      } else
	snode_flag = 0;

      if (snode_size >= maxsn) snode_flag = 0;

      if (j != snode_lastcol+1) snode_flag = 0;

      if (snode_flag) {

	/* THIS COLUMN BELONGS TO AN EXISTING SUPERNODE */
	
	snode_pivrows[snode_size] = pivotindex;
	snode_size++;

	for (ii=0; ii<snode_size; ii++)
	  spa[snode_pivrows[ii]] = taucs_zero;
	for (ii=0; ii<lnext; ii++)
	  spa[lu_ind[ii]] = lu_re[ii];

	for (ii=0; ii<snode_nnz; ii++)
	  snode_re[ii] = spa[snode_ind[ii]];

#ifdef DETAILED_TIMING
	time_snode_detect += (taucs_wtime() - time_tmp);
	time_tmp = taucs_wtime();
#endif

	/* Sivan: replaced 2 March 2002 */
	/* oocsp_appendcol(L,j,snode_nnz,snode_ind,snode_re);*/
	assert(Lclen[j] == 0);
	Lclen[j] = snode_nnz;
	Lappendcol(LU,j,snode_nnz,snode_ind,snode_re);

#ifdef DETAILED_TIMING
	time_append += (taucs_wtime() - time_tmp);
	bytes_appended += (double) (snode_nnz
				    * (sizeof(taucs_datatype)+sizeof(int)));
#endif

	/*taucs_printf("oocsp_numfact: supernode size = %d (column %d)\n",snode_size,j);*/
      } else {
	/* THIS COLUMN BELONGS TO A NEW SUPERNODE */

	snode_id ++;

	/*taucs_printf("oocsp_numfact: new supernode, column %d row %d\n",j,pivotindex);*/

#ifdef DETAILED_TIMING
	time_snode_detect += (taucs_wtime() - time_tmp);
	time_tmp = taucs_wtime();
#endif
      
	/*oocsp_appendcol(L,j,lnext,lu_ind,lu_re);*/
	assert(Lclen[j] == 0);
	Lclen[j] = lnext;
	Lappendcol(LU,j,lnext,lu_ind,lu_re);

#ifdef DETAILED_TIMING
	time_append += (taucs_wtime() - time_tmp);
	bytes_appended += (double) (lnext 
				    * (sizeof(taucs_datatype)+sizeof(int)));
	time_tmp = taucs_wtime();
#endif

	snode_hash = pivotindex;
	for (ii=0; ii<lnext; ii++) {
	  snode_hash ^= lu_ind[ii];
	  snode_ind[ii] = lu_ind[ii];
	}
	snode_pivrows[0] = pivotindex;
	snode_nnz = lnext;
	snode_size = 1;

#ifdef DETAILED_TIMING
	time_snode_detect += (taucs_wtime() - time_tmp);
#endif
      }

      snode_index[j] = snode_id;

      snode_lastcol = j;

#else /* SNODES */

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      /*oocsp_appendcol(L,j,lnext,lu_ind,lu_re);*/
      assert(Lclen[j] == 0);
      Lclen[j] = lnext;
      Lappendcol(LU,j,lnext,lu_ind,lu_re);
#ifdef DETAILED_TIMING
      time_append += (taucs_wtime() - time_tmp);
      bytes_appended += (double) (lnext 
				  * (sizeof(taucs_datatype)+sizeof(int)));
#endif /* DETAILED_TIMING */

#endif /* SNODES */
     
      /* Update rest of panel */

      /*taucs_printf("oocsp_numfact: updating\n");*/

#ifdef DETAILED_TIMING
      time_tmp = taucs_wtime();
#endif
      if (spawidth <= 0) {
	for(qp = rowlists_head[pivotindex]; qp != -1; qp = rowlists_next[qp]) {
	  q = rowlists_colind[qp];
	  scatter(panel_nnz[q],panel_re[q],panel_ind[q],
		  spa,spamap);
	  spcol_spcol_update(pivotindex,
			     lu_re,lu_ind,lnext,
			     q,panel_inrowlist[q],
			     spa,
			     spamap,
			     panel_ind[q],&(panel_nnz[q]),
			     lcolcount[panel_id[q]]+ucolcount[panel_id[q]]);
	  gather(panel_nnz[q],panel_re[q],panel_ind[q],
		 spa,spamap);
	}
      } else {
	int spa_n = 0;
	for(qp = rowlists_head[pivotindex]; qp != -1; qp = rowlists_next[qp]) {
	  q = rowlists_colind[qp];
	  spa_updcols[spa_n] = q;
	  spa_n++;
	}
	/*
	for(qp = 0; qp < spa_n; qp++) {
	  q = spa_updcols[qp];
	  spcol_spcol_update(pivotindex,
			     lu_re,lu_ind,lnext,
			     q,
			     panel_inrowlist[q],
			     panel_spa    + (q*nrows),
			     panel_spamap + (q*nrows),
			     panel_ind[q],&(panel_nnz[q]),
			     0);
	}
	*/

	spcol_spa_update(pivotindex,
			 lu_re,lu_ind,lnext,
			 spa_updcols,spa_n,nrows,
			 panel_inrowlist,
			 panel_spa,
			 panel_spamap,
			 panel_ind,panel_nnz);
      }
#ifdef DETAILED_TIMING
      time_colcol   += (taucs_wtime() - time_tmp);
      time_colcol_2 += (taucs_wtime() - time_tmp);
#endif

      /*taucs_printf("oocsp_numfact: done updating\n");*/

      /* Eject from panel */

      for (ip = 0; ip<panel_nnz[p]; ip++) {
	i = panel_ind[p][ip];
	rowlists_delete(i,panel_inrowlist[p][ip]);
      }

      taucs_free(panel_inrowlist[p]); panel_inrowlist[p] = NULL;
      taucs_free(panel_ind[p]); panel_ind[p] = NULL;
      taucs_free(panel_re[p]);  panel_re[p]  = NULL;
      if (spawidth > 0) {
#ifndef SPA_ONEARRAY
      taucs_free(panel_spa[p]); panel_spa[p] = NULL;
      taucs_free(panel_spamap[p]);  panel_spamap[p]  = NULL;
#endif
      }
      panel_id[p] = -1;

      /*taucs_printf("oocsp_numfact: done with col\n");*/

      en++;
    }    
  }

#ifdef SNODESHIST
#define HIST_SIZE 32
#define HIST_INC  4
  {
    int i,j,k,s,last;
    int hist[HIST_SIZE];
    for (i=0; i<HIST_SIZE; i++) hist[i] = 0;

    last = 0;
    s = 0;
    for (j=0; j<ncols; j++) {
      if (snode_index[j] == last) {
	s++;
      } else {
	i = s / HIST_INC;
	if (i > HIST_SIZE-1) i=HIST_SIZE-1;
	(hist[i])++;
	/*printf("last snode index %d size %d hist %d\n",last,s,i);*/
	last = snode_index[j];
	s = 1;
      }
    }
    /*printf("last snode index %d size %d\n",last,s);*/
    i = s / HIST_INC;
    if (i > HIST_SIZE-1) i=HIST_SIZE-1;
    (hist[i])++;
    taucs_printf("oocsp_numfact: snode histogram:\n");
    for (i=0; i<HIST_SIZE-1; i++)
      taucs_printf("oocsp_numfact:   %02d-%02d: %d snodes\n",i*HIST_INC+1,(i+1)*HIST_INC,hist[i]);
    taucs_printf("oocsp_numfact:   %02d&up: %d snodes:\n",
	       (HIST_SIZE-1)*HIST_INC+1,hist[HIST_SIZE-1]);
  }
#endif

  taucs_io_append(LU, HEADER_NROWS  , 1,      1, TAUCS_INT, &(A->m) );
  taucs_io_append(LU, HEADER_NCOLS  , 1,      1, TAUCS_INT, &(A->n) );
  taucs_io_append(LU, HEADER_FLAGS  , 1,      1, TAUCS_INT, &(A->flags));
  taucs_io_append(LU, HEADER_COLPERM, (A->n), 1, TAUCS_INT, colperm    );
  taucs_io_append(LU, HEADER_IPIVOTS, (A->n), 1, TAUCS_INT, ipivots    );
  taucs_io_append(LU, HEADER_LCLEN  , (A->n), 1, TAUCS_INT, Lclen      );
  taucs_io_append(LU, HEADER_UCLEN  , (A->n), 1, TAUCS_INT, Uclen      );

  (remaining_memory) += (double) ( 2 * (ncols+1) * sizeof(int));

  {
    double nnzL = 0;
    double nnzU = 0;
    for (j=0; j < (A->n); j++) nnzL += (double) (Lclen[j]);
    for (j=0; j < (A->n); j++) nnzU += (double) (Uclen[j]);
    taucs_printf("oocsp_numfact: nnz(L) = %.2le nnz(U) = %.2le nnz(L+U) = %.2leM\n",
	       nnzL,nnzU,(nnzL+nnzU)/1e6);
  }
  
  if (spawidth > 0) {
    taucs_free(panel_spa);
    taucs_free(panel_spamap);
  }

  spa_finalize();

  rowlists_finalize();

  taucs_free(ipivots);

  taucs_free(Uclen);
  taucs_free(Lclen);

  taucs_free(panel_re);
  taucs_free(panel_ind);
  taucs_free(panel_inrowlist);
  taucs_free(panel_nnz);
  taucs_free(panel_id);

/*
  taucs_free(update_vec);
  taucs_free(update_tmp);
*/

#ifdef SNODES
  taucs_free(snode_index);
  taucs_free(snode_pivrows);
  taucs_free(snode_ind);
  taucs_free(snode_re);
  taucs_free(snode_map);

  taucs_free(pivots);

  taucs_free(spa_updcols);
  taucs_free(P);
  taucs_free(S);
  taucs_free(srows);
#endif

  taucs_free(heap);
  taucs_free(nnzmap);
  taucs_free(lindices);
  /*taucs_free(uindices);*/
  taucs_free(lu_re);
  taucs_free(lu_ind);

  /*taucs_free(pivots);
  taucs_free(ipivots);
  */

  /* OLD */

  time_total = taucs_wtime() - time_total;
  taucs_printf("oocsp_numfact: %lg sec total\n",time_total);

#ifdef DETAILED_TIMING
  taucs_printf("oocsp_numfact: %lg extra flops, %2.0lf %\n",flops_extra,100.0*flops_extra/flops);
  taucs_printf("oocsp_numfact: %lg dense flops, %2.0lf %\n",flops_dense,100.0*flops_dense/flops);
  taucs_printf("oocsp_numfact: %lg Mflop dense/s\n",(flops_dense*1e-6)/(time_snode_dense));
 
  taucs_printf("oocsp_numfact: %lg flops\n",flops);
  taucs_printf("oocsp_numfact: %lg scatter ops\n",scatters);
  taucs_printf("oocsp_numfact: %lg gather  ops\n",gathers);
  taucs_printf("oocsp_numfact: %lg heap    ops\n",num_heap_ops);
  taucs_printf("oocsp_numfact: %lg rowlist ops\n",rowlist_ops);
  taucs_printf("oocsp_numfact: %lg sec col/col ops\n",time_colcol);
  taucs_printf("oocsp_numfact: %lg sec col/col ops (%lg+%lg=%lg)\n",time_colcol,
	     time_colcol_1,time_colcol_2,(time_colcol_1+time_colcol_2));
  taucs_printf("oocsp_numfact: %lg sec column factor ops\n",time_factor);
  taucs_printf("oocsp_numfact: %lg sec scatter ops\n",time_scatter);
  taucs_printf("oocsp_numfact: %lg sec gather  ops\n",time_gather);
  taucs_printf("oocsp_numfact: %lg sec io read\n",time_read);
  taucs_printf("oocsp_numfact: %lg sec io write\n",time_append);
#ifdef SNODES
  taucs_printf("oocsp_numfact: %lg sec snode preparations for dense ops\n",time_snode_prepare);
  taucs_printf("oocsp_numfact: %lg sec snode dense ops\n",time_snode_dense);
  taucs_printf("oocsp_numfact: %lg sec snode detection\n",time_snode_detect);
  taucs_printf("oocsp_numfact: %lg sec snode 1\n",time_snode_1);
  taucs_printf("oocsp_numfact: %lg sec snode 2 (%lg)\n",time_snode_2,
		                                      time_snode_21);
  taucs_printf("oocsp_numfact: %lg sec snode 3\n",time_snode_3);
  taucs_printf("oocsp_numfact: %lg sec snode 4\n",time_snode_4);
#endif
  taucs_printf("oocsp_numfact: \n");
  taucs_printf("oocsp_numfact: %lg Mflop/s\n",(flops*1e-6)/(time_total));
  taucs_printf("oocsp_numfact: %lg MB/s IO read\n",(bytes_read*1e-6)/(time_read));
  taucs_printf("oocsp_numfact: %lg MB/s IO write\n",(bytes_appended*1e-6)/(time_append));
  taucs_printf("oocsp_numfact: %lg col reuse\n",col_ooc_updates/col_read);
  taucs_printf("oocsp_numfact: %lg percent IO\n",(time_read+time_append)/time_total);
  taucs_printf("oocsp_numfact: %lg percent col/col\n",(time_colcol)/time_total);
#endif


#ifndef SIMPLE_COL_COL
  {
    extern int oocsp_spcol_n1,oocsp_spcol_n2;
    taucs_printf("oocsp_numfact: spcol counts %d %d\n",oocsp_spcol_n1,oocsp_spcol_n2);
  }
#endif
}


/*********************************************************/
/* FACTOR                                                */
/*********************************************************/

static 
int
oocsp_factor(taucs_ccs_matrix* A_in,
	     taucs_io_handle* LU,
             int*  colperm)
{
  /*int i,j; omer*/
  
  int* etree=NULL;
  int* postorder=NULL;
  int* l_colcounts=NULL;
  int* u_colcounts=NULL;
  int* panels=NULL;
  int* schedstart=NULL;
  int* schedend=NULL;
  int* fetchnext=NULL;
  int* ejectnext=NULL;
  int  spawidth;
  int  maxsn;

  taucs_printf("taucs_ooc_lu: starting\n");

  taucs_printf("taucs_ooc_lu: calling colanalyze\n");
  oocsp_colanalyze(A_in,
		taucs_io_get_basename(LU),
		colperm,&etree,&postorder,&l_colcounts,&u_colcounts);
  
  taucs_printf("taucs_ooc_lu: calling panelize\n");
  oocsp_panelize_simple(A_in->m,A_in->n,
		        postorder,
			l_colcounts,u_colcounts,etree,
			&spawidth,&maxsn,
			&panels,&schedstart,&schedend,&fetchnext,&ejectnext);

  taucs_printf("taucs_ooc_lu: calling numfact\n");
  oocsp_numfact(A_in,colperm,
		/*L,U,*/
		LU,
		panels,
		schedstart,
		schedend,
		fetchnext,
		ejectnext,
		l_colcounts,
		u_colcounts,
		spawidth,maxsn);

  taucs_printf("taucs_ooc_lu: done, returning\n");

  return 0;
}

void
taucs_dtl(ooc_factor_lu)(taucs_ccs_matrix* A_in,
		         int    colperm[],
                         taucs_io_handle* LU,
		         double memory)
{
  remaining_memory = memory;
  taucs_printf("taucs_ooc_factor_lu: using %.0lf MBytes of in-core memory\n",
	     (remaining_memory)/1048576.0);
  oocsp_factor(A_in,LU,colperm);
}

/*********************************************************/
/* SOLVE                                                 */
/*********************************************************/

static int
oocsp_solve(taucs_io_handle* LU,
	    taucs_datatype* X,
	    taucs_datatype* B)
{
  int i,ip,j,n;
  taucs_datatype* Y;
  taucs_datatype  Aij;
  taucs_datatype* values;
  int*    indices;
  int*    irowperm;
  int*    Lclen;
  int*    Uclen;
  int*    colperm;
  int*    ipivots;

  int     found;
  
  double  time_solve = taucs_wtime();
  double  bytes_read = 0;

  taucs_printf("oocsp_solve: starting\n");

  /* n = L->nrows; */
  taucs_io_read(LU, HEADER_NROWS, 1, 1, TAUCS_INT, &n);

  Y       = (taucs_datatype*) taucs_malloc(n * sizeof(taucs_datatype));
  values  = (taucs_datatype*) taucs_malloc(n * sizeof(taucs_datatype));
  indices = (int*)    taucs_malloc(n * sizeof(int));
  irowperm= (int*)    taucs_malloc(n * sizeof(int));

  Lclen   = (int*)    taucs_malloc(n * sizeof(int));
  Uclen   = (int*)    taucs_malloc(n * sizeof(int));

  colperm = (int*)    taucs_malloc(n * sizeof(int));
  ipivots = (int*)    taucs_malloc(n * sizeof(int));
  assert(Y && values && indices && irowperm && Lclen && Uclen && colperm && ipivots);

  taucs_io_read(LU, HEADER_LCLEN, n, 1, TAUCS_INT, Lclen);
  taucs_io_read(LU, HEADER_UCLEN, n, 1, TAUCS_INT, Uclen);

  taucs_io_read(LU, HEADER_COLPERM, n, 1, TAUCS_INT, colperm);
  taucs_io_read(LU, HEADER_IPIVOTS, n, 1, TAUCS_INT, ipivots);

  for(i=0; i<n; i++)
     irowperm[ipivots[i]]=i;

  /*
  taucs_printf("colperm = [");
  for(i=0; i<n; i++)
    taucs_printf("%d, ",colperm[i]);
  taucs_printf("\b\b]\n");

  taucs_printf("ipivots = [");
  for(i=0; i<n; i++)
    taucs_printf("%d, ",ipivots[i]);
  taucs_printf("\b\b]\n");
  */
  

  /* start by permuting B, X=PB */

  /*
  for (i=0; i<n; i++)
    PB[i] = B[ ipivots[i] ];
  */

  for (i=0; i<n; i++)
    Y[i] = B[i];

  for (j=0; j<n; j++) {
    /*oocsp_readcol(L,j,indices,values);*/
    Lreadcol(LU,j,Lclen[j],indices,values);
    bytes_read += Lclen[j] * (sizeof(int) + sizeof(taucs_datatype));
    for (ip=0; ip < Lclen[j]; ip++) {
      i = indices[ip];
      Aij = values[ip];
      /*Y[i] = Y[i] - Aij*Y[irowperm[j]];*/
      Y[i] = taucs_sub(Y[i] , taucs_mul( Aij , Y[irowperm[j]] ));
    }
  }

  for (i=0; i<n; i++) X[i] = Y[i];

  for (j=n-1; j>=0; j--) {
    /*oocsp_readcol(U,j,indices,values);*/
    Ureadcol(LU,j,Uclen[j],indices,values);
    bytes_read += Uclen[j] * (sizeof(int) + sizeof(taucs_datatype));

    found = 0;
    for (ip=0; ip < Uclen[j]; ip++) {
      i = indices[ip];
      if (i == irowperm[j]) {
	found = 1;
	Aij = values[ip];
	/*X[i] = X[i] / Aij;*/
	X[i] = taucs_div( X[i] , Aij );
	values[ip] = taucs_zero; /* so we don't multiply in the daxpy below */
      }
    }
    assert( found );

    for (ip=0; ip < Uclen[j]; ip++) {
      i = indices[ip];
      Aij = values[ip];
      /*X[i] = X[i] - Aij*X[irowperm[j]];*/
      X[i] = taucs_sub( X[i] , taucs_mul( Aij , X[irowperm[j]] ));
    }
  }


  for (i=0; i<n; i++) Y[i] = X[i];
  for (i=0; i<n; i++)
    X[ ipivots[i] ] = Y[ i ];


  for (i=0; i<n; i++) Y[i] = X[i];
  for (i=0; i<n; i++)
    X[ colperm[i] ] = Y[ i ];

  /*
    X[ colperm[i] ] = Y[ i ];

  for (i=0; i<n; i++) Y[i] = X[i];
  for (i=0; i<n; i++)
    X[ ipivots[i] ] = Y[ i ];

  */

  /*
  for (i=0; i<n; i++) if (colperm[i] == 0) j=i;
  printf("rowperm[0]=%d irowperm[0]=%d colperm[0]=%d icp[0]=%d\n",ipivots[0], irowperm[0], colperm[0],j);
  for (i=0; i<n; i++) if (colperm[i] == 1) j=i;
  printf("rowperm[1]=%d irowperm[1]=%d colperm[1]=%d icp[1]=%d\n",ipivots[1], irowperm[1], colperm[1],j);
  */

  taucs_free(Y);
  taucs_free(values);
  taucs_free(indices);
  taucs_free(irowperm);
  taucs_free(Uclen);
  taucs_free(Lclen);
  taucs_free(ipivots);
  taucs_free(colperm);

  time_solve = (taucs_wtime() - time_solve);
  taucs_printf("oocsp_solve: done in %.0lf seconds, read %.0lf bytes (%.0lf MBytes)\n",
	     time_solve,bytes_read,bytes_read/1048576.0);

  return 0; /* success */
  
} /* main */

int taucs_dtl(ooc_solve_lu)(taucs_io_handle*   LU,
			    taucs_datatype* x, taucs_datatype* b)
{
  /*oocsp_solve(LU,x,b); omer*/
	return oocsp_solve(LU,x,b);
}

#endif /*#ifndef TAUCS_CORE_GENERAL*/

/*********************************************************/
/* USER CALLABLE ROUTINES                                */
/*********************************************************/

#ifdef TAUCS_CORE_GENERAL

int taucs_ooc_factor_lu(taucs_ccs_matrix* A,
			int*              colperm,
                        taucs_io_handle*  LU,                       
			double            memory)
{
#ifdef TAUCS_CONFIG_DREAL
  if (A->flags & TAUCS_DOUBLE) {
    taucs_dooc_factor_lu(A,colperm,LU,memory);
    return 0;
  }
#endif

#ifdef TAUCS_CONFIG_DCOMPLEX
  if (A->flags & TAUCS_DCOMPLEX) {
    taucs_zooc_factor_lu(A,colperm,LU,memory);
    return 0;
  }
#endif

#ifdef TAUCS_CONFIG_SREAL
  if (A->flags & TAUCS_SINGLE) {
    taucs_sooc_factor_lu(A,colperm,LU,memory);
    return 0;
  }
#endif

#ifdef TAUCS_CONFIG_SCOMPLEX
  if (A->flags & TAUCS_SCOMPLEX) {
    taucs_cooc_factor_lu(A,colperm,LU,memory);
    return 0;
  }
#endif

  assert(0);
  return -1;
}

int taucs_ooc_solve_lu (taucs_io_handle* LU,
			void* x, void* b)
{
  int flags;

  taucs_io_read(LU, HEADER_FLAGS, 1, 1, TAUCS_INT, &flags);

  printf("taucs_ooc_solve_lu: starting, DZSC=%d%d%d%d\n",
	 (flags & TAUCS_DOUBLE  ) != 0,
	 (flags & TAUCS_DCOMPLEX) != 0,
	 (flags & TAUCS_SINGLE  ) != 0,
	 (flags & TAUCS_SCOMPLEX) != 0);
  
#ifdef TAUCS_CONFIG_DREAL
  if (flags & TAUCS_DOUBLE) {
    taucs_dooc_solve_lu(LU,x,b);
    return 0;
  }
#endif

#ifdef TAUCS_CONFIG_DCOMPLEX
  if (flags & TAUCS_DCOMPLEX) {
    taucs_zooc_solve_lu(LU,x,b);
    return 0;
  }
#endif

#ifdef TAUCS_CONFIG_SREAL
  if (flags & TAUCS_SINGLE) {
    taucs_sooc_solve_lu(LU,x,b);
    return 0;
  }
#endif

#ifdef TAUCS_CONFIG_SCOMPLEX
  if (flags & TAUCS_SCOMPLEX) {
    taucs_cooc_solve_lu(LU,x,b);
    return 0;
  }
#endif

  assert(0);
  return -1;
}

#endif /*#ifdef TAUCS_CORE_GENERAL*/

/*********************************************************/
/* END OF FILE                                           */
/*********************************************************/




syntax highlighted by Code2HTML, v. 0.9.1