/* ========================================================================== */
/* === Modify/t_cholmod_updown_numkr ======================================== */
/* ========================================================================== */

/* -----------------------------------------------------------------------------
 * CHOLMOD/Modify Module.  Copyright (C) 2005-2006,
 * Timothy A. Davis and William W. Hager.
 * The CHOLMOD/Modify Module is licensed under Version 2.0 of the GNU
 * General Public License.  See gpl.txt for a text of the license.
 * CHOLMOD is also available under other licenses; contact authors for details.
 * http://www.cise.ufl.edu/research/sparse
 * -------------------------------------------------------------------------- */

/* Supernodal numerical update/downdate of rank K = RANK, along a single path.
 * This routine operates on a simplicial factor, but operates on adjacent
 * columns of L that would fit within a single supernode.  "Adjacent" means
 * along a single path in the elimination tree; they may or may not be
 * adjacent in the matrix L.
 *
 * external defines:  NUMERIC, WDIM, RANK.
 *
 * WDIM is 1, 2, 4, or 8.  RANK can be 1 to WDIM.
 *
 * A simple method is included (#define SIMPLE).  The code works, but is slow.
 * It is meant only to illustrate what this routine is doing.
 *
 * A rank-K update proceeds along a single path, using single-column, dual-
 * column, or quad-column updates of L.  If a column j and the next column
 * in the path (its parent) do not have the same nonzero pattern, a single-
 * column update is used.  If they do, but the 3rd and 4th column from j do
 * not have the same pattern, a dual-column update is used, in which the two
 * columns are treated as if they were a single supernode of two columns.  If
 * there are 4 columns in the path that all have the same nonzero pattern, then
 * a quad-column update is used.  All three kinds of updates can be used along
 * a single path, in a single call to this function.
 *
 * Single-column update:
 *
 *	When updating a single column of L, each iteration of the for loop,
 *	below, processes four rows of W (all columns involved) and one column
 *	of L.  Suppose we have a rank-5 update, and columns 2 through 6 of W
 *	are involved.  In this case, W in this routine is a pointer to column
 *	2 of the matrix W in the caller.  W (in the caller, shown as 'W') is
 *	held in row-major order, and is 8-by-n (a dense matrix storage format),
 *	but shown below in column form to match the column of L.  Suppose there
 *	are 13 nonzero entries in column 27 of L, with row indices 27 (the
 *	diagonal, D), 28, 30, 31, 42, 43, 44, 50, 51, 67, 81, 83, and 84.  This
 *	pattern is held in Li [Lp [27] ... Lp [27 + Lnz [27] - 1], where
 *	Lnz [27] = 13.   The modification of the current column j of L is done
 *	in the following order.  A dot (.) means the entry of W is not accessed.
 *
 *	W0 points to row 27 of W, and G is a 1-by-8 temporary vector.
 *
 *		     G[0]        G[4]
 *	G	      x  x  x  x  x  .  .  .
 *
 *		      W0
 *		      |
 *		      v
 *	27      .  .  x  x  x  x  x  .   W0 points to W (27,2)
 *
 *
 *	row    'W'    W                      column j = 27
 *	|       |     |                      of L
 *	v       v     v                      |
 *	first iteration of for loop:         v
 *
 *	28      .  .  1  5  9 13 17  .       x
 *	30      .  .  2  6 10 14 18  .       x
 *	31      .  .  3  7 11 15 19  .       x
 *	42      .  .  4  8 12 16 20  .       x
 *
 *	second iteration of for loop:
 *
 *	43      .  .  1  5  9 13 17  .       x
 *	44      .  .  2  6 10 14 18  .       x
 *	50      .  .  3  7 11 15 19  .       x
 *	51      .  .  4  8 12 16 20  .       x
 *
 *	third iteration of for loop:
 *
 *	67      .  .  1  5  9 13 17  .       x
 *	81      .  .  2  6 10 14 18  .       x
 *	83      .  .  3  7 11 15 19  .       x
 *	84      .  .  4  8 12 16 20  .       x
 *
 *	If the number of offdiagonal nonzeros in column j of L is not divisible
 *	by 4, then the switch-statement does the work for the first nz % 4 rows.
 *
 * Dual-column update:
 *
 *	In this case, two columns of L that are adjacent in the path are being
 *	updated, by 1 to 8 columns of W.  Suppose columns j=27 and j=28 are
 *	adjacent columns in the path (they need not be j and j+1).  Two rows
 *	of G and W are used as coefficients during the update: (G0, G1) and
 *	(W0, W1).
 *
 *	G0	      x  x  x  x  x  .  .  .
 *	G1	      x  x  x  x  x  .  .  .
 *
 *	27      .  .  x  x  x  x  x  .   W0 points to W (27,2)
 *	28      .  .  x  x  x  x  x  .   W1 points to W (28,2)
 *
 *
 *	row    'W'    W0,W1                  column j = 27
 *	|       |     |                      of L
 *	v       v     v                      |
 *					     | |-- column j = 28 of L
 *					     v v
 *	update L (j1,j):
 *
 *	28      .  .  1  2  3  4  5  .       x -    ("-" is not stored in L)
 *
 *	cleanup iteration since length is odd:
 *
 *	30      .  .  1  2  3  4  5  .       x x
 *
 *	then each iteration does two rows of both columns of L:
 *
 *	31      .  .  1  3  5  7  9  .       x x
 *	42      .  .  2  4  6  8 10  .       x x
 *
 *	43      .  .  1  3  5  7  9  .       x x
 *	44      .  .  2  4  6  8 10  .       x x
 *
 *	50      .  .  1  3  5  7  9  .       x x
 *	51      .  .  2  4  6  8 10  .       x x
 *
 *	67      .  .  1  3  5  7  9  .       x x
 *	81      .  .  2  4  6  8 10  .       x x
 *
 *	83      .  .  1  3  5  7  9  .       x x
 *	84      .  .  2  4  6  8 10  .       x x
 *
 *	If the number of offdiagonal nonzeros in column j of L is not even,
 *	then the cleanup iteration does the work for the first row.
 *
 * Quad-column update:
 *
 *	In this case, four columns of L that are adjacent in the path are being
 *	updated, by 1 to 8 columns of W.  Suppose columns j=27, 28, 30, and 31
 *	are adjacent columns in the path (they need not be j, j+1, ...).  Four
 *	rows of G and W are used as coefficients during the update: (G0 through
 *	G3) and (W0 through W3).  j=27, j1=28, j2=30, and j3=31.
 *
 *	G0	      x  x  x  x  x  .  .  .
 *	G1	      x  x  x  x  x  .  .  .
 *	G3	      x  x  x  x  x  .  .  .
 *	G4	      x  x  x  x  x  .  .  .
 *
 *	27      .  .  x  x  x  x  x  .   W0 points to W (27,2)
 *	28      .  .  x  x  x  x  x  .   W1 points to W (28,2)
 *	30      .  .  x  x  x  x  x  .   W2 points to W (30,2)
 *	31      .  .  x  x  x  x  x  .   W3 points to W (31,2)
 *
 *
 *	row    'W'    W0,W1,..               column j = 27
 *	|       |     |                      of L
 *	v       v     v                      |
 *					     | |-- column j = 28 of L
 *					     | | |-- column j = 30 of L
 *					     | | | |-- column j =  31 of L
 *					     v v v v
 *	update L (j1,j):
 *	28      .  .  1  2  3  4  5  .       x - - -
 *
 *	update L (j2,j):
 *	30      .  .  1  2  3  4  5  .       # x - -	 (# denotes modified)
 *
 *	update L (j2,j1)
 *	30      .  .  1  2  3  4  5  .       x # - -
 *
 *	update L (j3,j)
 *	31      .  .  1  2  3  4  5  .       # x x -
 *
 *	update L (j3,j1)
 *	31      .  .  1  2  3  4  5  .       x # x -
 *
 *	update L (j3,j2)
 *	31      .  .  1  2  3  4  5  .       x x # -
 *
 *	cleanup iteration since length is odd:
 *	42      .  .  1  2  3  4  5  .       x x x x
 *
 *
 * ----- CHOLMOD v1.1.1 did the following --------------------------------------
 *	then each iteration does two rows of all four colummns of L:
 *
 *	43      .  .  1  3  5  7  9  .       x x x x
 *	44      .  .  2  4  6  8 10  .       x x x x
 *
 *	50      .  .  1  3  5  7  9  .       x x x x
 *	51      .  .  2  4  6  8 10  .       x x x x
 *
 *	67      .  .  1  3  5  7  9  .       x x x x
 *	81      .  .  2  4  6  8 10  .       x x x x
 *
 *	83      .  .  1  3  5  7  9  .       x x x x
 *	84      .  .  2  4  6  8 10  .       x x x x
 *
 * ----- CHOLMOD v1.2.0 does the following -------------------------------------
 *	then each iteration does one rows of all four colummns of L:
 *
 *	43      .  .  1  2  3  4  5  .       x x x x
 *	44      .  .  1  2  3  4  5  .       x x x x
 *	50      .  .  1  3  5  4  5  .       x x x x
 *	51      .  .  1  2  3  4  5  .       x x x x
 *	67      .  .  1  3  5  4  5  .       x x x x
 *	81      .  .  1  2  3  4  5  .       x x x x
 *	83      .  .  1  3  5  4  5  .       x x x x
 *	84      .  .  1  2  3  4  5  .       x x x x
 *
 * This file is included in t_cholmod_updown.c, only.
 * It is not compiled separately.  It contains no user-callable routines.
 *
 * workspace: Xwork (WDIM*nrow)
 */

/* ========================================================================== */
/* === loop unrolling macros ================================================ */
/* ========================================================================== */

#undef RANK1
#undef RANK2
#undef RANK3
#undef RANK4
#undef RANK5
#undef RANK6
#undef RANK7
#undef RANK8

#define RANK1(statement) statement

#if RANK < 2
#define RANK2(statement)
#else
#define RANK2(statement) statement
#endif

#if RANK < 3
#define RANK3(statement)
#else
#define RANK3(statement) statement
#endif

#if RANK < 4
#define RANK4(statement)
#else
#define RANK4(statement) statement
#endif

#if RANK < 5
#define RANK5(statement)
#else
#define RANK5(statement) statement
#endif

#if RANK < 6
#define RANK6(statement)
#else
#define RANK6(statement) statement
#endif

#if RANK < 7
#define RANK7(statement)
#else
#define RANK7(statement) statement
#endif

#if RANK < 8
#define RANK8(statement)
#else
#define RANK8(statement) statement
#endif

#define FOR_ALL_K \
    RANK1 (DO (0)) \
    RANK2 (DO (1)) \
    RANK3 (DO (2)) \
    RANK4 (DO (3)) \
    RANK5 (DO (4)) \
    RANK6 (DO (5)) \
    RANK7 (DO (6)) \
    RANK8 (DO (7))

/* ========================================================================== */
/* === alpha/gamma ========================================================== */
/* ========================================================================== */

#undef ALPHA_GAMMA

#define ALPHA_GAMMA(Dj,Alpha,Gamma,W) \
{ \
    double dj = Dj ; \
    if (update) \
    { \
	for (k = 0 ; k < RANK ; k++) \
	{ \
	    double w = W [k] ; \
	    double alpha = Alpha [k] ; \
	    double a = alpha + (w * w) / dj ; \
	    dj *= a ; \
	    Alpha [k] = a ; \
	    Gamma [k] = (- w / dj) ; \
	    dj /= alpha ; \
	} \
    } \
    else \
    { \
	for (k = 0 ; k < RANK ; k++) \
	{ \
	    double w = W [k] ; \
	    double alpha = Alpha [k] ; \
	    double a = alpha - (w * w) / dj ; \
	    dj *= a ; \
	    Alpha [k] = a ; \
	    Gamma [k] = w / dj ; \
	    dj /= alpha ; \
	} \
    } \
    Dj = ((use_dbound) ? (CHOLMOD(dbound) (dj, Common)) : (dj)) ; \
}

/* ========================================================================== */
/* === numeric update/downdate along one path =============================== */
/* ========================================================================== */

static void NUMERIC (WDIM, RANK)
(
    int update,		/* TRUE for update, FALSE for downdate */
    Int j,		/* first column in the path */
    Int e,		/* last column in the path */
    double Alpha [ ],	/* alpha, for each column of W */
    double W [ ],	/* W is an n-by-WDIM array, stored in row-major order */
    cholmod_factor *L,	/* with unit diagonal (diagonal not stored) */
    cholmod_common *Common
)
{

#ifdef SIMPLE
#define w(row,col) W [WDIM*(row) + (col)]

    /* ---------------------------------------------------------------------- */
    /* concise but slow version for illustration only */
    /* ---------------------------------------------------------------------- */

    double Gamma [WDIM] ;
    double *Lx ;
    Int *Li, *Lp, *Lnz ;
    Int p, k ;
    Int use_dbound = IS_GT_ZERO (Common->dbound) ;

    Li = L->i ;
    Lx = L->x ;
    Lp = L->p ;
    Lnz = L->nz ;

    /* walk up the etree from node j to its ancestor e */
    for ( ; j <= e ; j = (Lnz [j] > 1) ? (Li [Lp [j] + 1]) : Int_max)
    {
	/* update the diagonal entry D (j,j) with each column of W */
	ALPHA_GAMMA (Lx [Lp [j]], Alpha, Gamma, (&(w (j,0)))) ;
	/* update column j of L */
	for (p = Lp [j] + 1 ; p < Lp [j] + Lnz [j] ; p++)
	{
	    /* update row Li [p] of column j of L with each column of W */
	    Int i = Li [p] ;
	    for (k = 0 ; k < RANK ; k++)
	    {
		w (i,k) -= w (j,k) * Lx [p] ;
		Lx [p] -= Gamma [k] * w (i,k) ;
	    }
	}
	/* clear workspace W */
	for (k = 0 ; k < RANK ; k++)
	{
	    w (j,k) = 0 ;
	}
    }

#else

    /* ---------------------------------------------------------------------- */
    /* dynamic supernodal version: supernodes detected dynamically */
    /* ---------------------------------------------------------------------- */

    double G0 [RANK], G1 [RANK], G2 [RANK], G3 [RANK] ;
    double Z0 [RANK], Z1 [RANK], Z2 [RANK], Z3 [RANK] ;
    double *W0, *W1, *W2, *W3, *Lx ;
    Int *Li, *Lp, *Lnz ;
    Int j1, j2, j3, p0, p1, p2, p3, parent, lnz, pend, k ;
    Int use_dbound = IS_GT_ZERO (Common->dbound) ;

    Li = L->i ;
    Lx = L->x ;
    Lp = L->p ;
    Lnz = L->nz ;

    /* walk up the etree from node j to its ancestor e */
    for ( ; j <= e ; j = parent)
    {

	p0 = Lp [j] ;		/* col j is Li,Lx [p0 ... p0+lnz-1] */
	lnz = Lnz [j] ;

	W0 = W + WDIM * j ;	/* pointer to row j of W */
	pend = p0 + lnz ;

	/* for k = 0 to RANK-1 do: */
	#define DO(k) Z0 [k] = W0 [k] ;
	FOR_ALL_K
	#undef DO

	/* for k = 0 to RANK-1 do: */
	#define DO(k) W0 [k] = 0 ;
	FOR_ALL_K
	#undef DO

	/* update D (j,j) */
	ALPHA_GAMMA (Lx [p0], Alpha, G0, Z0) ;
	p0++ ;

	/* determine how many columns of L to update at the same time */
	parent = (lnz > 1) ? (Li [p0]) : Int_max ;
	if (parent <= e && lnz == Lnz [parent] + 1)
	{

	    /* -------------------------------------------------------------- */
	    /* node j and its parent j1 can be updated at the same time */
	    /* -------------------------------------------------------------- */

	    j1 = parent ;
	    j2 = (lnz > 2) ? (Li [p0+1]) : Int_max ;
	    j3 = (lnz > 3) ? (Li [p0+2]) : Int_max ;
	    W1 = W + WDIM * j1 ;	/* pointer to row j1 of W */
	    p1 = Lp [j1] ;

	    /* for k = 0 to RANK-1 do: */
	    #define DO(k) Z1 [k] = W1 [k] ;
	    FOR_ALL_K
	    #undef DO

	    /* for k = 0 to RANK-1 do: */
	    #define DO(k) W1 [k] = 0 ;
	    FOR_ALL_K
	    #undef DO

	    /* update L (j1,j) */
	    {
		double lx = Lx [p0] ;

		/* for k = 0 to RANK-1 do: */
		#define DO(k) \
		    Z1 [k] -= Z0 [k] * lx ; \
		    lx -= G0 [k] * Z1 [k] ;
		FOR_ALL_K
		#undef DO

		Lx [p0++] = lx ;
	    }

	    /* update D (j1,j1) */
	    ALPHA_GAMMA (Lx [p1], Alpha, G1, Z1) ;
	    p1++ ;

	    /* -------------------------------------------------------------- */
	    /* update 2 or 4 columns of L */
	    /* -------------------------------------------------------------- */

	    if ((j2 <= e) &&			/* j2 in the current path */
		(j3 <= e) &&			/* j3 in the current path */
		(lnz == Lnz [j2] + 2) &&	/* column j2 matches */
		(lnz == Lnz [j3] + 3))		/* column j3 matches */
	    {

		/* ---------------------------------------------------------- */
		/* update 4 columns of L */
		/* ---------------------------------------------------------- */

		/* p0 and p1 currently point to row j2 in cols j and j1 of L */

		parent = (lnz > 4) ? (Li [p0+2]) : Int_max ;
		W2 = W + WDIM * j2 ;	    /* pointer to row j2 of W */
		W3 = W + WDIM * j3 ;	    /* pointer to row j3 of W */
		p2 = Lp [j2] ;
		p3 = Lp [j3] ;

		/* for k = 0 to RANK-1 do: */
		#define DO(k) Z2 [k] = W2 [k] ;
		FOR_ALL_K
		#undef DO

		/* for k = 0 to RANK-1 do: */
		#define DO(k) Z3 [k] = W3 [k] ;
		FOR_ALL_K
		#undef DO

		/* for k = 0 to RANK-1 do: */
		#define DO(k) W2 [k] = 0 ;
		FOR_ALL_K
		#undef DO

		/* for k = 0 to RANK-1 do: */
		#define DO(k) W3 [k] = 0 ;
		FOR_ALL_K
		#undef DO

		/* update L (j2,j) and update L (j2,j1) */
		{
		    double lx [2] ;
		    lx [0] = Lx [p0] ;
		    lx [1] = Lx [p1] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    Z2 [k] -= Z0 [k] * lx [0] ; lx [0] -= G0 [k] * Z2 [k] ; \
		    Z2 [k] -= Z1 [k] * lx [1] ; lx [1] -= G1 [k] * Z2 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0++] = lx [0] ;
		    Lx [p1++] = lx [1] ;
		}

		/* update D (j2,j2) */
		ALPHA_GAMMA (Lx [p2], Alpha, G2, Z2) ;
		p2++ ;

		/* update L (j3,j), L (j3,j1), and L (j3,j2) */
		{
		    double lx [3] ;
		    lx [0] = Lx [p0] ;
		    lx [1] = Lx [p1] ;
		    lx [2] = Lx [p2] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    Z3 [k] -= Z0 [k] * lx [0] ; lx [0] -= G0 [k] * Z3 [k] ; \
		    Z3 [k] -= Z1 [k] * lx [1] ; lx [1] -= G1 [k] * Z3 [k] ; \
		    Z3 [k] -= Z2 [k] * lx [2] ; lx [2] -= G2 [k] * Z3 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0++] = lx [0] ;
		    Lx [p1++] = lx [1] ;
		    Lx [p2++] = lx [2] ;
		}

		/* update D (j3,j3) */
		ALPHA_GAMMA (Lx [p3], Alpha, G3, Z3) ;
		p3++ ;

		/* each iteration updates L (i, [j j1 j2 j3]) */
		for ( ; p0 < pend ; p0++, p1++, p2++, p3++)
		{
		    double lx [4], *w0 ;
		    lx [0] = Lx [p0] ;
		    lx [1] = Lx [p1] ;
		    lx [2] = Lx [p2] ;
		    lx [3] = Lx [p3] ;
		    w0 = W + WDIM * Li [p0] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    w0 [k] -= Z0 [k] * lx [0] ; lx [0] -= G0 [k] * w0 [k] ; \
		    w0 [k] -= Z1 [k] * lx [1] ; lx [1] -= G1 [k] * w0 [k] ; \
		    w0 [k] -= Z2 [k] * lx [2] ; lx [2] -= G2 [k] * w0 [k] ; \
		    w0 [k] -= Z3 [k] * lx [3] ; lx [3] -= G3 [k] * w0 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0] = lx [0] ;
		    Lx [p1] = lx [1] ;
		    Lx [p2] = lx [2] ;
		    Lx [p3] = lx [3] ;
		}
	    }
	    else
	    {

		/* ---------------------------------------------------------- */
		/* update 2 columns of L */
		/* ---------------------------------------------------------- */

		parent = j2 ;

		/* cleanup iteration if length is odd */
		if ((lnz - 2) % 2)
		{
		    double lx [2] , *w0 ;
		    lx [0] = Lx [p0] ;
		    lx [1] = Lx [p1] ;
		    w0 = W + WDIM * Li [p0] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    w0 [k] -= Z0 [k] * lx [0] ; lx [0] -= G0 [k] * w0 [k] ; \
		    w0 [k] -= Z1 [k] * lx [1] ; lx [1] -= G1 [k] * w0 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0++] = lx [0] ;
		    Lx [p1++] = lx [1] ;
		}

		for ( ; p0 < pend ; p0 += 2, p1 += 2)
		{
		    double lx [2][2], w [2], *w0, *w1 ;
		    lx [0][0] = Lx [p0  ] ;
		    lx [1][0] = Lx [p0+1] ;
		    lx [0][1] = Lx [p1  ] ;
		    lx [1][1] = Lx [p1+1] ;
		    w0 = W + WDIM * Li [p0  ] ;
		    w1 = W + WDIM * Li [p0+1] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    w [0] = w0 [k] - Z0 [k] * lx [0][0] ; \
		    w [1] = w1 [k] - Z0 [k] * lx [1][0] ; \
		    lx [0][0] -= G0 [k] * w [0] ; \
		    lx [1][0] -= G0 [k] * w [1] ; \
		    w0 [k] = w [0] -= Z1 [k] * lx [0][1] ; \
		    w1 [k] = w [1] -= Z1 [k] * lx [1][1] ; \
		    lx [0][1] -= G1 [k] * w [0] ; \
		    lx [1][1] -= G1 [k] * w [1] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0  ] = lx [0][0] ;
		    Lx [p0+1] = lx [1][0] ;
		    Lx [p1  ] = lx [0][1] ;
		    Lx [p1+1] = lx [1][1] ;
		}
	    }
	}
	else
	{

	    /* -------------------------------------------------------------- */
	    /* update one column of L */
	    /* -------------------------------------------------------------- */

	    /* cleanup iteration if length is not a multiple of 4 */
	    switch ((lnz - 1) % 4)
	    {
		case 1:
		{
		    double lx , *w0 ;
		    lx = Lx [p0] ;
		    w0 = W + WDIM * Li [p0] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    w0 [k] -= Z0 [k] * lx ; lx -= G0 [k] * w0 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0++] = lx ;
		}
		break ;

		case 2:
		{
		    double lx [2], *w0, *w1 ;
		    lx [0] = Lx [p0  ] ;
		    lx [1] = Lx [p0+1] ;
		    w0 = W + WDIM * Li [p0  ] ;
		    w1 = W + WDIM * Li [p0+1] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    w0 [k] -= Z0 [k] * lx [0] ; \
		    w1 [k] -= Z0 [k] * lx [1] ; \
		    lx [0] -= G0 [k] * w0 [k] ; \
		    lx [1] -= G0 [k] * w1 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0++] = lx [0] ;
		    Lx [p0++] = lx [1] ;
		}
		break ;

		case 3:
		{
		    double lx [3], *w0, *w1, *w2 ;
		    lx [0] = Lx [p0  ] ;
		    lx [1] = Lx [p0+1] ;
		    lx [2] = Lx [p0+2] ;
		    w0 = W + WDIM * Li [p0  ] ;
		    w1 = W + WDIM * Li [p0+1] ;
		    w2 = W + WDIM * Li [p0+2] ;

		    /* for k = 0 to RANK-1 do: */
		    #define DO(k) \
		    w0 [k] -= Z0 [k] * lx [0] ; \
		    w1 [k] -= Z0 [k] * lx [1] ; \
		    w2 [k] -= Z0 [k] * lx [2] ; \
		    lx [0] -= G0 [k] * w0 [k] ; \
		    lx [1] -= G0 [k] * w1 [k] ; \
		    lx [2] -= G0 [k] * w2 [k] ;
		    FOR_ALL_K
		    #undef DO

		    Lx [p0++] = lx [0] ;
		    Lx [p0++] = lx [1] ;
		    Lx [p0++] = lx [2] ;
		}
	    }

	    for ( ; p0 < pend ; p0 += 4)
	    {
		double lx [4], *w0, *w1, *w2, *w3 ;
		lx [0] = Lx [p0  ] ;
		lx [1] = Lx [p0+1] ;
		lx [2] = Lx [p0+2] ;
		lx [3] = Lx [p0+3] ;
		w0 = W + WDIM * Li [p0  ] ;
		w1 = W + WDIM * Li [p0+1] ;
		w2 = W + WDIM * Li [p0+2] ;
		w3 = W + WDIM * Li [p0+3] ;

		/* for k = 0 to RANK-1 do: */
		#define DO(k) \
		w0 [k] -= Z0 [k] * lx [0] ; \
		w1 [k] -= Z0 [k] * lx [1] ; \
		w2 [k] -= Z0 [k] * lx [2] ; \
		w3 [k] -= Z0 [k] * lx [3] ; \
		lx [0] -= G0 [k] * w0 [k] ; \
		lx [1] -= G0 [k] * w1 [k] ; \
		lx [2] -= G0 [k] * w2 [k] ; \
		lx [3] -= G0 [k] * w3 [k] ;
		FOR_ALL_K
		#undef DO

		Lx [p0  ] = lx [0] ;
		Lx [p0+1] = lx [1] ;
		Lx [p0+2] = lx [2] ;
		Lx [p0+3] = lx [3] ;
	    }
	}
    }
#endif
}
/* prepare this file for another inclusion in t_cholmod_updown.c: */
#undef RANK


syntax highlighted by Code2HTML, v. 0.9.1