/*--------------------------------------------------------------------
This source distribution is placed in the public domain by its author,
Jason Papadopoulos. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.

Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may 
benefit from your work.	
       				   --jasonp@boo.net 6/3/07
--------------------------------------------------------------------*/

#include "lanczos.h"

#define BIT(x) ((uint64)(1) << (x))

static const uint64 bitmask[64] = {
	BIT( 0), BIT( 1), BIT( 2), BIT( 3), BIT( 4), BIT( 5), BIT( 6), BIT( 7),
	BIT( 8), BIT( 9), BIT(10), BIT(11), BIT(12), BIT(13), BIT(14), BIT(15),
	BIT(16), BIT(17), BIT(18), BIT(19), BIT(20), BIT(21), BIT(22), BIT(23),
	BIT(24), BIT(25), BIT(26), BIT(27), BIT(28), BIT(29), BIT(30), BIT(31),
	BIT(32), BIT(33), BIT(34), BIT(35), BIT(36), BIT(37), BIT(38), BIT(39),
	BIT(40), BIT(41), BIT(42), BIT(43), BIT(44), BIT(45), BIT(46), BIT(47),
	BIT(48), BIT(49), BIT(50), BIT(51), BIT(52), BIT(53), BIT(54), BIT(55),
	BIT(56), BIT(57), BIT(58), BIT(59), BIT(60), BIT(61), BIT(62), BIT(63),
};

/* for matrices of dimension exceeding MIN_POST_LANCZOS_DIM,
   the first POST_LANCZOS_ROWS rows are handled in a separate
   Gauss elimination phase after the Lanczos iteration
   completes. This means the lanczos code will produce about
   64 - POST_LANCZOS_ROWS dependencies on average. 
   
   The code will still work if POST_LANCZOS_ROWS is 0, but I 
   don't know why you would want to do that. The first rows are 
   essentially completely dense, and removing them from the main 
   Lanczos iteration greatly reduces the amount of arithmetic 
   in a matrix multiply, as well as the memory footprint of 
   the matrix */

#define POST_LANCZOS_ROWS 48
#define MIN_POST_LANCZOS_DIM 10000

/*-------------------------------------------------------------------*/
static uint64 * form_post_lanczos_matrix(msieve_obj *obj, uint32 *nrows, 
				uint32 *dense_rows_out, uint32 ncols, 
				la_col_t *cols) {

	uint32 i, j, k;
	uint32 num_dense_rows = *dense_rows_out;
	uint32 dense_row_words;
	uint32 new_dense_rows;
	uint32 new_dense_row_words;
	uint32 final_dense_row_words;
	uint64 mask;
	uint64 *submatrix;
	mp_t tmp;

	/* if the matrix is going to have cache blocking applied,
	   proceed but do not form a post-Lanczos matrix if one
	   is not desired. We have to do this because the block
	   matrix multiply expects the number of dense rows to be
	   a multiple of 64.

	   Otherwise, don't do anything if the Lanczos iteration 
	   would finish quickly */

	submatrix = NULL;
	if (ncols >= MIN_NCOLS_TO_PACK ||
	    (POST_LANCZOS_ROWS > 0 && ncols >= MIN_POST_LANCZOS_DIM)) {

		if (POST_LANCZOS_ROWS > 0) {
			logprintf(obj, "saving the first %u matrix rows "
					"for later\n", POST_LANCZOS_ROWS);
			submatrix = (uint64 *)malloc(ncols * sizeof(uint64));
		}
	}
	else {
		return NULL;
	}

	mask = (uint64)(-1) >> (64 - POST_LANCZOS_ROWS);
	dense_row_words = (num_dense_rows + 31) / 32;
	mp_clear(&tmp);

	/* we will be removing the first POST_LANCZOS_ROWS rows
	   from the matrix entirely, and packing together the
	   next few rows. The matrix may have dense rows already, 
	   or these rows may be partially or completely sparse, 
	   in which case we'll have to pack them manually. After
	   the post-lanczos rows are removed, the number of dense 
	   rows remaining is a multiple of 64 (minimum of 64) */

	new_dense_rows = MAX(num_dense_rows, POST_LANCZOS_ROWS);
	new_dense_rows += 64 - (new_dense_rows - POST_LANCZOS_ROWS) % 64;
	new_dense_row_words = (new_dense_rows + 31) / 32;
	final_dense_row_words = (new_dense_rows - POST_LANCZOS_ROWS) / 32;

	for (i = 0; i < ncols; i++) {
		uint32 curr_weight = cols[i].weight;
		uint32 *curr_row = cols[i].data;

		/* build up a bitfield of the rows that will be
		   stored in packed format. Start with the rows
		   that are already packed */

		for (j = 0; j < dense_row_words; j++)
			tmp.val[j] = curr_row[curr_weight + j];

		/* add in the rows from the sparse part of the matrix.
		   Entries from these rows are either added to the
		   new dense bitfield, or moved to fill the holes
		   created by packing the first few sparse rows. In 
		   the latter case, the row index must be biased to 
		   reflect the removed rows */

		for (; j < new_dense_row_words; j++)
			tmp.val[j] = 0;

		for (j = k = 0; j < curr_weight; j++) {
			uint32 curr_index = curr_row[j];

			if (curr_index < new_dense_rows)
				tmp.val[curr_index / 32] |= 
						bitmask[curr_index % 32];
			else
				curr_row[k++] = curr_index - POST_LANCZOS_ROWS;
		}

		tmp.nwords = new_dense_row_words;
#if POST_LANCZOS_ROWS > 0
		/* remove the first POST_LANCZOS_ROWS bits from
		   the bitfield */
		submatrix[i] = ((uint64)tmp.val[0] |
				(uint64)tmp.val[1] << 32) & mask;
#endif

		/* move the rest of the bitfield and repack the (hopefully
		   shorter) current column in the heap */
		cols[i].weight = k;
		cols[i].data = (uint32 *)realloc(curr_row, (k + 
						final_dense_row_words) * 
						sizeof(uint32));
		mp_rshift(&tmp, POST_LANCZOS_ROWS, &tmp);
		memcpy(cols[i].data + k, tmp.val, final_dense_row_words * 
						sizeof(uint32));
	}

	*nrows -= POST_LANCZOS_ROWS;
	*dense_rows_out = new_dense_rows - POST_LANCZOS_ROWS;
	count_matrix_nonzero(obj, *nrows, *dense_rows_out, ncols, cols);
	return submatrix;
}

/*-------------------------------------------------------------------*/
static void mul_64x64_64x64(uint64 *a, uint64 *b, uint64 *c ) {

	/* c[][] = x[][] * y[][], where all operands are 64 x 64
	   (i.e. contain 64 words of 64 bits each). The result
	   may overwrite a or b. */

	uint64 ai, bj, accum;
	uint64 tmp[64];
	uint32 i, j;

	for (i = 0; i < 64; i++) {
		j = 0;
		accum = 0;
		ai = a[i];

		while (ai) {
			bj = b[j];
			if( ai & 1 )
				accum ^= bj;
			ai >>= 1;
			j++;
		}

		tmp[i] = accum;
	}
	memcpy(c, tmp, sizeof(tmp));
}

/*-------------------------------------------------------------------*/
void mul_Nx64_64x64_acc(uint64 *v, uint64 *x,
			uint64 *y, uint32 n) {

	/* let v[][] be a n x 64 matrix with elements in GF(2), 
	   represented as an array of n 64-bit words. Let c[][]
	   be an 8 x 256 scratch matrix of 64-bit words.
	   This code multiplies v[][] by the 64x64 matrix 
	   x[][], then XORs the n x 64 result into y[][] */

	uint32 i, j, k;
	uint64 c[8 * 256];

	/* fill c[][] with a bunch of "partial matrix multiplies". 
	   For 0<=i<256, the j_th row of c[][] contains the matrix 
	   product

	   	( i << (8*j) ) * x[][]

	   where the quantity in parentheses is considered a 
	   1 x 64 vector of elements in GF(2). The resulting
	   table will dramatically speed up matrix multiplies
	   by x[][]. */

	for (i = 0; i < 8; i++) {
		uint64 *xtmp = x + 8 * i;
		uint64 *ctmp = c + 256 * i;

		for (j = 0; j < 256; j++) {
			uint64 accum = 0;
			uint32 index = j;

			for (k = 0; k < 8; k++) {
				if (index & ((uint32)1 << k))
					accum ^= xtmp[k];
			}
			ctmp[j] = accum;
		}
	}

#if defined(__GNUC__) && defined(__i386__) && defined(HAS_MMX)
	i = 0;
	asm volatile("0:                                   \n\t"
		     "movq (%3,%0,8), %%mm0                \n\t"
		     "movl (%1,%0,8), %%eax                \n\t"
		     "incl %0                              \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "movq (%2,%%ecx,8), %%mm1             \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "pxor 1*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "shrl $16, %%eax                      \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "pxor 2*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "pxor 3*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movl 4-8(%1,%0,8), %%eax             \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "pxor 4*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "shrl $16, %%eax                      \n\t"
		     "cmpl %5, %0                          \n\t"
		     "pxor 5*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "pxor 6*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "pxor 7*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "pxor %%mm0, %%mm1                    \n\t"
		     "movq %%mm1, -8(%3,%0,8)              \n\t"
		     "jne 0b                               \n\t"
		     "emms                                 \n\t"
			:"=r"(i)
			:"r"(v), "r"(c), "r"(y), "0"(i), "g"(n)
			:"%eax", "%ecx", "%mm0", "%mm1", "memory");

#elif defined(_MSC_VER) && !defined(_WIN64)
	i = 0;
	__asm
	{
		push	ebx
		mov	edi,y
		lea	ebx,c
		mov	esi,v
		mov	ecx,i
	L0:	movq	mm0,[edi+ecx*8]
		mov	eax,[esi+ecx*8]
		inc	ecx
		movzx	edx, al
		movq	mm1,[ebx+edx*8]
		movzx	edx,ah
		pxor	mm1,[1*256*8+ebx+edx*8]
		shr	eax,16
		movzx	edx,al
		pxor	mm1,[2*256*8+ebx+edx*8]
		movzx	edx,ah
		pxor	mm1,[3*256*8+ebx+edx*8]
		mov	eax,[4-8+esi+ecx*8]
		movzx	edx,al
		pxor	mm1,[4*256*8+ebx+edx*8]
		movzx	edx,ah
		shr	eax,16
		cmp	ecx,n
		pxor	mm1,[5*256*8+ebx+edx*8]
		movzx	edx,al
		pxor	mm1,[6*256*8+ebx+edx*8]
		movzx	edx,ah
		pxor	mm1,[7*256*8+ebx+edx*8]
		pxor	mm1, mm0
		movq	[-8+edi+ecx*8],mm1
		jne	L0
		pop	ebx
		emms
	}
#else
	for (i = 0; i < n; i++) {
		uint64 word = v[i];
		y[i] ^=  c[ 0*256 + ((uint8)(word >>  0)) ]
		       ^ c[ 1*256 + ((uint8)(word >>  8)) ]
		       ^ c[ 2*256 + ((uint8)(word >> 16)) ]
		       ^ c[ 3*256 + ((uint8)(word >> 24)) ]
		       ^ c[ 4*256 + ((uint8)(word >> 32)) ]
		       ^ c[ 5*256 + ((uint8)(word >> 40)) ]
		       ^ c[ 6*256 + ((uint8)(word >> 48)) ]
		       ^ c[ 7*256 + ((uint8)(word >> 56)       ) ];
	}
#endif
}

/*-------------------------------------------------------------------*/
void mul_64xN_Nx64(uint64 *x, uint64 *y,
		   uint64 *xy, uint32 n) {

	/* Let x and y be n x 64 matrices. This routine computes
	   the 64 x 64 matrix xy[][] given by transpose(x) * y */

	uint32 i;
	uint64 c[8 * 256] = {0};

	memset(xy, 0, 64 * sizeof(uint64));

#if defined(__GNUC__) && defined(__i386__) && defined(HAS_MMX)
	i = 0;
	asm volatile("0:                                   \n\t"
		     "movq (%3,%0,8), %%mm0                \n\t"
		     "movl (%1,%0,8), %%eax                \n\t"
		     "incl %0                              \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor (%2,%%ecx,8), %%mm1             \n\t"
		     "movq %%mm1, (%2,%%ecx,8)             \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 1*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 1*256*8(%2,%%ecx,8)      \n\t"
		     "shrl $16, %%eax                      \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 2*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 2*256*8(%2,%%ecx,8)      \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 3*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 3*256*8(%2,%%ecx,8)      \n\t"
		     "movl 4-8(%1,%0,8), %%eax             \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 4*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 4*256*8(%2,%%ecx,8)      \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "shrl $16, %%eax                      \n\t"
		     "cmpl %5, %0                          \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 5*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 5*256*8(%2,%%ecx,8)      \n\t"
		     "movzbl %%al, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 6*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 6*256*8(%2,%%ecx,8)      \n\t"
		     "movzbl %%ah, %%ecx                   \n\t"
		     "movq %%mm0, %%mm1                    \n\t"
		     "pxor 7*256*8(%2,%%ecx,8), %%mm1      \n\t"
		     "movq %%mm1, 7*256*8(%2,%%ecx,8)      \n\t"
		     "jne 0b                               \n\t"
		     "emms                                 \n\t"
			:"=r"(i)
			:"r"(x), "r"(c), "r"(y), "0"(i), "g"(n)
			:"%eax", "%ecx", "%mm0", "%mm1", "memory");

#elif defined(_MSC_VER) && !defined(_WIN64)
	i = 0;
	__asm
	{
		push	ebx
		mov	edi,y
		lea	ebx,c
		mov	esi,x
		mov	ecx,i
	L0:	movq	mm0,[edi+ecx*8]
		mov	eax,[esi+ecx*8]
		inc	ecx
		movzx	edx,al
		movq	mm1,mm0
		pxor	mm1,[ebx+edx*8]
		movq	[ebx+edx*8],mm1
		movzx	edx,ah
		movq	mm1, mm0
		pxor	mm1,[1*256*8+ebx+edx*8]
		movq	[1*256*8+ebx+edx*8],mm1
		shr	eax,16
		movzx	edx,al
		movq	mm1,mm0
		pxor	mm1,[2*256*8+ebx+edx*8]
		movq	[2*256*8+ebx+edx*8],mm1
		movzx	edx,ah
		movq	mm1,mm0
		pxor	mm1,[3*256*8+ebx+edx*8]
		movq	[3*256*8+ebx+edx*8],mm1
		mov	eax,[4-8+esi+ecx*8]
		movzx	edx,al
		movq	mm1,mm0
		pxor	mm1,[4*256*8+ebx+edx*8]
		movq	[4*256*8+ebx+edx*8],mm1
		movzx	edx,ah
		shr	eax,16
		cmp	ecx,n
		movq	mm1,mm0
		pxor	mm1,[5*256*8+ebx+edx*8]
		movq	[5*256*8+ebx+edx*8],mm1
		movzx	edx,al
		movq	mm1,mm0
		pxor	mm1,[6*256*8+ebx+edx*8]
		movq	[6*256*8+ebx+edx*8],mm1
		movzx	edx,ah
		movq	mm1,mm0
		pxor	mm1,[7*256*8+ebx+edx*8]
		movq	[7*256*8+ebx+edx*8],mm1
		jne	L0
		emms
		pop	ebx
	}
#else

	for (i = 0; i < n; i++) {
		uint64 xi = x[i];
		uint64 yi = y[i];
		c[ 0*256 + ((uint8) xi       ) ] ^= yi;
		c[ 1*256 + ((uint8)(xi >>  8)) ] ^= yi;
		c[ 2*256 + ((uint8)(xi >> 16)) ] ^= yi;
		c[ 3*256 + ((uint8)(xi >> 24)) ] ^= yi;
		c[ 4*256 + ((uint8)(xi >> 32)) ] ^= yi;
		c[ 5*256 + ((uint8)(xi >> 40)) ] ^= yi;
		c[ 6*256 + ((uint8)(xi >> 48)) ] ^= yi;
		c[ 7*256 + ((uint8)(xi >> 56)) ] ^= yi;
	}
#endif

	for(i = 0; i < 8; i++) {

		uint32 j;
		uint64 a0, a1, a2, a3, a4, a5, a6, a7;

		a0 = a1 = a2 = a3 = 0;
		a4 = a5 = a6 = a7 = 0;

		for (j = 0; j < 256; j++) {
			if ((j >> i) & 1) {
				a0 ^= c[0*256 + j];
				a1 ^= c[1*256 + j];
				a2 ^= c[2*256 + j];
				a3 ^= c[3*256 + j];
				a4 ^= c[4*256 + j];
				a5 ^= c[5*256 + j];
				a6 ^= c[6*256 + j];
				a7 ^= c[7*256 + j];
			}
		}

		xy[ 0] = a0; xy[ 8] = a1; xy[16] = a2; xy[24] = a3;
		xy[32] = a4; xy[40] = a5; xy[48] = a6; xy[56] = a7;
		xy++;
	}
}

/*-------------------------------------------------------------------*/
static uint32 find_nonsingular_sub(msieve_obj *obj,
				uint64 *t, uint32 *s, 
				uint32 *last_s, uint32 last_dim, 
				uint64 *w) {

	/* given a 64x64 matrix t[][] (i.e. sixty-four
	   64-bit words) and a list of 'last_dim' column 
	   indices enumerated in last_s[]: 
	   
	     - find a submatrix of t that is invertible 
	     - invert it and copy to w[][]
	     - enumerate in s[] the columns represented in w[][] */

	uint32 i, j;
	uint32 dim;
	uint32 cols[64];
	uint64 M[64][2];
	uint64 mask, *row_i, *row_j;
	uint64 m0, m1;

	/* M = [t | I] for I the 64x64 identity matrix */

	for (i = 0; i < 64; i++) {
		M[i][0] = t[i]; 
		M[i][1] = bitmask[i];
	}

	/* put the column indices from last_s[] into the
	   back of cols[], and copy to the beginning of cols[]
	   any column indices not in last_s[] */

	mask = 0;
	for (i = 0; i < last_dim; i++) {
		cols[63 - i] = last_s[i];
		mask |= bitmask[last_s[i]];
	}
	for (i = j = 0; i < 64; i++) {
		if (!(mask & bitmask[i]))
			cols[j++] = i;
	}

	/* compute the inverse of t[][] */

	for (i = dim = 0; i < 64; i++) {
	
		/* find the next pivot row and put in row i */

		mask = bitmask[cols[i]];
		row_i = M[cols[i]];

		for (j = i; j < 64; j++) {
			row_j = M[cols[j]];
			if (row_j[0] & mask) {
				m0 = row_j[0];
				m1 = row_j[1];
				row_j[0] = row_i[0];
				row_j[1] = row_i[1];
				row_i[0] = m0; 
				row_i[1] = m1;
				break;
			}
		}
				
		/* if a pivot row was found, eliminate the pivot
		   column from all other rows */

		if (j < 64) {
			for (j = 0; j < 64; j++) {
				row_j = M[cols[j]];
				if ((row_i != row_j) && (row_j[0] & mask)) {
					row_j[0] ^= row_i[0];
					row_j[1] ^= row_i[1];
				}
			}

			/* add the pivot column to the list of 
			   accepted columns */

			s[dim++] = cols[i];
			continue;
		}

		/* otherwise, use the right-hand half of M[]
		   to compensate for the absence of a pivot column */

		for (j = i; j < 64; j++) {
			row_j = M[cols[j]];
			if (row_j[1] & mask) {
				m0 = row_j[0];
				m1 = row_j[1];
				row_j[0] = row_i[0];
				row_j[1] = row_i[1];
				row_i[0] = m0; 
				row_i[1] = m1;
				break;
			}
		}
				
		if (j == 64) {
			logprintf(obj, "lanczos error: submatrix "
					"is not invertible\n");
			return 0;
		}
			
		/* eliminate the pivot column from the other rows
		   of the inverse */

		for (j = 0; j < 64; j++) {
			row_j = M[cols[j]];
			if ((row_i != row_j) && (row_j[1] & mask)) {
				row_j[0] ^= row_i[0];
				row_j[1] ^= row_i[1];
			}
		}

		/* wipe out the pivot row */

		row_i[0] = row_i[1] = 0;
	}

	/* the right-hand half of M[] is the desired inverse */
	
	for (i = 0; i < 64; i++) 
		w[i] = M[i][1];

	return dim;
}

/*-----------------------------------------------------------------------*/
static void transpose_vector(uint32 ncols, uint64 *v, uint64 **trans) {

	/* Hideously inefficent routine to transpose a
	   vector v[] of 64-bit words into a 2-D array
	   trans[][] of 64-bit words */

	uint32 i, j;
	uint32 col;
	uint64 mask, word;

	for (i = 0; i < ncols; i++) {
		col = i / 64;
		mask = bitmask[i % 64];
		word = v[i];
		j = 0;
		while (word) {
			if (word & 1)
				trans[j][col] |= mask;
			word = word >> 1;
			j++;
		}
	}
}

/*-----------------------------------------------------------------------*/
static uint32 combine_cols(uint32 ncols, 
			uint64 *x, uint64 *v, 
			uint64 *ax, uint64 *av) {

	/* Once the block Lanczos iteration has finished, 
	   x[] and v[] will contain mostly nullspace vectors
	   between them, as well as possibly some columns
	   that are linear combinations of nullspace vectors.
	   Given vectors ax[] and av[] that are the result of
	   multiplying x[] and v[] by the matrix, this routine 
	   will use Gauss elimination on the columns of [ax | av] 
	   to find all of the linearly dependent columns. The
	   column operations needed to accomplish this are mir-
	   rored in [x | v] and the columns that are independent
	   are skipped. Finally, the dependent columns are copied
	   back into x[] and represent the nullspace vector output
	   of the block Lanczos code. */

	uint32 i, j, k, bitpos, col, col_words;
	uint64 mask;
	uint64 *matrix[128], *amatrix[128], *tmp;

	col_words = (ncols + 63) / 64;

	for (i = 0; i < 128; i++) {
		matrix[i] = (uint64 *)calloc((size_t)col_words, 
					     sizeof(uint64));
		amatrix[i] = (uint64 *)calloc((size_t)col_words, 
					      sizeof(uint64));
	}

	/* operations on columns can more conveniently become 
	   operations on rows if all the vectors are first
	   transposed */

	transpose_vector(ncols, x, matrix);
	transpose_vector(ncols, ax, amatrix);
	transpose_vector(ncols, v, matrix + 64);
	transpose_vector(ncols, av, amatrix + 64);

	/* Keep eliminating rows until the unprocessed part
	   of amatrix[][] is all zero. The rows where this
	   happens correspond to linearly dependent vectors
	   in the nullspace */

	for (i = bitpos = 0; i < 128 && bitpos < ncols; bitpos++) {

		/* find the next pivot row */

		mask = bitmask[bitpos % 64];
		col = bitpos / 64;
		for (j = i; j < 128; j++) {
			if (amatrix[j][col] & mask) {
				tmp = matrix[i];
				matrix[i] = matrix[j];
				matrix[j] = tmp;
				tmp = amatrix[i];
				amatrix[i] = amatrix[j];
				amatrix[j] = tmp;
				break;
			}
		}
		if (j == 128)
			continue;

		/* a pivot was found; eliminate it from the
		   remaining rows */

		for (j++; j < 128; j++) {
			if (amatrix[j][col] & mask) {

				/* Note that the entire row, *not*
				   just the nonzero part of it, must
				   be eliminated; this is because the
				   corresponding (dense) row of matrix[][]
				   must have the same operation applied */

				for (k = 0; k < col_words; k++) {
					amatrix[j][k] ^= amatrix[i][k];
					matrix[j][k] ^= matrix[i][k];
				}
			}
		}
		i++;
	}

	/* transpose rows i to 64 back into x[]. Pack the
	   dependencies into the low-order bits of x[] */

	for (j = 0; j < ncols; j++) {
		uint64 word = 0;

		col = j / 64;
		mask = bitmask[j % 64];

		for (k = i; k < 64; k++) {
			if (matrix[k][col] & mask)
				word |= bitmask[k - i];
		}
		x[j] = word;
	}

	for (j = 0; j < 128; j++) {
		free(matrix[j]);
		free(amatrix[j]);
	}

	if (i > 64)
		return 0;
	return 64 - i;
}

/*-----------------------------------------------------------------------*/
static uint64 * block_lanczos_core(msieve_obj *obj, 
				packed_matrix_t *packed_matrix,
				uint32 *num_deps_found,
				uint64 *post_lanczos_matrix) {
	
	/* Solve Bx = 0 for some nonzero x; the computed
	   solution, containing up to 64 of these nullspace
	   vectors, is returned */

	uint32 n = packed_matrix->ncols;
	uint64 *vnext, *v[3], *x, *v0;
	uint64 *winv[3];
	uint64 *vt_a_v[2], *vt_a2_v[2];
	uint64 *scratch;
	uint64 *d, *e, *f, *f2;
	uint64 *tmp;
	uint32 s[2][64];
	uint32 i, iter;
	uint32 dim0, dim1;
	uint64 mask0, mask1;

	uint32 dim_solved = 0;
	uint32 report_interval = 0;
	uint32 next_report = 0;

	if (packed_matrix->num_threads > 1)
		logprintf(obj, "commencing Lanczos iteration (%u threads)\n",
					packed_matrix->num_threads);
	else
		logprintf(obj, "commencing Lanczos iteration\n");

	/* allocate all of the size-n variables */

	v[0] = (uint64 *)malloc(n * sizeof(uint64));
	v[1] = (uint64 *)malloc(n * sizeof(uint64));
	v[2] = (uint64 *)malloc(n * sizeof(uint64));
	vnext = (uint64 *)malloc(n * sizeof(uint64));
	x = (uint64 *)malloc(n * sizeof(uint64));
	v0 = (uint64 *)malloc(n * sizeof(uint64));
	scratch = (uint64 *)malloc(n * sizeof(uint64));

	/* allocate all the 64x64 variables */

	winv[0] = (uint64 *)malloc(64 * sizeof(uint64));
	winv[1] = (uint64 *)malloc(64 * sizeof(uint64));
	winv[2] = (uint64 *)malloc(64 * sizeof(uint64));
	vt_a_v[0] = (uint64 *)malloc(64 * sizeof(uint64));
	vt_a_v[1] = (uint64 *)malloc(64 * sizeof(uint64));
	vt_a2_v[0] = (uint64 *)malloc(64 * sizeof(uint64));
	vt_a2_v[1] = (uint64 *)malloc(64 * sizeof(uint64));
	d = (uint64 *)malloc(64 * sizeof(uint64));
	e = (uint64 *)malloc(64 * sizeof(uint64));
	f = (uint64 *)malloc(64 * sizeof(uint64));
	f2 = (uint64 *)malloc(64 * sizeof(uint64));

	/* The iterations computes v[0], vt_a_v[0],
	   vt_a2_v[0], s[0] and winv[0]. Subscripts larger
	   than zero represent past versions of these
	   quantities, which start off empty (except for
	   the past version of s[], which contains all
	   the column indices) */
	   
	memset(v[1], 0, n * sizeof(uint64));
	memset(v[2], 0, n * sizeof(uint64));
	for (i = 0; i < 64; i++) {
		s[1][i] = i;
		vt_a_v[1][i] = 0;
		vt_a2_v[1][i] = 0;
		winv[1][i] = 0;
		winv[2][i] = 0;
	}
	dim0 = 0;
	dim1 = 64;
	mask1 = (uint64)(-1);
	iter = 0;

	/* The computed solution 'x' starts off random,
	   and v[0] starts off as B*x. This initial copy
	   of v[0] must be saved off separately */

	for (i = 0; i < n; i++)
		v[0][i] = (uint64)(get_rand(&obj->seed1, &obj->seed2)) << 32 |
		          (uint64)(get_rand(&obj->seed1, &obj->seed2));

	memcpy(x, v[0], n * sizeof(uint64));
	mul_MxN_Nx64(packed_matrix, v[0], scratch);
	mul_trans_MxN_Nx64(packed_matrix, scratch, v[0]);
	memcpy(v0, v[0], n * sizeof(uint64));

	/* determine if the solver will run long enough that
	   it would be worthwhile to report progress */

	if (n > 60000 &&
	    obj->flags & (MSIEVE_FLAG_USE_LOGFILE |
	    		  MSIEVE_FLAG_LOG_TO_STDOUT)) {
		if (n > 1000000)
			report_interval = 200;
		else if (n > 500000)
			report_interval = 500;
		else if (n > 100000)
			report_interval = 2000;
		else
			report_interval = 8000;
		next_report = report_interval;
	}

	/* perform the iteration */

	while (1) {
		iter++;

		/* multiply the current v[0] by a symmetrized
		   version of B, or B'B (apostrophe means 
		   transpose). Use "A" to refer to B'B  */

		mul_MxN_Nx64(packed_matrix, v[0], scratch);
		mul_trans_MxN_Nx64(packed_matrix, scratch, vnext);

		/* compute v0'*A*v0 and (A*v0)'(A*v0) */

		mul_64xN_Nx64(v[0], vnext, vt_a_v[0], n);
		mul_64xN_Nx64(vnext, vnext, vt_a2_v[0], n);

		/* if the former is orthogonal to itself, then
		   the iteration has finished */

		for (i = 0; i < 64; i++) {
			if (vt_a_v[0][i] != 0)
				break;
		}
		if (i == 64) {
			break;
		}

		/* Find the size-'dim0' nonsingular submatrix
		   of v0'*A*v0, invert it, and list the column
		   indices present in the submatrix */

		dim0 = find_nonsingular_sub(obj, vt_a_v[0], s[0], 
					    s[1], dim1, winv[0]);

		/* The block Lanczos recurrence depends on all columns
		   of v'Av appearing in this and/or the last iteration. 
		   Verify that condition here
		  
		   Note that the test only applies if this is not the
		   last Lanczos iteration. I'm not sure that this is right, 
		   but the test fails on the last iteration much more often 
		   than would be expected by chance alone, yet ignoring
		   the failure still produces good dependencies */
	
		if (dim_solved + dim1 < n - 64) {
			mask0 = 0;
			for (i = 0; i < dim0; i++)
				mask0 |= bitmask[s[0][i]];
			for (i = 0; i < dim1; i++)
				mask0 |= bitmask[s[1][i]];
		
			if (mask0 != (uint64)(-1)) {
				logprintf(obj, "lanczos error: "
						"not all columns used\n");
				dim0 = 0;
				break;
			}
		}

		/* mask0 contains one set bit for every column
		   that participates in the inverted submatrix
		   computed above */

		mask0 = 0;
		for (i = 0; i < dim0; i++)
			mask0 |= bitmask[s[0][i]];

		/* compute d */

		for (i = 0; i < 64; i++)
			d[i] = (vt_a2_v[0][i] & mask0) ^ vt_a_v[0][i];

		mul_64x64_64x64(winv[0], d, d);

		for (i = 0; i < 64; i++)
			d[i] = d[i] ^ bitmask[i];

		/* compute e */

		mul_64x64_64x64(winv[1], vt_a_v[0], e);

		for (i = 0; i < 64; i++)
			e[i] = e[i] & mask0;

		/* compute f */

		mul_64x64_64x64(vt_a_v[1], winv[1], f);

		for (i = 0; i < 64; i++)
			f[i] = f[i] ^ bitmask[i];

		mul_64x64_64x64(winv[2], f, f);

		for (i = 0; i < 64; i++)
			f2[i] = ((vt_a2_v[1][i] & mask1) ^ 
				   vt_a_v[1][i]) & mask0;

		mul_64x64_64x64(f, f2, f);

		/* compute the next v */

		for (i = 0; i < n; i++)
			vnext[i] = vnext[i] & mask0;

		mul_Nx64_64x64_acc(v[0], d, vnext, n);
		mul_Nx64_64x64_acc(v[1], e, vnext, n);
		mul_Nx64_64x64_acc(v[2], f, vnext, n);
		
		/* update the computed solution 'x' */

		mul_64xN_Nx64(v[0], v0, d, n);
		mul_64x64_64x64(winv[0], d, d);
		mul_Nx64_64x64_acc(v[0], d, x, n);

		/* rotate all the variables */

		tmp = v[2]; 
		v[2] = v[1]; 
		v[1] = v[0]; 
		v[0] = vnext; 
		vnext = tmp;
		
		tmp = winv[2]; 
		winv[2] = winv[1]; 
		winv[1] = winv[0]; 
		winv[0] = tmp;
		
		tmp = vt_a_v[1]; vt_a_v[1] = vt_a_v[0]; vt_a_v[0] = tmp;
		
		tmp = vt_a2_v[1]; vt_a2_v[1] = vt_a2_v[0]; vt_a2_v[0] = tmp;

		memcpy(s[1], s[0], 64 * sizeof(uint32));
		mask1 = mask0;
		dim1 = dim0;

		if (report_interval) {
			dim_solved += dim0;
			if (dim_solved >= next_report) {
				next_report = dim_solved + report_interval;
				fprintf(stderr, "linear algebra completed %u "
					"out of %u dimensions (%1.1f%%)\r",
					dim_solved, n, 100.0 * dim_solved / n);
				fflush(stderr);
			}
		}
	}

	if (report_interval)
		fprintf(stderr, "\n");

	logprintf(obj, "lanczos halted after %d iterations\n", iter);

	/* free unneeded storage */

	free(vnext);
	free(scratch);
	free(v0);
	free(vt_a_v[0]);
	free(vt_a_v[1]);
	free(vt_a2_v[0]);
	free(vt_a2_v[1]);
	free(winv[0]);
	free(winv[1]);
	free(winv[2]);
	free(d);
	free(e);
	free(f);
	free(f2);

	/* if a recoverable failure occurred, start everything
	   over again */

	if (dim0 == 0) {
		logprintf(obj, "linear algebra failed; retrying...\n");
		free(x);
		free(v[0]);
		free(v[1]);
		free(v[2]);
		return NULL;
	}

	/* convert the output of the iteration to an actual
	   collection of nullspace vectors. Begin by multiplying
	   the output from the iteration by B */

	mul_MxN_Nx64(packed_matrix, x, v[1]);
	mul_MxN_Nx64(packed_matrix, v[0], v[2]);

	if (post_lanczos_matrix) {

		/* if necessary, add in the contribution of the
		   first few rows that were originally in B. We 
		   expect there to be about 64 - POST_LANCZOS_ROWS 
		   bit vectors that are in the nullspace of B and
		   post_lanczos_matrix simultaneously */

		for (i = 0; i < POST_LANCZOS_ROWS; i++) {
			uint64 accum0 = 0;
			uint64 accum1 = 0;
			uint32 j;
			mask0 = bitmask[i];
			for (j = 0; j < n; j++) {
				if (post_lanczos_matrix[j] & mask0) {
					accum0 ^= x[j];
					accum1 ^= v[0][j];
				}
			}
			v[1][i] ^= accum0;
			v[2][i] ^= accum1;
		}
	}

	*num_deps_found = combine_cols(n, x, v[0], v[1], v[2]);

	/* verify that these really are linear dependencies of B */

	mul_MxN_Nx64(packed_matrix, x, v[0]);

	for (i = 0; i < n; i++) {
		if (v[0][i] != 0)
			break;
	}
	if (i < n) {
		logprintf(obj, "lanczos error: dependencies don't work\n");
		exit(-1);
	}
	
	free(v[0]);
	free(v[1]);
	free(v[2]);

	if (*num_deps_found == 0)
		logprintf(obj, "lanczos error: only trivial "
				"dependencies found\n");
	else
		logprintf(obj, "recovered %u nontrivial dependencies\n", 
				*num_deps_found);
	return x;
}

/*-----------------------------------------------------------------------*/
uint64 * block_lanczos(msieve_obj *obj, uint32 nrows, 
			uint32 num_dense_rows, uint32 ncols, 
			la_col_t *B, uint32 *num_deps_found) {
	
	/* Solve Bx = 0 for some nonzero x; the computed
	   solution, containing up to 64 of these nullspace
	   vectors, is returned */

	uint64 *post_lanczos_matrix;
	uint64 *dependencies;
	packed_matrix_t packed_matrix;

	if (ncols <= nrows) {
		logprintf(obj, "matrix must have more columns than rows\n");
		exit(-1);
	}

	/* optionally remove the densest rows of the matrix, and
	   optionally pack a few more rows into dense format */

	post_lanczos_matrix = form_post_lanczos_matrix(obj, &nrows,
					&num_dense_rows, ncols, B);
	if (num_dense_rows) {
		logprintf(obj, "matrix includes %u packed rows\n", 
					num_dense_rows);
	}

	packed_matrix_init(obj, &packed_matrix, B, nrows, 
			   ncols, num_dense_rows);

	do {
		dependencies = block_lanczos_core(obj, &packed_matrix,
						num_deps_found,
						post_lanczos_matrix);
	} while (dependencies == NULL);

	packed_matrix_free(&packed_matrix);
	free(post_lanczos_matrix);
	return dependencies;
}


syntax highlighted by Code2HTML, v. 0.9.1