/*--------------------------------------------------------------------
This source distribution is placed in the public domain by its author,
Jason Papadopoulos. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.

Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may 
benefit from your work.	
       				   --jasonp@boo.net 6/3/07
--------------------------------------------------------------------*/

#include "lanczos.h"

/*-------------------------------------------------------------------*/
static void mul_unpacked(packed_matrix_t *matrix,
			  uint64 *x, uint64 *b) {

	uint32 ncols = matrix->ncols;
	uint32 num_dense_rows = matrix->num_dense_rows;
	la_col_t *A = matrix->unpacked_cols;
	uint32 i, j;

	memset(b, 0, ncols * sizeof(uint64));
	
	for (i = 0; i < ncols; i++) {
		la_col_t *col = A + i;
		uint32 *row_entries = col->data;
		uint64 tmp = x[i];

		for (j = 0; j < col->weight; j++) {
			b[row_entries[j]] ^= tmp;
		}
	}

	if (num_dense_rows) {
		for (i = 0; i < ncols; i++) {
			la_col_t *col = A + i;
			uint32 *row_entries = col->data + col->weight;
			uint64 tmp = x[i];
	
			for (j = 0; j < num_dense_rows; j++) {
				if (row_entries[j / 32] & 
						((uint32)1 << (j % 32))) {
					b[j] ^= tmp;
				}
			}
		}
	}
}

/*-------------------------------------------------------------------*/
static void mul_trans_unpacked(packed_matrix_t *matrix,
				uint64 *x, uint64 *b) {

	uint32 ncols = matrix->ncols;
	uint32 num_dense_rows = matrix->num_dense_rows;
	la_col_t *A = matrix->unpacked_cols;
	uint32 i, j;

	for (i = 0; i < ncols; i++) {
		la_col_t *col = A + i;
		uint32 *row_entries = col->data;
		uint64 accum = 0;

		for (j = 0; j < col->weight; j++) {
			accum ^= x[row_entries[j]];
		}
		b[i] = accum;
	}

	if (num_dense_rows) {
		for (i = 0; i < ncols; i++) {
			la_col_t *col = A + i;
			uint32 *row_entries = col->data + col->weight;
			uint64 accum = b[i];
	
			for (j = 0; j < num_dense_rows; j++) {
				if (row_entries[j / 32] &
						((uint32)1 << (j % 32))) {
					accum ^= x[j];
				}
			}
			b[i] = accum;
		}
	}
}

/*-------------------------------------------------------------------*/
#ifdef WIN32
static DWORD WINAPI mul_packed_core(LPVOID thread_data) {
#else
static void *mul_packed_core(void *thread_data) {
#endif

	thread_data_t *t = (thread_data_t *)thread_data;
	uint64 *x = t->x;
	uint64 *b = t->b;
	uint32 i, j;

	/* proceed block by block. We assume that blocks access
	   the matrix in row-major order; when computing b = A*x
	   this will write to the same block of b repeatedly, and
	   will read from all of x. This reduces the number of
	   dirty cache writebacks, improving performance slightly */

	for (i = 0; i < t->num_blocks; i++) {
		packed_block_t *curr_block = t->blocks + i;
		uint32 num_entries = curr_block->num_entries;
		entry_idx_t *entries = curr_block->entries;
		uint64 *curr_col = x + curr_block->start_col;
		uint64 *curr_b = b + curr_block->start_row;
		uint32 k = 0;
		uint32 m;

		/* unroll by 16, i.e. the number of matrix elements
		   in one cache line (usually). For 32-bit x86, we get
		   a huge performance boost by using either SSE or MMX
		   registers; not because they intrinsically are faster,
		   but because using them cuts the number of memory
		   operations in half, allowing the processor to buffer
		   more xor operations */

		for (j = 0; j < (num_entries & (uint32)(~15)); j += 16) {
			#ifdef MANUAL_PREFETCH
			PREFETCH(entries + j + 48);
			#endif

#if defined(__GNUC__) && defined(__i386__) && defined(HAS_MMX)
		#define _txor(x)				\
			"movzwl 4*" #x "+2(%1,%4,4), %0  \n\t"	\
			"movq (%3,%0,8), %%mm0      \n\t"	\
			"movzwl 4*" #x "(%1,%4,4), %0    \n\t"	\
			"pxor (%2,%0,8), %%mm0      \n\t"	\
			"movq %%mm0, (%2,%0,8)      \n\t"

		asm volatile (
			_txor( 0) _txor( 1) _txor( 2) _txor( 3)
			_txor( 4) _txor( 5) _txor( 6) _txor( 7)
			_txor( 8) _txor( 9) _txor(10) _txor(11)
			_txor(12) _txor(13) _txor(14) _txor(15)
			:"=r"(k)
			:"r"(entries), "r"(curr_b), "r"(curr_col),
			 "r"(j), "0"(k)
			:"%mm0", "memory");

#elif defined(_MSC_VER) && !defined(_WIN64) && defined(HAS_MMX)
		#define _txor(x)				\
			__asm movzx	eax,word ptr[4*x+2+edx]	\
			__asm movq	mm0,[ecx+eax*8]		\
			__asm movzx	eax,word ptr[4*x+edx]	\
			__asm pxor	mm0,[esi+eax*8]		\
			__asm movq	[esi+eax*8],mm0

		__asm
		{	
			mov		esi,curr_b
			mov		ecx,curr_col
			mov		edx,entries
			mov		eax,j
			lea		edx,[edx+eax*4]
			_txor( 0); _txor( 1); _txor( 2); _txor( 3);
			_txor( 4); _txor( 5); _txor( 6); _txor( 7);
			_txor( 8); _txor( 9); _txor(10); _txor(11);
			_txor(12); _txor(13); _txor(14); _txor(15);
		}
#else
	#define _txor(x) curr_b[entries[j+x].row_off] ^= \
					 curr_col[entries[j+x].col_off]
		_txor( 0); _txor( 1); _txor( 2); _txor( 3);
		_txor( 4); _txor( 5); _txor( 6); _txor( 7);
		_txor( 8); _txor( 9); _txor(10); _txor(11);
		_txor(12); _txor(13); _txor(14); _txor(15);
#endif
	#undef _txor
		}

		for (; j < num_entries; j++) {
			k = entries[j].row_off;
			m = entries[j].col_off;
			curr_b[k] ^= curr_col[m];
		}
	}

#ifdef WIN32
	return 0;
#else
	return NULL;
#endif
}

/*-------------------------------------------------------------------*/
static void mul_packed(packed_matrix_t *matrix,
		  uint64 *x, uint64 *b) {

	uint32 i;
	uint32 ncols = matrix->ncols;
	uint32 dense_row_blocks = (matrix->num_dense_rows + 63) / 64;
	thread_id_t thread_id[MAX_THREADS];

	for (i = 0; i < matrix->num_threads; i++) {
		thread_data_t *t = matrix->thread_data + i;

		/* use each thread's scratch vector, except the
		   first thead, which has no scratch vector but
		   uses b instead */

		t->x = x;
		if (i == 0)
			t->b = b;
		memset(t->b, 0, ncols * sizeof(uint64));

		/* fire off each part of the matrix multiply
		   in a separate thread, except the last part.
		   The current thread does the last partial
		   multiply, and this saves one join operation
		   for a noticeable speedup */

		if (i == matrix->num_threads - 1) {
			mul_packed_core(t);
		}
		else {
#ifdef WIN32
			thread_id[i] = CreateThread(NULL, 0, 
						mul_packed_core,
						t, 0, NULL);
#else
			pthread_create(thread_id + i, NULL, 
						mul_packed_core, t);
#endif
		}
	}

	/* wait for each thread to finish. All the scratch
	   vectors used by threads get xor-ed into the final b
	   vector */

	for (i = 0; i < matrix->num_threads; i++) {
		if (i < matrix->num_threads - 1) {
#ifdef WIN32
			WaitForSingleObject(thread_id[i], INFINITE);
#else
			pthread_join(thread_id[i], NULL);
#endif
		}

		if (i > 0) {
			uint64 *curr_b = matrix->thread_data[i].b;
			uint32 j;

			for (j = 0; j < ncols; j++)
				b[j] ^= curr_b[j];
		}
	}

	/* multiply the densest few rows by x (in batches of 64 rows) */

	for (i = 0; i < dense_row_blocks; i++) {
		mul_64xN_Nx64(matrix->dense_blocks[i], 
				x, b + 64 * i, ncols);
	}

#if defined(__GNUC__) && defined(__i386__) && defined(HAS_MMX)
	asm volatile ("emms");
#elif defined(_MSC_VER) && !defined(_WIN64) && defined(HAS_MMX)
	__asm emms
#endif
}

/*-------------------------------------------------------------------*/
#ifdef WIN32
static DWORD WINAPI mul_trans_packed_core(LPVOID thread_data) {
#else
static void *mul_trans_packed_core(void *thread_data) {
#endif

	thread_data_t *t = (thread_data_t *)thread_data;
	uint64 *x = t->x;
	uint64 *b = t->b;
	uint32 i, j;

	/* you would think that doing the matrix multiply
	   in column-major order would be faster, since this would
	   also minimize dirty cache writes. Except that it's slower;
	   it appears that row-major matrix access minimizes some
	   kind of thrashing somewhere */

	for (i = 0; i < t->num_blocks; i++) {
		packed_block_t *curr_block = t->blocks + i;
		uint32 num_entries = curr_block->num_entries;
		entry_idx_t *entries = curr_block->entries;
		uint64 *curr_row = x + curr_block->start_row;
		uint64 *curr_b = b + curr_block->start_col;
		uint32 k = 0;
		uint32 m;

		/* unroll by 16, i.e. the number of matrix elements
		   in one cache line (usually). For 32-bit x86, we get
		   a huge performance boost by using either SSE or MMX
		   registers; not because they intrinsically are faster,
		   but because using them cuts the number of memory
		   operations in half, allowing the processor to buffer
		   more xor operations */

		for (j = 0; j < (num_entries & (uint32)(~15)); j += 16) {
			#ifdef MANUAL_PREFETCH
			PREFETCH(entries + j + 48);
			#endif
#if defined(__GNUC__) && defined(__i386__) && defined(HAS_MMX)
		#define _txor(x)				\
			"movzwl 4*" #x "(%1,%4,4), %0    \n\t"	\
			"movq (%3,%0,8), %%mm0      \n\t"	\
			"movzwl 4*" #x "+2(%1,%4,4), %0  \n\t"	\
			"pxor (%2,%0,8), %%mm0      \n\t"	\
			"movq %%mm0, (%2,%0,8)      \n\t"

		asm volatile (
			_txor( 0) _txor( 1) _txor( 2) _txor( 3)
			_txor( 4) _txor( 5) _txor( 6) _txor( 7)
			_txor( 8) _txor( 9) _txor(10) _txor(11)
			_txor(12) _txor(13) _txor(14) _txor(15)
			:"=r"(k)
			:"r"(entries), "r"(curr_b), "r"(curr_row),
			 "r"(j), "0"(k)
			:"%mm0", "memory");

#elif defined(_MSC_VER) && !defined(_WIN64) && defined(HAS_MMX)
		#define _txor(x)				\
			__asm movzx	eax,word ptr[4*x+2+edx]	\
			__asm movq	mm0,[ecx+eax*8]		\
			__asm movzx	eax,word ptr[4*x+edx]	\
			__asm pxor	mm0,[esi+eax*8]		\
			__asm movq	[esi+eax*8],mm0

		__asm
		{	
			mov		esi,curr_b
			mov		ecx,curr_row
			mov		edx,entries
			mov		eax,j
			lea		edx,[edx+eax*4]
			_txor( 0); _txor( 1); _txor( 2); _txor( 3);
			_txor( 4); _txor( 5); _txor( 6); _txor( 7);
			_txor( 8); _txor( 9); _txor(10); _txor(11);
			_txor(12); _txor(13); _txor(14); _txor(15);
		}
#else
	#define _txor(x) curr_b[entries[j+x].col_off] ^= \
					 curr_row[entries[j+x].row_off]	
		_txor( 0); _txor( 1); _txor( 2); _txor( 3);
		_txor( 4); _txor( 5); _txor( 6); _txor( 7);
		_txor( 8); _txor( 9); _txor(10); _txor(11);
		_txor(12); _txor(13); _txor(14); _txor(15);
#endif
	#undef _txor
		}
		for (; j < num_entries; j++) {
			k = entries[j].row_off;
			m = entries[j].col_off;
			curr_b[m] ^= curr_row[k];
		}
	}

#ifdef WIN32
	return 0;
#else
	return NULL;
#endif
}

/*-------------------------------------------------------------------*/
static void mul_trans_packed(packed_matrix_t *matrix,
			uint64 *x, uint64 *b) {

	uint32 i;
	uint32 ncols = matrix->ncols;
	uint32 dense_row_blocks = (matrix->num_dense_rows + 63) / 64;
	thread_id_t thread_id[MAX_THREADS];
	uint64 *tmp_b[MAX_THREADS];

	memset(b, 0, ncols * sizeof(uint64));

	for (i = 0; i < matrix->num_threads; i++) {
		thread_data_t *t = matrix->thread_data + i;

		/* separate threads fill up disjoint portions
		   of a single b vector, and do not need 
		   per-thread scratch space */

		tmp_b[i] = t->b;
		t->x = x;
		t->b = b;

		/* fire off each part of the matrix multiply
		   in a separate thread, except the last part.
		   The current thread does the last partial
		   multiply, and this saves one join operation
		   for a noticeable speedup */

		if (i == matrix->num_threads - 1) {
			mul_trans_packed_core(t);
		}
		else {
#ifdef WIN32
			thread_id[i] = CreateThread(NULL, 0, 
					mul_trans_packed_core,
					t, 0, NULL);
#else
			pthread_create(thread_id + i, NULL, 
					mul_trans_packed_core, t);
#endif
		}
	}

	/* wait for each thread to finish */

	for (i = 0; i < matrix->num_threads; i++) {
		if (i < matrix->num_threads - 1) {
#ifdef WIN32
			WaitForSingleObject(thread_id[i], INFINITE);
#else
			pthread_join(thread_id[i], NULL);
#endif
		}
		matrix->thread_data[i].b = tmp_b[i];
	}

	/* multiply the densest few rows by x (in batches of 64 rows) */

	for (i = 0; i < dense_row_blocks; i++) {
		mul_Nx64_64x64_acc(matrix->dense_blocks[i], 
				   x + 64 * i, b, ncols);
	}

#if defined(__GNUC__) && defined(__i386__) && defined(HAS_MMX)
	asm volatile ("emms");
#elif defined(_MSC_VER) && !defined(_WIN64) && defined(HAS_MMX)
	__asm emms
#endif
}

/*-------------------------------------------------------------------*/
static void matrix_thread_init(thread_data_t *t, la_col_t *A,
			uint32 nrows, uint32 col_min, 
			uint32 col_max, uint32 block_size) {

	uint32 i, j, k;
	uint32 num_row_blocks;
	uint32 num_col_blocks;
	packed_block_t *curr_stripe;

	/* allocate blocks in row-major order; a 'stripe' is
	   a vertical column of blocks */

	num_row_blocks = (nrows + (block_size-1)) / block_size;
	num_col_blocks = ((col_max - col_min + 1) + 
				(block_size-1)) / block_size;
	t->num_blocks = num_row_blocks * num_col_blocks;
	t->blocks = curr_stripe = (packed_block_t *)calloc(
						(size_t)t->num_blocks,
						sizeof(packed_block_t));

	/* we convert the sparse part of the matrix to packed
	   format one stripe at a time. This limits the worst-
	   case memory use of the packing process */

	for (i = 0; i < num_col_blocks; i++, curr_stripe++) {

		uint32 curr_cols = MIN(block_size, (col_max - col_min + 1) - 
						i * block_size);
		packed_block_t *b;

		/* initialize the blocks in stripe i */

		for (j = 0, b = curr_stripe; j < num_row_blocks; j++) {
			b->start_row = j * block_size;
			b->start_col = col_min + i * block_size;
			b += num_col_blocks;
		}

		/* count the number of nonzero entries in each block */

		for (j = 0; j < curr_cols; j++) {
			la_col_t *c = A + col_min + i * block_size + j;

			for (k = 0, b = curr_stripe; k < c->weight; k++) {
				uint32 index = c->data[k];

				while (index >= b->start_row + block_size)
					b += num_col_blocks;
				b->num_entries_alloc++;
			}
		}

		/* concatenate the nonzero elements of the matrix
		   columns corresponding to this stripe. Note that
		   the ordering of elements within a single block
		   is arbitrary, and it's possible that some non-
		   default ordering can enhance performance. However,
		   I've tried a few different ideas and they're much
		   slower than just storing the nonzeros in the order
		   in which they occur in the unpacked matrix 
		   
		   We technically can combine the previous pass through
		   the columns with this pass, but on some versions of
		   libc the number of reallocations causes an incredible
		   slowdown */

		for (j = 0, b = curr_stripe; j < num_row_blocks; j++) {
			b->entries = (entry_idx_t *)malloc(
						b->num_entries_alloc *
						sizeof(entry_idx_t));
			b += num_col_blocks;
		}

		for (j = 0; j < curr_cols; j++) {
			la_col_t *c = A + col_min + i * block_size + j;

			for (k = 0, b = curr_stripe; k < c->weight; k++) {
				uint32 index = c->data[k];
				entry_idx_t *e;

				while (index >= b->start_row + block_size)
					b += num_col_blocks;

				e = b->entries + b->num_entries++;
				e->row_off = index - b->start_row;
				e->col_off = j;
			}

			free(c->data);
			c->data = NULL;
		}
	}
}

/*-------------------------------------------------------------------*/
void packed_matrix_init(msieve_obj *obj,
			packed_matrix_t *p, la_col_t *A,
			uint32 nrows, uint32 ncols,
			uint32 num_dense_rows) {

	uint32 i, j, k;
	uint32 dense_row_blocks;
	uint32 block_size;
	uint32 num_threads;
	uint32 num_nonzero;
	uint32 num_nonzero_per_thread;

	/* initialize */

	memset(p, 0, sizeof(packed_matrix_t));
	p->unpacked_cols = A;
	p->nrows = nrows;
	p->ncols = ncols;
	p->num_dense_rows = num_dense_rows;

	if (ncols <= MIN_NCOLS_TO_PACK)
		return;

	p->unpacked_cols = NULL;

	/* decide on the block size. We want to split the
	   cache into thirds; one third for x, one third for
	   b and the last third for streaming access to 
	   each block. If the matrix is small, adjust the
	   block size so that the matrix is divided 2.5
	   ways in each dimension.
	  
	   Note that the following does not compensate for
	   having multiple threads. If each thread gets a 
	   separate processor, no compensation is needed.
	   This could conceivably cause problems if multiple 
	   cores share the same cache, but multicore processors 
	   typically have pretty big caches anyway */

	block_size = obj->cache_size2 / (3 * sizeof(uint64));
	block_size = MIN(block_size, ncols / 2.5);
	block_size = MIN(block_size, 65536);
	if (block_size == 0)
		block_size = 32768;

	logprintf(obj, "using block size %u for "
			"processor cache size %u kB\n", 
				block_size, obj->cache_size2 / 1024);

	/* pack the dense rows 64 at a time */

	dense_row_blocks = (num_dense_rows + 63) / 64;
	if (dense_row_blocks) {
		p->dense_blocks = (uint64 **)malloc(dense_row_blocks *
						sizeof(uint64 *));
		for (i = 0; i < dense_row_blocks; i++) {
			p->dense_blocks[i] = (uint64 *)malloc(ncols *
							sizeof(uint64));
		}

		for (i = 0; i < ncols; i++) {
			la_col_t *c = A + i;
			uint32 *src = c->data + c->weight;
			for (j = 0; j < dense_row_blocks; j++) {
				p->dense_blocks[j][i] = 
						(uint64)src[2 * j + 1] << 32 |
						(uint64)src[2 * j];
			}
		}
	}

	/* determine the number of threads to use */

	num_threads = obj->num_threads;
	if (num_threads < 2 || ncols < MIN_NCOLS_TO_THREAD)
		num_threads = 1;
	p->num_threads = num_threads = MIN(num_threads, MAX_THREADS);

	/* compute the number of nonzero elements in the submatrix
	   given to each thread */

	for (i = num_nonzero = 0; i < ncols; i++)
		num_nonzero += A[i].weight;
	num_nonzero_per_thread = num_nonzero / num_threads;

	/* divide the matrix into groups of columns, one group
	   per thread, and pack each group separately */

	for (i = j = k = num_nonzero = 0; i < ncols; i++) {

		num_nonzero += A[i].weight;

		if (i == ncols - 1 || num_nonzero >= num_nonzero_per_thread) {
			matrix_thread_init(p->thread_data + k, A,
						nrows, j, i, block_size);

			/* each thread needs scratch space to store
			   matrix products. The first thread doesn't need
			   scratch space, it's provided by calling code */

			if (k > 0) {
				p->thread_data[k].b = (uint64 *)malloc(ncols * 
							sizeof(uint64));
			}
			j = i + 1;
			k++;
			num_nonzero = 0;
		}
	}
}

/*-------------------------------------------------------------------*/
void packed_matrix_free(packed_matrix_t *p) {

	uint32 i, j;

	if (p->unpacked_cols) {
		la_col_t *A = p->unpacked_cols;
		for (i = 0; i < p->ncols; i++) {
			free(A[i].data);
			A[i].data = NULL;
		}
	}
	else {
		for (i = 0; i < p->num_threads; i++) {
			thread_data_t *t = p->thread_data + i;
			for (j = 0; j < t->num_blocks; j++) {
				free(t->blocks[j].entries);
			}
			free(t->blocks);
			if (i > 0) {
				free(t->b);
			}
		}
		for (i = 0; i < (p->num_dense_rows + 63) / 64; i++)
			free(p->dense_blocks[i]);
		free(p->dense_blocks);
	}
}

/*-------------------------------------------------------------------*/
void mul_MxN_Nx64(packed_matrix_t *A, uint64 *x, uint64 *b) {

	/* Multiply the vector x[] by the matrix A (stored
	   columnwise) and put the result in b[]. */

	if (A->unpacked_cols)
		mul_unpacked(A, x, b);
	else
		mul_packed(A, x, b);
}

/*-------------------------------------------------------------------*/
void mul_trans_MxN_Nx64(packed_matrix_t *A, uint64 *x, uint64 *b) {

	/* Multiply the vector x[] by the transpose of the
	   matrix A and put the result in b[]. Since A is stored
	   by columns, this is just a matrix-vector product */

	if (A->unpacked_cols)
		mul_trans_unpacked(A, x, b);
	else
		mul_trans_packed(A, x, b);
}