#include "BSprivate.h"

/*@ BSforward - Forward triangular matrix multiplication on a single vector

    Input Parameters:
.   A - The sparse matrix
.   x - The rhs
.   comm - The communication structure for A
.   procinfo - the usual processor information

    Output Parameters:
.   b - on exit contains A*x

    Returns:
    void

 @*/
void BSforward(BSpar_mat *A, FLOAT *x, FLOAT *b,
		BScomm *comm, BSprocinfo *procinfo)
{
	BMphase *to_phase, *from_phase;
	BMmsg *msg;
	int	i, j, k;
	int	cl_ind, in_ind, symmetric;
	int	count, size, ind, num_cols;
	int *row;
	FLOAT *nz;
	BScl_2_inode *clique2inode;
	BSnumbering *color2clique;
	BSinode *inodes;
	int	*data_ptr, msg_len;
	FLOAT *msg_buf, *matrix;
	FLOAT *work;
	char UP = 'L';
	char TR = 'N';
	char ND = 'N';
	int	ione = 1;
	FLOAT one = 1.0;
	FLOAT zero = 0.0;

	/* Is the symmetric data structure used? */
	symmetric = A->icc_storage;

	color2clique = A->color2clique;
	clique2inode = A->clique2inode;
	inodes = A->inodes->list;

	/* get some work space */
	MY_MALLOC(work,(FLOAT *),sizeof(FLOAT)*A->num_rows,1);

	/* post for all messages */
	BMinit_comp_msg(comm->from_msg,procinfo); CHKERR(0);

	if(symmetric) {
		if (A->save_diag == NULL) {
			/* because we know the diagonal is ones, initialize b to x */
			for (i=0;i<A->num_rows;i++) b[i] = x[i];
		} else {
			for (i=0;i<A->num_rows;i++) b[i] = A->save_diag[i]*x[i];
		}
	} else {
		/* The following part is modified (ILU) for nonsymmetric version */
		for (i=0;i<color2clique->length-1;i++) {
			for (cl_ind=color2clique->numbers[i];
				cl_ind<color2clique->numbers[i+1];cl_ind++) {
				if (procinfo->my_id == clique2inode->proc[cl_ind]) {
					size = clique2inode->d_mats[cl_ind].size;
					ind = clique2inode->d_mats[cl_ind].local_ind;
					matrix = clique2inode->d_mats[cl_ind].matrix;

					/* Much simplied in the case of ILU */
					if (size > 1) {
						DGEMV(&TR,&size,&size,&one,matrix,&size,
						&(x[ind]),&ione,&zero,&(b[ind]),&ione);
					} else if (size == 1) {
						if (A->scale_diag == NULL)
							b[ind] = A->diag[ind]*x[ind];
						else
							if (A->diag[ind] > 0.0)
								b[ind] = x[ind];
							else if (A->diag[ind] < 0.0)
								b[ind] = -x[ind];
							else
								b[ind] = 0.0;
					}
				}
			}
		}
	}

	/* first send my messages */
	for (i=0;i<color2clique->length-1;i++) {
		to_phase = BMget_phase(comm->to_msg,i); CHKERR(0);
		msg = NULL;
		while ((msg = BMnext_msg(to_phase,msg)) != NULL) {
			CHKERR(0);
			msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0);
			data_ptr = BMget_user(msg,&msg_len); CHKERR(0);
			for (j=0;j<msg_len;j++) {
				msg_buf[j] = x[data_ptr[j]];
			}
			BMsendf_msg(msg,procinfo); CHKERR(0);
		}
		CHKERR(0);
	}

	/* do some local work */
	for (i=0;i<color2clique->length-1;i++) {
		for (cl_ind=color2clique->numbers[i];
				cl_ind<color2clique->numbers[i+1];cl_ind++) {
			ind = clique2inode->d_mats[cl_ind].local_ind;
			if (procinfo->my_id == clique2inode->proc[cl_ind]) {
				if(symmetric) {
					/* first, multiply the clique */
					/* only do the strictly lower triangular part */
					/* we ASSUME the diagonal is all 1's */
					size = clique2inode->d_mats[cl_ind].size;
					matrix = clique2inode->d_mats[cl_ind].matrix;
					if (size > 1) {
						j = size-1;
						matrix++;
#ifdef MY_BLAS_DTRMV_ON
						MY_DTRMV_N_L(j,matrix,size,&(x[ind]),&(b[ind+1]));
#else
						DCOPY(&j,&(x[ind]),&ione,work,&ione);
						DTRMV(&UP,&TR,&ND,&j,matrix,&size,work,&ione);
						DAXPY(&j,&one,work,&ione,&(b[ind+1]),&ione);
#endif
					}
				}

				/* now, multiply the inodes */
				for (in_ind=clique2inode->inode_index[cl_ind];
					in_ind<clique2inode->inode_index[cl_ind+1];in_ind++) {
					row = inodes[in_ind].row_num;
					nz = inodes[in_ind].nz;
					size = inodes[in_ind].length;
					num_cols = inodes[in_ind].num_cols;
					if (size > 0) {
#ifdef MY_BLAS_DGEMV_ON
						if (num_cols > DGEMV_UNROLL_LVL) {
							DGEMV(&TR,&size,&num_cols,&one,
								nz,&size,&(x[ind]),&ione,&zero,work,&ione);
							for (k=0;k<size;k++) b[row[k]] += work[k];
						} else {
							MY_DGEMV_N_1111(size,num_cols,nz,size,&(x[ind]),b,row);
						}
#else
						DGEMV(&TR,&size,&num_cols,&one,
							nz,&size,&(x[ind]),&ione,&zero,work,&ione);
						for (k=0;k<size;k++) b[row[k]] += work[k];
#endif
					}
					ind += num_cols;
				}
			}
		}
	}

	/* receive my messages and do non-local work */
	for (i=0;i<color2clique->length-1;i++) {
		from_phase = BMget_phase(comm->from_msg,i); CHKERR(0);
		while ((msg = BMrecv_msg(from_phase)) != NULL) {
			CHKERR(0);
			msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0);
			data_ptr = BMget_user(msg,&msg_len); CHKERR(0);
			count = 0;
			for (cl_ind=data_ptr[0];cl_ind<=data_ptr[1];cl_ind++) {
				for (in_ind=clique2inode->inode_index[cl_ind];
					in_ind<clique2inode->inode_index[cl_ind+1];in_ind++) {
					row = inodes[in_ind].row_num;
					nz = inodes[in_ind].nz;
					size = inodes[in_ind].length;
					num_cols = inodes[in_ind].num_cols;
					if (size > 0) {
#ifdef MY_BLAS_DGEMV_ON
						if (num_cols > DGEMV_UNROLL_LVL) {
							DGEMV(&TR,&size,&num_cols,&one,
								nz,&size,&(msg_buf[count]),&ione,&zero,work,&ione);
							for (k=0;k<size;k++) b[row[k]] += work[k];
						} else {
							MY_DGEMV_N_1111(size,num_cols,nz,size,&(msg_buf[count]),
								b,row);
						}
#else
						DGEMV(&TR,&size,&num_cols,&one,
							nz,&size,&(msg_buf[count]),&ione,&zero,work,&ione);
						for (k=0;k<size;k++) b[row[k]] += work[k];
#endif
					}
					count += num_cols;
				}
			}
			BMfree_msg(msg); CHKERR(0);
		}
		CHKERR(0);
	}
	MY_FREE(work);
	/* wait for all of the sent messages to finish */
	BMfinish_comp_msg(comm->to_msg,procinfo); CHKERR(0);
	if(symmetric) {
		MLOG_flop((2*A->local_nnz));
	} else {
		MLOG_flop((4*A->local_nnz-2*A->num_rows));
	}
}


syntax highlighted by Code2HTML, v. 0.9.1