#include "BSprivate.h"

/*@ BStri_mult - Multiply the matrix (A - shift*B) by a block of vectors

    Input Parameters:
.   A - a sparse matrix
.   comm_A - the communication structure for A
.   B - a sparse matrix
.   comm_B - the communication structure for B
.   v1 - the block of vectors to multiply by
.   t1 - a block of work vectors
.   t2 - a block of work vectors
.   shift - the shift value in (A-shift*B)
.   BS - the number of vectors in v1
.   procinfo - the usual processor stuff

    Output Parameters:
.   v2 - the resulting block of vectors

    Returns:
    void

    Notes:
    Different code is used to multiply a single vector than is used
    to multiply a block of vectors (this improves efficiency).  Also
    different code is used if shift=0.0.  If B is NULL, then we
    assume that it is the identity matrix.

 @*/
void BStri_mult(BSpar_mat *A, BScomm *comm_A, BSpar_mat *B, BScomm *comm_B,
	FLOAT *v1, FLOAT *v2, FLOAT *t1, FLOAT *t2, FLOAT shift, int BS,
	BSprocinfo *procinfo)
{
	int	i, j, n;
	FLOAT	*t_t1, *t_t2, *t_v1, *t_v2;

	if ((B == NULL) || (shift == 0.0)) {
		if (BS == 1) {
			MLOG_ELM(procinfo->procset);
			if (procinfo->single) {
				BSforward1(A,v1,v2,comm_A,procinfo); CHKERR(0);
			} else {
				BSforward(A,v1,v2,comm_A,procinfo); CHKERR(0);
			}
			MLOG_ACC(MM_FORWARD);
			MLOG_ELM(procinfo->procset);
			if (procinfo->single) {
				BSbackward1(A,v1,v2,comm_A,procinfo); CHKERR(0);
			} else {
				BSbackward(A,v1,v2,comm_A,procinfo); CHKERR(0);
			}
			MLOG_ACC(MM_BACKWARD);
			if (shift != 0.0) {
				n = A->num_rows;
				for (i=0;i<n;i++) {
					v2[i] -= shift*(v1[i]/(fabs(A->scale_diag[i])));
				}
			}
		} else {
			MLOG_ELM(procinfo->procset);
			BSb_forward(A,v1,v2,comm_A,BS,procinfo); CHKERR(0);
			MLOG_ACC(BMM_FORWARD);
			MLOG_ELM(procinfo->procset);
			BSb_backward(A,v1,v2,comm_A,BS,procinfo); CHKERR(0);
			MLOG_ACC(BMM_BACKWARD);
			if (shift != 0.0) {
				n = A->num_rows;
				for (i=0;i<BS;i++) {
					t_v2 = &(v2[i*n]);
					/* is tv_1 this correct? */
					t_v1 = &(v1[i*n]);
					for (j=0;j<n;j++) {
						t_v2[j] -= shift*(t_v1[j]/A->scale_diag[j]);
					}
				}
			}
		}
	} else {
		if (BS == 1) {
			n = A->num_rows;
			MLOG_ELM(procinfo->procset);
			if (procinfo->single) {
				BSforward1(A,v1,v2,comm_A,procinfo); CHKERR(0);
			} else {
				BSforward(A,v1,v2,comm_A,procinfo); CHKERR(0);
			}
			MLOG_ACC(MM_FORWARD);
			MLOG_ELM(procinfo->procset);
			if (procinfo->single) {
				BSbackward1(A,v1,v2,comm_A,procinfo); CHKERR(0);
			} else {
				BSbackward(A,v1,v2,comm_A,procinfo); CHKERR(0);
			}
			MLOG_ACC(MM_BACKWARD);
			if(A->icc_storage) {
				for (i=0;i<n;i++) {
					t1[i] = v1[i]/sqrt(A->scale_diag[i]);
				}
			} else {
				for (i=0;i<n;i++) {
					t1[i] = v1[i]/sqrt(fabs(A->scale_diag[i]));
				}
			}
			BSiperm_dvec(t1,t2,A->perm); CHKERR(0);
			BSperm_dvec(t2,t1,B->perm); CHKERR(0);
			MLOG_ELM(procinfo->procset);
			if (procinfo->single) {
				BSforward1(B,t1,t2,comm_B,procinfo); CHKERR(0);
			} else {
				BSforward(B,t1,t2,comm_B,procinfo); CHKERR(0);
			}
			MLOG_ACC(OMM_FORWARD);
			MLOG_ELM(procinfo->procset);
			if (procinfo->single) {
				BSbackward1(B,t1,t2,comm_B,procinfo); CHKERR(0);
			} else {
				BSbackward(B,t1,t2,comm_B,procinfo); CHKERR(0);
			}
			MLOG_ACC(OMM_BACKWARD);
			BSiperm_dvec(t2,t1,B->perm); CHKERR(0);
			BSperm_dvec(t1,t2,A->perm); CHKERR(0);
			if(A->icc_storage) {
				for (i=0;i<n;i++) {
					v2[i] -= shift*(t2[i]/sqrt(A->scale_diag[i]));
				}
			} else {
				for (i=0;i<n;i++) {
					v2[i] -= shift*(t2[i]/sqrt(fabs(A->scale_diag[i])));
				}
			}
		} else {
			n = A->num_rows;
			MLOG_ELM(procinfo->procset);
			BSb_forward(A,v1,v2,comm_A,BS,procinfo); CHKERR(0);
			MLOG_ACC(BMM_FORWARD);
			MLOG_ELM(procinfo->procset);
			BSb_backward(A,v1,v2,comm_A,BS,procinfo); CHKERR(0);
			MLOG_ACC(BMM_BACKWARD);
			if(A->icc_storage) {
				for (i=0;i<BS;i++) {
					t_t1 = &(t1[i*n]);
					t_v1 = &(v1[i*n]);
					for (j=0;j<n;j++) {
						t_t1[j] = t_v1[j]/sqrt(A->scale_diag[j]);
					}
				}
			} else {
				for (i=0;i<BS;i++) {
					t_t1 = &(t1[i*n]);
					t_v1 = &(v1[i*n]);
					for (j=0;j<n;j++) {
						t_t1[j] = t_v1[j]/sqrt(fabs(A->scale_diag[j]));
					}
				}
			}
			for (i=0;i<BS;i++) {
				t_t1 = &(t1[i*n]);
				t_t2 = &(t2[i*n]);
				BSiperm_dvec(t_t1,t_t2,A->perm); CHKERR(0);
				BSperm_dvec(t_t2,t_t1,B->perm); CHKERR(0);
			}
			MLOG_ELM(procinfo->procset);
			BSb_forward(B,t1,t2,comm_B,BS,procinfo); CHKERR(0);
			MLOG_ACC(OBMM_FORWARD);
			MLOG_ELM(procinfo->procset);
			BSb_backward(B,t1,t2,comm_B,BS,procinfo); CHKERR(0);
			MLOG_ACC(OBMM_BACKWARD);
			for (i=0;i<BS;i++) {
				t_t1 = &(t1[i*n]);
				t_t2 = &(t2[i*n]);
				BSiperm_dvec(t_t2,t_t1,B->perm); CHKERR(0);
				BSperm_dvec(t_t1,t_t2,A->perm); CHKERR(0);
			}
			if(A->icc_storage) {
				for (i=0;i<BS;i++) {
					t_t2 = &(t2[i*n]);
					t_v2 = &(v2[i*n]);
					for (j=0;j<n;j++) {
						t_v2[j] -= shift*(t_t2[j]/sqrt(A->scale_diag[j]));
					}
				}
			} else {
				for (i=0;i<BS;i++) {
					t_t2 = &(t2[i*n]);
					t_v2 = &(v2[i*n]);
					for (j=0;j<n;j++) {
						t_v2[j] -= shift*(t_t2[j]/sqrt(fabs(A->scale_diag[j])));
					}
				}
			}
		}
	}
}


syntax highlighted by Code2HTML, v. 0.9.1