#include "BSprivate.h" /*@ BSforward - Forward triangular matrix multiplication on a single vector Input Parameters: . A - The sparse matrix . x - The rhs . comm - The communication structure for A . procinfo - the usual processor information Output Parameters: . b - on exit contains A*x Returns: void @*/ void BSforward(BSpar_mat *A, FLOAT *x, FLOAT *b, BScomm *comm, BSprocinfo *procinfo) { BMphase *to_phase, *from_phase; BMmsg *msg; int i, j, k; int cl_ind, in_ind, symmetric; int count, size, ind, num_cols; int *row; FLOAT *nz; BScl_2_inode *clique2inode; BSnumbering *color2clique; BSinode *inodes; int *data_ptr, msg_len; FLOAT *msg_buf, *matrix; FLOAT *work; char UP = 'L'; char TR = 'N'; char ND = 'N'; int ione = 1; FLOAT one = 1.0; FLOAT zero = 0.0; /* Is the symmetric data structure used? */ symmetric = A->icc_storage; color2clique = A->color2clique; clique2inode = A->clique2inode; inodes = A->inodes->list; /* get some work space */ MY_MALLOC(work,(FLOAT *),sizeof(FLOAT)*A->num_rows,1); /* post for all messages */ BMinit_comp_msg(comm->from_msg,procinfo); CHKERR(0); if(symmetric) { if (A->save_diag == NULL) { /* because we know the diagonal is ones, initialize b to x */ for (i=0;inum_rows;i++) b[i] = x[i]; } else { for (i=0;inum_rows;i++) b[i] = A->save_diag[i]*x[i]; } } else { /* The following part is modified (ILU) for nonsymmetric version */ for (i=0;ilength-1;i++) { for (cl_ind=color2clique->numbers[i]; cl_indnumbers[i+1];cl_ind++) { if (procinfo->my_id == clique2inode->proc[cl_ind]) { size = clique2inode->d_mats[cl_ind].size; ind = clique2inode->d_mats[cl_ind].local_ind; matrix = clique2inode->d_mats[cl_ind].matrix; /* Much simplied in the case of ILU */ if (size > 1) { DGEMV(&TR,&size,&size,&one,matrix,&size, &(x[ind]),&ione,&zero,&(b[ind]),&ione); } else if (size == 1) { if (A->scale_diag == NULL) b[ind] = A->diag[ind]*x[ind]; else if (A->diag[ind] > 0.0) b[ind] = x[ind]; else if (A->diag[ind] < 0.0) b[ind] = -x[ind]; else b[ind] = 0.0; } } } } } /* first send my messages */ for (i=0;ilength-1;i++) { to_phase = BMget_phase(comm->to_msg,i); CHKERR(0); msg = NULL; while ((msg = BMnext_msg(to_phase,msg)) != NULL) { CHKERR(0); msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0); data_ptr = BMget_user(msg,&msg_len); CHKERR(0); for (j=0;jlength-1;i++) { for (cl_ind=color2clique->numbers[i]; cl_indnumbers[i+1];cl_ind++) { ind = clique2inode->d_mats[cl_ind].local_ind; if (procinfo->my_id == clique2inode->proc[cl_ind]) { if(symmetric) { /* first, multiply the clique */ /* only do the strictly lower triangular part */ /* we ASSUME the diagonal is all 1's */ size = clique2inode->d_mats[cl_ind].size; matrix = clique2inode->d_mats[cl_ind].matrix; if (size > 1) { j = size-1; matrix++; #ifdef MY_BLAS_DTRMV_ON MY_DTRMV_N_L(j,matrix,size,&(x[ind]),&(b[ind+1])); #else DCOPY(&j,&(x[ind]),&ione,work,&ione); DTRMV(&UP,&TR,&ND,&j,matrix,&size,work,&ione); DAXPY(&j,&one,work,&ione,&(b[ind+1]),&ione); #endif } } /* now, multiply the inodes */ for (in_ind=clique2inode->inode_index[cl_ind]; in_indinode_index[cl_ind+1];in_ind++) { row = inodes[in_ind].row_num; nz = inodes[in_ind].nz; size = inodes[in_ind].length; num_cols = inodes[in_ind].num_cols; if (size > 0) { #ifdef MY_BLAS_DGEMV_ON if (num_cols > DGEMV_UNROLL_LVL) { DGEMV(&TR,&size,&num_cols,&one, nz,&size,&(x[ind]),&ione,&zero,work,&ione); for (k=0;klength-1;i++) { from_phase = BMget_phase(comm->from_msg,i); CHKERR(0); while ((msg = BMrecv_msg(from_phase)) != NULL) { CHKERR(0); msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0); data_ptr = BMget_user(msg,&msg_len); CHKERR(0); count = 0; for (cl_ind=data_ptr[0];cl_ind<=data_ptr[1];cl_ind++) { for (in_ind=clique2inode->inode_index[cl_ind]; in_indinode_index[cl_ind+1];in_ind++) { row = inodes[in_ind].row_num; nz = inodes[in_ind].nz; size = inodes[in_ind].length; num_cols = inodes[in_ind].num_cols; if (size > 0) { #ifdef MY_BLAS_DGEMV_ON if (num_cols > DGEMV_UNROLL_LVL) { DGEMV(&TR,&size,&num_cols,&one, nz,&size,&(msg_buf[count]),&ione,&zero,work,&ione); for (k=0;kto_msg,procinfo); CHKERR(0); if(symmetric) { MLOG_flop((2*A->local_nnz)); } else { MLOG_flop((4*A->local_nnz-2*A->num_rows)); } }