#include #include #include "pdsp_defs.h" #include "util.h" void dlsolve(int, int, double *, double *); void dmatvec(int, int, int, double *, double *, double *); void dmatvec2(int, int, int, double*, double*, double*, double*, double*); void pdgstrf_bmod2D_mv2( const int pnum, /* process number */ const int n, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int fsupc,/* leading column of the updating supernode */ const int krep, /* last column of the updating s-node */ const int nsupc,/* number of columns in the updating s-node */ int nsupr, /* number of rows in the updating s-node */ int nrow, /* number of rows below the diagonal block of the updating supernode */ int *repfnz, /* in */ int *panel_lsub, /* modified */ int *w_lsub_end, /* modified */ int *spa_marker, /* modified; size n-by-w */ double *dense, /* modified */ double *tempv, /* working array - zeros on entry/exit */ GlobalLU_t *Glu, /* modified */ Gstat_t *Gstat /* modified */ ) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * Performs numeric 2-D block updates (sup-panel) in topological order. * Results are returned in SPA dense[*,w]. * */ double zero = 0.0; #if ( MACH==CRAY_PVP ) _fcd ftcs1 = _cptofcd("L", strlen("L")), ftcs2 = _cptofcd("N", strlen("N")), ftcs3 = _cptofcd("U", strlen("U")); #endif #ifdef USE_VENDOR_BLAS int incx = 1, incy = 1; double alpha = 1.0, beta = zero; #endif double ukj, ukj1, ukj2; int luptr, luptr1, luptr2; int segsze; int block_nrow; /* no of rows in a block row */ register int lptr; /* points to the row subscripts of a supernode */ int kfnz, irow, no_zeros; register int isub, isub1, i, j; register int jj; /* index through each column in the panel */ int krep_ind; int *repfnz_col; /* repfnz[] for a column in the panel */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ double *dense_col; /* dense[] for a column in the panel */ double *TriTmp; register int ldaTmp; register int r_ind, r_hi; static int first = 1, maxsuper, rowblk; register int twocols; int kfnz2[2], jj2[2]; /* detect two identical columns */ double *tri[2], *matvec[2]; int *lsub, *xlsub_end; double *lusup; int *xlusup; register float flopcnt; #ifdef TIMING double *utime = Gstat->utime; double f_time; #endif if ( first ) { maxsuper = sp_ienv(3); rowblk = sp_ienv(4); first = 0; } ldaTmp = maxsuper + rowblk; lsub = Glu->lsub; xlsub_end = Glu->xlsub_end; lusup = Glu->lusup; xlusup = Glu->xlusup; lptr = Glu->xlsub[fsupc]; krep_ind = lptr + nsupc - 1; /* --------------------------------------------------------------- * Sequence through each column in the panel -- triangular solves. * The results of the triangular solves of all columns in the * panel are temporaroly stored in TriTemp[*] array. * For the unrolled small supernodes of size <= 3, we also perform * matrix-vector updates from below the diagonal block. * --------------------------------------------------------------- */ repfnz_col= repfnz; dense_col = dense; TriTmp = tempv; col_marker= spa_marker; col_lsub = panel_lsub; for (jj = jcol; jj < jcol + w; ++jj, col_marker += n, col_lsub += n, repfnz_col += n, dense_col += n, TriTmp += ldaTmp ) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* Skip any zero segment */ segsze = krep - kfnz + 1; luptr = xlusup[fsupc]; flopcnt = segsze * (segsze - 1) + 2 * nrow * segsze; Gstat->procstat[pnum].fcops += flopcnt; #ifdef TIMING f_time = SuperLU_timer_(); #endif /* Case 1: Update U-segment of size 1 -- col-col update */ if ( segsze == 1 ) { ukj = dense_col[lsub[krep_ind]]; luptr += nsupr*(nsupc-1) + nsupc; for (i = lptr + nsupc; i < xlsub_end[fsupc]; i++) { irow = lsub[i]; dense_col[irow] -= ukj * lusup[luptr]; ++luptr; #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else if ( segsze <= 3 ) { ukj = dense_col[lsub[krep_ind]]; ukj1 = dense_col[lsub[krep_ind - 1]]; luptr += nsupr*(nsupc-1) + nsupc-1; luptr1 = luptr - nsupr; if ( segsze == 2 ) { ukj -= ukj1 * lusup[luptr1]; dense_col[lsub[krep_ind]] = ukj; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; dense_col[irow] -= (ukj * lusup[luptr] + ukj1 * lusup[luptr1]); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else { ukj2 = dense_col[lsub[krep_ind - 2]]; luptr2 = luptr1 - nsupr; ukj1 -= ukj2 * lusup[luptr2-1]; ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2]; dense_col[lsub[krep_ind]] = ukj; dense_col[lsub[krep_ind-1]] = ukj1; for (i = lptr + nsupc; i < xlsub_end[fsupc]; ++i) { irow = lsub[i]; luptr++; luptr1++; luptr2++; dense_col[irow] -= (ukj * lusup[luptr] + ukj1*lusup[luptr1] + ukj2*lusup[luptr2]); #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif } } #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } else { /* segsze >= 4 */ /* Copy A[*,j] segment from dense[*] to TriTmp[*], which holds the result of triangular solve. */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; ++i) { irow = lsub[isub]; TriTmp[i] = dense_col[irow]; /* Gather */ ++isub; } /* start effective triangle */ luptr += nsupr * no_zeros + no_zeros; #ifdef TIMING f_time = SuperLU_timer_(); #endif #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #else dtrsv_( "L", "N", "U", &segsze, &lusup[luptr], &nsupr, TriTmp, &incx ); #endif #else dlsolve ( nsupr, segsze, &lusup[luptr], TriTmp ); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif } /* else ... */ } /* for jj ... end tri-solves */ /* -------------------------------------------------------- * Perform block row updates from below the diagonal block. * Push each block all the way into SPA dense[*]. * -------------------------------------------------------- */ for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) { r_hi = MIN(nrow, r_ind + rowblk); block_nrow = MIN(rowblk, r_hi - r_ind); luptr1 = xlusup[fsupc] + nsupc + r_ind; isub1 = lptr + nsupc + r_ind; repfnz_col = repfnz; twocols = 0; /* Sequence through each column in the panel -- matrix-vector */ for (jj = jcol; jj < jcol + w; ++jj, repfnz_col += n) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* skip zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ /* Now segsze >= 4 ... */ if ( twocols == 1 ) { /* got two columns */ jj2[1] = jj; twocols = 0; for (j = 0; j < 2; ++j) { i = n * (jj2[j] - jcol); kfnz2[j] = repfnz[i + krep]; tri[j] = tempv + ldaTmp * (jj2[j] - jcol); matvec[j] = tri[j] + maxsuper; } if ( kfnz2[0] < kfnz2[1] ) { /* First column is bigger */ no_zeros = kfnz2[0] - fsupc; segsze = kfnz2[1] - kfnz2[0]; luptr = luptr1 + nsupr * no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #else dgemv_( "N", &block_nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #endif #else dmatvec (nsupr, block_nrow, segsze, &lusup[luptr], tri[0], matvec[0]); #endif } else if ( kfnz2[0] > kfnz2[1] ) { no_zeros = kfnz2[1] - fsupc; segsze = kfnz2[0] - kfnz2[1]; luptr = luptr1 + nsupr * no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[1], &incx, &beta, matvec[1], &incy ); #else dgemv_( "N", &block_nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[1], &incx, &beta, matvec[1], &incy ); #endif #else dmatvec (nsupr, block_nrow, segsze, &lusup[luptr], tri[1], matvec[1]); #endif } /* Do matrix-vector multiply with two destinations */ kfnz = MAX( kfnz2[0], kfnz2[1] ); no_zeros = kfnz - fsupc; segsze = krep - kfnz + 1; luptr = luptr1 + nsupr * no_zeros; #if ( MACH==DEC ) dgemv2_ (&nsupr, &block_nrow, &segsze, &lusup[luptr], &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]], matvec[0], matvec[1]); /*#elif ( MACH==CRAY_PVP ) DGEMV2(&nsupr, &block_nrow, &segsze, &lusup[luptr], &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]], matvec[0], matvec[1]); */ #else dmatvec2 (nsupr, block_nrow, segsze, &lusup[luptr], &tri[0][kfnz-kfnz2[0]], &tri[1][kfnz-kfnz2[1]], matvec[0], matvec[1]); #endif #ifdef TIMING utime[FLOAT] += SuperLU_timer_() - f_time; #endif /* end for two destination update */ } else { /* wait for a second column */ jj2[0] = jj; twocols = 1; } } /* for jj ... */ if ( twocols == 1 ) { /* one more column left */ i = jj2[0] - jcol; tri[0] = tempv + ldaTmp * i; matvec[0] = tri[0] + maxsuper; kfnz = repfnz[i*n + krep]; no_zeros = kfnz - fsupc; segsze = krep - kfnz + 1; luptr = luptr1 + nsupr * no_zeros; #ifdef USE_VENDOR_BLAS #if ( MACH==CRAY_PVP ) SGEMV( ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #else dgemv_( "N", &block_nrow, &segsze, &alpha, &lusup[luptr], &nsupr, tri[0], &incx, &beta, matvec[0], &incy ); #endif #else dmatvec(nsupr, block_nrow, segsze, &lusup[luptr], tri[0], matvec[0]); #endif } /* if twocols == 1 */ /* Scatter matvec[*] into SPA dense[*]. */ repfnz_col = repfnz; dense_col = dense; col_marker = spa_marker; col_lsub = panel_lsub; matvec[0] = tempv + maxsuper; for (jj = jcol; jj < jcol + w; ++jj, repfnz_col += n, dense_col += n, col_marker += n, col_lsub += n, matvec[0] += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* skip zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ isub = isub1; for (i = 0; i < block_nrow; ++i) { irow = lsub[isub]; dense_col[irow] -= matvec[0][i]; /* Scatter-add */ #ifdef SCATTER_FOUND if ( col_marker[irow] != jj ) { col_marker[irow] = jj; col_lsub[w_lsub_end[jj-jcol]++] = irow; } #endif matvec[0][i] = zero; ++isub; } } /* for jj ... */ } /* for each block row ... */ /* ------------------------------------------------ Scatter the triangular solves into SPA dense[*]. ------------------------------------------------ */ repfnz_col = repfnz; dense_col = dense; TriTmp = tempv; for (jj = 0; jj < w; ++jj, repfnz_col += n, dense_col += n, TriTmp += ldaTmp) { kfnz = repfnz_col[krep]; if ( kfnz == EMPTY ) continue; /* skip any zero segment */ segsze = krep - kfnz + 1; if ( segsze <= 3 ) continue; /* skip unrolled cases */ no_zeros = kfnz - fsupc; isub = lptr + no_zeros; for (i = 0; i < segsze; i++) { irow = lsub[isub]; dense_col[irow] = TriTmp[i]; /* Scatter */ TriTmp[i] = zero; ++isub; } } /* for jj ... */ }