#include #include #include "pdsp_defs.h" #define PRINT_SPIN_TIME(where) { \ if ( t2 > 0.001 ) { \ printf("[%d] Panel%6d on P%2d waits s-node%6d for %8.2f msec.\n",\ where, jcol, pnum, kcol, t2*1e3);\ fflush(stdout); } } #ifdef PREDICT_OPT /* comparison function used by qsort() - in decreasing order */ int numcomp(desc_eft_t *a, desc_eft_t *b) { if ( a->eft < b->eft ) return -1; else if ( a->eft > b->eft ) return 1; else return 0; } #endif void pdgstrf_panel_bmod( const int pnum, /* process number */ const int m, /* number of rows in the matrix */ const int w, /* current panel width */ const int jcol, /* leading column of the current panel */ const int bcol, /* first column of the farthest busy snode*/ int *inv_perm_r,/* in; inverse of the row pivoting */ int *etree, /* in */ int *nseg, /* modified */ int *segrep, /* modified */ int *repfnz, /* modified, size n-by-w */ int *panel_lsub,/* modified */ int *w_lsub_end,/* modified */ int *spa_marker,/* modified; size n-by-w */ double *dense, /* modified, size n-by-w */ double *tempv, /* working array - zeros on input/output */ pxgstrf_shared_t *pxgstrf_shared /* modified */ ) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * Performs numeric block updates (sup-panel) in topological order. * It features combined 1D and 2D blocking of the source updating s-node. * It consists of two steps: * (1) accumulates updates from "done" s-nodes. * (2) accumulates updates from "busy" s-nodes. * * Before entering this routine, the nonzeros of the original A in * this panel were already copied into the SPA dense[n,w]. * * Updated/Output arguments * ======================== * L[*,j:j+w-1] and U[*,j:j+w-1] are returned collectively in the * m-by-w vector dense[*,w]. The locations of nonzeros in L[*,j:j+w-1] * are given by lsub[*] and U[*,j:j+w-1] by (nseg,segrep,repfnz). * */ GlobalLU_t *Glu = pxgstrf_shared->Glu; /* modified */ Gstat_t *Gstat = pxgstrf_shared->Gstat; /* modified */ register int j, k, ksub; register int fsupc, nsupc, nsupr, nrow; register int kcol, krep, ksupno, dadsupno; register int jj; /* index through each column in the panel */ int *xsup, *xsup_end, *supno; int *lsub, *xlsub, *xlsub_end; int *repfnz_col; /* repfnz[] for a column in the panel */ double *dense_col; /* dense[] for a column in the panel */ int *col_marker; /* each column of the spa_marker[*,w] */ int *col_lsub; /* each column of the panel_lsub[*,w] */ static int first = 1, rowblk, colblk; #ifdef PROFILE double t1, t2; /* temporary time */ #endif #ifdef PREDICT_OPT register float pmod, max_child_eft = 0, sum_pmod = 0, min_desc_eft = 0; register float pmod_eft; register int kid, ndesc = 0; #endif #if ( DEBUGlevel>=2 ) int dbg_addr = 0*m; #endif if ( first ) { rowblk = sp_ienv(4); colblk = sp_ienv(5); first = 0; } xsup = Glu->xsup; xsup_end = Glu->xsup_end; supno = Glu->supno; lsub = Glu->lsub; xlsub = Glu->xlsub; xlsub_end = Glu->xlsub_end; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL) check_panel_dfs_list(pnum, "begin", jcol, *nseg, segrep);*/ if (jcol == BADPAN) printf("(%d) Enter pdgstrf_panel_bmod() jcol %d,BADCOL %d,dense_col[%d] %.10f\n", pnum, jcol, BADCOL, BADROW, dense[dbg_addr+BADROW]); #endif /* -------------------------------------------------------------------- For each non-busy supernode segment of U[*,jcol] in topological order, perform sup-panel update. -------------------------------------------------------------------- */ k = *nseg - 1; for (ksub = 0; ksub < *nseg; ++ksub) { /* * krep = representative of current k-th supernode * fsupc = first supernodal column * nsupc = no of columns in a supernode * nsupr = no of rows in a supernode */ krep = segrep[k--]; fsupc = xsup[supno[krep]]; nsupc = krep - fsupc + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nrow = nsupr - nsupc; #ifdef PREDICT_OPT pmod = procstat[pnum].fcops; #endif if ( nsupc >= colblk && nrow >= rowblk ) { /* 2-D block update */ #ifdef GEMV2 pdgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else pdgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } else { /* 1-D block update */ #ifdef GEMV2 pdgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else pdgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } #ifdef PREDICT_OPT pmod = procstat[pnum].fcops - pmod; kid = (Glu->pan_status[krep].size > 0) ? krep : (krep + Glu->pan_status[krep].size); desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv; desc_eft[ndesc++].pmod = pmod; #endif #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) non-busy update: krep %d, repfnz %d, dense_col[%d] %.10e\n", pnum, krep, repfnz[dbg_addr+krep], BADROW, dense[dbg_addr+BADROW]); #endif } /* for each updating supernode ... */ #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) After non-busy update: dense_col[%d] %.10e\n", pnum, BADROW, dense[dbg_addr+BADROW]); #endif /* --------------------------------------------------------------------- * Now wait for the "busy" s-nodes to become "done" -- this amounts to * climbing up the e-tree along the path starting from "bcol". * Several points are worth noting: * * (1) There are two possible relations between supernodes and panels * along the path of the e-tree: * o |s-node| < |panel| * want to climb up the e-tree one column at a time in order * to achieve more concurrency * o |s-node| > |panel| * want to climb up the e-tree one panel at a time; this * processor is stalled anyway while waiting for the panel. * * (2) Need to accommodate new fills, append them in panel_lsub[*,w]. * o use an n-by-w marker array, as part of the SPA (not scalable!) * * (3) Symbolically, need to find out repfnz[S, w], for each (busy) * supernode S. * o use dense[inv_perm_r[kcol]], filter all zeros * o detect the first nonzero in each segment * (at this moment, the boundary of the busy supernode/segment * S has already been identified) * * --------------------------------------------------------------------- */ kcol = bcol; while ( kcol < jcol ) { /* Pointers to each column of the w-wide arrays. */ repfnz_col = repfnz; dense_col = dense; col_marker = spa_marker; col_lsub = panel_lsub; /* Wait for the supernode, and collect wait-time statistics. */ if ( pxgstrf_shared->spin_locks[kcol] ) { #ifdef PROFILE TIC(t1); #endif await( &pxgstrf_shared->spin_locks[kcol] ); #ifdef PROFILE TOC(t2, t1); panstat[jcol].pipewaits++; panstat[jcol].spintime += t2; procstat[pnum].spintime += t2; #ifdef DOPRINT PRINT_SPIN_TIME(1); #endif #endif } /* Find leading column "fsupc" in the supernode that contains column "kcol" */ ksupno = supno[kcol]; fsupc = kcol; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL) */ if ( jcol==BADCOL ) printf("(%d) pdgstrf_panel_bmod[1] kcol %d, ksupno %d, fsupc %d\n", pnum, kcol, ksupno, fsupc); #endif /* Wait for the whole supernode to become "done" -- climb up e-tree one column at a time */ do { krep = SUPER_REP( ksupno ); kcol = etree[kcol]; if ( kcol >= jcol ) break; if ( pxgstrf_shared->spin_locks[kcol] ) { #ifdef PROFILE TIC(t1); #endif await ( &pxgstrf_shared->spin_locks[kcol] ); #ifdef PROFILE TOC(t2, t1); panstat[jcol].pipewaits++; panstat[jcol].spintime += t2; procstat[pnum].spintime += t2; #ifdef DOPRINT PRINT_SPIN_TIME(2); #endif #endif } dadsupno = supno[kcol]; #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL)*/ if ( jcol==BADCOL ) printf("(%d) pdgstrf_panel_bmod[2] krep %d, dad=kcol %d, dadsupno %d\n", pnum, krep, kcol, dadsupno); #endif } while ( dadsupno == ksupno ); /* Append the new segment into segrep[*]. After column_bmod(), copy_to_ucol() will use them. */ segrep[*nseg] = krep; ++(*nseg); /* Determine repfnz[krep, w] for each column in the panel */ for (jj = jcol; jj < jcol + w; ++jj, dense_col += m, repfnz_col += m, col_marker += m, col_lsub += m) { /* * Note: relaxed supernode may not form a path on the e-tree, * but its column numbers are contiguous. */ #ifdef SCATTER_FOUND for (kcol = fsupc; kcol <= krep; ++kcol) { if ( col_marker[inv_perm_r[kcol]] == jj ) { repfnz_col[krep] = kcol; /* Append new fills in panel_lsub[*,jj]. */ j = w_lsub_end[jj - jcol]; /*#pragma ivdep*/ for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) { ksub = lsub[k]; if ( col_marker[ksub] != jj ) { col_marker[ksub] = jj; col_lsub[j++] = ksub; } } w_lsub_end[jj - jcol] = j; break; /* found the leading nonzero in the segment */ } } #else for (kcol = fsupc; kcol <= krep; ++kcol) { if ( dense_col[inv_perm_r[kcol]] != 0.0 ) { repfnz_col[krep] = kcol; break; /* Found the leading nonzero in the U-segment */ } } /* In this case, we always treat the L-subscripts of the busy s-node [kcol : krep] as the new fills, even if the corresponding U-segment may be all zero. */ /* Append new fills in panel_lsub[*,jj]. */ j = w_lsub_end[jj - jcol]; /*#pragma ivdep*/ for (k = xlsub[krep]; k < xlsub_end[krep]; ++k) { ksub = lsub[k]; if ( col_marker[ksub] != jj ) { col_marker[ksub] = jj; col_lsub[j++] = ksub; } } w_lsub_end[jj - jcol] = j; #endif #if ( DEBUGlevel>=2 ) if (jj == BADCOL) { printf("(%d) pdgstrf_panel_bmod[fills]: jj %d, repfnz_col[%d] %d, inv_pr[%d] %d\n", pnum, jj, krep, repfnz_col[krep], fsupc, inv_perm_r[fsupc]); printf("(%d) pdgstrf_panel_bmod[fills] xlsub %d, xlsub_end %d, #lsub[%d] %d\n", pnum,xlsub[krep],xlsub_end[krep],krep, xlsub_end[krep]-xlsub[krep]); } #endif } /* for jj ... */ #ifdef PREDICT_OPT pmod = procstat[pnum].fcops; #endif /* Perform sup-panel updates - use combined 1D + 2D updates. */ nsupc = krep - fsupc + 1; nsupr = xlsub_end[fsupc] - xlsub[fsupc]; nrow = nsupr - nsupc; if ( nsupc >= colblk && nrow >= rowblk ) { /* 2-D block update */ #ifdef GEMV2 pdgstrf_bmod2D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else pdgstrf_bmod2D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } else { /* 1-D block update */ #ifdef GEMV2 pdgstrf_bmod1D_mv2(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #else pdgstrf_bmod1D(pnum, m, w, jcol, fsupc, krep, nsupc, nsupr, nrow, repfnz, panel_lsub, w_lsub_end, spa_marker, dense, tempv, Glu, Gstat); #endif } #ifdef PREDICT_OPT pmod = procstat[pnum].fcops - pmod; kid = (pxgstrf_shared->pan_status[krep].size > 0) ? krep : (krep + pxgstrf_shared->pan_status[krep].size); desc_eft[ndesc].eft = cp_panel[kid].est + cp_panel[kid].pdiv; desc_eft[ndesc++].pmod = pmod; #endif #if ( DEBUGlevel>=2 ) if (jcol == BADPAN) printf("(%d) After busy update: dense_col[%d] %.10f\n", pnum, BADROW, dense[dbg_addr+BADROW]); #endif /* Go to the parent of "krep" */ kcol = etree[krep]; } /* while kcol < jcol ... */ #if ( DEBUGlevel>=2 ) /*if (jcol >= LOCOL && jcol <= HICOL)*/ if ( jcol==BADCOL ) check_panel_dfs_list(pnum, "after-busy", jcol, *nseg, segrep); #endif #ifdef PREDICT_OPT qsort(desc_eft, ndesc, sizeof(desc_eft_t), (int(*)())numcomp); pmod_eft = 0; for (j = 0; j < ndesc; ++j) { pmod_eft = MAX( pmod_eft, desc_eft[j].eft ) + desc_eft[j].pmod; } if ( ndesc == 0 ) { /* No modifications from descendants */ pmod_eft = 0; for (j = cp_firstkid[jcol]; j != EMPTY; j = cp_nextkid[j]) { kid = (pxgstrf_shared->pan_status[j].size > 0) ? j : (j + pxgstrf_shared->pan_status[j].size); pmod_eft = MAX( pmod_eft, cp_panel[kid].est + cp_panel[kid].pdiv ); } } cp_panel[jcol].est = pmod_eft; #endif } int check_panel_dfs_list(int pnum, char *msg, int jcol, int nseg, int *segrep) { register int k; printf("(%d) pdgstrf_panel_bmod(%s) jcol %d, nseg %d in top. order ->\n", pnum, msg, jcol, nseg); for (k = nseg-1; k >= 0; --k) printf("(%d) segrep-%d ", pnum, segrep[k]); printf("\n"); fflush(stdout); return 0; }