#include /* for getenv and atoi */ #include "pdsp_defs.h" void pdgstrf(pdgstrf_options_t *pdgstrf_options, SuperMatrix *A, int *perm_r, SuperMatrix *L, SuperMatrix *U, Gstat_t *Gstat, int *info) { /* * -- SuperLU MT routine (version 1.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * August 15, 1997 * * Purpose * ======= * * pdgstrf() computes an LU factorization of a general sparse nrow-by-ncol * matrix A using partial pivoting with row interchanges. The factorization * has the form * Pr * A = L * U * where Pr is a row permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is * upper triangular (upper trapezoidal if A->nrow < A->ncol). * * Arguments * ========= * * pdgstrf_options (input) pdgstrf_options_t* * The structure defines the parameters to control how the sparse * LU factorization is performed. The following fields must be set * by the user: * * o nprocs (int) * Number of processes to be spawned and used for factorization. * * o refact (yes_no_t) * Specifies whether this is first time or subsequent factorization. * = NO: this factorization is treated as the first one; * = YES: it means that a factorization was performed prior to this * one. Therefore, this factorization will re-use some * existing data structures, such as L and U storage, column * elimination tree, and the symbolic information of the * Householder matrix. * * o panel_size (int) * A panel consists of at most panel_size consecutive columns. * * o relax (int) * Degree of relaxing supernodes. If the number of nodes (columns) * in a subtree of the elimination tree is less than relax, this * subtree is considered as one supernode, regardless of the row * structures of those columns. * * o diag_pivot_thresh (double) * Diagonal pivoting threshold. At step j of Gaussian elimination, * if abs(A_jj) >= diag_pivot_thresh * (max_(i>=j) abs(A_ij)), * use A_jj as pivot. 0 <= diag_pivot_thresh <= 1. The default * value is 1.0, corresponding to partial pivoting. * * o usepr (yes_no_t) * Whether the pivoting will use perm_r specified by the user. * = YES: use perm_r; perm_r is input, unchanged on exit. * = NO: perm_r is determined by partial pivoting, and is output. * * o drop_tol (double) (NOT IMPLEMENTED) * Drop tolerance parameter. At step j of the Gaussian elimination, * if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij. * 0 <= drop_tol <= 1. The default value of drop_tol is 0, * corresponding to not dropping any entry. * * o perm_c (int*) * Column permutation vector of size A->ncol, which defines the * permutation matrix Pc; perm_c[i] = j means column i of A is * in position j in A*Pc. * * o perm_r (int*) * Column permutation vector of size A->nrow. * If pdgstrf_options->usepr = NO, this is an output argument. * * o work (void*) of size lwork * User-supplied work space and space for the output data structures. * Not referenced if lwork = 0; * * o lwork (int) * Specifies the length of work array. * = 0: allocate space internally by system malloc; * > 0: use user-supplied work array of length lwork in bytes, * returns error if space runs out. * = -1: the routine guesses the amount of space needed without * performing the factorization, and returns it in * superlu_memusage->total_needed; no other side effects. * * A (input) SuperMatrix* * Original matrix A, permuted by columns, of dimension * (A->nrow, A->ncol). The type of A can be: * Stype = NCP; Dtype = _D; Mtype = GE. * * perm_r (input/output) int*, dimension A->nrow * Row permutation vector which defines the permutation matrix Pr, * perm_r[i] = j means row i of A is in position j in Pr*A. * If pdgstrf_options->usepr = NO, perm_r is output argument; * If pdgstrf_options->usepr = YES, the pivoting routine will try * to use the input perm_r, unless a certain threshold criterion * is violated. In that case, perm_r is overwritten by a new * permutation determined by partial pivoting or diagonal * threshold pivoting. * * L (output) SuperMatrix* * The factor L from the factorization Pr*A=L*U; use compressed row * subscripts storage for supernodes, i.e., L has type: * Stype = SCP, Dtype = _D, Mtype = TRLU. * * U (output) SuperMatrix* * The factor U from the factorization Pr*A*Pc=L*U. Use column-wise * storage scheme, i.e., U has types: Stype = NCP, Dtype = _D, * Mtype = TRU. * * Gstat (output) Gstat_t* * Record all the statistics about the factorization; * See Gstat_t structure defined in util.h. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * > A->ncol: number of bytes allocated when memory allocation * failure occurred, plus A->ncol. * */ pdgstrf_threadarg_t *pdgstrf_threadarg; pxgstrf_shared_t pxgstrf_shared; register int nprocs = pdgstrf_options->nprocs; register int i, iinfo; double *utime = Gstat->utime; double usrtime, wtime; double usertimer_(); #if ( MACH==SUN ) thread_t *thread_id; #elif ( MACH==DEC || MACH==PTHREAD ) pthread_t *thread_id; void *status; #endif void *pdgstrf_thread(void *); /* -------------------------------------------------------------- Initializes the parallel data structures for pdgstrf_thread(). --------------------------------------------------------------*/ pdgstrf_threadarg = pdgstrf_thread_init(A, L, U, pdgstrf_options, &pxgstrf_shared, Gstat, info); if ( *info ) return; /* Start timing factorization. */ usrtime = usertimer_(); wtime = SuperLU_timer_(); /* ------------------------------------------------------------ On a SUN multiprocessor system, use Solaris thread. ------------------------------------------------------------*/ #if ( MACH==SUN ) /* Create nproc threads for concurrent factorization. */ thread_id = (thread_t *) SUPERLU_MALLOC(nprocs * sizeof(thread_t)); for (i = 1; i < nprocs; ++i) { #if ( PRNTlevel==1 ) printf(".. Create unbound threads: i %d, nprocs %d\n", i, nprocs); #endif if ( (iinfo = thr_create(NULL, 0, pdgstrf_thread, &(pdgstrf_threadarg[i]), 0, &thread_id[i])) ) { fprintf(stderr, "thr_create: %d\n", iinfo); ABORT("thr_creat()"); } } pdgstrf_thread( &(pdgstrf_threadarg[0]) ); /* Wait for all threads to terminate. */ for (i = 1; i < nprocs; i++) thr_join(thread_id[i], 0, 0); SUPERLU_FREE (thread_id); /* _SOLARIS_2 */ /* ------------------------------------------------------------ On a DEC multiprocessor system, use pthread. ------------------------------------------------------------*/ #elif ( MACH==DEC ) /* Use DECthreads ... */ /* Create nproc threads for concurrent factorization. */ thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t)); for (i = 0; i < nprocs; ++i) { if ( iinfo = pthread_create(&thread_id[i], NULL, pdgstrf_thread, &(pdgstrf_threadarg[i])) ) { fprintf(stderr, "pthread_create: %d\n", iinfo); ABORT("pthread_create()"); } /* pthread_bind_to_cpu_np(thread_id[i], i);*/ } /* Wait for all threads to terminate. */ for (i = 0; i < nprocs; i++) pthread_join(thread_id[i], &status); SUPERLU_FREE (thread_id); /* _DEC */ /* ------------------------------------------------------------ On a SGI Power Challenge or Origin multiprocessor system, use parallel C. ------------------------------------------------------------*/ #elif ( MACH==SGI || MACH==ORIGIN ) /* Use parallel C ... */ if ( getenv("MP_SET_NUMTHREADS") ) { i = atoi(getenv("MP_SET_NUMTHREADS")); if ( nprocs > i ) { printf("nprocs=%d > environment allowed: MP_SET_NUMTHREADS=%d\n", nprocs, i); exit(-1); } } #pragma parallel #pragma shared (pdgstrf_threadarg) /*#pragma numthreads (max = nprocs)*/ #pragma numthreads (nprocs) { pdgstrf_thread( pdgstrf_threadarg ); } /* _SGI or _ORIGIN */ /* ------------------------------------------------------------ On a Cray PVP multiprocessor system, use microtasking. ------------------------------------------------------------*/ #elif ( MACH==CRAY_PVP ) /* Use C microtasking. */ if ( getenv("NCPUS") ) { i = atoi(getenv("NCPUS")); if ( nprocs > i ) { printf("nprocs=%d > environment allowed: NCPUS=%d\n", nprocs, i); exit(-1); } } #pragma _CRI taskloop private (i,nprocs) shared (pdgstrf_threadarg) /* Stand-alone task loop */ for (i = 0; i < nprocs; ++i) { pdgstrf_thread( &(pdgstrf_threadarg[i]) ); } /* _CRAY_PVP */ /* ------------------------------------------------------------ Use POSIX threads. ------------------------------------------------------------*/ #elif ( MACH==PTHREAD ) /* Use pthread ... */ /* Create nproc threads for concurrent factorization. */ thread_id = (pthread_t *) SUPERLU_MALLOC(nprocs * sizeof(pthread_t)); for (i = 0; i < nprocs; ++i) { if ( iinfo = pthread_create(&thread_id[i], NULL, pdgstrf_thread, &(pdgstrf_threadarg[i])) ) { fprintf(stderr, "pthread_create: %d\n", iinfo); ABORT("pthread_create()"); } } /* Wait for all threads to terminate. */ for (i = 0; i < nprocs; i++) pthread_join(thread_id[i], &status); SUPERLU_FREE (thread_id); /* _PTHREAD */ /* ------------------------------------------------------------ On all other systems, use single processor. ------------------------------------------------------------*/ #else printf("pdgstrf() is not parallelized on this machine.\n"); printf("pdgstrf() will be run on single processor.\n"); pdgstrf_thread( &(pdgstrf_threadarg[0]) ); #endif wtime = SuperLU_timer_() - wtime; usrtime = usertimer_() - usrtime; utime[FACT] = wtime; #if ( PRNTlevel==1 ) printf(".. pdgstrf_thread() returns info %d, usrtime %.2f, wtime %.2f\n", *info, usrtime, wtime); #endif /* check_mem_leak("after pdgstrf_thread()"); */ /* ------------------------------------------------------------ Clean up and free storage after multithreaded factorization. ------------------------------------------------------------*/ pdgstrf_thread_finalize(pdgstrf_threadarg, &pxgstrf_shared, A, perm_r, L, U); }