/* ======================================================= *
 * Copyright 1998-2005 Stephen C. Grubb                    *
 * http://ploticus.sourceforge.net                         *
 * Covered by GPL; see the file ./Copyright for details.   *
 * ======================================================= */

/* PROC TABULATE - do frequency distributions, 1-D or 2-D */

/* Uses data read by most recent getdata; result becomes "current data set"

   scg 1/11/00  - added support for special units even when categories not in use
   scg 1/21/00  - changed tab[][], total[], and grantotal from int to double.
                  added accumfield option.
   scg 1/21/00  - for 2-way dist, column tags are used to set getdata field names

   scg 4/2/04   - added numfmt - %g doesn't give correct results for sets of very small values

*/
#include "pl.h"

#define WORDLEN 40
#define MAXBINS 200
#define MAXROWS 200
#define MAXCOLS 60


static int fsort(), freqsort();

int
PLP_tabulate()
{
int i;
char attr[NAMEMAXLEN], val[256];
char *line, *lineval;
int nt, lvp;
int first;

char buf[256];
double tab[MAXROWS][MAXCOLS];
double total[2][MAXROWS];
double grantotal;
char list[2][MAXROWS][WORDLEN];
int stat, j, nlist[3], select[3], ndim;
int order[2][MAXROWS], valuesgiven[2], dopercents;
int forcevertical, forcehorizontal;
char ordering[2];
char tmp[WORDLEN];
char *GL_getok();
double atof();
/* --- */
char valuelist[2][256];
int field[2];
int ix;
int showresults;
int irow;
double gran[2];
char numbuf[80];
char selectex[256];
int result;
int doranges[2];
char axis[2];
int ixx;
char hival[80], lowval[80];
double hiv[MAXBINS], lowv[MAXBINS];
double fval;
char rangesepchar[5];
char showrange[20];
char rangespec[2][80];
int axisset[2]; /* added 1/11/00 scg */
char tag[80];   /* added 1/11/00 scg */
char fieldnamelist[256]; /* added 1/21/00 scg */
int accumfield;
double inc;
char numfmt[20];


TDH_errprog( "pl proc tabulate" );


/* initialize */
field[0] = -1; field[1] = -1;
ndim = -1;
valuesgiven[0] = valuesgiven[1] = 0;
dopercents = 0;
forcevertical = 1; forcehorizontal = 0;
ordering[0] = '?'; ordering[1] = '?';
gran[0] = gran[1] = 0.0;
strcpy( selectex, "" );
doranges[0] = doranges[1] = 0;
axis[0] = 'x';
axis[1] = 'y';
strcpy( rangesepchar, "-" );
strcpy( showrange, "" );
strcpy( rangespec[0], "" );
strcpy( rangespec[1], "" );
axisset[0] = axisset[1] = 0; /* was axis1 or axis2 set explicitly - added 1/11/00 scg */
accumfield = -1;
showresults = 0;
strcpy( numfmt, "%g" );


/* get attributes.. */
first = 1;
while( 1 ) {
	line = getnextattr( first, attr, val, &lvp, &nt );
	if( line == NULL ) break;
	first = 0;
	lineval = &line[lvp];

	if( stricmp( attr, "datafield1" )==0 ) {
		field[0] = fref( val ) - 1;
		ndim = 1;
		}
	else if( stricmp( attr, "datafield2" )==0 ) {
		field[1] = fref( val ) - 1;
		ndim = 2;
		}
	else if( stricmp( attr, "accumfield" )==0 ) {
		accumfield = fref( val ) -1;
		}
	else if( stricmp( attr, "valuelist1" )==0 ) strcpy( valuelist[0], lineval );
	else if( stricmp( attr, "valuelist2" )==0 ) strcpy( valuelist[1], lineval );
	else if( stricmp( attr, "doranges1" )==0 ) {
		if( strnicmp( val, YESANS, 1 )==0 ) doranges[0] = 1;
		else doranges[0] = 0;
		}
	else if( stricmp( attr, "doranges2" )==0 ) {
		if( strnicmp( val, YESANS, 1 )==0 ) doranges[1] = 1;
		else doranges[1] = 0;
		}
	else if( stricmp( attr, "showrangelowonly" )==0 ) { 
		if( strnicmp( val, YESANS, 1 )==0 ) strcpy( showrange, "low" );
		}
	else if( stricmp( attr, "showrange" )==0 ) strcpy( showrange, val );
	else if( stricmp( attr, "rangesepchar" )==0 ) strcpy( rangesepchar, val );
	else if( stricmp( attr, "order1" )==0 ) {
		if( strnicmp( val, "mag", 3 )==0 ) ordering[0] = 'm';
		else if( strnicmp( val, "rev", 3 )==0 ) ordering[0] = 'r';
		else if( strnicmp( val, "nat", 3 )==0 ) ordering[0] = 'n';
		else if( stricmp( val, "none" )==0 ) ordering[0] = '0';
		}
	else if( stricmp( attr, "ordering2" )==0 ) {
		if( strnicmp( val, "mag", 3 ) == 0 ) ordering[1] = 'm';
		else if( strnicmp( val, "rev", 3 ) == 0 ) ordering[1] = 'r';
		else if( strnicmp( val, "nat", 3 ) == 0 ) ordering[1] = 'n';
		else if( stricmp( val, "none" ) == 0 ) ordering[1] = '0';
		}

	else if( stricmp( attr, "percents" )==0 ) {
		if( strnicmp( val, YESANS, 1 )==0 ) dopercents = 1;
		else dopercents = 0;
		}
	else if( stricmp( attr, "showresults" )==0 || stricmp( attr, "savetable" )==0 ) showresults = 1;

	else if( stricmp( attr, "select" )==0 ) strcpy( selectex, lineval );

	else if( stricmp( attr, "axis1" )==0 ) {
		axis[0] = tolower(val[0]);
		axisset[0] = 1;
		}
	else if( stricmp( attr, "axis2" )==0 ) {
		axis[1] = tolower(val[0]);
		axisset[1] = 1;
		}

	else if( stricmp( attr, "rangespec1" )==0 ) strcpy( rangespec[0], lineval );
	else if( stricmp( attr, "rangespec2" )==0 ) strcpy( rangespec[1], lineval );

	else if( stricmp( attr, "numfmt" )==0 ) strcpy( numfmt, val );

	else Eerr( 1, "tabulate attribute not recognized", attr );
	}


/* overrides and degenerate cases */
/* -------------------------- */
if( field[0] < 0 ) return( Eerr( 4984, "datafield1 must be specified", "" ) );

if( ordering[0] == '?' ) {
	if( valuelist[0][0] != '\0' ) ordering[0] = '0';
	else ordering[0] = 'n';
	}
if( ordering[1] == '?' ) {
	if( valuelist[1][0] != '\0' ) ordering[1] = '0';
	else ordering[1] = 'n';
	}



/* now do the computation work.. */
/* -------------------------- */

/* initialize tables */
grantotal = 0.0;
for( i = 0; i < MAXROWS; i++ ) {
        if( i < 2 )nlist[i] = 0;
        for( j = 0; j < MAXCOLS; j++ ) { tab[i][j] = 0.0; if( i < 2 ) { total[i][j] = 0.0; } }
	order[0][i] = order[1][i] = i;
        }


/* get value list (a comma or space delimited list of values; output distribution will
   be for only these values (in the order given?) */
for( j = 0; j < ndim; j++ ) { /* for all dimensions (1 or 2).. */


	if( valuelist[j][0] != '\0' ) {
	    int ix;
	    for( i = 0, ix = 0;  ; i++ ) {
		if( GL_getseg( tmp, valuelist[j], &ix, " ," ) ) break;
		strcpy( list[j][i], tmp ); 
		nlist[j]++;
		if( doranges[j] ) { /* find lowv and hiv */
			if( i >= MAXBINS ) return( Eerr( 9285, "Sorry, max number of bins exceeded","" ) );
			ixx = 0;
			GL_getseg( lowval, tmp, &ixx, rangesepchar );
			strcpy( hival, &tmp[ixx] );
			if( stricmp( lowval, "c" )==0 && i > 0 ) lowv[i] = hiv[i-1]; /* contiguous*/
			else lowv[i] = Econv( axis[j], lowval );
			if( Econv_error( ) ) {   /* a non-conformant value.. */
				lowv[i] = PLHUGE; /* so that we know to compare 
							to value for non-numerics */
				hiv[i] = NEGHUGE;
				}
			else 	{
				hiv[i] = Econv( axis[j], hival );
				/* Euprint( lowval, axis[j], lowv[i], "" ); */
				Euprint( lowval, axis[j], lowv[i], numfmt );
				if( tolower(showrange[0]) == 'l' ) strcpy( list[j][i], lowval );
				/* else if( tolower(showrange[0]) == 'a' ) Euprint( list[j][i], axis[j], (lowv[i]+hiv[i])/2.0, "" ); */
				else if( tolower(showrange[0]) == 'a' ) Euprint( list[j][i], axis[j], (lowv[i]+hiv[i])/2.0, numfmt );

				else sprintf( list[j][i], "%s%s%s", lowval, rangesepchar, hival );
				}
			}
		}
	    valuesgiven[j] = 1;
	    }

	/* automatic bins */
	if( rangespec[j][0] != '\0' ) {
		double binsiz, hilimit, rw;
		doranges[j] = 1; /* implied */
		nt = sscanf( rangespec[j], "%s %lf %s", lowval, &binsiz, hival );
		if( nt < 2 || nt > 3 ) return( Eerr( 2740, "2 or 3 values expected in rangespec", 
			rangespec[j] ) );
		rw = Econv( axis[j], lowval );
		if( Econv_error() ) return( Eerr( 2750, "warning: error on rangespec lowlimit", rangespec[j] ) );
		if( nt == 3 ) {
			hilimit = Econv(axis[j], hival );
			if( Econv_error() ) return( Eerr( 2750, "warning: error on rangespec hilimit", rangespec[j] ) );
			}
		else if( nt == 2 ) {
			if( !scalebeenset() ) return( Eerr( 2479, "rangespec must have 3 values since scaleing has not yet been set", "" ));
			hilimit = Elimit( axis[j], 'h', 's' );
			}
		for( i = 0; i < MAXBINS ; i++ ) {
			lowv[i] = rw;
			rw += binsiz;
			hiv[i] = rw;
			Euprint( lowval, axis[j], lowv[i], "" );
			Euprint( hival, axis[j], hiv[i], "" );
			if( tolower(showrange[0]) == 'l' ) strcpy( list[j][i], lowval );
			/* else if( tolower(showrange[0]) == 'a' ) Euprint( list[j][i], axis[j], (lowv[i]+hiv[i])/2.0, "" ); */
			else if( tolower(showrange[0]) == 'a' ) Euprint( list[j][i], axis[j], (lowv[i]+hiv[i])/2.0, numfmt ); 
			else sprintf( list[j][i], "%s%s%s", lowval, rangesepchar, hival );
			nlist[j]++;
			if( rw > hilimit ) break;
			}
		valuesgiven[j] = 1;
		}

	if( doranges[j] && !valuesgiven[j] ) 
		return( Eerr( 2052, "A values list must be given when doing ranges.", "" ));
	}



/* process from data already read in earlier.. */
if( Nrecords < 1 ) return( Eerr( 32, "No data has been read yet.", "" ) );

/* process input data.. */
ix = 0;
for( irow = 0; irow < Nrecords; irow++ ) {

 	if( selectex[0] != '\0' ) { /* process against selection condition if any.. */
                stat = do_select( selectex, irow, &result );
                if( stat != 0 ) { Eerr( stat, "Select error", selectex ); continue; }
                if( result == 0 ) continue; /* reject */
                }

        /* for each item requested (1 or 2) */
        for( i = 0; i < ndim; i++ ) {

                /* strcpy( val, data[ field[i] ] ); */
		strcpy( val, da( irow, field[i] ));
		val[WORDLEN-1] = '\0';



		/****  if ranges have not been pre defined... */
                if( !doranges[i] ) { /* classify by value */

			/**** a. if axis has been set, use special units, Econv, etc. */
			if( axisset[i] ) { /* code copied from below; uses fval instead of val */
				fval = Econv( axis[i], val );

				/* see if already encountered (compare fval) */
				for( j = 0; j < nlist[i]; j++ ) 
					if( GL_close_to( fval, atof(list[i][j]), 0.001 ))
                                        	{ select[i] = j; break; }

				if( j == nlist[i] && valuesgiven[i] ) goto NEXT; 

				/* otherwise, add the internal representation to list */
				if( j == nlist[i] ) {
					if( (i == 0 && nlist[i] >= MAXROWS) || (i == 1 && nlist[i] >= MAXCOLS) ) 
					    fprintf( PLS.errfp, "tabulate warning, sorry, table capacity exceeded, skipping %s\n", val );
					else	{
						if( !Econv_error()) 
							sprintf( list[i][j], "%f", fval );
						else strcpy( list[i][j], val );
						select[i] = j; 
						nlist[i] ++; 
						}
					}
				}
			

			/**** b. axis has not been set, dont use Econv */
			else	{

	                        /* see if we've already encountered current val */
	                        for( j = 0; j < nlist[i]; j++ ) if( strcmp( val, list[i][j] )== 0 ) 
	                                                                { select[i] = j; break; }
	
				if( j == nlist[i] && valuesgiven[i] ) goto NEXT; /* not in value list-
										   discard */
	
	                        /* add it to list */
	                        if( j == nlist[i] ) { 
					if( (i == 0 && nlist[i] >= MAXROWS) || (i == 1 && nlist[i] >= MAXCOLS) ) 
						fprintf( PLS.errfp, "tabulate warning, sorry, table capacity exceeded, skipping %s\n", val );
					else	{
						strcpy( list[i][j], val ); 
						select[i] = j; 
						nlist[i] ++; 
						}
					}
	                        }
			}


		/**** if ranges have been defined.. */
                else if( doranges[i] ) {
			fval = Econv( axis[i], val );


			/* go through list backwards so that boundary values are 
			   put into higher category.. */
			for( j = nlist[i] - 1; j >= 0; j-- ) {
				if( lowv[j] >= PLHUGE && hiv[j] <= NEGHUGE ) {
					if( strcmp( list[i][j], val )==0 ) {
						select[i] = j;
						break;
						}
					}
				if( Econv_error() ) continue;
				if( fval >= lowv[j] && fval <= hiv[j] ) {
					select[i] = j;
					break;
					}
				}
			/* if( j == nlist[i] ) goto NEXT; */ /* not in value list- discard */
			if( j < 0 ) goto NEXT;  /* not in value list- discard */
			}

                }
	if( accumfield >= 0 ) inc = atof( da( irow, accumfield ) );
	else inc = 1.0;

        if( ndim == 1 ) { 
                tab[ 0 ][ select[ 0 ] ] += inc; 
                total[0][0] += inc; 
                }
        else if( ndim == 2 ) { 
                ( tab[ select[0] ][ select[1] ] ) += inc; 
                total[0][select[0]] += inc; 
                total[1][select[1]] += inc; 
                grantotal += inc; 
                }

	NEXT: continue;
        }




/* put rows/cols in some kind of order */
if( ndim == 1 ) {
	if( ordering[0] == 'n' ) fsort( list[0], nlist[0], order[0] );
	else if( ordering[0] == 'm' ) freqsort(tab[0], nlist[0], order[0], 0 );
	else if( ordering[0] == 'r' ) freqsort(tab[0], nlist[0], order[0], 1 );
	}
else if( ndim == 2 ) {
	if( ordering[0] == 'n' ) fsort( list[0], nlist[0], order[0] );
	else if( ordering[0] == 'm' ) freqsort(tab[0], nlist[0], order[0], 0 );
	else if( ordering[0] == 'r' ) freqsort(tab[0], nlist[0], order[0], 1 );

	if( ordering[1] == 'n' ) fsort( list[1], nlist[1], order[1] );
	else if( ordering[1] == 'm' ) freqsort(total[1], nlist[1], order[1], 0 );
	else if( ordering[1] == 'r' ) freqsort(total[1], nlist[1], order[1], 1 );
	}


/* --------------------- */
/* generate the results */
/* --------------------- */

/* make a description string */
if( showresults ) {
        if( ndim == 1 ) fprintf( PLS.diagfp, "// proc tabulate has computed this distribution on field %d %s\n", field[0]+1, selectex );
        else if( ndim == 2 ) fprintf( PLS.diagfp, "// proc tabulate has computed this 2-way distribution\n// on field %d (down) by field %d (across) %s\n",
                field[0]+1, field[1]+1, selectex );
	}



PL_newdataset();


/* ------------------ */
/* for 1-way tables.. */
/* ------------------ */
if( ndim == 1 ) {


	/* do output lines.. */
        for( i = 0; i < nlist[0]; i++ ) {
		if( !doranges[0] && axisset[0] ) Euprint( tag, axis[0], atof(list[0][order[0][i]]), "" );
		else strcpy( tag, list[0][order[0][i]] );


		PL_startdatarow();
		PL_catitem( tag ); /* label */

		/* n */
		sprintf( buf, numfmt, tab[0][order[0][i]] ); 
		PL_catitem( buf );

		if( dopercents ) {
			sprintf( buf, numfmt, (double)(tab[0][order[0][i]])/(total[0][0]+0.0001)*100  );
			PL_catitem( buf );
			}
		PL_enddatarow();
		}
	}


/* ------------------ */
/* for 2-way tables.. */
/* ------------------ */
else if( ndim == 2 ) { 

	if( fieldnamelist[0] != '\0' ) {
		definefieldnames( fieldnamelist );
		fprintf( PLS.diagfp, "proc tabulate: field names are now: %s\n", fieldnamelist );
		}
	 
	/* do output lines.. */
        for( j = 0; j < nlist[0]; j++ ) {
		if( !doranges[0] && axisset[0] ) Euprint( tag, axis[0], atof(list[0][order[0][j]]), "" );
		else strcpy( tag, list[0][order[0][j]] );

		PL_startdatarow();
		PL_catitem( tag ); /* label */

                for( i = 0; i < nlist[1]; i++ ) {

			/* n */
			sprintf( numbuf, numfmt, tab[order[0][j]] [order[1][i]] );
			PL_catitem( numbuf );

			if( dopercents ) {
				sprintf( numbuf, numfmt, 
				  	(double)(tab[order[0][j]][order[1][i]])/ (total[1][order[1][i]]+0.00001) * 100 ); 
				PL_catitem( numbuf );
				}
			}
		
		/* row total */
		sprintf( numbuf, numfmt, total[0][order[0][j]] );
		PL_catitem( numbuf );

		if( dopercents ) {
			sprintf( numbuf, numfmt, (double)(total[0][order[0][j]]) / (grantotal+0.00001) * 100 ); 
			PL_catitem( numbuf );
			}
		
		PL_enddatarow();
                }

	}


PLD.curds++;
if( PLS.debug ) fprintf( PLS.diagfp, "filling data set# %d (this will now be the current data)\n", PLD.curds );

setintvar( "NRECORDS", Nrecords );
setintvar( "NFIELDS", Nfields );

if( showresults ) for( i = 0; i < Nrecords; i++ ) {
	for( j = 0; j < Nfields; j++ ) fprintf( PLS.diagfp, "[%s]", da( i, j ) );
	fprintf( PLS.diagfp, "\n" );
	}
 

return( 0 );
}



/* ================ */
/* sort bin names numerically if all numeric, alphabetically otherwise */
static int
fsort( data, nd, order )
char data[][WORDLEN];
int nd, order[];
{
int i, j, used[MAXROWS], mincell, first, allnum, foo, idiff;
double atof(), diff;

allnum = YES;
for( i = 0; i < nd; i++ ) { 
        used[i] = 0; 
        if( atof( data[i] ) < -31999) strcpy( data[i], "-31000" );
        if( !GL_goodnum( data[i], &foo )) { allnum = NO; }
        }

for( i = 0; i < nd; i++ ) {
        first = YES;
        for( j = 0; j < nd; j++ ) {
                if( used[j] ) continue;
                if( first ) { mincell = j; first = NO; }
                if( allnum ) {
			diff = atof( data[j] ) - atof( data[mincell] );
                	if( diff <= 0.0 ) mincell = j;
			}
                else 	{
			idiff = (strcmp( data[j], data[mincell] ));
			if( idiff <= 0 ) mincell = j; 
			}
                }
        order[i] = mincell;
        used[mincell] = 1;
        }

return( 0 );
}

/* ================ */
static int
freqsort( counts, nd, order, reverse )
double counts[];
int nd, order[], reverse;
{
int i, j, used[MAXROWS], firstcell, first, diff;

for( i = 0; i < nd; i++ )
        used[i] = NO;

for( i = 0; i < nd; i++ ) {
        first = YES;
        for( j = 0; j < nd; j++ ) {
                if( used[j] ) continue;
                if( first ) { firstcell = j; first = NO; }
                diff = counts[j] - counts[firstcell];
                switch (reverse)
                    {
                    case YES: if (diff >= 0) firstcell = j; break;
                    case NO:  if (diff <= 0) firstcell = j; break;
                    }
                }
        order[i] = firstcell;
        used[firstcell] = 1;
        }
return( 0 );
}

/* ======================================================= *
 * Copyright 1998-2005 Stephen C. Grubb                    *
 * http://ploticus.sourceforge.net                         *
 * Covered by GPL; see the file ./Copyright for details.   *
 * ======================================================= */


syntax highlighted by Code2HTML, v. 0.9.1