//  cssutil.c - utility for munging css files, version X0.1
//  Copyright 2001-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.0.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org .  Other licenses may be negotiated; contact the 
//  author for details.  
//

//  include some standard files

#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

char version[] = "1.2";

void
helptext ()
{
  fprintf (stdout,
	   "cssutil version %s - generic css file utility.\n"
	   "Usage: cssutil [options]... css-file\n"

	   "		-b   - brief; print only summary\n"
	   "		-h   - print this help\n"
	   "		-q   - quite mode; no warning messages\n"
	   "		-r   - report then exit (no menu)\n"
	   "		-s css-size  - if no css file found, create new\n"
	   "			       one with this many buckets.\n"
	   "		-S css-size  - same as -s, but round up to next\n"
	   "			       2^n + 1 boundary.\n"
	   "		-v   - print version and exit\n"
	   "		-D   - dump css file to stdout in CSV format.\n"
           "		-R csv-file  - create and restore css from CSV\n",
	   VERSION);
}

int
main (int argc, char **argv)
{

  long i, k;			//  some random counters, when we need a loop
  long v;
  long sparse_spectrum_file_length = DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH;
  long user_set_css_length = 0;
  long hfsize;
  long long sum;		// sum of the hits... can be _big_.  
  // int hfd;
  int brief = 0, quiet = 0, dump = 0, restore = 0;
  int opt, fields;
  int report_only = 0;

  long *bcounts;
  long maxchain;
  long curchain;
  long totchain;
  long fbuckets;
  long nchains;
  long zvbins;
  long ofbins;

  long histbins;  // how many bins for the histogram

  char cmdstr[255];
  char cssfile[255];
  char csvfile[255];
  unsigned char cmdchr[2];
  char crapchr[2];
  float cmdval;
  int zloop, cmdloop;
  long learns_index, features_index;
  long docs_learned = -1;
  long features_learned = -1;

  //    the following for crm114.h's happiness

  char *newinputbuf;
  newinputbuf = (char *) &hfsize;

  histbins = FEATUREBUCKET_VALUE_MAX;
  if (histbins > FEATUREBUCKET_HISTOGRAM_MAX) 
    histbins = FEATUREBUCKET_HISTOGRAM_MAX;
  bcounts = malloc (sizeof (unsigned long) * (histbins + 2) );

  {
    struct stat statbuf;	//  filestat buffer
    FEATUREBUCKET_TYPE *hashes;	//  the text of the hash file

    // parse cmdline options
    while ((opt = getopt (argc, argv, "bDhR:rqs:S:v")) != -1)
      {
	switch (opt)
	  {
	  case 'b':
	    brief = 1;		// brief, no 'bin value ...' lines
	    break;
	  case 'D':
	    dump = 1;		// dump css file, no cmd menu
	    break;
	  case 'q':
	    quiet = 1;		// quiet mode, no warning messages
	    break;
	  case 'R':
	    {
	      FILE *f;
	      unsigned long key, hash, value;

	      // count lines to determine number of buckets and check CSV format
	      if ((f = fopen (optarg, "rb")) != NULL)
		{
		  sparse_spectrum_file_length = 0;
		  while (!feof (f))
		    if (fscanf (f, "%lu;%lu;%lu\n", &key, &hash, &value) == 3)
		      sparse_spectrum_file_length++;
		    else
		      {
			fprintf (stderr,
				 "\n %s is not in the right CSV format.\n",
				 optarg);
			exit (EXIT_FAILURE);
		      }
		  fclose (f);
		  strcpy (csvfile, optarg);
		}
	      else
		{
		  fprintf (stderr,
			   "\n Couldn't open csv file %s; errno=%d.\n",
			   optarg, errno);
		  exit (EXIT_FAILURE);
		}
	    }
	    restore = 1;	// restore css file, no cmd menu
	    break;
	  case 'r':
	    report_only = 1;	// print stats only, no cmd menu.
	    break;
	  case 's':		// set css size to option value
	  case 'S':		// same as above but round up to next 2^n+1
	    if (sscanf (optarg, "%ld", &sparse_spectrum_file_length))
	      {
		if (!quiet)
		  fprintf (stderr,
			   "\nOverride css creation length to %ld\n",
			   sparse_spectrum_file_length);
		user_set_css_length = 1;
	      }
	    else
	      {
		fprintf (stderr,
			 "On -%c flag: Missing or incomprehensible number of buckets.\n",
			 opt);
		exit (EXIT_FAILURE);
	      }
	    if (opt == 'S')	// round up to next 2^n+1
	      {
		int k;

		k = (long) floor (log10 (sparse_spectrum_file_length - 1)
				  / log10 (2.0));
		while ((2 << k) + 1 < sparse_spectrum_file_length)
		  k++;
		sparse_spectrum_file_length = (2 << k) + 1;
		user_set_css_length = 1;
	      }
	    break;
	  case 'v':
	    fprintf (stderr, " This is cssutil, version %s\n", version);
	    fprintf (stderr, " Copyright 2001-2006 W.S.Yerazunis.\n");
	    fprintf (stderr,
		     " This software is licensed under the GPL with ABSOLUTELY NO WARRANTY\n");
	    exit (EXIT_SUCCESS);
	  default:
	    helptext ();
	    exit (EXIT_SUCCESS);
	    break;
	  }
      }

    if (optind < argc)
      strncpy (cssfile, argv[optind], sizeof (cssfile));
    else
      {
	helptext ();
	exit (EXIT_SUCCESS);
      }

    //       and stat it to get it's length
    k = stat (cssfile, &statbuf);
    //       quick check- does the file even exist?
    if (k == 0)
      {
	if (restore)
	  {
	    fprintf (stderr,
		     "\n.CSS file %s exists! Restore operation aborted.\n",
		     cssfile);
	    exit (EXIT_FAILURE);
	  }
	hfsize = statbuf.st_size;
	if (!quiet && user_set_css_length)
	  fprintf (stderr,
		   "\n.CSS file %s exists; -s, -S options ignored.\n",
		   cssfile);
      }
    else
      {
	//      file didn't exist... create it
	if (!quiet && !restore)
	  fprintf (stdout, "\nHad to create .CSS file %s\n", cssfile);
	if (crm_create_cssfile
	    (cssfile, sparse_spectrum_file_length, 0, 0, 0) != EXIT_SUCCESS)
	  exit (EXIT_FAILURE);
	k = stat (cssfile, &statbuf);
	hfsize = statbuf.st_size;
      }
    //    
    //   mmap the hash file into memory so we can bitwhack it
    hashes = (FEATUREBUCKET_TYPE *) crm_mmap_file (cssfile, 
						   0, hfsize,
						   PROT_READ | PROT_WRITE,
						   MAP_SHARED,
						   NULL);
    if (hashes == MAP_FAILED)
      {
	fprintf (stderr,
		 "\n Couldn't open RW file %s; errno=%d .\n", cssfile, errno);
	exit (EXIT_FAILURE);
      }

    //   from now on, hfsize is buckets, not bytes.
    hfsize = statbuf.st_size / sizeof (FEATUREBUCKET_STRUCT);
    
#ifdef OSB_LEARNCOUNTS
    //       If LEARNCOUNTS is enabled, we normalize with documents-learned.
    //
    //       We use the reserved h2 == 0 setup for the learncount.
    //  
    {
      char* litf = "Learnings in this file";
      char* fitf = "Features in this file";
      unsigned long hcode, h1, h2;
      //
      hcode = strnhash (litf, strlen ( litf ));
      h1 = hcode % hfsize;
      h2 = 0;
      if (hashes[h1].hash != hcode)
	{
	  // initialize the file?
	  if (hashes[h1].hash == 0 && hashes[h1].key == 0) 
	    {
	      hashes[h1].hash = hcode;
	      hashes[h1].key = 0;
	      hashes[h1].value = 1;
	      learns_index = h1;
	    }
	  else
	    { 
	      //fatalerror (" This file should have learncounts, but doesn't!",
	      //  " The slot is busy, too.  It's hosed.  Time to die.");
	      //goto regcomp_failed;
	      fprintf (stderr, "\n Minor Caution - this file has the learncount slot in use.\n This is not a problem for Markovian classification, but it will have some\n issues with an OSB classfier.\n");  
	    };
	}
      //      fprintf (stderr, "This file has had %ld documents learned!\n",
      //	       hashes[h1].value);
      docs_learned = hashes[h1].value;
      hcode = strnhash (fitf, strlen ( fitf ));
      h1 = hcode % hfsize;
      h2 = 0;
      if (hashes[h1].hash != hcode)
	{
	  // initialize the file?
	  if (hashes[h1].hash == 0 && hashes[h1].key == 0) 
	    {
	      hashes[h1].hash = hcode;
	      hashes[h1].key = 0;
	      hashes[h1].value = 1;
	      features_index = h1;
	    }
	  else
	    { 
	      //fatalerror (" This file should have learncounts, but doesn't!",
	      //  " The slot is busy, too.  It's hosed.  Time to die.");
	      //goto regcomp_failed ;
	      fprintf (stderr, "\n Minor Caution - this file has the featurecount slot in use.\n This is not a problem for Markovian classification, but it will have some\n issues with an OSB classfier.\n");  
	    };
	}
      //fprintf (stderr, "This file has had %ld features learned!\n",
      //	       hashes[h1].value);
      features_learned = hashes[h1].value;
    };

#endif 

    if (dump)
      {
	// dump the css file
	for (i = 0; i < hfsize; i++)
	  {
	    printf ("%lu;%lu;%lu\n", 
		    hashes[i].key, hashes[i].hash, hashes[i].value);
	  }
      }

    if (restore)
      {
	FILE *f;

	// restore the css file  - note that if we DIDN'T create
	//  it already, then this will fail.
	//
	if ((f = fopen (csvfile, "rb")) == NULL)
	  {
	    fprintf (stderr, "\n Couldn't open csv file %s; errno=%d.\n",
		     csvfile, errno);
	    exit (EXIT_FAILURE);
	  }
	for (i = 0; i < hfsize; i++)
	  {
	    fscanf (f, "%lu;%lu;%lu\n", 
		    &(hashes[i].key), &(hashes[i].hash), &(hashes[i].value));
	  }
	fclose (f);
      }

    zloop = 1;
    while (zloop == 1 && !restore && !dump)
      {
	zloop = 0;
	//crm_packcss (hashes, hfsize, 1, hfsize-1);
	sum = 0;
	maxchain = 0;
	curchain = 0;
	totchain = 0;
	fbuckets = 0;
	nchains = 0;
	zvbins = 0;
	ofbins = 0;
	//   calculate maximum overflow chain length
	for (i = 1; i < hfsize; i++)
	  {
	    if (hashes[i].key != 0)
	      {
		//  only count the non-special buckets for feature count
		sum = sum + hashes[i].value;
		//
		fbuckets++;
		curchain++;
		if (hashes[i].value == 0)
		  zvbins++;
		if (hashes[i].value >= FEATUREBUCKET_VALUE_MAX)
		  ofbins++;
	      }
	    else
	      {
		if (curchain > 0)
		  {
		    nchains++;
		    totchain += curchain;
	    	    if (curchain > maxchain)
		      maxchain = curchain;
		    curchain = 0;
		  }
	      }
	  }

	fprintf (stdout, "\n Sparse spectra file %s statistics: \n", cssfile);
	fprintf (stdout, "\n Total available buckets          : %12ld ",
		 hfsize);
	fprintf (stdout, "\n Total buckets in use             : %12ld  ",
		 fbuckets);
	fprintf (stdout, "\n Total in-use zero-count buckets  : %12ld  ",
		 zvbins);
	fprintf (stdout, "\n Total buckets with value >= max  : %12ld  ",
		 ofbins);
	fprintf (stdout, "\n Total hashed datums in file      : %12lld", sum);
	fprintf (stdout, "\n Documents learned                : %12ld  ",
		 docs_learned);
	fprintf (stdout, "\n Features learned                 : %12ld  ",
		 features_learned);
	fprintf (stdout, "\n Average datums per bucket        : %12.2f",
		 (fbuckets > 0) ? (sum * 1.0) / (fbuckets * 1.0) : 0);
	fprintf (stdout, "\n Maximum length of overflow chain : %12ld  ",
		 maxchain);
	fprintf (stdout, "\n Average length of overflow chain : %12.2f ",
		 (nchains > 0) ? (totchain * 1.0) / (nchains * 1.0) : 0 );
	fprintf (stdout, "\n Average packing density          : %12.2f\n",
		 (fbuckets * 1.0) / (hfsize * 1.0));

	// set up histograms
	for (i = 0; i < histbins; i++)
	  bcounts[i] = 0;
	for (v = 1; v < hfsize; v++)
	  {
	    if (hashes[v].value < histbins)
	      {
		bcounts[hashes[v].value]++;
	      }
	    else
	      {
		bcounts[histbins]++;  // note that bcounts is len(histbins+2)
	      }
	  }

	if (!brief)
	  for (i = 0; i < histbins; i++)
	    {
	      if (bcounts[i] > 0)
		{
		  if (i < histbins)
		    {
		      fprintf (stdout, "\n bin value %8ld found %9ld times",
			       i, bcounts[i]);
		    }
		  else
		    {
		      fprintf (stdout, "\n bin value %8ld or more found %9ld times",
			       i, bcounts[i]);
		    }
		}
	    }

	fprintf (stdout, "\n");
	cmdloop = 1;
	while (!report_only && cmdloop)
	  {
	    // clear command buffer
	    cmdchr[0] = '\0';
	    fprintf (stdout, "Options:\n");
	    fprintf (stdout, "   Z n - zero bins at or below a value\n");
	    fprintf (stdout, "   S n - subtract a constant from all bins\n");
	    fprintf (stdout, "   D n - divide all bins by a constant\n");
	    fprintf (stdout, "   R - rescan\n");
	    fprintf (stdout, "   P - pack\n");
	    fprintf (stdout, "   Q - quit\n");
	    fprintf (stdout, ">>> ");
	    clearerr (stdin);
	    fscanf (stdin, "%[^\n]", cmdstr);
	    fscanf (stdin, "%c", crapchr);
	    fields = sscanf (cmdstr, "%s %f", cmdchr, &cmdval);
	    if (strlen ( (char *) cmdchr) != 1)
	      {
		fprintf (stdout, "Unknown command: %s\n", cmdchr);
		continue;
	      }
	    switch (tolower ((int)cmdchr[0]))
	      {
	      case 'z':
		if (fields != 2)
		  fprintf (stdout,
			   "Z command requires a numeric argument!\n");
		else
		  {
		    fprintf (stdout, "Working...");
		    for (i = 1; i < hfsize; i++)
		      if (hashes[i].value <= cmdval)
			hashes[i].value = 0;
		    fprintf (stdout, "done.\n");
		  }
		break;
	      case 's':
		if (fields != 2)
		  fprintf (stdout,
			   "S command requires a numeric argument!\n");
		else
		  {
		    fprintf (stdout, "Working...");
		    for (i = 1; i < hfsize; i++)
		      {
			if (hashes[i].value > (int) cmdval)
			  {
			    hashes[i].value = hashes[i].value - cmdval;
			  }
			else
			  {
			    hashes[i].value = 0;
			  }
		      }
		    fprintf (stdout, "done.\n");
		  }
		break;
	      case 'd':
		if (fields != 2)
		  fprintf (stdout,
			   "D command requires a numeric argument!\n");
		else if (cmdval == 0)
		  fprintf (stdout, "You can't divide by zero, nimrod!\n");
		else
		  {
		    fprintf (stdout, "Working...");
		    for (i = 1; i < hfsize; i++)
		      hashes[i].value = hashes[i].value / cmdval;
		    fprintf (stdout, "done.\n");
		  }
		break;
	      case 'r':
		zloop = 1;
		cmdloop = 0;
		break;
	      case 'p':
		fprintf (stdout, "Working...");
		crm_packcss (hashes, NULL, hfsize, 1, hfsize - 1);
		zloop = 1;
		cmdloop = 0;
		break;
	      case 'q':
		fprintf (stdout, "Bye! \n");
		cmdloop = 0;
		break;
	      default:
		fprintf (stdout, "Unknown command: %c\n", cmdchr[0]);
		break;
	      }
	  }
      }

    crm_munmap_file ((void *) hashes);
  }
  return 0;
}


syntax highlighted by Code2HTML, v. 0.9.1