ports//mail/crm114/work/crm114-20070810-BlameTheSegfault.src/crm_stmt

//  crm_preprocessor.c  - Controllable Regex Mutilator,  version v1.0
//  Copyright 2001-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact the 
//  author for details.  
//
//  include some standard files
#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

//    the command line argc, argv
extern int prog_argc;
extern char **prog_argv;

//    the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;

//    the globals used when we need a big buffer  - allocated once, used 
//    wherever needed.  These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;

//
//       the actual textual representations of the flags, with their values
//     DON'T FORGET TO ALSO MODIFY THIS IN crm114_structs.h !!

FLAG_DEF crm_flags[41] =
  {
    {"fromstart", CRM_FROMSTART},
    {"fromnext", CRM_FROMNEXT},
    {"fromend", CRM_FROMEND},
    {"newend", CRM_NEWEND},
    {"fromcurrent", CRM_FROMCURRENT},
    {"nocase", CRM_NOCASE},
    {"absent", CRM_ABSENT},
    {"basic", CRM_BASIC},
    {"backwards", CRM_BACKWARDS},
    {"literal", CRM_LITERAL},
    {"nomultiline", CRM_BYLINE},
    {"byline", CRM_BYLINE},
    {"bychar", CRM_BYCHAR},
    {"bychunk", CRM_BYCHUNK},
    {"byeof", CRM_BYEOF},
    {"eofaccepts", CRM_EOFACCEPTS},
    {"eofretry", CRM_EOFRETRY},
    {"append", CRM_APPEND},
    {"keep", CRM_KEEP},
    {"async", CRM_ASYNC},
    {"refute", CRM_REFUTE},
    {"microgroom", CRM_MICROGROOM},
    {"markovian", CRM_MARKOVIAN},
    {"osb", CRM_OSB_BAYES},
    {"correlate", CRM_CORRELATE},
    {"winnow", CRM_OSB_WINNOW},
    {"unique", CRM_UNIQUE},
    {"chi2", CRM_CHI2},
    {"entropy", CRM_ENTROPY},
    {"entropic", CRM_ENTROPY},  
    {"osbf", CRM_OSBF },
    {"hyperspace", CRM_HYPERSPACE},
    {"unigram", CRM_UNIGRAM},
    {"crosslink", CRM_CROSSLINK},
    {"default", CRM_DEFAULT},
    {"lineedit", CRM_READLINE},
    {"sks", CRM_SKS},
    {"svm", CRM_SVM},
    {"fscm", CRM_FSCM},
    {"", 0}
  };
    
#define CRM_MAXFLAGS 39
    



//    The magic flag parser.  Given a string of input, return the
//    codes that were found as the (long int) return value.  If an
//    unrecognized code is found, squalk an error (whether it is fatal
//    or not is another issue)
//
//    Note that since flags (like variables) are always ASCII, we don't
//    need to worry about 8-bit-safety.
//      
unsigned long long crm_flagparse (char *input, long inlen)  //  the user input
{
  char flagtext [MAX_PATTERN];
  char *remtext;
  long remlen;
  char *wtext;
  long flagsearch_start_here;
  long wstart;
  long wlen;
  unsigned long long outcode;

  int done;
  int i;
  int j;
  int k;
  int recog_flag;
  
  outcode = 0;

  memmove (flagtext, input, inlen);
  flagtext[inlen] = '\000';

  if (internal_trace)
    fprintf (stderr, "Flag string: %s\n", flagtext);

  //  now loop on thru the nextwords,
  remtext = flagtext;
  done = 0;
  remlen = inlen;
  wstart = 0;
  wlen = 0;
  flagsearch_start_here = 0;
  while (!done && remlen > 0)
    {
      i=crm_nextword (remtext, remlen, flagsearch_start_here, &wstart, &wlen);
      flagsearch_start_here = wstart + wlen + 1; 
      if (wlen > 0)
	{
	  //    We got a word, so aim wtext at it
	  wtext = &(remtext[wstart]);
	  if (internal_trace)
	    {
	      fprintf (stderr, "found flag, len %ld: ", wlen) ;
	      for (j = 0; j < wlen; j++) fprintf (stderr, "%c", wtext[j]);
	      fprintf (stderr, "\n");
	    };
	  
	  //    find sch in our table, squalk a nonfatal/fatal if necessary.
	  recog_flag = 0;
	  for (j = 0; j <= CRM_MAXFLAGS; j++)
	    {
	      // fprintf (stderr, " Trying %s (%ld) \n", crm_flags[j].string, crm_flags[j].value );
	      k = strlen (crm_flags[j].string);
	      if (k == wlen 
		  && 0 == strncasecmp (wtext, crm_flags[j].string, k))
		{
		  //    mark this flag as valid so we don't squalk an error
		  recog_flag = 1;
		  //     and OR this into our outcode
		  outcode = outcode | crm_flags[j].value;
		  if (user_trace) 
		    {
		      fprintf (stderr, "Mode #%d, '%s' turned on. \n",
			       j,
			       crm_flags[j].string);
		    };
		};
	    };
	  
	  //   check to see if we need to squalk an error condition
	  if (recog_flag == 0)
	    {
	      long q;
	      char foo[1024];
	      strncpy (foo, wtext, 128);
	      foo[wlen] = '\000';
	      q = nonfatalerror ("Darn...  unrecognized flag :", foo);
	    };
	  
	  
	  //  and finally,  move sch up to point at the remaining string
	  if (remlen <= 0) done = 1;
	}
      else
	done = 1;
    };
  
  if (internal_trace )
    fprintf (stderr, "Flag code is : %llx\n", outcode);
  
  return (outcode);
}

//     Get the next word in a string.  "word" is defined by the 
//     continuous span of characters that are above ascii ! (> hex 0x20
//
//     The search starts at the "start" position given; the start position
//     is updated on each call and so is mutilated.  To step through a
//     arglist, you must add the returned value of "len" to the returned
//     value of start!
//     
//     The returned value is 0/1 as to whether we found
//     a valid word, and *start and *length, which give it's position.
//
long crm_nextword ( char *input,
		    long inlen,
		    long starthere,
		    long *start,
		    long *len)
{	
  *start = starthere;
  *len = 0;
  //   find start of string (if it exists)
  while (*start < inlen && input [*start] <= 0x20 ) *start = *start + 1;

  //  check - did we hit the end and still be invalid?  If so, return 0
  if (*start == inlen) return (0);

  //    if we get to here, then we have a valid string.
  *len = 0;
  while ((*start+*len) < inlen 
	 && input [*start+*len] > 0x20 ) *len = *len + 1;
  
  return ( (*len) > 0);
}



//
//    experimental code for a statement-type-sensitive parser.
//   Not in use yet... but someday... goal is to provide better error 
//   detection.

int crm_profiled_statement_parse ( char *in,
				   long slen,
				   ARGPARSE_BLOCK *apb,
				   long amin, long amax,
				   long pmin, long pmax,
				   long bmin, long bmax,
				   long smin, long smax)
{
  return (0);
}

//      parse a CRM114 statement; this is mostly a setup routine for 
//     the generic parser.

int crm_statement_parse ( char *in, 
			  long slen,
			  ARGPARSE_BLOCK *apb)
{
#define CRM_STATEMENT_PARSE_MAXARG 10
  int i,  k;

  long ftype[CRM_STATEMENT_PARSE_MAXARG];
  long fstart[CRM_STATEMENT_PARSE_MAXARG];
  long flen [CRM_STATEMENT_PARSE_MAXARG];

  //     we call the generic parser with the right args to slice and
  //     dice the incoming statement into declension-delimited parts
  k = crm_generic_parse_line ( in, 
			       slen,
			       "<([/",
			       ">)]/",
			       "\\\\\\\\",      // this is four backslashes
			       CRM_STATEMENT_PARSE_MAXARG,
			       ftype,
			       fstart,
			       flen);

  //   now we have all these nice chunks... we split them up into the
  //   various allowed categories.


  //   start out with empties on each possible chunk
  apb->a1start = NULL; apb->a1len = 0;
  apb->p1start = NULL; apb->p1len = 0; 
  apb->p2start = NULL; apb->p2len = 0;
  apb->p3start = NULL; apb->p3len = 0;
  apb->b1start = NULL; apb->b1len = 0;
  apb->s1start = NULL; apb->s1len = 0;
  apb->s2start = NULL; apb->s2len = 0;

  //   Scan through the incoming chunks
  for (i = 0; i < k; i++)
    {
      switch (ftype[i])
       	{
	case CRM_ANGLES:
	  {   
	    //  Grab the angles, if we don't have one already 
	    if (apb->a1start == NULL)
	      {
		apb->a1start = &in[fstart[i]];
		apb->a1len = flen [i];
	      }
	    else nonfatalerror 
		   ("There are multiple flag sets on this line.",
		    " ignoring all but the first");
	  }
	  break;
	case CRM_PARENS:
	  {
	    //  grab a set of parens, cascading till we find an one
	    if (apb->p1start == NULL)
	      {
		apb->p1start = &in[fstart[i]];
		apb->p1len = flen [i];
	      }
	    else
	      if (apb->p2start == NULL)
		{
		  apb->p2start = &in[fstart[i]];
		  apb->p2len = flen [i];
		}
	      else
		if (apb->p3start == NULL)
		  {
		    apb->p3start = &in[fstart[i]];
		    apb->p3len = flen [i];
		  }
		else
		  nonfatalerror 
		    ("Too many parenthesized varlists.",
		     "ignoring the excess varlists.");
	  }
	  break;
	case CRM_BOXES:
	  {   
	    //  Grab the angles, if we don't have one already 
	    if (apb->b1start == NULL)
	      {
		apb->b1start = &in[fstart[i]];
		apb->b1len = flen [i];
	      }
	    else nonfatalerror 
		   ("There are multiple domain limits on this line.",
		    " ignoring all but the first");
	  }
	  break;
	case CRM_SLASHES:
	  {
	    //  grab a set of parens, cascading till we find an one
	    if (apb->s1start == NULL)
	      {
		apb->s1start = &in[fstart[i]];
		apb->s1len = flen [i];
	      }
	    else
	      if (apb->s2start == NULL)
		{
		  apb->s2start = &in[fstart[i]];
		  apb->s2len = flen [i];
		}
	      else
		nonfatalerror (
		       "There are too many regex sets in this statement,",
	          " ignoring all but the first.");
	  }
	  break;
	default:
	  fatalerror( "Declensional parser returned an undefined typecode!",
		      "What the HECK did you do to cause this?");
	};
    }
  return (k);    // return value is how many declensional arguments we found.
};


//     The new and improved line core parser routine.  Instead of
//     being totally ad hoc, this new parser actually retains context
//     durng the parse.
//
//     this hopefully will keep the parser from getting confused by [] in
//     the slash matching and other such abominations.
//
//     (one way to view this style of parsing is that each arg in a 
//     CRM114 statement is "declined" by it's delimiters to determine 
//     what role this variable is to play in the statement.  Kinda like
//     Latin - to a major extent, you can mix the parts around and it 
//     won't make any difference.

int crm_generic_parse_line ( 
		    char *txt,       //   the start of the program line
		    long len,        //   how long is the line
		    char *schars,    //   characters that can "start" an arg
		    char *fchars,    //   characters that "finish" an arg
		    char *echars,    //   characters that escape in an arg
		    long maxargs,    //   howm many things to search for (max)
		    long *ftype,     //   type of thing found (index by schars)
		    long *fstart,    //   starting location of found arg
		    long *flen       //   length of found arg
		    )
{
  //    the general algorithm here is to move along the input line,
  //    looking for one of the characters in schars.  When we find it,
  //    we lock onto that and commit to finding an arg of that type.
  //    We then start scanning ahead keeping count of schars minus echars.
  //    when the count hits zero, it's end for that arg and we move onward
  //    to the next arg, with the same procedure.  
  //
  //    note that when we are scanning for a new arg, we are open to args
  //    of any type (as defined by the members of schars, while in an arg
  //    we are looking only for the unescaped outstanding echar and are blind
  //    to everything else.  
  //    
  //    when not in an arg, we do not have any escape character active.
  //    
  //     We return the number of args found

  long chidx;
  char curchar;
  long argc;
  long i; 
  long itype;
  long depth;

  //    zeroize the outputs to start...
  for (i = 0; i < maxargs; i++)
    { 
      ftype[i] = -1;
      fstart[i] = 0;
      flen[i] = 0;
    };
    

  //    scan forward, looking for any member of schars

  depth = 0;
  chidx = -1;
  argc = 0;
  itype = -1;

  if (internal_trace)
    {
      fprintf (stderr, " declensional parsing for %ld chars on: ", len);
      for (i = 0; i < len; i++)
	fprintf (stderr, "%c", txt[i]);
      fprintf (stderr, "\n");
    }

  while (chidx < len  &&  argc <= maxargs)
    {
      chidx++;
      curchar = txt[chidx];
      if (itype == -1)     // are we looking for an argstart char?
	{
	  //    is curchar one of the start chars?  (this is 8-bit-safe,
	  //     because schars is always normal ASCII)
	  for (i = 0; i < strlen (schars); i++)
	    if (curchar == schars[i])
	      {
		if (internal_trace)
		  fprintf (stderr, "   found opener %c at %ld,",curchar,chidx);
		itype = i;
		fstart[argc] = chidx + 1;
		ftype [argc] = itype;
		depth = 1;
	      };
	  //  if it wasn't a start-character for an arg, we are done.
	}
      else    // nope, we're in an arg, so we check for unescaped schar
	     // and fchar characers
	{
	  //  if (curchar == fchars [itype] && txt[chidx-1] != echars[itype])
          if (curchar == fchars [itype] 
	      && (txt[chidx-1] != echars[itype] 
		  || txt[chidx-1] == txt[chidx-2]))
	    {
	      depth--;
	      if (depth == 0)
		{
		  //   we've found the end of the text arg.  Close it off and
		  //   note it into the output vectors
		  flen [argc] = chidx - fstart[argc] ;
		  if (internal_trace)
		    {
		      int q;
		      fprintf (stderr, " close %c at %ld --", curchar, chidx);
		      for (q = fstart[argc]; q < fstart[argc]+flen[argc]; q++)
			fprintf (stderr, "%c", txt[q]);
		      fprintf (stderr, "-- len %ld\n", flen[argc]);
		    };
		  itype = -1;
		  argc++;
		};
	    }
	  else
	    //if (curchar == schars [itype] && txt[chidx-1] != echars[itype])
	    if (curchar == schars [itype] 
		&& (txt[chidx-1] != echars[itype] 
		    || txt[chidx-1] == txt[chidx-2]))
	      {
		depth++;
	      };
	};
      //    if we weren't a schar or an unexcaped echar, we're done!
    };
  if (depth != 0)
    {
      char errstmt[MAX_PATTERN];
      flen[argc] = chidx - fstart[argc];
      //  
      //   GROT GROT GROT Somehow, sometimes we get flen[argc] < 0.   It's
      //   always with buggy userprograms, but we shouldn't need this anyway.
      //   So, until we find out what _we_ are doing wrong, leave the check
      //   for flen[argc] < 0 in here.
      //
      if (flen[argc] < 0) flen[argc] = 0;
      strncpy ( errstmt, &txt[fstart[argc]], 
		flen[argc] );
      nonfatalerror (" This operand doesn't seem to end.  Bug?  \n -->  ",
		     errstmt);
      argc++;
    };
  return (argc);
}

//    and to avoid all the mumbo-jumbo, an easy way to get a copy of
//    an arg found by the declensional parser.
void crm_get_pgm_arg (char *to, long tolen, char *from, long fromlen) 
{
  long len;
  
  if (to == NULL)
    return;

  if (from == NULL)
    {
      to[0] = '\000';
    }
  else
    {
      len = tolen - 1;
      if (len > fromlen ) len = fromlen ;
      memmove (to, from, len);
      to[len] = '\000';
    }
}
syntax highlighted by Code2HTML, v. 0.9.1