//  crm_expr_match.c  - Controllable Regex Mutilator,  version v1.0
//  Copyright 2001-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact the 
//  author for details.  
//
//  include some standard files
#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

//    the command line argc, argv
extern int prog_argc;
extern char **prog_argv;

//    the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;

//    the globals used when we need a big buffer  - allocated once, used 
//    wherever needed.  These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;

//        And the match routine.  What a humungous mess...
int crm_expr_match (CSL_CELL *csl, ARGPARSE_BLOCK *apb)
{
  long i; 
  long j;
  long k;
  long mc;
  char pch[MAX_PATTERN];
  long pchlen;
  char errstr[MAX_PATTERN+128];
  char *mdwptr;
  regex_t preg;
  int casep, nomultilinep, absentp, fromp, extended_regex_p, literal_pattern_p;
  int cflags, eflags;
  char *mtext;
  long mtextlen;
  long textoffset;
  char bindable_vars[MAX_PATTERN]; 
  long bindable_vars_len;
  char box_text[MAX_PATTERN];
  regmatch_t matches [MAX_SUBREGEX];
  long nmatches;
  long source_start;
  long source_len;
  long vtextoffset, vtextend, vtextstartlimit;
  long vpmstart, vpmend;
  long vmidx;
  //    And it all comes down to this, right here.  Matching a regex.
  //    This is the cruxpoint of the whole system.  We parse the
  //    program line args, get the flags out of the <> brackets, get the
  //    bound values out of the () parens, and the regex pattern out
  //    of the // delimiters.  Then we regcomp the pattern, and apply
  //    it to the data window (or to the variable if one is supplied).
  //    Then we either pass thru or fail to the failpoint.
  //
  //    get the flags out of the <> brackets
  //        cflags = REG_EXTENDED + REG_ICASE + REG_NEWLINE;
  //
  //  Translate to the cflags REG_EXTENDED, REG_ICASE, REG_NEWLINE
  //  (newline doesn't match wldcrd) and the eflags REG_NOTBOL (no
  //  newline at start) REG_NOTEOL (no newline at end)

  casep = 0;
  nomultilinep = 0;
  absentp = 0;
  fromp = 0;
  extended_regex_p = 1;
  literal_pattern_p = 0;

  //       Go through the flags
  //      is the ignore case flag set?
  if (apb->sflags & CRM_NOCASE)
    {
      if (user_trace)
	fprintf (stderr, "  nocase turned on...\n");
      casep = 1;
    };
  //      is the "basic regex" (obsolete, but useful) flag set?
  if (apb->sflags & CRM_BASIC)
    {
      if (user_trace)
	fprintf (stderr, "  basic regex match turned on...\n");
      extended_regex_p = 0;
    };

  if (apb->sflags & CRM_NOMULTILINE)
    {
      if (user_trace)
	fprintf (stderr, "  nomultiline turned on...\n");
      nomultilinep = 1;
    };
  
  if (apb->sflags & CRM_ABSENT)
    {
      if (user_trace)
	fprintf (stderr, "  absent flag turned on...\n");
      absentp = 1;
    };

  if (apb->sflags & CRM_LITERAL)
    {
      if (user_trace)
	fprintf (stderr, "  literal pattern search turned on...\n");
      literal_pattern_p = 1;
    };

  
  //  default is NO special fromming...
  fromp = 0;

  if (apb->sflags & CRM_FROMSTART)
    {
      if (user_trace)
	fprintf (stderr, "  fromstart turned on...\n");
      fromp = CRM_FROMSTART;
    };

  if (apb->sflags & CRM_FROMNEXT)
    {
      if (user_trace)
	fprintf (stderr, "  fromnext turned on...\n");
      fromp = CRM_FROMNEXT;
    };
  
  if (apb->sflags & CRM_FROMEND)
    {
      if (user_trace)
	fprintf (stderr, "  fromend turned on...\n");
      fromp = CRM_FROMEND;
    };
  
  if (apb->sflags & CRM_NEWEND)
    {
      if (user_trace)
	fprintf (stderr, "  newend turned on...\n");
      fromp = CRM_NEWEND;
    };

  if (apb->sflags & CRM_BACKWARDS)
    {
      if (user_trace)
	fprintf (stderr, "  backwards search turned on...\n");
      fromp = CRM_BACKWARDS;
    };
  
  if (apb->sflags & CRM_FROMCURRENT)
    {
      if (user_trace)
	fprintf (stderr, "  from-current search turned on...\n");
      fromp = CRM_FROMCURRENT;
    };
  
  //   Now, from the flags, calculate the cflags and eflags
  cflags = casep * REG_ICASE 
    + nomultilinep * REG_NEWLINE 
    + extended_regex_p * REG_EXTENDED
    + literal_pattern_p * REG_LITERAL;
  
  eflags = 0; //  
  
  //    get the bound values out of the () parenthesis
  //
  //   DANGER WILL ROBINSON!!  TAKE COVER, DOCTOR SMITH!!!  We
  //   have to be really careful here, because we need durable
  //   variable names to reference the VHT to, and an
  //   expandvar'ed variable doesn't have that durable text
  //   string somewhere.  So, we have to stuff the variable name
  //   in as a temp var and then immediately reassign it.
  //    
  crm_get_pgm_arg (bindable_vars, MAX_PATTERN, apb->p1start, apb->p1len);
  if (internal_trace)
    fprintf (stderr, " bindable vars: ***%s***\n", bindable_vars);
  bindable_vars_len = crm_nexpandvar (bindable_vars, apb->p1len, MAX_PATTERN);
  
  
  //     here's where we look for a [] var-restriction
  //
  //     Experimentally, we're adding [ :foo: 123 456 ] to 
  //     allow an externally specified start and length.
  crm_get_pgm_arg (box_text, MAX_PATTERN, apb->b1start, apb->b1len);

  //  Use crm_restrictvar to get start & length to look at.
  i = crm_restrictvar(box_text, apb->b1len, 
		      &vmidx,
		      &mdwptr,
		      &source_start, 
		      &source_len,
		      errstr);

  if (internal_trace)
    fprintf (stderr, 
	     "restricted: vmidx: %ld  mdw: %ld   start: %ld  len: %ld\n",
	     vmidx, (long) mdwptr, source_start, source_len);
  if ( i < 0)
    {
      long curstmt;
      curstmt = csl->cstmt;
      if (i == -1)
	nonfatalerror (errstr, "");
      if (i == -2)
	fatalerror (errstr, "");
      //
      //     did the FAULT handler change the next statement to execute?
      //     If so, continue from there, otherwise, we FAIL.
      if (curstmt == csl->cstmt)
	{
	  csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
	  csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1;
	};
      goto nonfatal_route_outwards;
    };
  
  //    get the regex pattern out of the // slashes
  crm_get_pgm_arg (pch, MAX_PATTERN, apb->s1start, apb->s1len);
  if (internal_trace)
    fprintf (stderr, " match pattern: =%s=\n", pch );
  pchlen = crm_nexpandvar (pch, apb->s1len, MAX_PATTERN);

  if (user_trace)
    fprintf (stderr, " match pattern expands to =%s= len %ld flags %x %x \n",
	     pch, pchlen, cflags, eflags);
  
  //    regcomp the pattern
  i = crm_regcomp (&preg, pch, pchlen, cflags);
  if ( i > 0)
    {
      long curstmt;
      curstmt = csl->cstmt;
      crm_regerror ( i, &preg, tempbuf, data_window_size);
      fatalerror ("Regular Expression Compilation Problem:", tempbuf);
      //
      //     did the FAULT handler change the next statement to execute?
      //     If so, continue from there, otherwise, we FAIL.
      if (curstmt == csl->cstmt)
	{
	  csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
	  csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1;
	};
      goto nonfatal_route_outwards;
    };


  //     Get the string to be matched upon...
#ifdef SUPERCEDED_BY_CRM_RESTRICTVAR
  vmidx = crm_vht_lookup (vht, svname, svnamelen);
  if (vht[vmidx] == NULL)
    {
      //   
      //     There was no such variable, so we need to fail.  First,
      //     we'll save the current and fail locations, then we'll let
      //     the error handler attempt fixup.  If the handler exists,
      //     and changes the FAIL location, the handler's result
      //     stands, otherwise the match does a FAIL.
      //
      long curstmt;
      curstmt = csl->cstmt;
      nonfatalerror (" Attempt to match inside nonexistent variable ( always fails!) ",
		     svname);
      //
      //     did the FAULT handler change the next statement to execute?
      //     If so, continue from there, otherwise, we FAIL.
      if (curstmt == csl->cstmt)
	{
	  csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
	  csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1;
	};
      goto nonfatal_route_outwards;
    }
#endif 

  //    do a check- if the vht->valtxt points into the
  //    cdw, or the tdw?
  mdw = cdw;    //  assume cdw unless otherwise proven...
  if (vht[vmidx]->valtxt == tdw->filetext)
    mdw=tdw;
  //  sanity check - must be tdw or cdw for searching!
  if (vht[vmidx]->valtxt != tdw->filetext
      && vht[vmidx]->valtxt != cdw->filetext)
    {
      long q;
      q = fatalerror ("Bogus text block (neither cdw nor tdw) on var ",
		box_text);
      if (q != 0)
	{
	  if (engine_exit_base != 0)
	    {
	      exit (engine_exit_base + 7);
	    }
	  else
	    exit (EXIT_FAILURE);
	};
    };
#ifdef SUPERCEDED
  vtextoffset = vht[vmidx]->vstart;
  vtextend = vtextoffset + vht[vmidx]->vlen;
  vpmstart = vht[vmidx]->mstart;
  vpmend =  vpmstart + vht[vmidx]->mlen; 
#endif

  vtextoffset = source_start;
  vtextend = source_start + source_len;
  vpmstart = vht[vmidx]->mstart;
  vpmend = vpmstart + vht[vmidx]->mlen;
  
  //    set up the start/end of the text we're matching against
  
  //       default is CRM_FROMSTART 
  textoffset = vtextoffset;
  if (fromp == CRM_FROMSTART)
    {
      textoffset = vtextoffset;
    }
  if (fromp == CRM_FROMCURRENT)
    {    
      textoffset = vpmstart;
    };
  if (fromp == CRM_NEWEND)
    {
      textoffset = vpmstart;
    };
  if (fromp == CRM_FROMNEXT)
    {
      textoffset = vpmstart + 1 ;
    };
  if (fromp == CRM_FROMEND)
    {
      textoffset = vpmend ;
    };
  if (fromp == CRM_BACKWARDS)
    { 
      if ( vpmstart > 0) 
	{ textoffset = vpmstart - 1; } else {textoffset = vpmstart; };
    };

  mtextlen = vtextend - textoffset;

#ifdef  SUPERCEDED_BY_CRM_RESTRICTVAR
  //
  //    did the user box-specify a different start?  Combine the restrictions!
  //
  //     1) start of search - must be inside vtextoffset + box_start
  if (textoffset < vtextoffset + box_start) 
    textoffset = box_start + vtextoffset;
  //
  //     2) the area searched must be <= box_start+box_len + vtextoffset
  if (mtextlen+textoffset > box_start+box_length+vtextoffset) 
    mtextlen = box_start + box_length +vtextoffset - textoffset;
  //
  //    3) the earliest point we'll allow a search to go is the start of
  //       the variable + box_start
  vtextstartlimit = vtextoffset + box_start;
#endif
  vtextstartlimit = source_start;
  if (internal_trace)
    {
      fprintf (stderr, "    start matchable zone: %ld, begin search %ld, length %ld\n",
	       vtextstartlimit, textoffset, mtextlen);
    };

  //       Here is the crux.  Do the REGEX match, maybe in a loop
  //       if we're iterating to find a result with a different end 
  //       point than previous matches.
  nmatches = MAX_SUBREGEX;
  matches[0].rm_so = 0;
  matches[0].rm_eo = 0;
  switch (fromp)
    {
    case CRM_NEWEND:
      {
	long oldend;
	long done;
	oldend = vpmend;
	done = 0;
	//        loop until we either get a match that goes 
	//        past the previous match, or until we are 
	//        at the end of the matchable text.
	while (textoffset <= vtextend - 1 
	       && done == 0)
	  {
	    textoffset++;
	    mtextlen--;
	    mtext = &mdw->filetext[textoffset];
	    i = crm_regexec (&preg, mtext, mtextlen,
			     nmatches, matches, eflags, NULL);
	    j = matches[0].rm_eo;
	    if (( (textoffset + j) > oldend) && (i == 0)) done = 1;
	  };
      };
      break;
    case CRM_BACKWARDS:
      {
	long oldstart;
	oldstart = vpmstart;
	i = -1;
	j = oldstart + 1;
	matches[0].rm_so = j;
	//        loop until we either get a match or until we have hit
	//        the start of this (possibly captured-variable) region.
	while (textoffset > vtextstartlimit
	       && (i != 0 || j > oldstart)  )
	  {
	    textoffset--;
	    mtextlen++;
	    mtext = &mdw->filetext[textoffset];
	    i = crm_regexec (&preg, mtext, mtextlen,
			     nmatches, matches, eflags, NULL);
	    j = matches[0].rm_so;
	  };
      };
    default:
      {
	mtext = &mdw->filetext[textoffset];
	i = crm_regexec ( &preg, mtext, mtextlen, 
			  nmatches, matches, eflags, NULL);
      };
    };
  
  crm_regfree (&preg);
  
  //    and now we FAIL or not...


  if ((absentp == 0 && i != 0) || (absentp == 1 && i == 0))
    {
      if (user_trace && !absentp) 
	fprintf (stderr, "Regex did not match, no absent flag, failing.\n");
      if (user_trace && absentp) 
	fprintf (stderr, "Regex matched but with absent flag, failing.\n");
      csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
      csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1;
    }
  else
    {
      if (user_trace)
	fprintf (stderr, "Regex matched.\n");
      //     if the match was succcessful, we may need to 
      //     bind some variables, so see if there's a () 
      //     (note that we cannot use the grab_delimited_string
      //     routine results here, because crm_setvar uses 
      //     indices into the program to define variable names,
      //     not char * strings.   We _could_ cheat and malloc
      //     the variable name (perhaps we should!) and then
      //     clean up when we're done, but this at least will
      //     work for now.
      
      //   CAREFUL HERE - we need to correct due to offsets
      //   from the front of the text

      //    Double careful- if some of the regex wasn't used
      //    (such as submatches) the indices and lengths will be -1.
      //    Don't bind those.  
      
      //     set the "last match was here" data...
      vht[vmidx]-> mstart = matches[0].rm_so + textoffset;
      vht[vmidx]-> mlen = matches[0].rm_eo - matches[0].rm_so;

      if (bindable_vars_len > 0 && absentp )
	nonfatalerror ("This program specifies an 'absent' match, and also "
		       "tries to bind variables that, by the above, aren't "
		       "matched!  ",
		       "We'll ignore these variable bindings for now.");
      
      if ( bindable_vars_len > 0 && !absentp )
	{
	  long vstart;
	  long vlen;
	  long vnext;
	  long done;
	  //              a place to put pre-rebind
	  //              text/starts/lengths, so we can run
	  //              reclamation on them later on.
	  //   
	  char *index_texts[MAX_SUBREGEX];
	  long index_starts[MAX_SUBREGEX], 
	    index_lengths[MAX_SUBREGEX];
	  
	  done = 0;          //  loop till we've captured all the vars
	  mc = 0;
	  vstart = 0;
	  	  while ( !done)
	    {              // bind each variable
	      //    find the start of the variable
	      while (bindable_vars[vstart] > 0x0
		     && bindable_vars[vstart] < 0x021 
		     && bindable_vars[vstart] != ')' )
		vstart++;
	      if (bindable_vars[vstart] == ')'
		  || bindable_vars[vstart] == 0x0 )
		{
		  done = 1;
		}
	      else
		{
		  
		  //    Now, the next space or ) ends the variable
		  vlen = 0;
		  while (bindable_vars[vstart+vlen] >=0x21 
			 && bindable_vars[vstart+vlen] != ')' )
		    vlen++;
		  //    have the next variable name, put out debug info.
		  if (internal_trace)
		    {
		      fprintf (stderr, "variable -");
		      for (k = 0; k < vlen; k++)
			fprintf (stderr, "%c", bindable_vars[vstart+k]);
		      fprintf (stderr, "- will be assigned from var offsets %ld to %ld "
			       "(origin offsets %ld to %ld), value ",
			       (long) matches[mc].rm_so, 
			       (long) matches[mc].rm_eo,
			       matches[mc].rm_so + textoffset,
			       matches[mc].rm_eo + textoffset );
		      for (k = matches[mc].rm_so + textoffset; 
			   k < matches[mc].rm_eo + textoffset; 
			   k++)
			fprintf (stderr, "%c", mdw->filetext[k]);
		      fprintf (stderr, "\n");
		    };
		  vnext = vstart + vlen;
		  //  HERE'S THE DANGEROUS PART..  because varible
		  //  names have been expanded, we can't assume
		  //  that the variablename in the program text
		  //  will be usable.  So, we create the varname as a temp
		  //  var, and then can reassign it with impunity.
		  {
		    static char *vn;
		    
		    //   DANGER here - we malloc the var, use it
		    //   in crm_set_windowed_nvar, and then free it.
		    //   Otherwise, we'd have a memory leak.
		    //
		    if (!vn)
		      vn = (char *) malloc (MAX_VARNAME+16);
		    if (!vn)
		      untrappableerror("Couldn't malloc vn.\n Can't fix that.","");
		    strncpy (vn, &(bindable_vars[vstart]), vlen);
		    vn[vlen] = '\000';
		    if (strcmp (vn, ":_dw:") != 0)
		      {
			{
			  long vi;
			  vi = crm_vht_lookup (vht, vn, vlen);
			  if (vht[vi] == NULL)
			    {
			      index_texts[mc] = 0;
			      index_starts[mc] = 0;
			      index_lengths[mc] = 0;
			    }
			  else
			    {
			      index_texts[mc] = vht[vi]->valtxt;
			      index_starts[mc] = vht[vi]->vstart;
			      index_lengths[mc] = vht[vi]->vlen;
			    };
			};
			//    watch out for nonparticipating () submatches...
			//    (that is, submatches that weren't used because
			//     of a|(b(c)) regexes.  These have .rm_so offsets
			//      of < 0 .
			if (matches[mc].rm_so >= 0)
			  crm_set_windowed_nvar (vn, 
						 vlen,
						 mdw->filetext, 
						 matches[mc].rm_so 
						 + textoffset,
						 matches[mc].rm_eo
						 -matches[mc].rm_so,
						 csl->cstmt);
		      }
		    else
		      {
			nonfatalerror ("This program tried to re-define the "
				       "data window!  That's very deep and "
				       "profound, but not acceptable.  ", 
				       "Therefore, I'm ignoring this "
				       "re-definition.");
		      };
		    // free (vn);
		  };
		  //    and move on to the next binding (if any)
		  vstart = vnext;
		  mc++;
		  if (mc >= MAX_SUBREGEX)
		    nonfatalerror (
				   "Exceeded MAX_SUBREGEX limit-too many parens in match",
				   " Looks like you blew the gaskets on 'er.\n"); 
		  
		};
	    };
	  //
	  //      Now do cleanup/reclamation of old memory space, if needed.
	  //
	  //      Nasty trick here - we have to do these reclamations
	  //      in a specific order, because during reclamation, the
	  //      indicies we have in the index_starts will become
	  //      altered in ways we don't have the ability to know
	  //      here.  So, we need to do the greatest index_starts
	  //      first, so that earlier index_starts won't be
	  //      damaged.  If we do them last-first, then prior ones
	  //      will still have correct starts and lengths for the
	  //      reclamation.
          //
	  //      Note that we don't have to worry about a reclaim on
	  //      a var that was "non-participating", as the var will 
          //      still be in use in the VHT and thus won't be reclaimed.
	  {
	    long i;
	    long done = 0;
	    long reclaimed;
	    long maxstart, maxi;
	    //	    fprintf (stderr, "MC is %ld\n", mc);
	    while (!done)
	      {
		maxstart = 0;
		maxi = -1;
		for (i = 0; i < mc; i++)
		  {
		    if (index_texts[i] == tdw->filetext)
		      {
			if (maxstart < index_starts[i])
			  {
			    maxi = i;
			    maxstart = index_starts[i];
			  };
		      }
		  };
		//  Now we know the last reclaim area; we can safely
		//  reclaim that area (and no other)
		if (maxi >= 0)
		  {
		    long j;
		    if (internal_trace)
		      fprintf (stderr," crm_comprss_tdw_section from match\n");
		    // because the prev shortening, current index_starts[maxi] 
		    // + index_lengths[maxi] may go past the end of tdw->nchars
		    j = index_starts[maxi] + index_lengths[maxi];
		    if (j > tdw->nchars - 1) j = tdw->nchars;
		    reclaimed = crm_compress_tdw_section
		      (index_texts[maxi],
		       index_starts[maxi],
		       j);
		    // WAS index_starts[maxi] + index_lengths[maxi] );
		    if (internal_trace)
		      fprintf (stderr,
			       " [ MatchVar #%ld (s: %ld l: %ld) reclaimed %ld. ]\n",
			       maxi, index_starts[maxi], index_lengths[maxi], 
			       reclaimed);
		    //   and zap out the reclaimed entry, so other entries
		    //   can also be reclaimed in the proper order.
		    index_starts[maxi] = -1;
		  };
		if (maxi == -1)
		  done = 1;
	      };
	  };
	};
    };


  if (0)
    {
    nonfatal_route_outwards:
      if (user_trace)
	fprintf (stderr, "The MATCH FAULTed and we're taking the TRAP out");
    };

  return (0);
};  


syntax highlighted by Code2HTML, v. 0.9.1