//  crm_preprocessor.c  - Controllable Regex Mutilator,  version v1.0
//  Copyright 2001-2006  William S. Yerazunis, all rights reserved.
//  
//  This software is licensed to the public under the Free Software
//  Foundation's GNU GPL, version 2.  You may obtain a copy of the
//  GPL by visiting the Free Software Foundations web site at
//  www.fsf.org, and a copy is included in this distribution.  
//
//  Other licenses may be negotiated; contact the 
//  author for details.  
//
//  include some standard files
#include "crm114_sysincludes.h"

//  include any local crm114 configuration file
#include "crm114_config.h"

//  include the crm114 data structures file
#include "crm114_structs.h"

//  and include the routine declarations file
#include "crm114.h"

//    the command line argc, argv
extern int prog_argc;
extern char **prog_argv;

//    the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;

//    the globals used when we need a big buffer  - allocated once, used 
//    wherever needed.  These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
//      crm preprocessor - pre-process a CRM file to make it 
//      palatable to the sorry excuse we have for a compiler.
int crm_preprocessor (CSL_CELL *csl, int flags)
{
  int lflag;
  int status;
  long i, j;
  long done;
  regex_t preg;
  int numinserts = 0;
  int maxinserts = DEFAULT_MAX_INSERTS;
  regmatch_t matches[3];

  //  regex commentary:
  //    we want to match both 
  //      "\n[ ]*insert[ ]*[file][ ]*\n"
  //    and
  //      "^[ ]*insert[ ]*[file][ ]*\n"
  //
  //    This is the file insertion regex.  Note that it does NOT allow
  //    spaces in filenames, nor does it deal with embedded #comments
  //    but then again, the "fixes" to deal with spaces in filenames
  //    also don't deal wth embedded #comments, because #comments 
  //    themselves aren't dealt with till lower down in the code..
  //
  //    However, there's another problem with the above.  The trailing
  //    newline may not be there - consider:
  //
  //      #insert foo.crm ; output /hello, world!\n/
  //
  //    which fails because we aren't regex_conforming.  
  //    So, what we really need is to grab the next nonblank token, then 
  //    either get a newline or a semicolon.  

  char *insert_regex = 
    "\n[[:blank:]]*(insert)[[:blank:]]+([[:graph:]]+)[[:blank:]]*[\n;]";
  //    "\n[[:blank:]]*(insert)[[:blank:]]+([[:graph:]]+)[[:blank:]]*[\n;]";
  //
  //
  if (internal_trace )
    fprintf (stderr, " preprocessor - #insert processing...\n");
  
  lflag = 0;
  i = 0;
  done = 0;

  //
  //     Compile the insert regex
  //  
  i = crm_regcomp (&preg, 
		   insert_regex, strlen (insert_regex),
		   REG_EXTENDED | REG_ICASE | REG_NEWLINE);
  if ( i != 0)
    {
      crm_regerror ( i, &preg, tempbuf, data_window_size);
      untrappableerror (
	 "Regular Expression Compilation Problem during INSERT processing:", 
	 tempbuf);
    };

  //
  //    Do the initial breaking pass   
  //
  crm_break_statements (0, csl->nchars, csl);

  if (internal_trace)
    fprintf (stderr, 
	     "After first pass, breaking statements we have -->>%s<<--\nlength %ld\n",
	     csl->filetext, csl->nchars);
  
  
  while (!done)
    {
      long filenamelen;
      j  = crm_regexec ( &preg, csl->filetext, csl->nchars,
			 		3, matches, lflag, NULL);
      if ( j != 0) 
	{
	  if (internal_trace)
	    fprintf (stderr, "No insert files left to do.\n");
	  done = 1;
	}
      else 
	{
	  char insertfilename [MAX_FILE_NAME_LEN];
	  struct stat statbuf;

	  filenamelen = matches[2].rm_eo - matches[2].rm_so ;

	  for (j = 0; 
	       j < filenamelen
		 && j < MAX_FILE_NAME_LEN; 
	       j++)
	    insertfilename[j] = csl->filetext[matches[2].rm_so + j];
	  insertfilename[j] = '\000';

	  //   Check to see if this was a "delimited" insertfile name
	  //   that is, wrapped in [filename] rather than plaintext -
	  //   if it is, then do a variable expansion on it.
	  if (insertfilename[0] == '['
	      && insertfilename[filenamelen-1] == ']')
	    {
	      if (user_trace)
		fprintf (stderr, "INSERT filename expand: '%s'",
			 insertfilename);
	      //  Get rid of the enclosing [ and ]
	      filenamelen = filenamelen - 2;
	      for (j = 0; j < filenamelen; j++)
		insertfilename[j] = insertfilename[j + 1];
	      insertfilename[filenamelen] = '\000';
	      filenamelen = crm_nexpandvar (insertfilename, 
					      filenamelen,
					      MAX_FILE_NAME_LEN);
	      if (user_trace)
		fprintf (stderr, " to '%s' \n", insertfilename);
	    }
	  insertfilename[filenamelen] = '\000';

	  //   We have a filename; check to see if it will blow the
	  //   gaskets on the filesystem or not:
	  //
	  if (filenamelen > MAX_FILE_NAME_LEN-1)
	    untrappableerror ("INSERT Filename was too long!  Here's the"
			      "first part of it: ", insertfilename);

	  //   To keep the matcher from looping, we change the string
	  //   'insert' to 'insert=' .  Cool, eh?  
	  //
	  csl->filetext[matches[1].rm_eo] = '=';   // smash in an "="

	  //   stat the file - if 0, file exists
	  status = stat ( insertfilename, &statbuf );
	  if (! status )
	    {
	      //
	      //    OK, now we have to "insert" the file, but we have to
	      //    do it gracefully.  In particular, the file itself
	      //    must be loaded, then newline-fixupped, then 
	      //    we know it's actual size and can actually -insert-
	      //    it.  
	      // 
	      //    We malloc a big hunk of memory, read the file in.
	      //    We expand it there (with impunity), then 
	      //    we make a temporary copy in malloced memory,
	      //    and do the real insertion.
	      CSL_CELL *ecsl;
	      char *insert_buf;
	      if (user_trace)
		{
		  fprintf (stderr, "Inserting file '%s' .\n", insertfilename);
		};

	      ecsl = (CSL_CELL *) malloc (sizeof (CSL_CELL));
	      insert_buf = malloc (sizeof (char) * max_pgmsize);
	      if (!insert_buf || !ecsl)
		untrappableerror ("Couldn't malloc enough memory to do"
				  " the insert of file ", insertfilename);

	      //   Loop prevention check - too many inserts?
	      //
	      numinserts++;
	      if (numinserts > maxinserts)
		untrappableerror ("Too many inserts!  Limit exceeded with"
				  "filename : ", insertfilename);
	      
	      ecsl->filetext = insert_buf;
	      ecsl->nchars = 0;
	      //   OK, we now have a buffer.  Read the file in...
	      {
		int fd;
		fd = open (insertfilename, O_RDONLY);
		read (fd, 
		      ecsl->filetext,
		      statbuf.st_size);
		close (fd);
		//
		//   file's read in, put in a trailing newline
		ecsl->nchars = statbuf.st_size;
		ecsl->filetext[ecsl->nchars] = '\n';
		ecsl->nchars ++;
		ecsl->filename = insertfilename;
		//
		//   now do the statement-break thing on this file
		crm_break_statements (0, ecsl->nchars, ecsl);
		//  
		//   and we have the expanded text ready to insert.
		//
		//   will it fit?
		//
		if ( (csl->nchars + ecsl->nchars + 64) 
		     > (sizeof (char) * max_pgmsize))
		  untrappableerror ( " Program file buffer overflow when "
				     " INSERTing file ", insertfilename);

		//   Does the result end with a newline?  If not, fix it.
		if (ecsl->filetext[ecsl->nchars-1] != '\n')
		  {
		    ecsl->filetext [ecsl->nchars ] = '\n';
		    ecsl->nchars++;
		  };
		
		//   Does the result end with two newlines?  Fix 
		//   that, too.
		//if (ecsl->filetext[ecsl->nchars-1] == '\n'
		//    && ecsl->filetext[ecsl->nchars-2] == '\n')
		//  {
		//    ecsl->nchars--;
		//  };

		//  Make a hole in the csl->filetext
		//
		//   (note- Fidelis' points out that we need to pace
		//    off from the end of matches[0] so as to not smash
		//    trailing stuff on the line.
		//    
		memmove (&(csl->filetext[matches[0].rm_eo + ecsl->nchars]),
			 &(csl->filetext[matches[0].rm_eo]),
			 csl->nchars - matches[0].rm_eo + 1); // +1 for '\0'!
		//
		//   and put the new text into that hole
		//
		memmove (&(csl->filetext[matches[0].rm_eo]),
			 ecsl->filetext,
			 ecsl->nchars);

		//   Mark the new length of the csl text.
		if (internal_trace)
		  fprintf (stderr, "Old length: %ld, ", csl->nchars);
		csl->nchars += ecsl->nchars;
		if (internal_trace)
		  fprintf (stderr, "new length: %ld\n ", csl->nchars);

		//    Now we clean up (de-malloc all that memory)
		free (ecsl->filetext);
		free (ecsl);
	      }
	      
	    }
	  else
	    {
	      //
	      //   Paolo's beautiful weasel hack to make missing
	      //   insert files a trappable error.  The hitch is that
	      //   the error "occurs" at preprocessor time, before the
	      //   compiler has a chance to set up the trap addresses.
	      //   So, Paolo presented the following waycool weasel hack.
	      //   "If the file is missing, 'insert' a FAULT that has the
	      //   fault message of "Missing insert file".  So, if we never
	      //   actually execute the missing lines, there's no problem,
	      //   and if we _do_, we can trap the error or not, as the 
	      //   the programmer chooses.
	      //
	      //  untrappableerror 
	      //    (" I'm having a problem inserting file ",
	      //		insertfilename);
	      //   
	      char faulttext[MAX_VARNAME];
	      long textlen;
		
	      if (user_trace)
		{
		  fprintf (stderr, "Can't find '%s' to insert.\n"
			   "Inserting a FAULT instead\n",
                                  insertfilename);
		};

	      //
	      //    Build the fault string.
	      sprintf (faulttext,
		       "\n######  THE NEXT LINE WAS AUTO_INSERTED BECAUSE THE FILE COULDN'T BE FOUND \nfault /Couldn't insert the file named '%s' that you asked for.  This is probably a bad thing./\n",
		       insertfilename);
	      textlen = strlen (faulttext)  ; //  -1 gets rid of the \0
	      //
	      //       make a hole to put the fault string into.
	      // 
	      memmove (&(csl->filetext[matches[0].rm_eo + textlen]),
		       &(csl->filetext[matches[0].rm_eo]),
		       csl->nchars - matches[0].rm_eo ); 
	      //
	      //   and put the new text into that hole
	      //
	      memmove (&(csl->filetext[matches[0].rm_eo]),
		       faulttext,
		       textlen);
	      //   Mark the new length of the csl text.
	      if (internal_trace)
		fprintf (stderr, "Added %ld chars to crmprogram\n",
			 textlen );
	      csl->nchars += textlen;
	    };
	  i = matches[1].rm_so + 1;
	};
      if (internal_trace)
	fprintf (stderr,
		 "----------Result after preprocessing-----\n"
		 "%s"
		 "\n-------------end preprocessing------\n",
		 csl->filetext);
    };
  
  //     define a hash of the expanded program for sanity checking on bugreps:
  //
  {
    char myhash[32];
    sprintf (myhash, "%08lX", strnhash (csl->filetext, csl->nchars));
    myhash[8] = '\0';
    crm_set_temp_var (":_pgm_hash:", myhash);
  };

  ///   GROT GROT GROT  for some reason, Gnu Regex segfaults if it
  //    tries to free this register.
  //  crm_regfree (&preg);
  //fprintf (stderr, "returning\n");
  return (0);
};

//
//     Set up statement breaks.
//      
//     If we're not in a nesting (paren, angle, box, slash) then
//     we need to assure that there are newlines before and after
//     any { and }, and that there is a newline after every ; and 
//     before every #.  
//
//     If we ARE in a nesting, then all characters pass unchanged.
//
//     Note that this is an "in-place" mutilation, not a copying mutilation.
//

void crm_break_statements (long ini, long nchars, CSL_CELL *csl)
{
  int seennewline;
  int in_comment;
  int neednewline;
  int paren_nest, angle_nest, box_nest, slash_nest;
  long i;
  seennewline = 1;
  neednewline = 0;
  in_comment = 0;
  paren_nest = slash_nest = angle_nest = box_nest = 0;

  if ( internal_trace )
    fprintf (stderr, "  preprocessor - breaking statmeents... \n");
  for (i = ini; i < ini + nchars; i++)
    {
      //    now, no matter what, we're looking at a non-quoted character.
      //
      //   are we looking at a nonprinting character?
      if (csl->filetext[i] < 0x021 )
	{
	  if (csl->filetext[i] == '\n')
	    {
	      //    get rid of extraneous newlines.
	      //if (internal_trace)
	      //  fprintf (stderr, "  newline .");
	      seennewline = 1;
	      neednewline = 0;
	      in_comment = 0;
	      //    Userbug containment - a newline closes all nests
	      paren_nest = slash_nest = angle_nest = box_nest = 0;
	    };
	  //   other nonprinting characters do not change things.
	}
      else
	{
	  //   we don't do any processing inside a comment!
	  if ( in_comment )
	    {
	      //   inside a comment, we don't do squat to printing chars.
	      //    unless it's an escaped hash; in that case
	      //     it's end-of-comment
	      if (csl->filetext[i] == '#'
		  && (i - 1) >= 0
		  && csl->filetext[i-1] == '\\')
		{
		  neednewline = 1;
		  seennewline = 0;
		  in_comment = 0;
		};
	    }
	  else
	    {
	      //    we are looking at a printing character, so maybe we have
	      //    to add a newline.  Or maybe not...
	      if (neednewline) 
		{
		  if ((csl->nchars+1) > (sizeof(char) * max_pgmsize))
		    untrappableerror ( "Program file buffer overflow - "
				       "post-inserting newline to: ",
				       &(csl->filetext[i]));
		  //    we need a newline and are looking at a printingchar
		  //    so we need to insert a newline.  
		  memmove ( &(csl->filetext[i+1]), 
			    &(csl->filetext[i]), 
			    strlen (&csl->filetext[i])+1);
		  csl->filetext[i] = '\n';
		  i++;
		  csl->nchars++;
		  nchars++;
		  neednewline = 0;
		  seennewline = 1;
		};
	      //
	      switch (csl->filetext[i])
		{
		case '\\':
		  {
		    //     if it's a backslash at the end of a line, 
		    //     delete +both+ the backslash and newline, making
		    //     one big line out of it.  
		    //
		    //     We do this whether or not we're in a nesting. 
		    if (  csl->filetext[i+1] == '\n' )
		      {
			if (internal_trace)
			  fprintf (stderr, " backquoted EOL - splicing.\n");
			memmove ( &(csl->filetext[i]), 
				  &(csl->filetext[i+2]), 
				  strlen (&csl->filetext[i+2])+1);
			csl->nchars--;
			csl->nchars--;
			nchars--;
			nchars--;
			i--;
		      }
		    else
		      {
			//   Otherwise, we _always_ step over the next
			//   character- it can't change nesting, it can't
			//   close a string.  Thus, the preprocessor will
			//   do nothing to it.
			//
			//
			//    TRICKY BIT HERE !!!  Notice that we do
			//   this '\' step-over test _BEFORE_ we do
			//   any other character testing, so the '\'
			//   gets to do it's escape magic before
			//   anything else can operate - and it
			//   _preempts_ any other character's
			//   actions.
			//
			i++;
		      };
		  };
		  break;

		case '{':
		case '}':
		  {
		    //    put an unquoted '{' or '}' onto it's own line.
		    //   do we need to put in a prefix new line?
		    //		  if (internal_trace)
		    //
		    //   Are we inside a nesting?  
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      { 
			if ( !seennewline )
			  {
			    if ((csl->nchars+1) > sizeof(char)*max_pgmsize)
			      untrappableerror ( "Program buffer overflow when"
						 "post-inserting newline on:",
						 &csl->filetext[i]);
			    if (internal_trace)
			      fprintf (stderr, " preinserting a newline.\n");
			    memmove ( &(csl->filetext[i+1]), 
				      &(csl->filetext[i]), 
				      strlen (&csl->filetext[i])+1);
			    csl->filetext[i] = '\n';
			    csl->nchars++;
			    nchars++;
			    i++;
			  };
			seennewline = 0;
			//   and mark that we need a newline before any more
			//   printable characters come through.
			neednewline = 1;
		      }
		  };
		  break;
		case ';':     
		  {
		    //       we can replace non-escaped semicolons with 
		    //       newlines.
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      { 
			if ( seennewline ) //  we just saw a newline
			  {  
			    //  was preceded by a newline so just get rid 
			    //  of the ;
			    if (internal_trace)
			      fprintf (stderr, 
				       "superfluous semicolon, *poof*.\n");
			    memmove ( &(csl->filetext[i]), 
				      &(csl->filetext[i+1]), 
				      strlen (&csl->filetext[i])+1);
			    csl->nchars--;
			    nchars--;
			    i--;
			    neednewline = 0;
			    seennewline = 1;
			  }
			else
			  {
			    //   this was not preceded by a newline,
			    //  so we just replace the semicolon with a 
			    //  newline before any printed characters
			    if (internal_trace)
			      fprintf (stderr, " statement break semi.\n"
				       "--> \\n \n");
			    csl->filetext[i] = '\n';
			    neednewline = 0;
			    seennewline = 1;
			  };
		      };
		  };
		  break;
		case '#':
		  {  
		    //      now, we're in a comment - everything should be
		    //      done only with the comment thing enabled.
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      {
			in_comment = 1;
		      };
		  };
		  break;
		case '(':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      {
			paren_nest = 1;
		      };
		  };
		  break;
		case ')':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 1 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      {
			paren_nest = 0;
		      };
		  };
		  break;
		case '<':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      {
			angle_nest = 1;
		      };
		  };
		  break;
		case '>':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 0 &&
			angle_nest == 1 &&
			box_nest == 0 &&
			slash_nest == 0)
		      {
			angle_nest = 0;
		      };
		  };
		  break;
		case '[':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 &&
			slash_nest == 0)
		      {
			box_nest = 1;
		      };
		  };
		  break;
		case ']':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 1 &&
			slash_nest == 0)
		      {
			box_nest = 0;
		      };
		  };
		  break;
		case '/':
		  {  
		    //      Update nesting if necessary
		    if (paren_nest == 0 &&
			angle_nest == 0 &&
			box_nest == 0 )
		      {
			if (slash_nest == 0)
			  {
			    slash_nest = 1;
			  }
			else
			  {
			    slash_nest = 0;
			  };
		      };
		  };
		  break;
		default: 
		  {
		    //      none of the above - it's a normal printing 
		    //  character - we can just do the 
		    //  clearing of all the "seen/need" flags 
		    seennewline = 0;
		    neednewline = 0;
		  };
		  break;
		};
	    };
	};
    };
};

  



syntax highlighted by Code2HTML, v. 0.9.1