// crm_expr_window.c - Controllable Regex Mutilator, version v1.0
// Copyright 2001-2006 William S. Yerazunis, all rights reserved.
//
// This software is licensed to the public under the Free Software
// Foundation's GNU GPL, version 2. You may obtain a copy of the
// GPL by visiting the Free Software Foundations web site at
// www.fsf.org, and a copy is included in this distribution.
//
// Other licenses may be negotiated; contact the
// author for details.
//
// include some standard files
#include "crm114_sysincludes.h"
// include any local crm114 configuration file
#include "crm114_config.h"
// include the crm114 data structures file
#include "crm114_structs.h"
// and include the routine declarations file
#include "crm114.h"
// the command line argc, argv
extern int prog_argc;
extern char **prog_argv;
// the auxilliary input buffer (for WINDOW input)
extern char *newinputbuf;
// the globals used when we need a big buffer - allocated once, used
// wherever needed. These are sized to the same size as the data window.
extern char *inbuf;
extern char *outbuf;
extern char *tempbuf;
// a helper function that should be in the C runtime lib but isn't.
//
char *my_strnchr (const char *str, long len, int c)
{
long i;
i = 0;
for (i = 0; i < len; i++)
{
if (str[i] == (char) c)
return ((char *) &(str[i]));
};
return (NULL);
}
int crm_expr_window (CSL_CELL *csl, ARGPARSE_BLOCK *apb)
{
// a window operation - two steps...first is to discard
// everything up till the first regex match, and second
// is to add more data until the second regex is
// satisfied by the incoming data. We just add the
// incoming data onto the back of the window buffer, and
// when we get a read completes.
//
// Yes, there are more efficient, less memory-intensive ways
// to do this, but this is simple and unlikely to be broken in
// subtle ways.
static long newbuflen = 0;
char pch [MAX_PATTERN];
long i;
long srcidx;
int inputsrc;
char inputsrcname[MAX_VARNAME];
long inputsrclen;
char *savedinputtxt;
long savedinputtxtlen;
char wvname [MAX_VARNAME];
long wvnamelen;
CSL_CELL* mdw;
long flen;
int regexflags;
regex_t preg;
// int inputmode;
int inputsize; // can be by char or by EOF < bychar byeof >
int inputretryEOF; // do we retry an EOF? < eoffails eofretry >
int inputEOFaccept; // accept an EOF as pat-end < acceptEOF >
int saweof;
int failout;
long vmidx;
regmatch_t matches [2]; // we're only interested in the first match.
int done;
int firsttime;
inputsrcname[0] = '\000';
inputsrclen = 0;
// wvname[0] = '\000';
//wvnamelen = 0;
srcidx = 0;
savedinputtxt = NULL;
savedinputtxtlen = 0;
failout = 0;
if (user_trace)
fprintf (stderr, "Executing a 'window' operation\n");
// there's the choice of input from a
// variable, or input from stdin. This is controlled strictly
// by whether there's a [] in the statement (someday it may
// allow other files than stdin, but not yet.) So right now, it's-
// 1) read from the variable [:foo:] if supplied, else
// 2) read from STDIN (default)
// these are inputsrc=FROM_VAR vs FROM_STDIN
#define FROM_STDIN 0
#define FROM_VAR 1
#define FROM_VAR_DONE 2
//
// Second, there's how much to read "preemptively", that is,
// to read ahead, but with the possibility of reading ahead too
// much (and thereby messing up a script or other typeahead that
// another program sharing stdin was meant to actually read.
// The three choices we support are:
// 1) read everything available (BYEOF), else
// 3) read one character at a time (BYCHAR) (default)
// these are inputsize = bychar, byeof
#define BYCHAR 0
#define BYEOF 1
#define BYCHUNK 2
#define BYLINE 999 // DANGER - BYLINE IS NOT SUPPORTED ANY MORE!!!
//
// Third, there's the question of what to do if the read doesn't
// have enough material to satisfy the second regex (i.e. we hit
// end of variable or EOF first).
//
// Our options are
//
// 1) just fail. (the default)
// 2) just accept what we got, even though it doesn't fulfill
// the paste regex (accepteof).
// these are expressed as inputEOFaccept= ...
#define EOFFAILS 0
#define EOFACCEPTS 1
//
// As to other behavior, we can also clear the eof, wait a
// bit, and try again, so we have:
//
// 1) leave EOF's alone.
// 2) try to reset the EOF before reading
// these are denoted by inputretryEOF = ...
#define EOFSTAYS 0
#define EOFRETRY 1
// check for the flags
//
// default is BYCHAR
inputsrc = 0;
inputEOFaccept = EOFFAILS;
inputsize = BYCHAR;
inputretryEOF = EOFSTAYS;
if (apb->sflags & CRM_BYCHAR)
{
if (user_trace)
fprintf (stderr, " window input by character\n");
inputsize = BYCHAR;
};
if (apb->sflags & CRM_BYCHUNK)
{
if (user_trace)
fprintf (stderr, " window input by chunks\n");
inputsize = BYCHUNK;
};
if (apb->sflags & CRM_BYEOF)
{
if (user_trace)
fprintf (stderr, " window input by EOFs\n");
inputsize = BYEOF;
};
inputEOFaccept = EOFFAILS;
if (apb->sflags & CRM_EOFACCEPTS)
{
if (user_trace)
fprintf (stderr, " window input EOF is always accepted\n");
inputEOFaccept = EOFACCEPTS;
};
inputretryEOF = EOFSTAYS;
if (apb->sflags & CRM_EOFRETRY)
{
if (user_trace)
fprintf (stderr, " window input EOF is retried\n");
inputretryEOF = EOFRETRY;
};
regexflags = REG_EXTENDED;
if (apb->sflags & CRM_NOCASE)
{
if (user_trace)
fprintf (stderr, " no case matching turned on\n ");
regexflags = regexflags | REG_ICASE;
};
if (apb->sflags & CRM_NOCASE)
{
if (user_trace)
fprintf (stderr, " no case matching turned on\n ");
regexflags = regexflags | REG_ICASE;
};
if (apb->sflags & CRM_LITERAL)
{
if (user_trace)
fprintf (stderr, " no case matching turned on\n ");
regexflags = regexflags | REG_LITERAL;
};
// part 1: dispose of old window worth of data. If no match,
// dispose of all of the old window.
//
// get the disposal pattern
//
crm_get_pgm_arg (pch, MAX_PATTERN, apb->s1start, apb->s1len);
// null window check - if no cut or paste patterns, then we
// just skip to the end of the WINDOW statement code
// which is how a WINDOW statement can be used to have a
// program "come out running" before reading stdin.
if (apb->s1len == 0 && apb->s2len == 0)
goto crm_window_no_changes_made;
// We have the first pattern in pch. We ought to look for the
// appropriate flags here (common code, anyone?) but for now,
// we'll just do a brutally straightforward expansion and then
// matching.
if (internal_trace)
fprintf (stderr, " window cut pattern ---%s---\n", pch);
flen = apb->s1len;
// expand the match pattern
flen = crm_nexpandvar (pch, apb->s1len, MAX_PATTERN);
//
// compile the regex
i = crm_regcomp (&preg, pch, flen, regexflags);
if ( i > 0)
{
crm_regerror ( i, &preg, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto invoke_bailout;
};
// Get the variable we're windowing. If there's no such
// variable, we default to :_dw:
crm_get_pgm_arg (wvname, MAX_PATTERN, apb->p1start, apb->p1len);
wvnamelen = crm_nexpandvar (wvname, apb->p1len, MAX_PATTERN);
// if no svname, then we're defaulted to :_dw:
if (strlen (wvname) == 0)
{
strcat (wvname, ":_dw:");
wvnamelen = strlen (":_dw:");
};
vmidx = crm_vht_lookup (vht, wvname,
strlen (wvname));
if (vht[vmidx] == NULL)
{
nonfatalerror ("We seem to be windowing a nonexistent variable.",
"How very bizarre.");
goto invoke_bailout;
}
mdw = NULL;
if (vht[vmidx]->valtxt == cdw->filetext) mdw = cdw;
if (vht[vmidx]->valtxt == tdw->filetext) mdw = tdw;
if (mdw == NULL)
{
nonfatalerror ("We seem to have lost the windowed var buffer",
"This is just plain sick.");
goto invoke_bailout;
}
//
//
// OK, we've got the arguments for part 1 - the cutting out
// of the old data. So, let's do the cut.
//
// execute the regex.
i = crm_regexec ( &preg,
&(vht[vmidx]->valtxt[vht[vmidx]->vstart]),
vht[vmidx]->vlen,
1, matches, 0, NULL);
crm_regfree (&preg);
// starting offset of the "keep section" is at matches[0].rm.eo
// so we use crm_slice_and_splice_window to get rid of it.
//
if (i == 0)
{
// delete everything up to and including the delimiter
crm_slice_and_splice_window (mdw,
vht[vmidx]->vstart,
-matches[0].rm_eo);
}
else
{
// didn't find the terminator pattern at all, which means we
// flush the input window completely.
crm_slice_and_splice_window (mdw,
vht[vmidx]->vstart,
-vht[vmidx]->vlen);
};
if (user_trace)
fprintf (stderr, " cut completed, variable length after cut is %ld\n",
vht[vmidx]->vlen);
//**************************************************************
// OK, part one is done- we've windowed off the first
// part of the input.
//
// Now we put the new
//
// Now we get the "put" half of the regex.
if (user_trace)
fprintf (stderr, " now finding new section to add to end.\n");
crm_get_pgm_arg (pch, MAX_PATTERN, apb->s2start, apb->s2len);
flen = apb->s2len;
if (user_trace)
fprintf (stderr, "adding input with terminator of --%s--,", pch);
// expand the match pattern
flen = crm_nexpandvar (pch, flen, MAX_PATTERN);
if (user_trace)
fprintf (stderr, " which expands to --%s--", pch);
//
// compile the paste match regex
i = crm_regcomp (&preg, pch, flen, regexflags);
if ( i > 0)
{
crm_regerror ( i, &preg, tempbuf, data_window_size);
nonfatalerror ("Regular Expression Compilation Problem:", tempbuf);
goto invoke_bailout;
};
// decide - do we suck input from stdin, or from
// a variable that's already here?
//
// Get the input source, if one is supplied (2nd set of parens is
// the var to use as input source, if it exists)
crm_get_pgm_arg (inputsrcname, MAX_PATTERN, apb->p2start, apb->p2len);
inputsrclen = apb->p2len;
if (apb->p2start)
{
// NonZero input source variable, so we're gonna take our input
// from this input variable.
inputsrc = FROM_VAR;
if (user_trace)
fprintf (stderr, " getting input from var %s\n", inputsrcname);
};
//
// Now, depending on inputmode, we set up the final pasting
// to do the right thing (the final pasting params are in
// matches[0] ).
//
// we'll set up dummy limits for now though...
//
matches[0].rm_so = 0;
matches[0].rm_eo = 0;
// Now, the WHILE loop to find satisfaction for the second
// regex, within the boundaries of from_var vs from_stdin, and
// byline vs bychar vs byeof. So it's really a read/test/maybe_loop
// loop.
done = 0;
saweof = 0;
firsttime = 1;
while (! done)
{
//
// Switch on whether we're reading from a var or from
// standard input. (either way, we use the newinputbuf)
//
switch (inputsrc)
{
case FROM_VAR:
{
// we're supposed to grab our input from an input variable.
// so we fake it as though it came from a file.
//
// Later on, we have to undo the faking, and also modify
// the length of the input variable (cutting out the stuff
// that went into the WINDOW).
// diagnostic - what was in the newinputbuf before this stmt?
if (user_trace)
{
fprintf (stderr, " Using input source from variable %s\n",
inputsrcname);
fprintf (stderr, " prior newinput buf --%s--\n",
newinputbuf);
}
// Get the source input stuff
//
srcidx = crm_vht_lookup (vht, inputsrcname, inputsrclen);
if (vht[srcidx] == NULL)
{
nonfatalerror ("Trying to take WINDOW input from"
"nonexistent variables doesn't work,"
"in this case, from :", inputsrcname);
goto invoke_bailout;
};
//
//
// malloc up some temporary space to keep the static input
// buffer's stored text
savedinputtxt = (char *)
malloc (sizeof (char) * (32 + newbuflen ));
if (savedinputtxt == NULL)
{
fatalerror ("Malloc in WINDOW failed. Aw, crud.",
"Can't WINDOW this way");
goto invoke_bailout;
};
//
// save our newinputbuf txt
strncpy (savedinputtxt,
newinputbuf,
newbuflen);
savedinputtxtlen = newbuflen;
//
// and push the contents of the variable into newinputbuf
// (we know it's no bigger than data_window_len)
strncpy (newinputbuf,
&vht[srcidx]->valtxt[vht[srcidx]->vstart],
vht[srcidx]->vlen );
newinputbuf[vht[srcidx]->vlen] = '\000';
newbuflen = vht[srcidx]->vlen;
//
// and there we have it - newintputbuf has all we will
// get from this variable.
//
// Mark the fact that we're done with this variable by
// setting inputsrc to FROM_VAR_DONE;
inputsrc = FROM_VAR_DONE;
saweof = 1;
};
break;
case FROM_VAR_DONE:
{
if (user_trace)
fprintf (stderr, " got to FROM_VAR_DONE - this should"
" NEVER happen. You've found a bug.");
saweof = 1;
}
break;
case FROM_STDIN:
{
int icount;
icount = 0;
//
// the reason we _don't_ do this on te first interation
// is that we may already have data in the temp
// buffer, and we should use that data up first.
if (!firsttime )
{
// If we're reading from stdin, then we have three options:
// read a character, read up to (and including) the newline,
// or read till EOF. After each one, we set
if (feof(stdin))
saweof = 1;
if (inputretryEOF == EOFRETRY
&& (feof (stdin) || ferror (stdin) ) )
{
if (user_trace)
fprintf (stderr, " resetting the stdin stream\n");
clearerr (stdin);
};
if (user_trace)
fprintf (stderr, " getting window input from STDIN\n");
switch (inputsize)
{
case BYLINE:
{
fatalerror (" Sorry, but BYLINE input is not supported;",
" we recommend using '\\n' in your match "
"pattern");
}
break;
case BYEOF:
{
// if BYEOF, we read as big a hunk as will fit.
// If that's less than the full buffer, we declare
// that we got an EOF as well.
if (user_trace)
fprintf (stderr, " bigchunk BYEOF read starting \n");
//
// fread doesn't stop on pipe empty, while
icount = fread (&(newinputbuf[newbuflen]), 1,
data_window_size - (newbuflen + 256),
stdin);
if (feof (stdin)) saweof = 1;
}
break;
case BYCHUNK:
{
// if BYCHUNK, we read all we can, and then we're
// off and running.
// Since we read everything available, we always
// declare we saw EOF. Use EOFRETRY to run again.
if (user_trace)
fprintf (stderr, " bigchunk BYEOF read starting \n");
//
// fread (stdin) doesn't return on pipe
// empty, while read on STDIN_FILENO does.
// So, for reading by chunks, we use read (STDIN
icount = read ( fileno (stdin),
&(newinputbuf[newbuflen]),
data_window_size / 4 );
saweof = 1;
}
break;
case BYCHAR:
default:
{
// if BYCHAR, read one character and we're done
// icount = read (0, &(newinputbuf[newbuflen]), 1);
//
if (user_trace)
fprintf (stderr, " single character BYCHAR read \n");
icount = fread (&(newinputbuf[newbuflen]), 1, 1, stdin);
};
break;
};
}
//
// end of major part of BYCHAR / BYEOF specialized code.
//
if (icount > 0)
{
newbuflen = newbuflen + icount;
newinputbuf[newbuflen] = '\000'; // put on the terminator
};
// icount < 0 means an error occurred
if (icount < 0)
{
nonfatalerror (" Something went wrong in WINDOW "
"while trying to read",
"I will keep trying. ");
};
if (feof (stdin))
saweof = 1;
};
}; // END OF SWITCH ON INPUTSRC
// mark that this is not the first time through the loop
//
firsttime = 0;
// now have an newinputbuf with something worth examining
// in it, of length newbuflen (i.e. using chars [0...newbuflen-1])
//
// So, we run the paste regex on it, and depending on the outcome,
// set "done" or not.
i = crm_regexec ( &preg,
newinputbuf,
newbuflen,
1, matches, 0, NULL);
//
// Now we deal with the result of the regex matching (or not
// matching. i== 0 for success, i > 0 for failure.
//
if (i == 0)
{
// we found the regex; do the cut/paste
//
done = 1;
if (user_trace)
fprintf (stderr, " Found the paste pattern\n");
// (and the cut/paste is already set up correctly in
// matches[0]; we don't have to do anything.
}
else
{
// Nope, the regex was not found. But if we had inputEOFaccept=
// EOFACCEPTS, then we accept it anyway.
if (saweof)
{
done = 1;
failout = 1;
if (user_trace)
fprintf (stderr, " saw EOF, EOFAccept= %d\n", inputEOFaccept);
switch (inputEOFaccept)
{
case EOFACCEPTS:
{
// In EOFENDS and EOFAIL, we take the available
// input, shove it in, and go onward. We do this
// by "faking" the matches[0] variable.
matches[0].rm_so = 0;
matches[0].rm_eo = newbuflen;
if (matches[0].rm_eo < 0) matches[0].rm_eo = 0;
failout = 0;
}
break;
case EOFFAILS:
default:
{
// Nope - got an EOF, and we aren't supposed to
// accept it. So we MIGHT be done. Or maybe not...
// if we have EOFRETRY set then we clear it and
// try again.
if (inputretryEOF == EOFRETRY)
{
clearerr (stdin);
done = 0;
failout = 0;
}
// But, if we are reading from a var, there will never
// be any more, so we are -always- done.
if (inputsrc == FROM_VAR ||
inputsrc == FROM_VAR_DONE)
{
done = 1 ;
};
};
break;
};
};
};
}; // end of the (!done) loop...
//
// It's just use the computed values from here on.
crm_regfree (&preg);
if (internal_trace)
fprintf (stderr, " now newinput buf --%s--\n", newinputbuf);
// Once we get to here, we have the new input in newinputbuf, and
// matches[0].rm_eo is the length. So, we copy the new data onto
// the end of the cdw window, and slide the new input up.
//
// start by making some space at the end of the input buffer
crm_slice_and_splice_window (mdw,
vht[vmidx]->vstart+vht[vmidx]->vlen,
matches[0].rm_eo);
// copy the pertinent part of newinputbuf into the space
// we just made.
memmove (&(vht[vmidx]->valtxt[vht[vmidx]->vstart
+ vht[vmidx]->vlen
- matches[0].rm_eo]),
newinputbuf,
matches[0].rm_eo);
// and get rid of the same characters out of newinputbuf
if (newbuflen > 0 )
memmove (newinputbuf,
&(newinputbuf[matches[0].rm_eo]),
newbuflen - matches[0].rm_eo + 1);
newbuflen = newbuflen - matches[0].rm_eo;
newinputbuf[newbuflen] = '\000';
// Now, if we had EOFFAILS, and we hit the fail condition,
// we have to set up the CSL so that it will continue execution
// in the "right" place.
if (failout == 1)
{
if (user_trace)
fprintf (stderr, " CUT match failed so we're going to fail.\n");
csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
csl->aliusstk [ csl->mct[csl->cstmt]->nest_level ] = -1;
};
// and, if we got a nonfatal error, we skip all the stuff above;
// this is cleanup that we have to do eiher way. Failure here
// is Really Bad.
invoke_bailout:
//
//
// Last little bit of cleanup is that IF we fetched from a
// variable (not a file) we have to undo our fakery of stuffing
// the var's contents into newinputbuf.
//
// This cleanup is two parts - stuffing the remains of inputsrcname
// back into inputsrcname, and then restoring the old stdin buffer
// contents from savedinputtxt and freeing the temporary
// space,
if (inputsrc == FROM_VAR || inputsrc == FROM_VAR_DONE)
{
// stuff the remaining characters back into the src var
if (user_trace)
fprintf (stderr, " restoring remains of input src variable.\n");
crm_destructive_alter_nvariable (inputsrcname, inputsrclen,
newinputbuf,
newbuflen);
// and restore the old stdin buffer
strncpy (newinputbuf,
savedinputtxt,
savedinputtxtlen + 1);
newbuflen = savedinputtxtlen;
//
};
// and free the temporary space
if (savedinputtxt) free (savedinputtxt);
crm_window_no_changes_made:
return (0);
}
syntax highlighted by Code2HTML, v. 0.9.1