/*
* re2glob - C implementation
* (c) 2007 ActiveState Software Inc.
*/
#include <tcl.h>
#define DEBUG 0
static void
ExpChopNested _ANSI_ARGS_ ((Tcl_UniChar** xstr,
int* xstrlen,
Tcl_UniChar open,
Tcl_UniChar close));
static Tcl_UniChar*
ExpLiteral _ANSI_ARGS_ ((Tcl_UniChar* nexto,
Tcl_UniChar* str,
int strlen));
static Tcl_UniChar*
ExpCollapseStar _ANSI_ARGS_ ((Tcl_UniChar* src,
Tcl_UniChar* last));
static Tcl_UniChar*
ExpCollapseQForward _ANSI_ARGS_ ((Tcl_UniChar* src,
Tcl_UniChar* last));
static Tcl_UniChar*
ExpCollapseQBack _ANSI_ARGS_ ((Tcl_UniChar* src,
Tcl_UniChar* last));
static Tcl_UniChar
ExpBackslash _ANSI_ARGS_ ((char prefix,
Tcl_UniChar* str,
int strlen));
static char*
xxx (Tcl_UniChar* x, int xl)
{
static Tcl_DString ds;
Tcl_DStringInit (&ds);
return Tcl_UniCharToUtfDString (x,xl,&ds);
}
Tcl_Obj*
exp_retoglob (str,strlen)
Tcl_UniChar* str;
int strlen;
{
/*
* Output: x2 size of input (literal where every character has to be
* quoted.
* Location: For next translated unit, in output.
* Size of last generated unit, in characters.
* Stack of output locations at opening parens. x1 size of input.
* Location for next location on stack.
*/
static Tcl_UniChar litprefix [] = {'*','*','*','='};
static Tcl_UniChar areprefix [] = {'*','*','*',':'};
static Tcl_UniChar areopts [] = {'(','?'};
static Tcl_UniChar nocapture [] = {'?',':'};
static Tcl_UniChar lookhas [] = {'?','='};
static Tcl_UniChar looknot [] = {'?','!'};
static Tcl_UniChar xcomment [] = {'?','#'};
static Tcl_UniChar classa [] = {'[','.'};
static Tcl_UniChar classb [] = {'[','='};
static Tcl_UniChar classc [] = {'[',':'};
int lastsz, expanded;
Tcl_UniChar* out;
Tcl_UniChar* nexto;
Tcl_UniChar** paren;
Tcl_UniChar** nextp;
Tcl_Obj* glob = NULL;
Tcl_UniChar* mark;
Tcl_UniChar ch;
/*
* Set things up.
*/
out = nexto = (Tcl_UniChar*) Tcl_Alloc (strlen*2*sizeof (Tcl_UniChar));
paren = nextp = (Tcl_UniChar**) Tcl_Alloc (strlen* sizeof (Tcl_UniChar*));
lastsz = -1;
expanded = 0;
/*
* Start processing ...
*/
#define CHOP(n) {str += (n); strlen -= (n);}
#define CHOPC(c) {while (*str != (c) && strlen) CHOP(1) ;}
#define EMIT(c) {lastsz = 1; *nexto++ = (c);}
#define EMITX(c) {lastsz++; *nexto++ = (c);}
#define MATCH(lit) ((strlen >= (sizeof (lit)/sizeof (Tcl_UniChar))) && (0 == Tcl_UniCharNcmp (str,(lit),sizeof(lit)/sizeof (Tcl_UniChar))))
#define MATCHC(c) (strlen && (*str == (c)))
#define PUSHPAREN {*nextp++ = nexto;}
#define UNEMIT {nexto -= lastsz; lastsz = -1;}
/* Tcl_UniCharIsDigit ? */
#define MATCH_DIGIT (MATCHC ('0') || MATCHC ('1') || \
MATCHC ('2') || MATCHC ('3') || \
MATCHC ('4') || MATCHC ('5') || \
MATCHC ('6') || MATCHC ('7') || \
MATCHC ('8') || MATCHC ('9'))
#define MATCH_HEXDIGIT (MATCH_DIGIT || \
MATCHC ('a') || MATCHC ('A') || \
MATCHC ('b') || MATCHC ('B') || \
MATCHC ('c') || MATCHC ('C') || \
MATCHC ('d') || MATCHC ('D') || \
MATCHC ('e') || MATCHC ('E') || \
MATCHC ('f') || MATCHC ('F'))
#define EMITC(c) {if (((c) == '\\') || \
((c) == '*') || \
((c) == '?') || \
((c) == '$') || \
((c) == '^') || \
((c) == '[')) { \
EMIT ('\\'); EMITX ((c)); \
} else { \
EMIT ((c));}}
#if DEBUG
#define LOG if (1) fprintf
#define FF fflush (stderr)
#define MARK(s) LOG (stderr,#s "\n"); FF;
#else
#define LOG if (0) fprintf
#define FF
#define MARK(s)
#endif
/* ***= -> literal string follows */
LOG (stderr,"RE-2-GLOB '%s'\n", xxx(str,strlen)); FF;
if (MATCH (litprefix)) {
CHOP (4);
nexto = ExpLiteral (nexto, str, strlen);
goto done;
}
/* ***: -> RE is ARE. Always for Expect. Therefore ignore */
if (MATCH (areprefix)) {
CHOP (4);
LOG (stderr,"ARE '%s'\n", xxx(str,strlen)); FF;
}
/* (?xyz) ARE options, in {bceimnpqstwx}. Not validating that the
* options are legal. We assume that the RE is valid.
*/
if (MATCH (areopts)) { /* "(?" */
CHOP (2);
mark = str; CHOPC (')');
while (mark < str) {
if (*mark == 'q') {
CHOP (1);
nexto = ExpLiteral (nexto, str, strlen);
goto done;
} else if (*mark == 'x') {
expanded = 1;
LOG (stderr,"EXPANDED\n"); FF;
}
mark++;
}
CHOP (1);
}
while (strlen) {
LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF;
LOG (stderr,"'%s'\n", xxx(str,strlen)); FF;
if (expanded) {
/* Expanded syntax, whitespace and comments, ignore. */
while (MATCHC (' ') ||
MATCHC (0x9) ||
MATCHC (0xa)) CHOP (1);
if (MATCHC ('#')) {
CHOPC (0xa);
if (strlen) CHOP (1);
continue;
}
}
if (MATCHC ('|')) {
/* branching is too complex */
goto error;
} else if (MATCHC ('(')) {
/* open parens */
CHOP (1);
if (MATCH (nocapture)) { /* "?:" */
/* non capturing -save location */
PUSHPAREN;
CHOP (2);
} else if (MATCH (lookhas) || /* "?=" */
MATCH (looknot)) { /* "?!" */
/* lookahead - ignore */
CHOP (2);
ExpChopNested (&str, &strlen, '(', ')');
} else if (MATCH (xcomment)) { /* "?#" */
/* comment - ignore */
CHOPC (')'); CHOP (1);
} else {
/* plain capturing */
PUSHPAREN;
}
} else if (MATCHC (')')) {
/* Closing parens. */
CHOP (1);
/* Everything coming after the saved result is new, and
* collapsed into a single entry for a possible coming operator
* to handle.
*/
nextp --; /* Back to last save */
mark = *nextp; /* Location where generation for this parens started */
lastsz = (nexto - mark); /* This many chars generated */
/* Now lastsz has the correct value for a possibly following
* UNEMIT
*/
} else if (MATCHC ('$') || MATCHC ('^')) {
/* anchor constraints - ignore */
CHOP (1);
} else if (MATCHC ('[')) {
/* Classes - reduce to any char [[=chars=]] [[.chars.]]
* [[:name:]] [chars] Count brackets to find end.
* These are a bit complicated ... [= =], [. .], [: {] sequences
* always have to be complete. '[' does NOT nest otherwise. And
* a ']' after the opening '[' (with only '^' allowed to
* intervene is a character, not the closing bracket. We have to
* process the class in pieces to handle all this. The Tcl level
* implementations (0-2 all have bugs one way or other, all
* different.
*/
int first = 1;
int allowed = 1;
CHOP (1);
while (strlen) {
if (first && MATCHC ('^')) {
/* ^ as first keeps allowed ok for one more cycle */
CHOP (1);
first = 0;
continue;
} else if (allowed && MATCHC (']')) {
/* Not a closing bracket! */
CHOP (1);
} else if (MATCHC (']')) {
/* Closing bracket found */
CHOP (1);
break;
} else if (MATCH (classa) ||
MATCH (classb) ||
MATCH (classc)) {
Tcl_UniChar delim[2];
delim[0] = str [1];
delim[1] = ']';
CHOP (2);
while (!MATCH (delim)) CHOP (1);
CHOP (2);
} else {
/* Any char in class */
CHOP (1);
}
/* Reset flags handling start of class */
allowed = first = 0;
}
EMIT ('?');
} else if (MATCHC ('\\')) {
/* Escapes */
CHOP (1);
if (MATCHC ('d') || MATCHC ('D') ||
MATCHC ('s') || MATCHC ('S') ||
MATCHC ('w') || MATCHC ('W')) {
/* Class shorthands - reduce to any char */
EMIT ('?');
CHOP (1);
} else if (MATCHC ('m') || MATCHC ('M') ||
MATCHC ('y') || MATCHC ('Y') ||
MATCHC ('A') || MATCHC ('Z')) {
/* constraint escapes - ignore */
CHOP (1);
} else if (MATCHC ('B')) {
/* Backslash */
EMIT ('\\');
EMITX ('\\');
CHOP (1);
} else if (MATCHC ('0')) {
/* Escape NULL */
EMIT ('\0');
CHOP (1);
} else if (MATCHC ('e')) {
/* Escape ESC */
EMIT ('\033');
CHOP (1);
} else if (MATCHC ('a')) {
/* Escape \a */
EMIT (0x7);
CHOP (1);
} else if (MATCHC ('b')) {
/* Escape \b */
EMIT (0x8);
CHOP (1);
} else if (MATCHC ('f')) {
/* Escape \f */
EMIT (0xc);
CHOP (1);
} else if (MATCHC ('n')) {
/* Escape \n */
EMIT (0xa);
CHOP (1);
} else if (MATCHC ('r')) {
/* Escape \r */
EMIT (0xd);
CHOP (1);
} else if (MATCHC ('t')) {
/* Escape \t */
EMIT (0x9);
CHOP (1);
} else if (MATCHC ('v')) {
/* Escape \v */
EMIT (0xb);
CHOP (1);
} else if (MATCHC ('c') && (strlen >= 2)) {
/* Escape \cX - reduce to (.) */
EMIT ('?');
CHOP (2);
} else if (MATCHC ('x')) {
CHOP (1);
if (MATCH_HEXDIGIT) {
/* Escape hex character */
mark = str;
while (MATCH_HEXDIGIT) CHOP (1);
if ((str - mark) > 2) { mark = str - 2; }
ch = ExpBackslash ('x',mark,str-mark);
EMITC (ch);
} else {
/* Without hex digits following this is a plain char */
EMIT ('x');
}
} else if (MATCHC ('u')) {
/* Escapes unicode short. */
CHOP (1);
mark = str;
CHOP (4);
ch = ExpBackslash ('u',mark,str-mark);
EMITC (ch);
} else if (MATCHC ('U')) {
/* Escapes unicode long. */
CHOP (1);
mark = str;
CHOP (8);
ch = ExpBackslash ('U',mark,str-mark);
EMITC (ch);
} else if (MATCH_DIGIT) {
/* Escapes, octal, and backreferences - reduce (.*) */
CHOP (1);
while (MATCH_DIGIT) CHOP (1);
EMIT ('*');
} else {
/* Plain escaped characters - copy over, requote */
EMITC (*str);
CHOP (1);
}
} else if (MATCHC ('{')) {
/* Non-greedy and greedy bounds - reduce to (*) */
CHOP (1);
if (MATCH_DIGIT) {
/* Locate closing brace and remove operator */
CHOPC ('}'); CHOP (1);
/* Remove optional greedy quantifier */
if (MATCHC ('?')) { CHOP (1);}
UNEMIT;
EMIT ('*');
} else {
/* Brace is plain character, copy over */
EMIT ('{');
/* CHOP already done */
}
} else if (MATCHC ('*') ||
MATCHC ('+') ||
MATCHC ('?')) {
/* (Non-)greedy operators - reduce to (*) */
CHOP (1);
/* Remove optional greedy quantifier */
if (MATCHC ('?')) { CHOP (1);}
UNEMIT;
EMIT ('*');
} else if (MATCHC ('.')) {
/* anychar - copy over */
EMIT ('?');
CHOP (1);
} else {
/* Plain char, copy over. */
EMIT (*str);
CHOP (1);
}
}
LOG (stderr,"'%s' <-- ",xxx(out,nexto-out)); FF;
LOG (stderr,"'%s'\n", xxx(str,strlen)); FF;
/*
* Clean up the output a bit (collapse *-sequences and absorb ?'s
* into adjacent *'s.
*/
MARK (QF)
nexto = ExpCollapseQForward (out,nexto);
LOG (stderr,"QF '%s'\n",xxx(out,nexto-out)); FF;
MARK (QB)
nexto = ExpCollapseQBack (out,nexto);
LOG (stderr,"QB '%s'\n",xxx(out,nexto-out)); FF;
MARK (QS)
nexto = ExpCollapseStar (out,nexto);
LOG (stderr,"ST '%s'\n",xxx(out,nexto-out)); FF;
/*
* Check if the result is actually useful.
* Empty or just a *, or ? are not. A series
* of ?'s is borderline, as they semi-count
* the buffer.
*/
if ((nexto == out) ||
(((nexto-out) == 1) &&
((*out == '*') ||
(*out == '?')))) {
goto error;
}
/*
* Result generation and cleanup.
*/
done:
LOG (stderr,"RESULT_ '%s'\n", xxx(out,nexto-out)); FF;
glob = Tcl_NewUnicodeObj (out,(nexto-out));
goto cleanup;
error:
LOG (stderr,"RESULT_ ERROR\n"); FF;
cleanup:
Tcl_Free ((char*)out);
Tcl_Free ((char*)paren);
return glob;
}
static void
#ifdef _AIX
ExpChopNested (Tcl_UniChar** xstr,
int* xstrlen,
Tcl_UniChar open,
Tcl_UniChar close)
#else
ExpChopNested (xstr,xstrlen, open, close)
Tcl_UniChar** xstr;
int* xstrlen;
Tcl_UniChar open;
Tcl_UniChar close;
#endif
{
Tcl_UniChar* str = *xstr;
int strlen = *xstrlen;
int level = 0;
while (strlen) {
if (MATCHC (open)) {
level ++;
} else if (MATCHC (close)) {
level --;
if (level < 0) {
CHOP (1);
break;
}
}
CHOP (1);
}
*xstr = str;
*xstrlen = strlen;
}
static Tcl_UniChar*
ExpLiteral (nexto, str, strlen)
Tcl_UniChar* nexto;
Tcl_UniChar* str;
int strlen;
{
int lastsz;
LOG (stderr,"LITERAL '%s'\n", xxx(str,strlen)); FF;
while (strlen) {
EMITC (*str);
CHOP (1);
}
return nexto;
}
static Tcl_UniChar
#ifdef _AIX
ExpBackslash (char prefix,
Tcl_UniChar* str,
int strlen)
#else
ExpBackslash (prefix, str, strlen)
char prefix;
Tcl_UniChar* str;
int strlen;
#endif
{
/* strlen <= 8 */
char buf[20];
char dst[TCL_UTF_MAX+1];
Tcl_UniChar ch;
int at = 0;
/* Construct an utf backslash sequence we can throw to Tcl */
buf [at++] = '\\';
buf [at++] = prefix;
while (strlen) {
buf [at++] = *str++;
strlen --;
}
Tcl_UtfBackslash (buf, NULL, dst);
TclUtfToUniChar (dst, &ch);
return ch;
}
static Tcl_UniChar*
ExpCollapseStar (src, last)
Tcl_UniChar* src;
Tcl_UniChar* last;
{
Tcl_UniChar* dst, *base;
int skip = 0;
int star = 0;
/* Collapses series of *'s into a single *. State machine. The
* complexity is due to the need of handling escaped characters.
*/
LOG (stderr,"Q-STAR\n"); FF;
for (dst = base = src; src < last;) {
LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF;
LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
if (skip) {
skip = 0;
star = 0;
} else if (*src == '\\') {
skip = 1; /* Copy next char, whatever its value */
star = 0;
} else if (*src == '*') {
if (star) {
/* Previous char was *, do not copy the current * to collapse
* the sequence
*/
src++;
continue;
}
star = 1; /* *-series starts here */
} else {
star = 0;
}
*dst++ = *src++;
}
LOG (stderr,"@%1d /%1d '%s' <-- ", star,skip,xxx(base,dst-base)); FF;
LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
return dst;
}
static Tcl_UniChar*
ExpCollapseQForward (src, last)
Tcl_UniChar* src;
Tcl_UniChar* last;
{
Tcl_UniChar* dst, *base;
int skip = 0;
int quest = 0;
/* Collapses series of ?'s coming after a *. State machine. The
* complexity is due to the need of handling escaped characters.
*/
LOG (stderr,"Q-Forward\n"); FF;
for (dst = base = src; src < last;) {
LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF;
LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
if (skip) {
skip = 0;
quest = 0;
} else if (*src == '\\') {
skip = 1;
quest = 0;
/* Copy next char, whatever its value */
} else if (*src == '?') {
if (quest) {
/* Previous char was *, do not copy the current ? to collapse
* the sequence
*/
src++;
continue;
}
} else if (*src == '*') {
quest = 1;
} else {
quest = 0;
}
*dst++ = *src++;
}
LOG (stderr,"?%1d /%1d '%s' <-- ", quest,skip,xxx(base,dst-base)); FF;
LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
return dst;
}
static Tcl_UniChar*
ExpCollapseQBack (src, last)
Tcl_UniChar* src;
Tcl_UniChar* last;
{
Tcl_UniChar* dst, *base;
int skip = 0;
/* Collapses series of ?'s coming before a *. State machine. The
* complexity is due to the need of handling escaped characters.
*/
LOG (stderr,"Q-Backward\n"); FF;
for (dst = base = src; src < last;) {
LOG (stderr,"/%1d '%s' <-- ",skip,xxx(base,dst-base)); FF;
LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
if (skip) {
skip = 0;
} else if (*src == '\\') {
skip = 1;
/* Copy next char, whatever its value */
} else if (*src == '*') {
/* Move backward in the output while the previous character is
* an unescaped question mark. If there is a previous character,
* or a character before that..
*/
while ((((dst-base) > 2) && (dst[-1] == '?') && (dst[-2] != '\\')) ||
(((dst-base) == 1) && (dst[-1] == '?'))) {
dst --;
}
}
*dst++ = *src++;
}
LOG (stderr,"/%1d '%s' <-- \n",skip,xxx(base,dst-base)); FF;
LOG (stderr,"'%s'\n", xxx(src,last-src)); FF;
return dst;
}
#undef CHOP
#undef CHOPC
#undef EMIT
#undef EMITX
#undef MATCH
#undef MATCHC
#undef MATCH_DIGIT
#undef MATCH_HEXDIGIT
#undef PUSHPAREN
#undef UNEMIT
syntax highlighted by Code2HTML, v. 0.9.1