/* mxte_impl -- A table driven Tagging Engine for Python (Version 0.9) This is the Tagging Engine implementation. It can be compiled for 8-bit strings and Unicode by setting the TE_* defines appropriately. Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com */ #ifndef TE_STRING_CHECK # define TE_STRING_CHECK(obj) PyString_Check(obj) #endif #ifndef TE_STRING_AS_STRING # define TE_STRING_AS_STRING(obj) PyString_AS_STRING(obj) #endif #ifndef TE_STRING_GET_SIZE # define TE_STRING_GET_SIZE(obj) PyString_GET_SIZE(obj) #endif #ifndef TE_STRING_FROM_STRING # define TE_STRING_FROM_STRING(str, size) PyString_FromStringAndSize(str, size) #endif #ifndef TE_CHAR # define TE_CHAR char #endif #ifndef TE_HANDLE_MATCH # define TE_HANDLE_MATCH string_match_append #endif #ifndef TE_ENGINE_API # define TE_ENGINE_API mxTextTools_TaggingEngine #endif #include "mcfpyapi.h" /* --- Tagging Engine ----------------------------------------------------- */ /* Non-recursive restructuring by Mike Fletcher to support SimpleParse This restructuring eliminates the use of the C call stack for processing sub-table and table directives, allowing these to be used for repetition calls if desired. while 1: while (index_in_table() and returnCode == NULL_CODE): decode the current table[index] if the current tag is new (not already processed): reset tag variables switch( tag command ): do what tag wants to do() set tag-related variables set childReturnCode (tag variable) if table: push_frame_stack() set childReturnCode == PENDING switch(childReturnCode): # figure out what to do with child's results # possibly set table-wide returnValue childSuccess append values update table-wide values set new index childFailure rewind position set new index childError signal error for whole table childPending ignore/continue processing without updating list values reset childReturnCode #done table, figure out what to do now... if no explicit return value: figure out implicit if failure: truncate result list to previous length reset position if error: report error as exception exit else: if frame_stack(): pop_frame_stack() else: return result */ /* call-stack structures used in non-recursive implementation */ #ifndef TEXTTOOLS_CALL_STACK_STRUCTURES # define TEXTTOOLS_CALL_STACK_STRUCTURES /* codes for returnCode and childReturnCode variables */ #define EOF_CODE 3 #define SUCCESS_CODE 2 #define FAILURE_CODE 1 #define ERROR_CODE 0 #define NULL_CODE -1 #define PENDING_CODE -2 typedef struct stack_entry { /* represents data stored for a particular stack recursion We want to make this as small as possible, so anything that is duplicate information (such as unpacked values of the tag or table) is ignored. Eventually this may support another field "available branches" recording backtracking points for the engine. */ void * parent; /* pointer to a parent table or NULL */ int position; /* where the engine is currently parsing for the parent table*/ int startPosition; /* position where we started parsing for the parent table */ mxTagTableObject * table; /* the parent table */ int index; /* index of the child tag in the parent table */ int childStart; /* text start position for the child table */ PyObject * results; /* the result-target of the parent */ int resultsLength; /* the length of the results list before the sub-table is called */ } recursive_stack_entry; /* Macro to reset table-specific variables XXX Not sure if loop vars are table or tag specific */ #define RESET_TABLE_VARIABLES {\ index=0;\ table_len = table->ob_size;\ returnCode = NULL_CODE;\ loopcount = -1;\ loopstart = startPosition;\ taglist_len = PyList_Size( taglist );\ } /* Macro to reset tag-specific variables */ #define RESET_TAG_VARIABLES {\ childStart = position;\ childPosition = position;\ childReturnCode = NULL_CODE;\ childResults = NULL;\ } /* Macro to decode a tag-entry into local variables */ #define DECODE_TAG {\ mxTagTableEntry *entry;\ entry = &table->entry[index];\ command = entry->cmd;\ flags = entry->flags;\ match = entry->args;\ failureJump = entry->jne;\ successJump = entry->je;\ tagobj = entry->tagobj;\ if (tagobj == NULL) { tagobj = Py_None;}\ } /* macro to push relevant local variables onto the stack and setup for child table newTable becomes table, newResults becomes taglist This is currently only called in the Table/SubTable family of commands, could be inlined there, but I find it cleaner to read here. */ #define PUSH_STACK( newTable, newResults ) {\ stackTemp = (recursive_stack_entry *) PyMem_Malloc( sizeof( recursive_stack_entry ));\ stackTemp->parent = stackParent;\ stackTemp->position = position;\ stackTemp->startPosition = startPosition;\ stackTemp->table = table;\ stackTemp->index = index;\ stackTemp->childStart = childStart;\ stackTemp->resultsLength = taglist_len;\ stackTemp->results = taglist;\ \ stackParent = stackTemp;\ childReturnCode = PENDING_CODE;\ \ startPosition = position;\ table = (mxTagTableObject *) newTable;\ taglist = newResults;\ } #define POP_STACK {\ if (stackParent) {\ childStart = stackParent->childStart;\ childPosition = position;\ position = stackParent->position;\ \ startPosition = stackParent->startPosition;\ \ childResults = taglist;\ taglist_len = stackParent->resultsLength;\ taglist = stackParent->results;\ if (table != stackParent->table ) { Py_DECREF( table ); }\ table = stackParent->table;\ table_len = table->ob_size;\ index = stackParent->index;\ \ stackTemp = stackParent->parent;\ PyMem_Free( stackParent );\ stackParent = stackTemp;\ stackTemp = NULL;\ \ childReturnCode = returnCode;\ returnCode = NULL_CODE;\ }\ } #endif /* mxTextTools_TaggingEngine(): a table driven parser engine - return codes: returnCode = 2: match ok; returnCode = 1: match failed; returnCode = 0: error - doesn't check type of passed arguments ! - doesn't increment reference counts of passed objects ! */ int TE_ENGINE_API( PyObject *textobj, int sliceleft, int sliceright, mxTagTableObject *table, PyObject *taglist, PyObject *context, int *next ) { TE_CHAR *text = NULL; /* Pointer to the text object's data */ /* local variables pushed into stack on recurse */ /* whole-table variables */ int position = sliceleft; /* current (head) position in text for whole table */ int startPosition = sliceleft; /* start position for current tag */ int table_len = table->ob_size; /* table length */ short returnCode = NULL_CODE; /* return code: -1 not set, 0 error, 1 not ok, 2 ok */ int index=0; /* index of current table entry */ int taglist_len = PyList_Size( taglist ); /* variables tracking status of the current tag */ register short childReturnCode = NULL_CODE; /* the current child's return code value */ int childStart = startPosition; register int childPosition = startPosition; PyObject *childResults = NULL; /* store's the current child's results (for table children) */ int flags=0; /* flags set in command */ int command=0; /* command */ int failureJump=0; /* rel. jump distance on 'not matched', what should the default be? */ int successJump=1; /* dito on 'matched', what should the default be? */ PyObject *match=NULL; /* matching parameter */ int loopcount = -1; /* loop counter */ int loopstart = startPosition; /* loop start position */ PyObject *tagobj = NULL; /* parentTable is our nearest parent, i.e. the next item to pop off the processing stack. We copied our local variables to it before starting a child table, and will copy back from it when we finish the child table. It's normally NULL */ recursive_stack_entry * stackParent = NULL; recursive_stack_entry * stackTemp = NULL; /* just temporary storage for parent pointers */ /* Error-management variables */ PyObject * errorType = NULL; PyObject * errorMessage = NULL; /* Initialise the buffer Here is where we will add memory-mapped file support I think... expand the TE_STRING macros to check for mmap file objects (only for str-type) and to access their values appropriately f = open('c:\\temp\\test.mem', 'r') buffer = mmap.mmap( f.fileno(), 0, access = mmap.ACCESS_READ ) */ if (!TE_STRING_CHECK(textobj)) { returnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Expected a string or unicode object to parse: found %.50s", textobj->ob_type->tp_name ); } else { text = TE_STRING_AS_STRING(textobj); if (text == NULL) { returnCode = ERROR_CODE; } } while (1) { /* this loop processes a whole table */ while ( (index < table_len) & (returnCode == NULL_CODE) & (index >= 0) ) { DPRINTF( "index %i\n", index ); DECODE_TAG if (childReturnCode == NULL_CODE ) { /* if we are not continuing processing of the child from a previous iteration we need to unpack the child into local variables */ RESET_TAG_VARIABLES childStart = position; childPosition = position; } if (command < MATCH_MAX_LOWLEVEL) { #include "lowlevelcommands.h" } else { switch (command) { /* Jumps & special commands */ #include "speccommands.h" /* non-table-recursion high-level stuff */ #include "highcommands.h" /* the recursive table commands */ #include "recursecommands.h" default: { childReturnCode = ERROR_CODE; errorType = PyExc_ValueError; errorMessage = PyString_FromFormat( "Unrecognised command code %i", command ); } } } /* we're done a single tag, process partial results for the current child This is a major re-structuring point. Previously all of this was scattered around (and duplicated among) the various command and command-group clauses. There also used to be a function call to handle the append/call functions. That's now handled inline */ /* sanity check wanted by Marc-André for skip-before-buffer */ if (childPosition < 0) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "tagobj (type %.50s) table entry %i moved/skipped beyond start of text (to position %i)", tagobj->ob_type->tp_name, index, childPosition ); } DPRINTF( "switch on return code %i\n", childReturnCode ); switch(childReturnCode) { case NULL_CODE: case SUCCESS_CODE: /* childReturnCode wasn't set or we positively matched positions are always: childStart, childPosition sub-results are: childResults unless childResults is taglist in which case we use Py_None for the tag's children unless childResults is NULL in which case we create an empty list object we call: tagobj == Py_None : do nothing... [ result tuple needed ] CallTag: entry->tagobj( resultTuple ) AppendToTagobj: entry->tagobj.append( resultTuple ) General Case: taglist.append( resultTuple ) AppendMatch: taglist.append( text[childStart:childPosition] ) AppendTagobj: taglist.append( entry->tagobj ) if LookAhead is specified: childPosition is set to childStart before continuing finally we set position = childPosition */ { PyObject * objectToCall = NULL; PyObject * objectCallResult = NULL; int releaseCallObject = 0; int releaseChildResults = 0; int releaseParameter = 1; PyObject * parameter = NULL; DPRINTF( "finishing success-code or null \n" ); if (tagobj == Py_None ) { /* XXX note: this short-circuits around "AppendTagobj" flagged items which specified tagobj == None... don't know if that's wanted or not. Similarly doesn't report AppendMatch's. Not sure what's appropriate there either. */ DPRINTF( "tagobj was none\n" ); DPRINTF( "Matched %i:%i but result not saved", childStart, childPosition ); } else { /* get the callable object */ /* normally it's taglist.append, do the exceptions first */ DPRINTF( "tagobj non-None, finding callable\n" ); if (flags & MATCH_CALLTAG) { /* want the tag itself */ objectToCall = tagobj; } else if (flags & MATCH_APPENDTAG) { /* AppendToTagobj -> want the tag's append method */ DPRINTF( "append to tag obj\n" ); objectToCall = PyObject_GetAttrString( tagobj, "append" ); DPRINTF( "got object\n"); if (objectToCall == NULL) { DPRINTF( "got invalid object\n"); returnCode = ERROR_CODE; errorType = PyExc_AttributeError; errorMessage = PyString_FromFormat( "tagobj (type %.50s) for table entry %i (flags include AppendTag) doesn't have an append method", tagobj->ob_type->tp_name, index ); } else { DPRINTF( "got valid object\n"); releaseCallObject = 1; } } else { DPRINTF( "appending to tag-list\n" ); /* append of the taglist, which we know exists, because it's a list We optimise this to use the raw List API */ objectToCall = NULL; /*PyObject_GetAttrString( taglist, "append" );*/ } if (returnCode == NULL_CODE && objectToCall && PyCallable_Check(objectToCall)==0) { /* object to call isn't callable */ DPRINTF( "object not callable\n" ); returnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "The object to call type(%.50s) for table entry %i isn't callable", objectToCall->ob_type->tp_name, index ); } if (returnCode == NULL_CODE) { /* get the parameter with which to call */ /* normally it's a result tuple, do exceptions first */ DPRINTF( "getting parameter\n" ); if (flags & MATCH_APPENDMATCH) { /* XXX need to do bounds-checking here so that: childStart >= sliceleft childPosition >= sliceleft childPosition <= sliceright */ /* MATCH_APPENDMATCH cannot occur with any other flag (makes no sense) so objectToCall _must_ be the taglist, and we just want to append the string, not a tuple wrapping the string. That is, everywhere else we use tuples, here we don't */ parameter = TE_STRING_FROM_STRING( TE_STRING_AS_STRING(textobj) + childStart, childPosition - childStart ); if (parameter == NULL) { /* error occured getting parameter, report the exception */ returnCode = ERROR_CODE; } } else if ( flags & MATCH_APPENDTAGOBJ) { /* append the tagobj itself to the results list */ if (tagobj == NULL) { parameter = Py_None; } else { parameter = tagobj; } releaseParameter = 0; } else { /* need to know what the child-list is to build resultsTuple if childResults is non-null and not taglist use it if childResults == taglist, use Py_None otherwise use Py_None ( originally we created a new empty list object, that was wrong :) ). */ if (childResults == taglist) { childResults = Py_None ; } else if (childResults != NULL) { /* exists already, with a reference from PUSH's creation */ releaseChildResults = 1; } else { /* turns out mxTextTools declares the return value to be None or [], using None is far more efficient, so I've made the code use it here */ childResults = Py_None; releaseChildResults = 0; /* we aren't increfing it locally */ } if (childResults == NULL || tagobj == NULL) { returnCode = ERROR_CODE; } else { if (flags & MATCH_CALLTAG) { parameter = Py_BuildValue( "OOiiO", taglist, textobj, childStart, childPosition, childResults ); } else if (flags & MATCH_APPENDTAG) { /* AppendToTagobj -> want to call append with a 4-tuple of values, so parameter needs to be ((x,y,z,w),) */ /* XXX can't get the darn thing to accept "((OiiO))" :( */ parameter = Py_BuildValue( "((OiiO))", Py_None, childStart, childPosition, childResults ); } else { /* either we are calling a method that requires the 4 args, or we're appending the 4-tuple to a list */ parameter = Py_BuildValue( "OiiO", tagobj, childStart, childPosition, childResults ); } if (parameter == NULL) { returnCode = ERROR_CODE; } } } DPRINTF( "done getting parameter\n" ); if (parameter == NULL && returnCode == ERROR_CODE && errorType == NULL) { errorType = PyExc_SystemError; /* following may fail, as we may have run out of memory */ errorMessage = PyString_FromFormat( "Unable to build return-value tuple" ); } /* now have both object and parameter and object is callable */ if (returnCode == NULL_CODE) { /* no errors yet */ DPRINTF( "doing call\n" ); if (objectToCall) { DPRINTF( " object call\n" ); /* explicit object to call */ Py_INCREF( objectToCall ); Py_INCREF( parameter ); DPRINTF( " lock released\n" ); objectCallResult = PyEval_CallObject( objectToCall, parameter ); DPRINTF( " call finished\n" ); Py_DECREF( objectToCall ); Py_DECREF( parameter ); DPRINTF( " lock acquired\n" ); if (objectCallResult == NULL) { DPRINTF( " null result\n" ); returnCode = ERROR_CODE; /* exception is already there, should alter error-handler to check for it */ } else { DPRINTF( " non-null result, decrefing\n" ); Py_DECREF( objectCallResult ); DPRINTF( " decrefd\n" ); } objectCallResult = NULL; } else { /* list steals reference */ DPRINTF( " list append\n" ); if (PyList_Append( taglist, parameter ) == -1) { returnCode = ERROR_CODE; /* list didn't steal ref yet */ errorType = PyExc_SystemError; /* following is likely to fail, as we've likely run out of memory */ errorMessage = PyString_FromFormat( "Unable to append result tuple to result list!" ); } } } } DPRINTF( "checking whether to release object\n" ); if (releaseCallObject) { Py_DECREF( objectToCall ); } objectToCall = NULL; releaseCallObject = 0; if (releaseChildResults) { Py_DECREF( childResults ); } childResults = NULL; releaseChildResults = 0; if (releaseParameter && parameter ) { Py_DECREF( parameter ); } parameter = NULL; releaseParameter = 1; } /* ends the else clause for reporting a result */ /* reset for lookahead */ if (flags & MATCH_LOOKAHEAD) { position = childStart; } else { position = childPosition; } index += successJump; DPRINTF( "finished success-handler code\n" ); break; } case FAILURE_CODE: /* failed, if failure jump is default, should set table returnCode */ if (childResults) { if (childResults != taglist) { /* different list, decref it since we won't be using it any more */ Py_DECREF( childResults ); } childResults = NULL; } /* XXX possible (eventual) logic error here? fail with jump of 0 might work in certain cases where the "parsing" is actually occuring outside of the current buffer (i.e. a side-effect-based parsing node that fails X times before finally succeeding). Don't see anything in current commands that can cause a problem but we may need to make this an explicitly watched idea, rather than a consequence of the child failing with a 0 failureJump value. */ position = childStart; if (failureJump == 0) { returnCode = 1; } else { index += failureJump; } break; case PENDING_CODE: /* the child tag hasn't begun parsing, this was a recursive-tag-start loop pass. PENDING_CODE is set by the stack push operation */ break; case ERROR_CODE: { /* explicit error encountered while processing this child Handle this as gracefully as possible, potentially triggering huge sets of operations, but therefore needing to be very careful about system-level errors (such as memory errors). 1) Signal whole table as err-d 2) Record any extra values for the error message? */ returnCode = ERROR_CODE; break; } default: { /* what error should be raised when an un-recognised return code is generated? */ returnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "An unknown child return code %i was generated by tag-table item %i", childReturnCode, index ); } } childReturnCode = NULL_CODE; /* single entry processing loop complete */ } /* we're done the table, figure out what to do. */ if (returnCode == NULL_CODE) { /* no explicit return code was set, but done table: index went beyond table_len (>=table_len) -> success index moved before table start (<= 0) -> failure */ if (index >= table_len) { /* success */ returnCode = SUCCESS_CODE; } else if (position >= sliceright) { /* EOF while parsing, special type of failure Eventually allow for returning the whole parse-stack for restarting the parser from a particular point. */ /*returnCode = EOF_CODE;*/ returnCode = FAILURE_CODE; } else if (index < 0) { /* explicit jump before table */ returnCode = FAILURE_CODE; } else { returnCode = FAILURE_CODE; } } if (returnCode == FAILURE_CODE) { /* truncate result list */ if (PyList_SetSlice( taglist, taglist_len, PyList_Size(taglist), NULL) ) { returnCode = ERROR_CODE; errorMessage = PyString_FromFormat( "Unable to truncate list object (likely tagging engine error) type(%.50s)", taglist->ob_type->tp_name ); } /* reset position */ position = startPosition; } if (returnCode == ERROR_CODE) { /* DO_FANCY_ERROR_REPORTING( ); This is where we will do the user-triggered error reporting (as well as reporting low-level errors such as memory/type/value). We have 3 values possibly available: errorType -> PyObject * to current error class (or NULL) if it is a MemoryError: Jettison some ballast then attempt to return a short message. Need to create this ballast somewhere for that to work. if is any other error class: create the error object and raise it decorate it with details: current table (need to incref to keep alive) current index current position childStart childPosition if it is simpleparse.stt.TextTools.ParsingError: (triggered by the user in their grammar) create a list of non-None parent tagobjs (a stack report) and add it to the object 3) Build an actual error object if possible? 4) Report the parent hierarchy of the failure point 5) */ char * msg = NULL; if (errorMessage && errorType) { /* we only report our own error if we've got all the information for it XXX Need to check that we don't have cases that are just setting type */ msg = PyString_AsString( errorMessage); PyErr_SetString( errorType, msg ); Py_DECREF( errorMessage ); } /* need to free the whole stack at once */ while (stackParent != NULL) { /* this is inefficient, should do it all-in-one-go without copying values back save for startPosition and returnCode in the last item*/ POP_STACK /* need to clean up all INCREF'd objects as we go... */ if (childResults != taglist) { /* different list, decref it since we won't be using it any more */ Py_DECREF( childResults ); } childResults = NULL; } *next = startPosition; return 0; } else { if (stackParent != NULL) { /* pop stack also sets the childReturnCode for us... */ POP_STACK } else { /* this was the root table, return the final results */ if (returnCode == FAILURE_CODE) { /* there is a clause in the docs for tag that says this will return the "error position" for the table. That requires reporting childPosition for the the last-matched position */ *next = childPosition; } else { *next = position; } return returnCode; } } } /* end of infinite loop */ }