/* mxTextTools -- Fast text manipulation routines Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com */ /* We want all our symbols to be exported */ #define MX_BUILDING_MXTEXTTOOLS /* Logging file used by debugging facility */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mxTextTools.log" #endif #include "mx.h" #include "mxTextTools.h" #include #define VERSION "2.1.0" /* Initial list size used by e.g. setsplit(), setsplitx(),... */ #define INITIAL_LIST_SIZE 64 /* Maximum TagTable cache size. If this limit is reached, the cache is cleared to make room for new compile TagTables. */ #define MAX_TAGTABLES_CACHE_SIZE 100 /* Define this to enable the copy-protocol (__copy__, __deepcopy__) */ #define COPY_PROTOCOL /* --- module doc-string -------------------------------------------------- */ static char *Module_docstring = MXTEXTTOOLS_MODULE" -- Tools for fast text processing. Version "VERSION"\n\n" "Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com\n" "Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n" "Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com\n\n" " All Rights Reserved\n\n" "See the documentation for further information on copyrights,\n" "or contact the author." ; /* --- internal macros ---------------------------------------------------- */ /* --- module globals ----------------------------------------------------- */ /* Translation strings for the 8-bit versions of lower() and upper() */ static PyObject *mx_ToUpper; static PyObject *mx_ToLower; static PyObject *mxTextTools_Error; /* mxTextTools specific error */ static PyObject *mxTextTools_TagTables; /* TagTable cache dictionary */ /* Flag telling us whether the module was initialized or not. */ static int mxTextTools_Initialized = 0; /* --- forward declarations ----------------------------------------------- */ /* --- module helper ------------------------------------------------------ */ static PyObject *mxTextTools_ToUpper(void) { char tr[256]; int i; for (i = 0; i < 256; i++) tr[i] = toupper((char)i); return PyString_FromStringAndSize(tr,sizeof(tr)); } static PyObject *mxTextTools_ToLower(void) { char tr[256]; int i; for (i = 0; i < 256; i++) tr[i] = tolower((char)i); return PyString_FromStringAndSize(tr,sizeof(tr)); } /* Create an exception object, insert it into the module dictionary under the given name and return the object pointer; this is NULL in case an error occurred. base can be given to indicate the base object to be used by the exception object. It should be NULL otherwise */ static PyObject *insexc(PyObject *moddict, char *name, PyObject *base) { PyObject *v; char fullname[256]; char *modname; char *dot; v = PyDict_GetItemString(moddict, "__name__"); if (v == NULL) modname = NULL; else modname = PyString_AsString(v); if (modname == NULL) { PyErr_Clear(); modname = MXTEXTTOOLS_MODULE; } /* The symbols from this extension are imported into simpleparse.stt.TextTools. We trim the name to not confuse the user with an overly long package path. */ strcpy(fullname, modname); dot = strchr(fullname, '.'); if (dot) dot = strchr(dot+1, '.'); if (dot) strcpy(dot+1, name); else sprintf(fullname, "%s.%s", modname, name); v = PyErr_NewException(fullname, base, NULL); if (v == NULL) return NULL; if (PyDict_SetItemString(moddict,name,v)) return NULL; return v; } /* Helper for adding integer constants to a dictionary. Check for errors with PyErr_Occurred() */ static void insint(PyObject *dict, char *name, int value) { PyObject *v = PyInt_FromLong((long)value); PyDict_SetItemString(dict, name, v); Py_XDECREF(v); } /* --- module interface --------------------------------------------------- */ /* --- Text Search Object ----------------------------------------------*/ staticforward PyMethodDef mxTextSearch_Methods[]; /* allocation */ static PyObject *mxTextSearch_New(PyObject *match, PyObject *translate, int algorithm) { mxTextSearchObject *so; so = PyObject_NEW(mxTextSearchObject, &mxTextSearch_Type); if (so == NULL) return NULL; so->data = NULL; so->translate = NULL; so->match = NULL; Py_INCREF(match); so->match = match; if (translate == Py_None) translate = NULL; else if (translate) { Py_Assert(PyString_Check(translate), PyExc_TypeError, "translate table must be a string"); Py_Assert(PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate string must have exactly 256 chars"); Py_INCREF(translate); } so->translate = translate; /* Init algorithm */ so->algorithm = algorithm; switch (algorithm) { case MXTEXTSEARCH_BOYERMOORE: Py_Assert(PyString_Check(match), PyExc_TypeError, "match must be a string for Boyer-Moore"); so->data = bm_init(PyString_AS_STRING(match), PyString_GET_SIZE(match)); Py_Assert(so->data != NULL, PyExc_TypeError, "error initializing the search object"); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: Py_Assert(PyString_Check(match), PyExc_TypeError, "match must be a string for FastSearch"); so->data = fs_init(PyString_AS_STRING(match), PyString_GET_SIZE(match)); Py_Assert(so->data != NULL, PyExc_TypeError, "error initializing the search object"); break; #endif case MXTEXTSEARCH_TRIVIAL: Py_Assert(PyString_Check(match) || PyUnicode_Check(match), PyExc_TypeError, "match must be a string or unicode"); Py_Assert(so->translate == NULL, PyExc_TypeError, "trivial search algorithm does not support translate"); break; default: Py_Error(PyExc_ValueError, "unknown or unsupported algorithm"); } return (PyObject *)so; onError: Py_DECREF(so); return NULL; } Py_C_Function_WithKeywords( mxTextSearch_TextSearch, "TextSearch(match[,translate=None,algorithm=default_algorithm])\n\n" "Create a substring search object for the string match;\n" "translate is an optional translate-string like the one used\n" "in the module re." ) { PyObject *match = 0; PyObject *translate = 0; int algorithm = -424242; Py_KeywordsGet3Args("O|Oi:TextSearch",match,translate,algorithm); if (algorithm == -424242) { if (PyUnicode_Check(match)) algorithm = MXTEXTSEARCH_TRIVIAL; else #ifdef MXFASTSEARCH algorithm = MXTEXTSEARCH_BOYERMOORE; #else algorithm = MXTEXTSEARCH_BOYERMOORE; #endif } return mxTextSearch_New(match, translate, algorithm); onError: return NULL; } static void mxTextSearch_Free(mxTextSearchObject *so) { if (so->data) { switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: bm_free(so->data); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: fs_free(so->data); break; #endif case MXTEXTSEARCH_TRIVIAL: break; } } Py_XDECREF(so->match); Py_XDECREF(so->translate); PyObject_Del(so); } /* C APIs */ #define so ((mxTextSearchObject *)self) /* Get the match length from an TextSearch object or -1 in case of an error. */ int mxTextSearch_MatchLength(PyObject *self) { Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: return BM_MATCH_LEN(so->data); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: return FS_MATCH_LEN(so->data); break; #endif case MXTEXTSEARCH_TRIVIAL: if (PyString_Check(so->match)) return PyString_GET_SIZE(so->match); #ifdef HAVE_UNICODE else if (PyUnicode_Check(so->match)) return PyUnicode_GET_SIZE(so->match); #endif break; } Py_Error(mxTextTools_Error, "internal error"); onError: return -1; } static int trivial_search(const char *text, int start, int stop, const char *match, int match_len) { int ml1 = match_len - 1; register const char *tx = &text[start]; register int x = start; if (ml1 < 0) return start; /* Brute-force method; from right to left */ for (;;) { register int j = ml1; register const char *mj = &match[j]; if (x + j >= stop) /* reached eof: no match */ return start; /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j < 0) { /* found */ x += ml1 + 1; return x; } /* not found: rewind and advance one char */ tx -= j - 1; x++; } return start; } #ifdef HAVE_UNICODE static int trivial_unicode_search(const Py_UNICODE *text, int start, int stop, const Py_UNICODE *match, int match_len) { int ml1 = match_len - 1; register const Py_UNICODE *tx = &text[start]; register int x = start; if (ml1 < 0) return start; /* Brute-force method; from right to left */ for (;;) { register int j = ml1; register const Py_UNICODE *mj = &match[j]; if (x + j >= stop) /* reached eof: no match */ return start; /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j < 0) { /* found */ x += ml1 + 1; return x; } /* not found: rewind and advance one char */ tx -= j - 1; x++; } return start; } #endif /* Search for the match in text[start:stop]. Returns 1 in case a match was found and sets sliceleft, sliceright to the matching slice. Returns 0 in case no match was found and -1 in case of an error. */ int mxTextSearch_SearchBuffer(PyObject *self, char *text, int start, int stop, int *sliceleft, int *sliceright) { int nextpos; int match_len; Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: if (so->translate) { /* search with translate table */ nextpos = bm_tr_search((mxbmse_data *)so->data, text, start, stop, PyString_AS_STRING(so->translate)); } else { /* exact search */ nextpos = bm_search((mxbmse_data *)so->data, text, start, stop); } match_len = BM_MATCH_LEN(so->data); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: if (so->translate) { /* search with translate table */ nextpos = fs_tr_search((mxfse_data *)so->data, text, start, stop, PyString_AS_STRING(so->translate)); } else { /* exact search */ nextpos = fs_search((mxfse_data *)so->data, text, start, stop); } match_len = FS_MATCH_LEN(so->data); break; #endif case MXTEXTSEARCH_TRIVIAL: { const char *match; if (PyString_Check(so->match)) { match = PyString_AS_STRING(so->match); match_len = PyString_GET_SIZE(so->match); } else if (PyObject_AsCharBuffer(so->match, &match, &match_len)) goto onError; nextpos = trivial_search(text, start, stop, match, match_len); } break; default: Py_Error(mxTextTools_Error, "unknown algorithm type in mxTextSearch_SearchBuffer"); } /* Found ? */ if (nextpos != start) { if (sliceleft) *sliceleft = nextpos - match_len; if (sliceright) *sliceright = nextpos; return 1; } /* Not found */ return 0; onError: return -1; } #ifdef HAVE_UNICODE int mxTextSearch_SearchUnicode(PyObject *self, Py_UNICODE *text, int start, int stop, int *sliceleft, int *sliceright) { int nextpos; int match_len; Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: Py_Error(PyExc_TypeError, "Boyer-Moore search algorithm does not support Unicode"); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: Py_Error(PyExc_TypeError, "FastSearch search algorithm does not support Unicode"); #endif case MXTEXTSEARCH_TRIVIAL: { PyObject *u; Py_UNICODE *match; if (PyUnicode_Check(so->match)) { u = NULL; match = PyUnicode_AS_UNICODE(so->match); match_len = PyUnicode_GET_SIZE(so->match); } else { u = PyUnicode_FromEncodedObject(so->match, NULL, NULL); if (u == NULL) goto onError; match = PyUnicode_AS_UNICODE(u); match_len = PyUnicode_GET_SIZE(u); } nextpos = trivial_unicode_search(text, start, stop, match, match_len); Py_XDECREF(u); } break; default: Py_Error(mxTextTools_Error, "unknown algorithm type in mxTextSearch_SearchUnicode"); } /* Found ? */ if (nextpos != start) { if (sliceleft) *sliceleft = nextpos - match_len; if (sliceright) *sliceright = nextpos; return 1; } /* Not found */ return 0; onError: return -1; } #endif /* methods */ Py_C_Function( mxTextSearch_search, "TextSearch.search(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return the slice (l,r)\n" "where the substring was found, (start,start) otherwise.") { PyObject *text; int start = 0; int stop = INT_MAX; int sliceleft, sliceright; int rc; Py_Get3Args("O|ii:TextSearch.search", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (rc < 0) goto onError; if (rc == 0) { sliceleft = start; sliceright = start; } /* Return the slice */ Py_Return2("ii", sliceleft, sliceright); onError: return NULL; } Py_C_Function( mxTextSearch_find, "TextSearch.find(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return the index\n" "where the substring was found, -1 otherwise.") { PyObject *text; int start = 0; int stop = INT_MAX; int sliceleft, sliceright; int rc; Py_Get3Args("O|ii:TextSearch.find", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (rc < 0) goto onError; if (rc == 0) sliceleft = -1; return PyInt_FromLong(sliceleft); onError: return NULL; } Py_C_Function( mxTextSearch_findall, "TextSearch.findall(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return a list of all\n" "non overlapping slices (l,r) in text where the match\n" "string can be found.") { PyObject *text; PyObject *list = 0; int start = 0; int stop = INT_MAX; int stop_index; int match_len; int listsize = INITIAL_LIST_SIZE; int listitem = 0; Py_Get3Args("O|ii:TextSearch.findall", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); list = PyList_New(listsize); if (!list) goto onError; match_len = mxTextSearch_MatchLength(self); if (match_len < 0) goto onError; stop_index = stop - match_len; while (start <= stop_index) { register PyObject *t,*v; int rc; int sliceleft, sliceright; /* exact search */ if (PyString_Check(text)) rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); #endif else break; if (rc < 0) goto onError; if (rc == 0) break; /* Build slice and append to list */ t = PyTuple_New(2); if (!t) goto onError; v = PyInt_FromLong(sliceleft); if (!v) goto onError; PyTuple_SET_ITEM(t,0,v); v = PyInt_FromLong(sliceright); if (!v) goto onError; PyTuple_SET_ITEM(t,1,v); if (listitem < listsize) PyList_SET_ITEM(list, listitem, t); else { PyList_Append(list, t); Py_DECREF(t); } listitem++; start = sliceright; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxTextSearch_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(so); return (PyObject *)so; onError: return NULL; } #endif #undef so /* --- slots --- */ static PyObject *mxTextSearch_Repr(mxTextSearchObject *self) { char *algoname; PyObject *v; char t[500], *reprstr; v = PyObject_Repr(self->match); if (v == NULL) return NULL; reprstr = PyString_AsString(v); if (reprstr == NULL) return NULL; switch (self->algorithm) { case MXTEXTSEARCH_BOYERMOORE: algoname = "Boyer-Moore"; break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: algoname = "FastSearch"; break; #endif case MXTEXTSEARCH_TRIVIAL: algoname = "Trivial"; break; default: algoname = ""; } sprintf(t, "<%.50s TextSearch object for %.400s at 0x%lx>", algoname, reprstr, (long)self); Py_DECREF(v); return PyString_FromString(t); } static PyObject *mxTextSearch_GetAttr(mxTextSearchObject *self, char *name) { PyObject *v; if (Py_WantAttr(name,"match")) { v = self->match; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"translate")) { v = self->translate; if (v == NULL) v = Py_None; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"algorithm")) return PyInt_FromLong(self->algorithm); else if (Py_WantAttr(name,"__members__")) return Py_BuildValue("[sss]", "match", "translate", "algorithm"); return Py_FindMethod(mxTextSearch_Methods, (PyObject *)self, (char *)name); } /* Python Type Table */ PyTypeObject mxTextSearch_Type = { PyObject_HEAD_INIT(0) /* init at startup ! */ 0, /*ob_size*/ "TextSearch", /*tp_name*/ sizeof(mxTextSearchObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ /* methods */ (destructor)mxTextSearch_Free, /*tp_dealloc*/ (printfunc)0, /*tp_print*/ (getattrfunc)mxTextSearch_GetAttr, /*tp_getattr*/ (setattrfunc)0, /*tp_setattr*/ (cmpfunc)0, /*tp_compare*/ (reprfunc)mxTextSearch_Repr, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_number*/ 0, /*tp_as_mapping*/ (hashfunc)0, /*tp_hash*/ (ternaryfunc)0, /*tp_call*/ (reprfunc)0, /*tp_str*/ (getattrofunc)0, /*tp_getattro*/ (setattrofunc)0, /*tp_setattro*/ }; /* Python Method Table */ statichere PyMethodDef mxTextSearch_Methods[] = { Py_MethodListEntry("search",mxTextSearch_search), Py_MethodListEntry("find",mxTextSearch_find), Py_MethodListEntry("findall",mxTextSearch_findall), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxTextSearch_copy), Py_MethodListEntry("__copy__",mxTextSearch_copy), #endif {NULL,NULL} /* end of list */ }; /* --- Character Set Object --------------------------------------------*/ staticforward PyMethodDef mxCharSet_Methods[]; /* internal */ /* 8-bit character sets are implemented using a simple 32-byte long bitmap with one bit per character. Addressing is done as follows: def char_is_set(ordinal): return bitmap[ordinal >> 3] & (1 << (ordinal & 7)) */ #define STRING_CHARSET_SIZE 256 #define STRING_CHARSET_BITMAP_SIZE (STRING_CHARSET_SIZE / 8) typedef struct { unsigned char bitmap[STRING_CHARSET_BITMAP_SIZE]; /* character bitmap */ } string_charset; static int init_string_charset(mxCharSetObject *cs, PyObject *definition) { register int i, j; char *def = PyString_AS_STRING(definition); const int len = PyString_GET_SIZE(definition); string_charset *lookup = 0; register unsigned char *bitmap; int logic = 1; /* Handle logic change (first char is '^' for negative matching) */ if (len > 0 && def[0] == '^') { logic = 0; i = 1; } else i = 0; /* Build 32-byte lookup bitmap (one bit per character) */ lookup = (string_charset *)PyMem_Malloc(sizeof(string_charset)); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } memset(lookup, 0, sizeof(string_charset)); cs->mode = MXCHARSET_8BITMODE; cs->lookup = (void *)lookup; bitmap = lookup->bitmap; for (; i < len; i++) { /* Handle escapes: "b\-d", "\\" */ if (def[i] == '\\') { if (i < len - 1 && def[i+1] == '\\') { j = (unsigned char)'\\'; bitmap[j >> 3] |= 1 << (j & 7); i++; } continue; } /* Handle ranges: "b-d", "\\-z", "\--z" */ if (i < len - 2 && def[i+1] == '-') { unsigned char range_left = def[i]; unsigned char range_right = def[i+2]; for (j = range_left; j <= range_right; j++) bitmap[j >> 3] |= 1 << (j & 7); i++; continue; } /* Normal processing */ j = (unsigned char)def[i]; bitmap[j >> 3] |= 1 << (j & 7); } /* Invert bitmap if negative matching is requested */ if (!logic) { DPRINTF("init_string_charset: inverting bitmap\n"); for (i = 0; i < STRING_CHARSET_BITMAP_SIZE; i++) bitmap[i] ^= 0xFF; } return 0; onError: if (lookup) PyMem_Free((void *)lookup); cs->lookup = 0; return -1; } #ifdef HAVE_UNICODE /* Unicode character sets are implemented using two step indexing which is a good compromise between lookup speed and memory usage. Lookup is done using a variable length array of 32-byte bitmap blocks. There can be 256 such blocks. Identical blocks are collapsed into a single copy. Addressing is done as follows: def char_is_set(ordinal): index = bitmapindex[ordinal >> 8] bitmap = bitmaps[index] return bitmap[(ordinal >> 3) & 31] & (1 << (ordinal & 7)) The technique used here is very similar to what is done in Python's SRE (see the BIGCHARSET patch by Martin von Loewis). Compression should be reasonably good since character sets in practice usually only contains a few single characters or longer ranges of Unicode characters. */ #define UNICODE_CHARSET_SIZE 65536 #define UNICODE_CHARSET_BITMAP_SIZE 32 #define UNICODE_CHARSET_BITMAPS (UNICODE_CHARSET_SIZE / (UNICODE_CHARSET_BITMAP_SIZE * 8)) #define UNICODE_CHARSET_BIGMAP_SIZE (UNICODE_CHARSET_SIZE / 8) typedef struct { unsigned char bitmapindex[UNICODE_CHARSET_BITMAPS]; /* Index to char bitmaps */ unsigned char bitmaps[UNICODE_CHARSET_BITMAPS][UNICODE_CHARSET_BITMAP_SIZE]; /* Variable length bitmap array */ } unicode_charset; static int init_unicode_charset(mxCharSetObject *cs, PyObject *definition) { register int i, j; Py_UNICODE *def = PyUnicode_AS_UNICODE(definition); const int len = PyUnicode_GET_SIZE(definition); unicode_charset *lookup = 0; unsigned char bigmap[UNICODE_CHARSET_BIGMAP_SIZE]; int blocks; int logic = 1; /* Handle logic change (first char is '^' for negative matching) */ if (len > 0 && def[0] == '^') { logic = 0; i = 1; } else i = 0; /* Build bigmap */ memset(bigmap, 0, sizeof(bigmap)); for (; i < len; i++) { /* Handle escapes: "b\-d", "\\" */ if (def[i] == '\\') { if (i < len - 1 && def[i+1] == '\\') { j = (int)'\\'; bigmap[j >> 3] |= 1 << (j & 7); i++; } continue; } /* Handle ranges: "b-d", "\\-z", "\--z" */ if (i < len - 2 && def[i+1] == '-') { Py_UNICODE range_left = def[i]; Py_UNICODE range_right = def[i+2]; if (range_right >= UNICODE_CHARSET_SIZE) { Py_Error(PyExc_ValueError, "unicode ordinal out of supported range"); } for (j = range_left; j <= range_right; j++) bigmap[j >> 3] |= 1 << (j & 7); i++; continue; } /* Normal processing */ j = def[i]; if (j >= UNICODE_CHARSET_SIZE) { Py_Error(PyExc_ValueError, "unicode ordinal out of supported range"); } bigmap[j >> 3] |= 1 << (j & 7); } /* Build lookup table XXX Could add dynamic resizing here... probably not worth it though, since sizeof(unicode_charset) isn't all that large. */ lookup = (unicode_charset *)PyMem_Malloc(sizeof(unicode_charset)); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } blocks = 0; for (i = UNICODE_CHARSET_BITMAPS - 1; i >= 0; i--) { unsigned char *block = &bigmap[i << 5]; for (j = blocks - 1; j >= 0; j--) if (memcmp(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE) == 0) break; if (j < 0) { j = blocks; DPRINTF("init_unicode_charset: Creating new block %i for %i\n", j, i); memcpy(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE); blocks++; } else DPRINTF("init_unicode_charset: Reusing block %i for %i\n", j, i); lookup->bitmapindex[i] = j; } DPRINTF("init_unicode_charset: Map size: %i block(s) = %i bytes\n", blocks, UNICODE_CHARSET_BITMAPS + blocks * UNICODE_CHARSET_BITMAP_SIZE); lookup = (unicode_charset *)PyMem_Realloc(lookup, UNICODE_CHARSET_BITMAPS + blocks * UNICODE_CHARSET_BITMAP_SIZE); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } /* Invert bitmaps if negative matching is requested */ if (!logic) { register unsigned char *bitmap = &lookup->bitmaps[0][0]; DPRINTF("init_unicode_charset: inverting bitmaps\n"); for (i = 0; i < blocks * UNICODE_CHARSET_BITMAP_SIZE; i++) bitmap[i] ^= 0xFF; } cs->mode = MXCHARSET_UCS2MODE; cs->lookup = (void *)lookup; return 0; onError: if (lookup) PyMem_Free((void *)lookup); cs->lookup = 0; return -1; } #endif /* allocation */ static PyObject *mxCharSet_New(PyObject *definition) { mxCharSetObject *cs; cs = PyObject_NEW(mxCharSetObject, &mxCharSet_Type); if (cs == NULL) return NULL; Py_INCREF(definition); cs->definition = definition; cs->lookup = NULL; cs->mode = -1; if (PyString_Check(definition)) { if (init_string_charset(cs, definition)) goto onError; } #ifdef HAVE_UNICODE else if (PyUnicode_Check(definition)) { if (init_unicode_charset(cs, definition)) goto onError; } #endif else Py_Error(PyExc_TypeError, "character set definition must be string or unicode"); return (PyObject *)cs; onError: Py_DECREF(cs); return NULL; } Py_C_Function( mxCharSet_CharSet, "CharSet(definition)\n\n" "Create a character set matching object from the string" ) { PyObject *definition; Py_GetArg("O:CharSet", definition); return mxCharSet_New(definition); onError: return NULL; } static void mxCharSet_Free(mxCharSetObject *cs) { Py_XDECREF(cs->definition); if (cs->lookup) PyMem_Free(cs->lookup); PyObject_Del(cs); } /* C APIs */ #define cs ((mxCharSetObject *)self) int mxCharSet_ContainsChar(PyObject *self, register unsigned char ch) { if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[0]]; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -1; } #ifdef HAVE_UNICODE int mxCharSet_ContainsUnicodeChar(PyObject *self, register Py_UNICODE ch) { if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap; if (ch >= 256) return 0; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[ch >> 8]]; return ((bitmap[(ch >> 3) & 31] & (1 << (ch & 7))) != 0); } else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -1; } #endif static int mxCharSet_Contains(PyObject *self, PyObject *other) { if (PyString_Check(other)) { Py_Assert(PyString_GET_SIZE(other) == 1, PyExc_TypeError, "expected a single character"); return mxCharSet_ContainsChar(self, PyString_AS_STRING(other)[0]); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(other)) { Py_Assert(PyUnicode_GET_SIZE(other) == 1, PyExc_TypeError, "expected a single unicode character"); return mxCharSet_ContainsUnicodeChar(self, PyUnicode_AS_UNICODE(other)[0]); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode character"); onError: return -1; } /* In mode 1, find the position of the first character in text belonging to set. This may also be stop or start-1 in case no such character is found during the search (depending on the direction). In mode 0, find the first character not in set. This may also be stop or start-1 in case no such character is found during the search (depending on the direction). The search is done in the slice start:stop. -2 is returned in case of an error. */ static int mxCharSet_FindChar(PyObject *self, unsigned char *text, int start, int stop, const int mode, const int direction) { register int i; register unsigned int c; register unsigned int block; unsigned char *bitmap; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) bitmap = ((string_charset *)cs->lookup)->bitmap; #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; bitmap = lookup->bitmaps[lookup->bitmapindex[0]]; } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; onError: return -2; } #ifdef HAVE_UNICODE static int mxCharSet_FindUnicodeChar(PyObject *self, Py_UNICODE *text, int start, int stop, const int mode, const int direction) { register int i; register unsigned int c; register unsigned int block; unsigned char *bitmap; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { bitmap = ((string_charset *)cs->lookup)->bitmap; if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; if (c > 256) continue; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; if (c > 256) break; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; if (c > 256) continue; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; if (c > 256) break; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; } #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -2; } #endif /* Return the position of the first character in text[start:stop] occurring in set or -1 in case no such character exists. */ static int mxCharSet_Search(PyObject *self, PyObject *text, int start, int stop, int direction) { int position; if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); position = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 1, direction); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); position = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 1, direction); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if ((direction > 0 && position >= stop) || (direction <= 0 && position < start)) position = -1; return position; onError: return -2; } /* Return the longest match of characters from set in text[start:stop]. If direction is positive, the search is done from the left (longest prefix), otherwise it is started from the right (longest suffix). -1 is returned in case of an error. */ int mxCharSet_Match(PyObject *self, PyObject *text, int start, int stop, int direction) { int position; if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); position = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 0, direction); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); position = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, direction); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (position < -1) goto onError; if (direction > 0) return position - start; else return stop-1 - position; onError: return -1; } /* Stips off characters appearing in the character set from text[start:stop] and returns the result as Python string object. where indicates the mode: where < 0: strip left only where = 0: strip left and right where > 0: strip right only */ static PyObject *mxCharSet_Strip(PyObject *self, PyObject *text, int start, int stop, int where) { int left,right; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); /* Strip left */ if (where <= 0) { left = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 0, 1); if (left < 0) goto onError; } else left = start; /* Strip right */ if (where >= 0) { right = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), left, stop, 0, -1) + 1; if (right < 0) goto onError; } else right = stop; return PyString_FromStringAndSize(PyString_AS_STRING(text) + left, max(right - left, 0)); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); /* Strip left */ if (where <= 0) { left = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, 1); if (left < 0) goto onError; } else left = start; /* Strip right */ if (where >= 0) { right = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, -1) + 1; if (right < 0) goto onError; } else right = stop; return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(text) + left, max(right - left, 0)); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } static PyObject *mxCharSet_Split(PyObject *self, PyObject *text, int start, int text_len, int include_splits) { PyObject *list = NULL; PyObject *s; register int x; int listitem = 0; int listsize = INITIAL_LIST_SIZE; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } list = PyList_New(listsize); if (!list) goto onError; if (PyString_Check(text)) { unsigned char *tx = (unsigned char *)PyString_AS_STRING(text); Py_CheckStringSlice(text, start, text_len); x = start; while (x < text_len) { int z; /* Skip all text in set (include_splits == 0), not in set (include_splits == 1) */ z = x; x = mxCharSet_FindChar(self, tx, x, text_len, include_splits, 1); /* Append the slice to list */ if (include_splits) { s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; } /* Skip all text in set (include_splits == 1), not in set (include_splits == 0) */ z = x; x = mxCharSet_FindChar(self, tx, x, text_len, !include_splits, 1); /* Append the slice to list if it is not empty */ if (x > z) { s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_UNICODE *tx = PyUnicode_AS_UNICODE(text); Py_CheckUnicodeSlice(text, start, text_len); x = start; while (x < text_len) { int z; /* Skip all text in set (include_splits == 0), not in set (include_splits == 1) */ z = x; x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, include_splits, 1); /* Append the slice to list */ if (include_splits) { s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; } /* Skip all text in set (include_splits == 1), not in set (include_splits == 0) */ z = x; x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, !include_splits, 1); /* Append the slice to list if it is not empty */ if (x > z) { s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } /* methods */ Py_C_Function( mxCharSet_contains, ".contains(char)\n\n" ) { PyObject *chr; int rc; Py_GetArg("O:CharSet.contains", chr); rc = mxCharSet_Contains(self, chr); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_search, ".search(text[, direction=1, start=0, stop=len(text)])\n\n" ) { PyObject *text; int direction = 1; int start = 0, stop = INT_MAX; int rc; Py_Get4Args("O|iii:CharSet.search", text, direction, start, stop); rc = mxCharSet_Search(self, text, start, stop, direction); if (rc == -1) Py_ReturnNone(); if (rc < -1) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_match, ".match(text[, direction=1, start=0, stop=len(text)])\n\n" ) { PyObject *text; int direction = 1; int start = 0, stop = INT_MAX; int rc; Py_Get4Args("O|iii:CharSet.match", text, direction, start, stop); rc = mxCharSet_Match(self, text, start, stop, direction); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_split, ".split(text[, start=0, stop=len(text)])\n\n" ) { PyObject *text; int start = 0, stop = INT_MAX; Py_Get3Args("O|ii:CharSet.split", text, start, stop); return mxCharSet_Split(self, text, start, stop, 0); onError: return NULL; } Py_C_Function( mxCharSet_splitx, ".splitx(text[, start=0, stop=len(text)])\n\n" ) { PyObject *text; int start = 0, stop = INT_MAX; Py_Get3Args("O|ii:CharSet.splitx", text, start, stop); return mxCharSet_Split(self, text, start, stop, 1); onError: return NULL; } Py_C_Function( mxCharSet_strip, ".strip(text[, where=0, start=0, stop=len(text)])\n\n" ) { PyObject *text; int where = 0; int start = 0, stop = INT_MAX; Py_Get4Args("O|iii:CharSet.strip", text, where, start, stop); return mxCharSet_Strip(self, text, start, stop, where); onError: return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxCharSet_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(cs); return (PyObject *)cs; onError: return NULL; } #endif #undef cs /* --- slots --- */ static PyObject *mxCharSet_Repr(mxCharSetObject *self) { PyObject *v; char t[500], *reprstr; v = PyObject_Repr(self->definition); if (v == NULL) return NULL; reprstr = PyString_AsString(v); if (reprstr == NULL) return NULL; sprintf(t, "", reprstr, (long)self); Py_DECREF(v); return PyString_FromString(t); } static PyObject *mxCharSet_GetAttr(mxCharSetObject *self, char *name) { PyObject *v; if (Py_WantAttr(name,"definition")) { v = self->definition; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"__members__")) return Py_BuildValue("[s]", "definition"); return Py_FindMethod(mxCharSet_Methods, (PyObject *)self, (char *)name); } /* Python Type Tables */ static PySequenceMethods mxCharSet_TypeAsSequence = { (inquiry)0, /*sq_length*/ (binaryfunc)0, /*sq_concat*/ (intargfunc)0, /*sq_repeat*/ (intargfunc)0, /*sq_item*/ (intintargfunc)0, /*sq_slice*/ (intobjargproc)0, /*sq_ass_item*/ (intintobjargproc)0, /*sq_ass_slice*/ #if PY_VERSION_HEX >= 0x02000000 (objobjproc)mxCharSet_Contains, /*sq_contains*/ #endif }; PyTypeObject mxCharSet_Type = { PyObject_HEAD_INIT(0) /* init at startup ! */ 0, /* ob_size */ "Character Set", /* tp_name */ sizeof(mxCharSetObject), /* tp_basicsize */ 0, /* tp_itemsize */ /* methods */ (destructor)mxCharSet_Free, /* tp_dealloc */ (printfunc)0, /* tp_print */ (getattrfunc)mxCharSet_GetAttr, /* tp_getattr */ (setattrfunc)0, /* tp_setattr */ (cmpfunc)0, /* tp_compare */ (reprfunc)mxCharSet_Repr, /* tp_repr */ 0, /* tp_as_number */ &mxCharSet_TypeAsSequence, /* tp_as_sequence */ 0, /* tp_as_mapping */ (hashfunc)0, /* tp_hash */ (ternaryfunc)0, /* tp_call */ (reprfunc)0, /* tp_str */ (getattrofunc)0, /* tp_getattro */ (setattrofunc)0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ (char*) 0, /* tp_doc */ }; /* Python Method Table */ statichere PyMethodDef mxCharSet_Methods[] = { Py_MethodListEntry("contains",mxCharSet_contains), Py_MethodListEntry("search",mxCharSet_search), Py_MethodListEntry("match",mxCharSet_match), Py_MethodListEntry("strip",mxCharSet_strip), Py_MethodListEntry("split",mxCharSet_split), Py_MethodListEntry("splitx",mxCharSet_splitx), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxCharSet_copy), Py_MethodListEntry("__copy__",mxCharSet_copy), #endif {NULL,NULL} /* end of list */ }; /* --- Tag Table Object ------------------------------------------------*/ staticforward PyMethodDef mxTagTable_Methods[]; PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable); /* internal APIs */ static PyObject *tc_get_item(register PyObject *obj, register int i) { if (PyTuple_Check(obj)) { if (i > PyTuple_GET_SIZE(obj)) return NULL; return PyTuple_GET_ITEM(obj, i); } else if (PyList_Check(obj)) { if (i > PyList_GET_SIZE(obj)) return NULL; return PyList_GET_ITEM(obj, i); } else return NULL; } static int tc_length(register PyObject *obj) { if (obj == NULL) return -1; else if (PyTuple_Check(obj)) return PyTuple_GET_SIZE(obj); else if (PyList_Check(obj)) return PyList_GET_SIZE(obj); else return -1; } /* Add a jump target to the jump dictionary */ static int tc_add_jumptarget(PyObject *jumpdict, PyObject *targetname, int index) { PyObject *v; v = PyDict_GetItem(jumpdict, targetname); if (v != NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "jump target already defined", index); v = PyInt_FromLong(index); if (v == NULL) goto onError; if (PyDict_SetItem(jumpdict, targetname, v)) goto onError; Py_DECREF(v); return 0; onError: return -1; } /* Convert a string command argument to either an 8-bit string or Unicode depending on the tabletype. */ static PyObject *tc_convert_string_arg(PyObject *arg, int tableposition, int tabletype) { /* Convert to strings */ if (tabletype == MXTAGTABLE_STRINGTYPE) { if (PyString_Check(arg)) return arg; #ifdef HAVE_UNICODE else if (PyUnicode_Check(arg)) { Py_DECREF(arg); arg = PyUnicode_AsEncodedString(arg, NULL, NULL); if (arg == NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "conversion from Unicode to " "string failed", tableposition); } #endif else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "command argument must be a " "string or unicode", tableposition); } #ifdef HAVE_UNICODE /* Convert to Unicode */ else if (tabletype == MXTAGTABLE_UNICODETYPE) { if (PyUnicode_Check(arg)) return arg; else if (PyString_Check(arg)) { Py_DECREF(arg); arg = PyUnicode_Decode(PyString_AS_STRING(arg), PyString_GET_SIZE(arg), NULL, NULL); if (arg == NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "conversion from string to " "Unicode failed", tableposition); } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "command argument must be a " "string or unicode", tableposition); } #endif else Py_Error(mxTextTools_Error, "unsupported table type"); return arg; onError: return NULL; } /* Cleanup any references in the tag table. */ static int tc_cleanup(mxTagTableObject *tagtable) { int i; for (i = 0; i < tagtable->ob_size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; Py_XDECREF(tagtableentry->tagobj); tagtableentry->tagobj = NULL; Py_XDECREF(tagtableentry->args); tagtableentry->args = NULL; } return 0; } /* Initialize the tag table (this is the actual Tag Table compiler) */ static int init_tag_table(mxTagTableObject *tagtable, PyObject *table, int size, int tabletype, int cacheable) { int i; PyObject *entry; int entry_len; PyObject *tagobj, *command, *args = 0, *je, *jne; PyObject *jumpdict, *v; int secondpass, own_args = 0; jumpdict = PyDict_New(); if (jumpdict == NULL) return -1; /* Reset to all fields to 0 */ memset(&tagtable->entry[0], 0, size * sizeof(mxTagTableEntry)); /* First pass */ secondpass = 0; for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; /* Get table entry i and parse it */ entry = tc_get_item(table, i); if (entry == NULL) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "not found or not a supported entry type", i); } /* Special handling for jump marks (args is set to the jump mark string, jump target index is the next table entry) */ if (PyString_Check(entry)) { if (tc_add_jumptarget(jumpdict, entry, i + 1)) goto onError; tagtableentry->tagobj = NULL; tagtableentry->cmd = MATCH_JUMPTARGET; tagtableentry->flags = 0; Py_INCREF(entry); tagtableentry->args = entry; tagtableentry->jne = 0; tagtableentry->je = 1; continue; } /* Get entry length */ entry_len = tc_length(entry); if (entry_len < 3) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "expected an entry of the form " "(tagobj,command,arg[,jne[,je]])", i); } /* Decode entry parts: (tagobj, command, args[, jne[, je]]) */ tagobj = tc_get_item(entry, 0); command = tc_get_item(entry, 1); args = tc_get_item(entry, 2); if (entry_len >= 4) jne = tc_get_item(entry, 3); else jne = NULL; if (entry_len >= 5) je = tc_get_item(entry, 4); else je = NULL; if (tagobj == NULL || command == NULL || args == NULL || (entry_len >= 4 && jne == NULL) || (entry_len >= 5 && je == NULL)) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "expected an entry of the form " "(tagobj,command,arg[,jne[,je]])", i); } /* Store tagobj, None gets converted to NULL */ if (tagobj != Py_None) Py_INCREF(tagobj); else tagobj = NULL; tagtableentry->tagobj = tagobj; /* Decode command and flags */ Py_AssertWithArg(PyInt_Check(command), PyExc_TypeError, "tag table entry %i: " "command must be an integer",i); tagtableentry->cmd = PyInt_AS_LONG(command) & 0xFF; tagtableentry->flags = PyInt_AS_LONG(command) - tagtableentry->cmd; /* Check command arguments */ Py_INCREF(args); own_args = 1; switch (tagtableentry->cmd) { case MATCH_JUMP: /* == MATCH_FAIL */ case MATCH_EOF: case MATCH_LOOP: /* args is ignored */ break; case MATCH_SKIP: case MATCH_MOVE: case MATCH_LOOPCONTROL: Py_AssertWithArg(PyInt_Check(args), PyExc_TypeError, "tag table entry %i: " "Skip|Move|LoopControl command argument " "must be an integer", i); break; case MATCH_JUMPTARGET: Py_AssertWithArg(PyString_Check(args), PyExc_TypeError, "tag table entry %i: " "JumpMark command argument must be a string",i); if (tc_add_jumptarget(jumpdict, args, i + 1)) goto onError; break; case MATCH_ALLIN: case MATCH_ALLNOTIN: case MATCH_IS: case MATCH_ISIN: case MATCH_ISNOTIN: case MATCH_WORD: case MATCH_WORDSTART: case MATCH_WORDEND: args = tc_convert_string_arg(args, i, tabletype); if (args == NULL) goto onError; break; case MATCH_ALLINSET: case MATCH_ISINSET: Py_AssertWithArg(PyString_Check(args) && PyString_GET_SIZE(args) == 32, PyExc_TypeError, "tag table entry %i: " "AllInSet|IsInSet command argument must " "be a set() string",i); break; case MATCH_ALLINCHARSET: case MATCH_ISINCHARSET: Py_AssertWithArg(mxCharSet_Check(args), PyExc_TypeError, "tag table entry %i: " "AllInCharSet|IsInCharSet command argument must " "be a CharSet instance",i); break; case MATCH_SWORDSTART: /* == MATCH_NOWORD */ case MATCH_SWORDEND: case MATCH_SFINDWORD: Py_AssertWithArg(mxTextSearch_Check(args), PyExc_TypeError, "tag table entry %i: " "sWordStart|sWordEnd|sFindWord command " "argument must be a TextSearch search " "object",i); break; case MATCH_TABLE: case MATCH_SUBTABLE: Py_AssertWithArg(mxTagTable_Check(args) || PyTuple_Check(args) || PyList_Check(args) || (PyInt_Check(args) && PyInt_AS_LONG(args) == MATCH_THISTABLE), PyExc_TypeError, "tag table entry %i: " "Table|SubTable command argument " "must be a tag table tuple/object or " "ThisTable", i); /* XXX We shouldn't recursively compile tag table tuples here because this will slow down the compile process too much and it's not clear whether this particular table will ever be used during tagging. */ if (!mxTagTable_Check(args) && !PyInt_Check(args)) { Py_DECREF(args); args = mxTagTable_New(args, tabletype, cacheable); if (args == NULL) goto onError; } break; case MATCH_TABLEINLIST: case MATCH_SUBTABLEINLIST: Py_AssertWithArg(PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 2 && PyList_Check(PyTuple_GET_ITEM(args, 0)) && PyInt_Check(PyTuple_GET_ITEM(args, 1)), PyExc_TypeError, "tag table entry %i: " "TableInList|SubTableInList command argument " "must be a 2-tuple (list, integer)", i); break; case MATCH_CALL: Py_AssertWithArg(PyCallable_Check(args), PyExc_TypeError, "tag table entry %i: " "Call command argument " "must be a callable object", i); break; case MATCH_CALLARG: Py_AssertWithArg(PyTuple_Check(args) && PyTuple_GET_SIZE(args) > 0 && PyCallable_Check(PyTuple_GET_ITEM(args, 0)), PyExc_TypeError, "tag table entry %i: " "CallArg command argument " "must be a tuple (fct,[arg0,arg1,...])", i); break; default: Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %i: " "unknown command integer: %i", i, tagtableentry->cmd); } /* Store command args */ tagtableentry->args = args; own_args = 0; /* Decode jump offsets */ if (jne) { if (PyInt_Check(jne)) tagtableentry->jne = PyInt_AS_LONG(jne); else if (PyString_Check(jne)) { /* Mark for back-patching */ tagtableentry->jne = -424242; secondpass = 1; } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "jne must be an integer or string", i); } else tagtableentry->jne = 0; if (je) { if (PyInt_Check(je)) tagtableentry->je = PyInt_AS_LONG(je); else if (PyString_Check(je)) { /* Mark for back-patching */ tagtableentry->je = -424242; secondpass = 1; } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "je must be an integer or string", i); } else tagtableentry->je = 1; } /* Second pass (needed to patch string jump targets) */ if (secondpass) for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; if (tagtableentry->je != -424242 && tagtableentry->jne != -424242) continue; /* Entry (most probably) needs back-patching */ entry = tc_get_item(table, i); if (entry == NULL) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "unexpected error (not found)", i); } /* Get entry length */ entry_len = tc_length(entry); if (entry_len < 0) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %i: " "unexpected error (no length)", i); } /* Decode jump offsets */ if (entry_len >= 4) jne = tc_get_item(entry, 3); else jne = NULL; if (entry_len >= 5) je = tc_get_item(entry, 4); else je = NULL; /* Patch jump offsets */ if (jne && PyString_Check(jne)) { v = PyDict_GetItem(jumpdict, jne); if (v == NULL || !PyInt_Check(v)) Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %i: " "jne jump target '%s' not found", i, PyString_AS_STRING(jne)); tagtableentry->jne = PyInt_AS_LONG(v) - i; } if (je && PyString_Check(je)) { v = PyDict_GetItem(jumpdict, je); if (v == NULL || !PyInt_Check(v)) Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %i: " "je jump target '%s' not found", i, PyString_AS_STRING(je)); tagtableentry->je = PyInt_AS_LONG(v) - i; } } Py_DECREF(jumpdict); return 0; onError: if (own_args) { Py_DECREF(args); } return -1; } /* Check the cache for an already compiled TagTable for this definition. Return NULL in case of an error, Py_None without INCREF in case no such table was found or the TagTable object. */ static PyObject *consult_tagtable_cache(PyObject *definition, int tabletype, int cacheable) { PyObject *v, *key, *tt; if (!PyTuple_Check(definition) || !cacheable) return Py_None; key = PyTuple_New(2); if (key == NULL) goto onError; v = PyInt_FromLong((long) definition); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 0, v); v = PyInt_FromLong(tabletype); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 1, v); tt = PyDict_GetItem(mxTextTools_TagTables, key); Py_DECREF(key); if (tt != NULL) { Py_INCREF(tt); return tt; } return Py_None; onError: return NULL; } /* Adds the compiled tagtable to the cache. Returns -1 in case of an error, 0 on success. */ static int add_to_tagtable_cache(PyObject *definition, int tabletype, int cacheable, PyObject *tagtable) { PyObject *v, *key; int rc; if (!PyTuple_Check(definition) || !cacheable) return 0; key = PyTuple_New(2); if (key == NULL) goto onError; v = PyInt_FromLong((long) definition); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 0, v); v = PyInt_FromLong(tabletype); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 1, v); /* Hard-limit the cache size */ if (PyDict_Size(mxTextTools_TagTables) >= MAX_TAGTABLES_CACHE_SIZE) PyDict_Clear(mxTextTools_TagTables); rc = PyDict_SetItem(mxTextTools_TagTables, key, tagtable); Py_DECREF(key); if (rc) goto onError; return 0; onError: return -1; } /* allocation */ PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable) { mxTagTableObject *tagtable = 0; PyObject *v; int size; /* First, consult the TagTable cache */ v = consult_tagtable_cache(definition, tabletype, cacheable); if (v == NULL) goto onError; else if (v != Py_None) return v; size = tc_length(definition); if (size < 0) Py_Error(PyExc_TypeError, "tag table definition must be a tuple or a list"); tagtable = PyObject_NEW_VAR(mxTagTableObject, &mxTagTable_Type, size); if (tagtable == NULL) goto onError; if (cacheable) { Py_INCREF(definition); tagtable->definition = definition; } else tagtable->definition = NULL; tagtable->tabletype = tabletype; /* Compile table ... */ if (init_tag_table(tagtable, definition, size, tabletype, cacheable)) goto onError; /* Cache the compiled table if it is cacheable and derived from a tuple */ if (add_to_tagtable_cache(definition, tabletype, cacheable, (PyObject *)tagtable)) goto onError; return (PyObject *)tagtable; onError: Py_XDECREF(tagtable); return NULL; } Py_C_Function( mxTagTable_TagTable, "TagTable(definition[,cachable=1])\n\n" ) { PyObject *definition; int cacheable = 1; Py_Get2Args("O|i:TagTable", definition, cacheable); return mxTagTable_New(definition, 0, cacheable); onError: return NULL; } #ifdef HAVE_UNICODE Py_C_Function( mxTagTable_UnicodeTagTable, "TagTable(definition[,cachable=1])\n\n" ) { PyObject *definition; int cacheable = 1; Py_Get2Args("O|i:UnicodeTagTable", definition, cacheable); return mxTagTable_New(definition, 1, cacheable); onError: return NULL; } #endif static void mxTagTable_Free(mxTagTableObject *tagtable) { tc_cleanup(tagtable); Py_XDECREF(tagtable->definition); PyObject_Del(tagtable); } /* C APIs */ #define tagtable ((mxTagTableObject *)self) static PyObject *mxTagTable_CompiledDefinition(PyObject *self) { PyObject *tuple = 0, *v, *w; int i; int size; if (!mxTagTable_Check(self)) { PyErr_BadInternalCall(); goto onError; } size = tagtable->ob_size; tuple = PyTuple_New(size); if (tuple == NULL) goto onError; for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; /* Build tuple (tagobj, command, args, jne, je) */ v = PyTuple_New(5); if (v == NULL) goto onError; w = tagtableentry->tagobj; if (w == NULL) w = Py_None; Py_INCREF(w); PyTuple_SET_ITEM(v, 0, w); PyTuple_SET_ITEM(v, 1, PyInt_FromLong(tagtableentry->cmd | tagtableentry->flags)); w = tagtableentry->args; if (w == NULL) w = Py_None; Py_INCREF(w); PyTuple_SET_ITEM(v, 2, w); PyTuple_SET_ITEM(v, 3, PyInt_FromLong(tagtableentry->jne)); PyTuple_SET_ITEM(v, 4, PyInt_FromLong(tagtableentry->je)); if (PyErr_Occurred()) { Py_DECREF(v); goto onError; } PyTuple_SET_ITEM(tuple, i, v); } return tuple; onError: Py_XDECREF(tuple); return NULL; } /* methods */ Py_C_Function( mxTagTable_compiled, ".compiled()\n\n" ) { Py_NoArgsCheck(); return mxTagTable_CompiledDefinition(self); onError: return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxTagTable_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(tagtable); return (PyObject *)tagtable; onError: return NULL; } #endif #undef tagtable /* --- slots --- */ static PyObject *mxTagTable_Repr(mxTagTableObject *self) { char t[100]; if (self->tabletype == MXTAGTABLE_STRINGTYPE) sprintf(t,"", (long)self); else if (self->tabletype == MXTAGTABLE_UNICODETYPE) sprintf(t,"", (long)self); else sprintf(t,"", (long)self); return PyString_FromString(t); } static PyObject *mxTagTable_GetAttr(mxTagTableObject *self, char *name) { PyObject *v; if (Py_WantAttr(name,"definition")) { v = self->definition; if (v == NULL) v = Py_None; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"__members__")) return Py_BuildValue("[s]", "definition"); return Py_FindMethod(mxTagTable_Methods, (PyObject *)self, (char *)name); } /* Python Type Tables */ PyTypeObject mxTagTable_Type = { PyObject_HEAD_INIT(0) /* init at startup ! */ 0, /* ob_size */ "Tag Table", /* tp_name */ sizeof(mxTagTableObject), /* tp_basicsize */ sizeof(mxTagTableEntry), /* tp_itemsize */ /* methods */ (destructor)mxTagTable_Free, /* tp_dealloc */ (printfunc)0, /* tp_print */ (getattrfunc)mxTagTable_GetAttr, /* tp_getattr */ (setattrfunc)0, /* tp_setattr */ (cmpfunc)0, /* tp_compare */ (reprfunc)mxTagTable_Repr, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ (hashfunc)0, /* tp_hash */ (ternaryfunc)0, /* tp_call */ (reprfunc)0, /* tp_str */ (getattrofunc)0, /* tp_getattro */ (setattrofunc)0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ (char*) 0, /* tp_doc */ }; /* Python Method Table */ statichere PyMethodDef mxTagTable_Methods[] = { Py_MethodListEntryNoArgs("compiled",mxTagTable_compiled), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxTagTable_copy), Py_MethodListEntry("__copy__",mxTagTable_copy), #endif {NULL,NULL} /* end of list */ }; /* --- Internal functions ----------------------------------------------*/ #ifdef HAVE_UNICODE /* Same as mxTextTools_Join() for Unicode objects. */ static PyObject *mxTextTools_UnicodeJoin(PyObject *seq, int start, int stop, PyObject *separator) { PyObject *newstring = 0, *tempstr = 0; int newstring_len,current_len = 0; Py_UNICODE *p; int i; Py_UNICODE *sep; int sep_len; if (separator) { separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; sep = PyUnicode_AS_UNICODE(separator); sep_len = PyUnicode_GET_SIZE(separator); } else { sep = NULL; sep_len = 0; } /* Create an empty new string */ newstring_len = (10 + sep_len) * (stop - start); newstring = PyUnicode_FromUnicode(NULL, newstring_len); if (newstring == NULL) goto onError; p = PyUnicode_AS_UNICODE(newstring); /* Join with separator */ for (i = start; i < stop; i++) { register PyObject *o; Py_UNICODE *st; int len_st; o = PySequence_GetItem(seq, i); if PyTuple_Check(o) { /* Tuple entry: (string,l,r,[...]) */ register int l,r; /* parse tuple */ Py_Assert((PyTuple_GET_SIZE(o) >= 3) && PyInt_Check(PyTuple_GET_ITEM(o,1)) && PyInt_Check(PyTuple_GET_ITEM(o,2)), PyExc_TypeError, "tuples must be of the format (string,l,r[,...])"); tempstr = PyUnicode_FromObject(PyTuple_GET_ITEM(o,0)); if (tempstr == NULL) goto onError; st = PyUnicode_AS_UNICODE(tempstr); len_st = PyUnicode_GET_SIZE(tempstr); l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1)); r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2)); /* compute slice */ if (r > len_st) r = len_st; else if (r < 0) { r += len_st + 1; if (r < 0) r = 0; } if (l > len_st) l = len_st; else if (l < 0) { l += len_st + 1; if (l < 0) l = 0; } /* empty ? */ if (l > r) continue; len_st = r - l; if (len_st == 0) continue; /* get pointer right */ st += l; } else { /* Must be a string entry: take the whole string */ tempstr = PyUnicode_FromObject(o); if (tempstr == NULL) goto onError; st = PyUnicode_AS_UNICODE(tempstr); len_st = PyUnicode_GET_SIZE(tempstr); } Py_DECREF(o); /* Resize the new string if needed */ while (current_len + len_st + sep_len >= newstring_len) { newstring_len += newstring_len >> 1; if (PyUnicode_Resize(&newstring, newstring_len)) goto onError; p = PyUnicode_AS_UNICODE(newstring) + current_len; } /* Insert separator */ if (i > 0 && sep_len > 0) { Py_UNICODE_COPY(p, sep, sep_len); p += sep_len; current_len += sep_len; } /* Copy snippet into new string */ Py_UNICODE_COPY(p, st, len_st); p += len_st; current_len += len_st; Py_DECREF(tempstr); tempstr = NULL; } /* Resize new string to the actual length */ if (PyUnicode_Resize(&newstring, current_len)) goto onError; Py_XDECREF(separator); return newstring; onError: Py_XDECREF(newstring); Py_XDECREF(separator); Py_XDECREF(tempstr); return NULL; } #endif /* Enhanced string join: also excepts tuple (text, left, right,...) entries which then cause text[left:right] to be used as string snippet. separator may be NULL; in that case, "" is used as separator. */ static PyObject *mxTextTools_Join(PyObject *seq, int start, int stop, PyObject *separator) { PyObject *newstring = 0; int newstring_len, current_len = 0; char *p; int i; char *sep; int sep_len; if (separator) { #ifdef HAVE_UNICODE if (PyUnicode_Check(separator)) return mxTextTools_UnicodeJoin(seq, start, stop, separator); #endif Py_Assert(PyString_Check(separator), PyExc_TypeError, "separator must be a string"); sep = PyString_AS_STRING(separator); sep_len = PyString_GET_SIZE(separator); } else { sep = NULL; sep_len = 0; } /* Create an empty new string */ newstring_len = (10 + sep_len) * (stop - start); newstring = PyString_FromStringAndSize((char*)NULL, newstring_len); if (newstring == NULL) goto onError; p = PyString_AS_STRING(newstring); /* Join with separator */ for (i = start; i < stop; i++) { register PyObject *o; char *st; int len_st; o = PySequence_GetItem(seq, i); if PyTuple_Check(o) { /* Tuple entry: (string,l,r,[...]) */ register int l,r; /* parse tuple */ Py_Assert((PyTuple_GET_SIZE(o) >= 3) && PyInt_Check(PyTuple_GET_ITEM(o,1)) && PyInt_Check(PyTuple_GET_ITEM(o,2)), PyExc_TypeError, "tuples must be of the format (string,int,int[,...])"); #ifdef HAVE_UNICODE if (PyUnicode_Check(PyTuple_GET_ITEM(o,0))) { /* Redirect to Unicode implementation; all previous work is lost. */ Py_DECREF(o); Py_DECREF(newstring); return mxTextTools_UnicodeJoin(seq, start, stop, separator); } #endif Py_Assert(PyString_Check(PyTuple_GET_ITEM(o,0)), PyExc_TypeError, "tuples must be of the format (string,int,int[,...])"); st = PyString_AS_STRING(PyTuple_GET_ITEM(o,0)); len_st = PyString_GET_SIZE(PyTuple_GET_ITEM(o,0)); l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1)); r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2)); /* compute slice */ if (r > len_st) r = len_st; else if (r < 0) { r += len_st + 1; if (r < 0) r = 0; } if (l > len_st) l = len_st; else if (l < 0) { l += len_st + 1; if (l < 0) l = 0; } /* empty ? */ if (l > r) continue; len_st = r - l; if (len_st == 0) continue; /* get pointer right */ st += l; } else if (PyString_Check(o)) { /* String entry: take the whole string */ st = PyString_AS_STRING(o); len_st = PyString_GET_SIZE(o); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(o)) { /* Redirect to Unicode implementation; all previous work is lost. */ Py_DECREF(o); Py_DECREF(newstring); return mxTextTools_UnicodeJoin(seq, start, stop, separator); } #endif else { Py_DECREF(o); Py_Error(PyExc_TypeError, "list must contain tuples or strings as entries"); } Py_DECREF(o); /* Resize the new string if needed */ while (current_len + len_st + sep_len >= newstring_len) { newstring_len += newstring_len >> 1; if (_PyString_Resize(&newstring, newstring_len)) goto onError; p = PyString_AS_STRING(newstring) + current_len; } /* Insert separator */ if (i > 0 && sep_len > 0) { memcpy(p, sep, sep_len); p += sep_len; current_len += sep_len; } /* Copy snippet into new string */ memcpy(p,st,len_st); p += len_st; current_len += len_st; } /* Resize new string to the actual length */ if (_PyString_Resize(&newstring, current_len)) goto onError; return newstring; onError: Py_XDECREF(newstring); return NULL; } static PyObject *mxTextTools_HexStringFromString(char *str, int len) { PyObject *w = 0; int i; char *hex; static const char hexdigits[] = "0123456789abcdef"; /* Convert to HEX */ w = PyString_FromStringAndSize(NULL,2*len); if (!w) goto onError; hex = PyString_AS_STRING(w); for (i = 0; i < len; i ++) { unsigned char c = (unsigned char)*str; *hex++ = hexdigits[c >> 4]; *hex++ = hexdigits[c & 0x0F]; str++; } return w; onError: Py_XDECREF(w); return NULL; } static PyObject *mxTextTools_StringFromHexString(char *hex, int len) { PyObject *w = 0; int i; char *str; static const char hexdigits[] = "0123456789abcdef"; /* Convert to string */ Py_Assert(len % 2 == 0, PyExc_TypeError, "need 2-digit hex string argument"); len >>= 1; w = PyString_FromStringAndSize(NULL,len); if (!w) goto onError; str = PyString_AS_STRING(w); for (i = 0; i < len; i++,str++) { register char c; register int j; c = tolower(*hex++); for (j = 0; j < (int)sizeof(hexdigits); j++) if (c == hexdigits[j]) { *str = j << 4; break; } if (j == sizeof(hexdigits)) { DPRINTF("Failed: '%c' (%u) at %i\n",c,(unsigned int)c,i); Py_Error(PyExc_ValueError, "argument contains non-hex characters"); } c = tolower(*hex++); for (j = 0; j < (int)sizeof(hexdigits); j++) if (c == hexdigits[j]) { *str += j; break; } if (j == sizeof(hexdigits)) { DPRINTF("Failed2: '%c' (%u) at %i\n",c,(unsigned int)c,i); Py_Error(PyExc_ValueError, "argument contains non-hex characters"); } } return w; onError: Py_XDECREF(w); return NULL; } static int mxTextTools_IsASCII(PyObject *text, int left, int right) { if (PyString_Check(text)) { int len; register int i; register unsigned char *str = (unsigned char *)PyString_AS_STRING(text); len = PyString_GET_SIZE(text); Py_CheckSequenceSlice(len, left, right); for (i = left; i < right; i++) if (str[i] >= 128) return 0; return 1; } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { int len; register int i; register Py_UNICODE *str = PyUnicode_AS_UNICODE(text); len = PyUnicode_GET_SIZE(text); Py_CheckSequenceSlice(len, left, right); for (i = left; i < right; i++) if (str[i] >= 128) return 0; return 1; } #endif else Py_Error(PyExc_TypeError, "need string object"); onError: return -1; } /* Takes a list of tuples (replacement,l,r,...) and produces a taglist suitable for mxTextTools_Join() which creates a copy of text where every slice [l:r] is replaced by the given replacement. */ static PyObject *mxTextTools_Joinlist(PyObject *text, PyObject *list, int pos, int text_len) { PyObject *joinlist = 0; int list_len; int i; int listitem = 0; int listsize = INITIAL_LIST_SIZE; if (PyString_Check(text)) { Py_CheckStringSlice(text, pos, text_len); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, pos, text_len); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyList_Check(list), PyExc_TypeError, "expected a list of tuples as second argument"); list_len = PyList_GET_SIZE(list); joinlist = PyList_New(listsize); if (joinlist == NULL) goto onError; for (i = 0; i < list_len; i++) { register PyObject *t; register int left, right; t = PyList_GET_ITEM(list, i); Py_Assert(PyTuple_Check(t) && (PyTuple_GET_SIZE(t) >= 3) && (PyString_Check(PyTuple_GET_ITEM(t,0)) || PyUnicode_Check(PyTuple_GET_ITEM(t,0))) && PyInt_Check(PyTuple_GET_ITEM(t,1)) && PyInt_Check(PyTuple_GET_ITEM(t,2)), PyExc_TypeError, "tuples must be of the form (string,int,int,...)"); left = PyInt_AS_LONG(PyTuple_GET_ITEM(t,1)); right = PyInt_AS_LONG(PyTuple_GET_ITEM(t,2)); Py_Assert(left >= pos, PyExc_ValueError, "list is not sorted ascending"); if (left > pos) { /* joinlist.append((text,pos,left)) */ register PyObject *v; register PyObject *w; v = PyTuple_New(3); if (v == NULL) goto onError; Py_INCREF(text); PyTuple_SET_ITEM(v,0,text); w = PyInt_FromLong(pos); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,1,w); w = PyTuple_GET_ITEM(t,1); Py_INCREF(w); PyTuple_SET_ITEM(v,2,w); if (listitem < listsize) PyList_SET_ITEM(joinlist,listitem,v); else { PyList_Append(joinlist,v); Py_DECREF(v); } listitem++; } /* joinlist.append(string) */ if (listitem < listsize) { register PyObject *v = PyTuple_GET_ITEM(t,0); Py_INCREF(v); PyList_SET_ITEM(joinlist,listitem,v); } else PyList_Append(joinlist,PyTuple_GET_ITEM(t,0)); listitem++; pos = right; } if (pos < text_len) { /* joinlist.append((text,pos,text_len)) */ register PyObject *v; register PyObject *w; v = PyTuple_New(3); if (v == NULL) goto onError; Py_INCREF(text); PyTuple_SET_ITEM(v,0,text); w = PyInt_FromLong(pos); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,1,w); w = PyInt_FromLong(text_len); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,2,w); if (listitem < listsize) PyList_SET_ITEM(joinlist,listitem,v); else { PyList_Append(joinlist,v); Py_DECREF(v); } listitem++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(joinlist,listitem,listsize,(PyObject*)NULL); return joinlist; onError: Py_XDECREF(joinlist); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeCharSplit(PyObject *text, PyObject *separator, int start, int text_len) { PyObject *list = NULL; register int x; int listitem = 0; int listsize = INITIAL_LIST_SIZE; Py_UNICODE *tx; Py_UNICODE sep; text = PyUnicode_FromObject(text); if (text == NULL) { separator = NULL; goto onError; } separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; Py_CheckUnicodeSlice(text, start, text_len); Py_Assert(PyUnicode_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyUnicode_AS_UNICODE(text); sep = *PyUnicode_AS_UNICODE(separator); list = PyList_New(listsize); if (!list) goto onError; x = start; while (1) { PyObject *s; register int z; /* Skip to next separator */ z = x; for (;x < text_len; x++) if (tx[x] == sep) break; /* Append the slice to list */ s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x == text_len) break; /* Skip separator */ x++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); Py_DECREF(text); Py_DECREF(separator); return list; onError: Py_XDECREF(list); Py_XDECREF(text); Py_XDECREF(separator); return NULL; } #endif static PyObject *mxTextTools_CharSplit(PyObject *text, PyObject *separator, int start, int text_len) { PyObject *list = 0; register int x; int listitem = 0; int listsize = INITIAL_LIST_SIZE; char *tx; char sep; #ifdef HAVE_UNICODE if (PyUnicode_Check(text) || PyUnicode_Check(separator)) return mxTextTools_UnicodeCharSplit(text, separator, start, text_len); #endif if (PyString_Check(text) && PyString_Check(separator)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "text and separator must be strings or unicode"); Py_Assert(PyString_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyString_AS_STRING(text); sep = *PyString_AS_STRING(separator); list = PyList_New(listsize); if (!list) goto onError; x = start; while (1) { PyObject *s; register int z; /* Skip to next separator */ z = x; for (;x < text_len; x++) if (tx[x] == sep) break; /* Append the slice to list */ s = PyString_FromStringAndSize(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x == text_len) break; /* Skip separator */ x++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeSplitAt(PyObject *text, PyObject *separator, int nth, int start, int text_len) { PyObject *tuple = 0; register int x; PyObject *s; Py_UNICODE *tx; Py_UNICODE sep; text = PyUnicode_FromObject(text); if (text == NULL) { separator = NULL; goto onError; } separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; Py_CheckUnicodeSlice(text, start, text_len); Py_Assert(PyUnicode_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyUnicode_AS_UNICODE(text); sep = *PyUnicode_AS_UNICODE(separator); tuple = PyTuple_New(2); if (!tuple) goto onError; if (nth > 0) { /* Skip to nth separator from the left */ x = start; while (1) { for (; x < text_len; x++) if (tx[x] == sep) break; if (--nth == 0 || x == text_len) break; x++; } } else if (nth < 0) { /* Skip to nth separator from the right */ x = text_len - 1; while (1) { for (; x >= start; x--) if (tx[x] == sep) break; if (++nth == 0 || x < start) break; x--; } } else Py_Error(PyExc_ValueError, "nth must be non-zero"); /* Add to tuple */ if (x < start) s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0); else s = PyUnicode_FromUnicode(&tx[start], x - start); if (!s) goto onError; PyTuple_SET_ITEM(tuple,0,s); /* Skip separator */ x++; if (x >= text_len) s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0); else s = PyUnicode_FromUnicode(&tx[x], text_len - x); if (!s) goto onError; PyTuple_SET_ITEM(tuple,1,s); Py_DECREF(text); Py_DECREF(separator); return tuple; onError: Py_XDECREF(tuple); Py_XDECREF(text); Py_XDECREF(separator); return NULL; } #endif static PyObject *mxTextTools_SplitAt(PyObject *text, PyObject *separator, int nth, int start, int text_len) { PyObject *tuple = 0; register int x; PyObject *s; char *tx; char sep; #ifdef HAVE_UNICODE if (PyUnicode_Check(text) || PyUnicode_Check(separator)) return mxTextTools_UnicodeSplitAt(text, separator, nth, start, text_len); #endif if (PyString_Check(text) && PyString_Check(separator)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "text and separator must be strings or unicode"); Py_Assert(PyString_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyString_AS_STRING(text); sep = *PyString_AS_STRING(separator); tuple = PyTuple_New(2); if (!tuple) goto onError; if (nth > 0) { /* Skip to nth separator from the left */ x = start; while (1) { for (; x < text_len; x++) if (tx[x] == sep) break; if (--nth == 0 || x == text_len) break; x++; } } else if (nth < 0) { /* Skip to nth separator from the right */ x = text_len - 1; while (1) { for (; x >= start; x--) if (tx[x] == sep) break; if (++nth == 0 || x < start) break; x--; } } else Py_Error(PyExc_ValueError, "nth must be non-zero"); /* Add to tuple */ if (x < start) s = PyString_FromStringAndSize("",0); else s = PyString_FromStringAndSize(&tx[start], x - start); if (!s) goto onError; PyTuple_SET_ITEM(tuple,0,s); /* Skip separator */ x++; if (x >= text_len) s = PyString_FromStringAndSize("",0); else s = PyString_FromStringAndSize(&tx[x], text_len - x); if (!s) goto onError; PyTuple_SET_ITEM(tuple,1,s); return tuple; onError: Py_XDECREF(tuple); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeSuffix(PyObject *text, PyObject *suffixes, int start, int text_len, PyObject *translate) { int i; Py_UNICODE *tx; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected unicode"); Py_Assert(PyTuple_Check(suffixes), PyExc_TypeError, "suffixes needs to be a tuple of unicode strings"); /* XXX Add support for translate... */ Py_Assert(translate == NULL, PyExc_TypeError, "translate is not supported for Unicode suffix()es"); tx = PyUnicode_AS_UNICODE(text); for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes,i); int start_cmp; suffix = PyUnicode_FromObject(suffix); if (suffix == NULL) goto onError; start_cmp = text_len - PyUnicode_GET_SIZE(suffix); if (start_cmp >= start && PyUnicode_AS_UNICODE(suffix)[0] == tx[start_cmp] && memcmp(PyUnicode_AS_UNICODE(suffix), &tx[start_cmp], PyUnicode_GET_DATA_SIZE(suffix)) == 0) { Py_DECREF(text); return suffix; } Py_DECREF(suffix); } Py_DECREF(text); Py_ReturnNone(); onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Suffix(PyObject *text, PyObject *suffixes, int start, int text_len, PyObject *translate) { int i; char *tx; #ifdef HAVE_UNICODE if (PyUnicode_Check(text)) return mxTextTools_UnicodeSuffix(text, suffixes, start, text_len, translate); #endif if (PyString_Check(text)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyTuple_Check(suffixes), PyExc_TypeError, "suffixes needs to be a tuple of strings"); tx = PyString_AS_STRING(text); if (translate) { char *tr; Py_Assert(PyString_Check(translate) && PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate must be a string having 256 characters"); tr = PyString_AS_STRING(translate); for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes, i); int start_cmp; register char *s; register char *t; register int j; Py_AssertWithArg(PyString_Check(suffix), PyExc_TypeError, "tuple entry %i is not a string",i); start_cmp = text_len - PyString_GET_SIZE(suffix); if (start_cmp < start) continue; /* Do the compare using a translate table */ s = PyString_AS_STRING(suffix); t = tx + start_cmp; for (j = start_cmp; j < text_len; j++, s++, t++) if (*s != tr[(unsigned char)*t]) break; if (j == text_len) { Py_INCREF(suffix); return suffix; } } } else for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes,i); int start_cmp; Py_AssertWithArg(PyString_Check(suffix), PyExc_TypeError, "tuple entry %i is not a string",i); start_cmp = text_len - PyString_GET_SIZE(suffix); if (start_cmp < start) continue; /* Compare without translate table */ if (PyString_AS_STRING(suffix)[0] == tx[start_cmp] && strncmp(PyString_AS_STRING(suffix), &tx[start_cmp], PyString_GET_SIZE(suffix)) == 0) { Py_INCREF(suffix); return suffix; } } Py_ReturnNone(); onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodePrefix(PyObject *text, PyObject *prefixes, int start, int text_len, PyObject *translate) { int i; Py_UNICODE *tx; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected unicode"); Py_Assert(PyTuple_Check(prefixes), PyExc_TypeError, "prefixes needs to be a tuple of unicode strings"); /* XXX Add support for translate... */ Py_Assert(translate == NULL, PyExc_TypeError, "translate is not supported for Unicode prefix()es"); tx = PyUnicode_AS_UNICODE(text); for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); prefix = PyUnicode_FromObject(prefix); if (prefix == NULL) goto onError; /* Compare without translate table */ if (start + PyString_GET_SIZE(prefix) <= text_len && PyUnicode_AS_UNICODE(prefix)[0] == tx[start] && memcmp(PyUnicode_AS_UNICODE(prefix), &tx[start], PyUnicode_GET_DATA_SIZE(prefix)) == 0) { Py_INCREF(prefix); return prefix; } Py_DECREF(prefix); } Py_DECREF(text); Py_ReturnNone(); onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Prefix(PyObject *text, PyObject *prefixes, int start, int text_len, PyObject *translate) { int i; char *tx; #ifdef HAVE_UNICODE if (PyUnicode_Check(text)) return mxTextTools_UnicodePrefix(text, prefixes, start, text_len, translate); #endif if (PyString_Check(text)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyTuple_Check(prefixes), PyExc_TypeError, "prefixes needs to be a tuple of strings"); tx = PyString_AS_STRING(text); if (translate) { char *tr; Py_Assert(PyString_Check(translate) && PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate must be a string having 256 characters"); tr = PyString_AS_STRING(translate); for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); int cmp_len; register char *s; register char *t; register int j; Py_AssertWithArg(PyString_Check(prefix), PyExc_TypeError, "tuple entry %i is not a string",i); cmp_len = PyString_GET_SIZE(prefix); if (start + cmp_len > text_len) continue; /* Do the compare using a translate table */ s = PyString_AS_STRING(prefix); t = tx + start; for (j = 0; j < cmp_len; j++, s++, t++) if (*s != tr[(unsigned char)*t]) break; if (j == cmp_len) { Py_INCREF(prefix); return prefix; } } } else for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); Py_AssertWithArg(PyString_Check(prefix), PyExc_TypeError, "tuple entry %i is not a string",i); if (start + PyString_GET_SIZE(prefix) > text_len) continue; /* Compare without translate table */ if (PyString_AS_STRING(prefix)[0] == tx[start] && strncmp(PyString_AS_STRING(prefix), &tx[start], PyString_GET_SIZE(prefix)) == 0) { Py_INCREF(prefix); return prefix; } } Py_ReturnNone(); onError: return NULL; } /* Stips off characters appearing in the character set from text[start:stop] and returns the result as Python string object. where indicates the mode: where < 0: strip left only where = 0: strip left and right where > 0: strip right only */ static PyObject *mxTextTools_SetStrip(char *tx, int tx_len, char *setstr, int setstr_len, int start, int stop, int where) { int left, right; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len, start, stop); /* Strip left */ if (where <= 0) { register int x; for (x = start; x < stop; x++) if (!Py_CharInSet(tx[x], setstr)) break; left = x; } else left = start; /* Strip right */ if (where >= 0) { register int x; for (x = stop - 1; x >= start; x--) if (!Py_CharInSet(tx[x], setstr)) break; right = x + 1; } else right = stop; return PyString_FromStringAndSize(tx + left, max(right - left, 0)); onError: return NULL; } static PyObject *mxTextTools_SetSplit(char *tx, int tx_len, char *setstr, int setstr_len, int start, int text_len) { PyObject *list = NULL; register int x; int listitem = 0; int listsize = INITIAL_LIST_SIZE; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len,start,text_len); list = PyList_New(listsize); if (!list) goto onError; x = start; while (x < text_len) { int z; /* Skip all text in set */ for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } /* Skip all text not in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } /* Append the slice to list if it is not empty */ if (x > z) { PyObject *s; s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } static PyObject *mxTextTools_SetSplitX(char *tx, int tx_len, char *setstr, int setstr_len, int start, int text_len) { PyObject *list = NULL; register int x; int listitem = 0; int listsize = INITIAL_LIST_SIZE; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len,start,text_len); list = PyList_New(listsize); if (!list) goto onError; x = start; while (x < text_len) { PyObject *s; register int z; /* Skip all text not in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } /* Append the slice to list */ s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; /* Skip all text in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } /* Append the slice to list if it is not empty */ s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } static PyObject *mxTextTools_Upper(PyObject *text) { PyObject *ntext; register unsigned char *s; register unsigned char *orig; register int i; unsigned char *tr; int len; Py_Assert(PyString_Check(text), PyExc_TypeError, "expected a Python string"); len = PyString_GET_SIZE(text); ntext = PyString_FromStringAndSize(NULL,len); if (!ntext) goto onError; /* Translate */ tr = (unsigned char *)PyString_AS_STRING(mx_ToUpper); orig = (unsigned char *)PyString_AS_STRING(text); s = (unsigned char *)PyString_AS_STRING(ntext); for (i = 0; i < len; i++, s++, orig++) *s = tr[*orig]; return ntext; onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeUpper(PyObject *text) { PyObject *ntext; register Py_UNICODE *s; register Py_UNICODE *orig; register int i; int len; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; len = PyUnicode_GET_SIZE(text); ntext = PyUnicode_FromUnicode(NULL, len); if (!ntext) goto onError; /* Translate */ orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text); s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext); for (i = 0; i < len; i++, s++, orig++) *s = Py_UNICODE_TOUPPER(*orig); Py_DECREF(text); return ntext; onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Lower(PyObject *text) { PyObject *ntext; register unsigned char *s; register unsigned char *orig; register int i; unsigned char *tr; int len; Py_Assert(PyString_Check(text), PyExc_TypeError, "expected a Python string"); len = PyString_GET_SIZE(text); ntext = PyString_FromStringAndSize(NULL,len); if (!ntext) goto onError; /* Translate */ tr = (unsigned char *)PyString_AS_STRING(mx_ToLower); orig = (unsigned char *)PyString_AS_STRING(text); s = (unsigned char *)PyString_AS_STRING(ntext); for (i = 0; i < len; i++, s++, orig++) *s = tr[*orig]; return ntext; onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeLower(PyObject *text) { PyObject *ntext; register Py_UNICODE *s; register Py_UNICODE *orig; register int i; int len; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; len = PyUnicode_GET_SIZE(text); ntext = PyUnicode_FromUnicode(NULL, len); if (!ntext) goto onError; /* Translate */ orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text); s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext); for (i = 0; i < len; i++, s++, orig++) *s = Py_UNICODE_TOLOWER(*orig); Py_DECREF(text); return ntext; onError: Py_XDECREF(text); return NULL; } #endif /* --- Module functions ------------------------------------------------*/ /* Interface to the tagging engine in mxte.c */ Py_C_Function_WithKeywords( mxTextTools_tag, "tag(text,tagtable,sliceleft=0,sliceright=len(text),taglist=[],context=None) \n""" "Produce a tag list for a string, given a tag-table\n" "- returns a tuple (success, taglist, nextindex)\n" "- if taglist == None, then no taglist is created" ) { PyObject *text; PyObject *tagtable; int sliceright = INT_MAX; int sliceleft = 0; PyObject *taglist = 0; int taglist_len; PyObject *context = 0; int next, result; PyObject *res; Py_KeywordsGet6Args("OO|iiOO:tag", text,tagtable,sliceleft,sliceright,taglist,context); if (taglist == NULL) { /* not given, so use default: an empty list */ taglist = PyList_New(0); if (taglist == NULL) goto onError; taglist_len = 0; } else { Py_INCREF(taglist); Py_Assert(PyList_Check(taglist) || taglist == Py_None, PyExc_TypeError, "taglist must be a list or None"); if (taglist != Py_None) { taglist_len = PyList_Size(taglist); if (taglist_len < 0) goto onError; } else taglist_len = 0; } Py_Assert(mxTagTable_Check(tagtable) || PyTuple_Check(tagtable) || PyList_Check(tagtable), PyExc_TypeError, "tagtable must be a TagTable instance, list or tuple"); /* Prepare the argument for the Tagging Engine and let it process the request */ if (PyString_Check(text)) { Py_CheckStringSlice(text, sliceleft, sliceright); if (!mxTagTable_Check(tagtable)) { tagtable = mxTagTable_New(tagtable, MXTAGTABLE_STRINGTYPE, 1); if (tagtable == NULL) goto onError; } else if (mxTagTable_Type(tagtable) != MXTAGTABLE_STRINGTYPE) { Py_Error(PyExc_TypeError, "TagTable instance is not intended for parsing strings"); } else Py_INCREF(tagtable); /* Call the Tagging Engine */ result = mxTextTools_TaggingEngine(text, sliceleft, sliceright, (mxTagTableObject *)tagtable, taglist, context, &next); Py_DECREF(tagtable); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, sliceleft, sliceright); if (!mxTagTable_Check(tagtable)) { tagtable = mxTagTable_New(tagtable, 1, 1); if (tagtable == NULL) goto onError; } else if (mxTagTable_Type(tagtable) != MXTAGTABLE_UNICODETYPE) { Py_Error(PyExc_TypeError, "TagTable instance is not intended for parsing Unicode"); } else Py_INCREF(tagtable); /* Call the Tagging Engine */ result = mxTextTools_UnicodeTaggingEngine(text, sliceleft, sliceright, (mxTagTableObject *)tagtable, taglist, context, &next); Py_DECREF(tagtable); } #endif else Py_Error(PyExc_TypeError, "text must be a string or unicode"); /* Check for exceptions during matching */ if (result == 0) goto onError; /* Undo changes to taglist in case of a match failure (result == 1) */ if (result == 1 && taglist != Py_None) { DPRINTF(" undoing changes: del taglist[%i:%i]\n", taglist_len, PyList_Size(taglist)); if (PyList_SetSlice(taglist, taglist_len, PyList_Size(taglist), NULL)) goto onError; } /* Convert result to the documented external values: 0 - no match, 1 - match. */ result--; /* Build result tuple */ res = PyTuple_New(3); if (!res) goto onError; PyTuple_SET_ITEM(res,0,PyInt_FromLong(result)); PyTuple_SET_ITEM(res,1,taglist); PyTuple_SET_ITEM(res,2,PyInt_FromLong(next)); return res; onError: if (!PyErr_Occurred()) Py_Error(PyExc_SystemError, "NULL result without error in builtin tag()"); Py_XDECREF(taglist); return NULL; } /* An extended version of string.join() for taglists: */ Py_C_Function( mxTextTools_join, "join(joinlist,sep='',start=0,stop=len(joinlist))\n\n" "Copy snippets from different strings together producing a\n" "new string\n" "The first argument must be a list of tuples or strings;\n" "tuples must be of the form (string,l,r[,...]) and turn out\n" "as string[l:r]\n" "NOTE: the syntax used for negative slices is different\n" "than the Python standard: -1 corresponds to the first\n" "character *after* the string, e.g. ('Example',0,-1) gives\n" "'Example' and not 'Exampl', like in Python\n" "sep is an optional separator string, start and stop\n" "define the slice of joinlist that is taken into accont." ) { PyObject *joinlist = NULL; int joinlist_len; PyObject *separator = NULL; int start=0, stop=INT_MAX; Py_Get4Args("O|Oii:join", joinlist,separator,start,stop); Py_Assert(PySequence_Check(joinlist), PyExc_TypeError, "first argument needs to be a sequence"); joinlist_len = PySequence_Length(joinlist); Py_Assert(joinlist_len >= 0, PyExc_TypeError, "first argument needs to have a __len__ method"); Py_CheckSequenceSlice(joinlist_len, start, stop); /* Short-cut */ if ((stop - start) <= 0) return PyString_FromString(""); return mxTextTools_Join(joinlist, start, stop, separator); onError: return NULL; } /* Special compare function for taglist-tuples, comparing the text-slices given: - slices starting at a smaller index come first - for slices starting at the same index, the longer one wins */ Py_C_Function( mxTextTools_cmp, "cmp(a,b)\n\n" "Compare two valid taglist tuples w/r to their slice\n" "position; this is useful for sorting joinlists.") { PyObject *v,*w; int cmp; Py_Get2Args("OO:cmp",v,w); Py_Assert(PyTuple_Check(v) && PyTuple_Check(w) && PyTuple_GET_SIZE(v) >= 3 && PyTuple_GET_SIZE(w) >= 3, PyExc_TypeError, "invalid taglist-tuple"); cmp = PyObject_Compare(PyTuple_GET_ITEM(v,1),PyTuple_GET_ITEM(w,1)); if (cmp != 0) return PyInt_FromLong(cmp); cmp = - PyObject_Compare(PyTuple_GET_ITEM(v,2),PyTuple_GET_ITEM(w,2)); return PyInt_FromLong(cmp); onError: return NULL; } Py_C_Function( mxTextTools_joinlist, "joinlist(text,list,start=0,stop=len(text))\n\n" "Takes a list of tuples (replacement,l,r,...) and produces\n" "a taglist suitable for join() which creates a copy\n" "of text where every slice [l:r] is replaced by the\n" "given replacement\n" "- the list must be sorted using cmp() as compare function\n" "- it may not contain overlapping slices\n" "- the slices may not contain negative indices\n" "- if the taglist cannot contain overlapping slices, you can\n" " give this function the taglist produced by tag() directly\n" " (sorting is not needed, as the list will already be sorted)\n" "- start and stop set the slice to work in, i.e. text[start:stop]" ) { PyObject *list; PyObject *text; int text_len = INT_MAX; int pos = 0; Py_Get4Args("OO|ii:joinlist",text,list,pos,text_len); return mxTextTools_Joinlist(text, list, pos, text_len); onError: return NULL; } Py_C_Function( mxTextTools_charsplit, "charsplit(text,char,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings at char and\n" "return the result as list of strings." ) { PyObject *text, *separator; int text_len = INT_MAX; int start = 0; Py_Get4Args("OO|ii:charsplit", text,separator,start,text_len); return mxTextTools_CharSplit(text, separator, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_splitat, "splitat(text,char,nth=1,start=0,stop=len(text))\n\n" "Split text[start:stop] into two substrings at the nth\n" "occurance of char and return the result as 2-tuple. If the\n" "character is not found, the second string is empty. nth may\n" "be negative: the search is then done from the right and the\n" "first string is empty in case the character is not found." ) { PyObject *text, *separator; int text_len = INT_MAX; int start = 0; int nth = 1; Py_Get5Args("OO|iii:splitat", text,separator,nth,start,text_len); return mxTextTools_SplitAt(text, separator, nth, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_suffix, "suffix(text,suffixes,start=0,stop=len(text)[,translate])\n\n" "Looks at text[start:stop] and returns the first matching\n" "suffix out of the tuple of strings given in suffixes.\n" "If no suffix is found to be matching, None is returned.\n" "The optional 256 char translate string is used to translate\n" "the text prior to comparing it with the given suffixes." ) { PyObject *text, *suffixes, *translate = NULL; int text_len = INT_MAX; int start = 0; Py_Get5Args("OO|iiO:suffix", text,suffixes,start,text_len,translate); return mxTextTools_Suffix(text, suffixes, start, text_len, translate); onError: return NULL; } Py_C_Function( mxTextTools_prefix, "prefix(text,prefixes,start=0,stop=len(text)[,translate])\n\n" "Looks at text[start:stop] and returns the first matching\n" "prefix out of the tuple of strings given in prefixes.\n" "If no prefix is found to be matching, None is returned.\n" "The optional 256 char translate string is used to translate\n" "the text prior to comparing it with the given suffixes." ) { PyObject *text, *prefixes, *translate = NULL; int text_len = INT_MAX; int start = 0; Py_Get5Args("OO|iiO:prefix", text,prefixes,start,text_len,translate); return mxTextTools_Prefix(text, prefixes, start, text_len, translate); onError: return NULL; } Py_C_Function( mxTextTools_set, "set(string,logic=1)\n\n" "Returns a character set for string: a bit encoded version\n" "of the characters occurring in string.\n" "- logic can be set to 0 if all characters *not* in string\n" " should go into the set") { PyObject *sto; char *s,*st; int len_s; int logic = 1; int i; Py_Get3Args("s#|i:set", s,len_s,logic); sto = PyString_FromStringAndSize(NULL,32); if (sto == NULL) goto onError; st = PyString_AS_STRING(sto); if (logic) { memset(st,0x00,32); for (i = 0; i < len_s; i++,s++) { int j = (unsigned char)*s; st[j >> 3] |= 1 << (j & 7); } } else { memset(st,0xFF,32); for (i = 0; i < len_s; i++,s++) { int j = (unsigned char)*s; st[j >> 3] &= ~(1 << (j & 7)); } } return sto; onError: return NULL; } Py_C_Function( mxTextTools_setfind, "setfind(text,set,start=0,stop=len(text))\n\n" "Find the first occurence of any character from set in\n" "text[start:stop]\n set must be a string obtained with set()\n" "DEPRECATED: use CharSet().search() instead." ) { PyObject *text; PyObject *set; int text_len = INT_MAX; int start = 0; register int x; register char *tx; register unsigned char *setstr; Py_Get4Args("OO|ii:setfind",text,set,start,text_len); Py_Assert(PyString_Check(text), PyExc_TypeError, "first argument needs to be a string"); Py_Assert(PyString_Check(set) && PyString_GET_SIZE(set) == 32, PyExc_TypeError, "second argument needs to be a set"); Py_CheckStringSlice(text,start,text_len); x = start; tx = PyString_AS_STRING(text) + x; setstr = (unsigned char *)PyString_AS_STRING(set); for (;x < text_len; tx++, x++) if (Py_CharInSet(*tx,setstr)) break; if (x == text_len) /* Not found */ return PyInt_FromLong(-1L); else return PyInt_FromLong(x); onError: return NULL; } Py_C_Function( mxTextTools_setstrip, "setstrip(text,set,start=0,stop=len(text),mode=0)\n\n" "Strip all characters in text[start:stop] appearing in set.\n" "mode indicates where to strip (<0: left; =0: left and right;\n" ">0: right). set must be a string obtained with set()\n" "DEPRECATED: use CharSet().strip() instead." ) { char *tx; int tx_len; char *setstr; int setstr_len; int start = 0; int stop = INT_MAX; int mode = 0; Py_Get7Args("s#s#|iii:setstip", tx,tx_len,setstr,setstr_len,start,stop,mode); return mxTextTools_SetStrip(tx, tx_len, setstr, setstr_len, start, stop, mode); onError: return NULL; } Py_C_Function( mxTextTools_setsplit, "setsplit(text,set,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings using set,\n" "omitting the splitting parts and empty substrings.\n" "set must be a string obtained from set()\n" "DEPRECATED: use CharSet().split() instead." ) { char *tx; int tx_len; char *setstr; int setstr_len; int start = 0; int stop = INT_MAX; Py_Get6Args("s#s#|ii:setsplit", tx,tx_len,setstr,setstr_len,start,stop); return mxTextTools_SetSplit(tx, tx_len, setstr, setstr_len, start, stop); onError: return NULL; } Py_C_Function( mxTextTools_setsplitx, "setsplitx(text,set,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings using set, so\n" "that every second entry consists only of characters in set.\n" "set must be a string obtained with set()\n" "DEPRECATED: use CharSet().splitx() instead." ) { int text_len = INT_MAX; int start = 0; char *tx; int tx_len; char *setstr; int setstr_len; Py_Get6Args("s#s#|ii:setsplitx", tx,tx_len,setstr,setstr_len,start,text_len); return mxTextTools_SetSplitX(tx, tx_len, setstr, setstr_len, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_upper, "upper(text)\n\n" "Return text converted to upper case.") { PyObject *text; Py_GetArgObject(text); if (PyString_Check(text)) return mxTextTools_Upper(text); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) return mxTextTools_UnicodeUpper(text); #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } Py_C_Function( mxTextTools_lower, "lower(text)\n\n" "Return text converted to lower case.") { PyObject *text; Py_GetArgObject(text); if (PyString_Check(text)) return mxTextTools_Lower(text); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) return mxTextTools_UnicodeLower(text); #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } Py_C_Function( mxTextTools_str2hex, "str2hex(text)\n\n" "Return text converted to a string consisting of two byte\n" "HEX values.") { char *str; int len; Py_Get2Args("s#",str,len); return mxTextTools_HexStringFromString(str,len); onError: return NULL; } Py_C_Function( mxTextTools_hex2str, "hex2str(text)\n\n" "Return text interpreted as two byte HEX values converted\n" "to a string.") { char *str; int len; Py_Get2Args("s#",str,len); return mxTextTools_StringFromHexString(str,len); onError: return NULL; } Py_C_Function( mxTextTools_isascii, "isascii(text,start=0,stop=len(text))\n\n" "Return 1/0 depending on whether text only contains ASCII\n" "characters." ) { PyObject *text; int start=0, stop = INT_MAX; int rc; Py_GetArgObject(text); rc = mxTextTools_IsASCII(text, start, stop); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } /* --- module init --------------------------------------------------------- */ /* Python Method Table */ static PyMethodDef Module_methods[] = { Py_MethodWithKeywordsListEntry("tag",mxTextTools_tag), Py_MethodListEntry("join",mxTextTools_join), Py_MethodListEntry("cmp",mxTextTools_cmp), Py_MethodListEntry("joinlist",mxTextTools_joinlist), Py_MethodListEntry("set",mxTextTools_set), Py_MethodListEntry("setfind",mxTextTools_setfind), Py_MethodListEntry("setsplit",mxTextTools_setsplit), Py_MethodListEntry("setsplitx",mxTextTools_setsplitx), Py_MethodListEntry("setstrip",mxTextTools_setstrip), Py_MethodWithKeywordsListEntry("TextSearch",mxTextSearch_TextSearch), Py_MethodListEntry("CharSet",mxCharSet_CharSet), Py_MethodListEntry("TagTable",mxTagTable_TagTable), #ifdef HAVE_UNICODE Py_MethodListEntry("UnicodeTagTable",mxTagTable_UnicodeTagTable), #endif Py_MethodListEntrySingleArg("upper",mxTextTools_upper), Py_MethodListEntrySingleArg("lower",mxTextTools_lower), Py_MethodListEntry("charsplit",mxTextTools_charsplit), Py_MethodListEntry("splitat",mxTextTools_splitat), Py_MethodListEntry("suffix",mxTextTools_suffix), Py_MethodListEntry("prefix",mxTextTools_prefix), Py_MethodListEntry("hex2str",mxTextTools_hex2str), Py_MethodListEntry("str2hex",mxTextTools_str2hex), Py_MethodListEntrySingleArg("isascii",mxTextTools_isascii), {NULL,NULL} /* end of list */ }; /* Cleanup function */ static void mxTextToolsModule_Cleanup(void) { mxTextTools_TagTables = NULL; /* Reset mxTextTools_Initialized flag */ mxTextTools_Initialized = 0; } MX_EXPORT(void) initmxTextTools(void) { PyObject *module, *moddict; if (mxTextTools_Initialized) Py_Error(PyExc_SystemError, "can't initialize "MXTEXTTOOLS_MODULE" more than once"); /* Init type objects */ PyType_Init(mxTextSearch_Type); #ifdef MXFASTSEARCH PyType_Init(mxFS_Type); #endif PyType_Init(mxCharSet_Type); PyType_Init(mxTagTable_Type); /* create module */ module = Py_InitModule4(MXTEXTTOOLS_MODULE, /* Module name */ Module_methods, /* Method list */ Module_docstring, /* Module doc-string */ (PyObject *)NULL, /* always pass this as *self */ PYTHON_API_VERSION); /* API Version */ if (!module) goto onError; /* Init TagTable cache */ if ((mxTextTools_TagTables = PyDict_New()) == NULL) goto onError; /* Register cleanup function */ if (Py_AtExit(mxTextToolsModule_Cleanup)) /* XXX what to do if we can't register that function ??? */; /* Add some symbolic constants to the module */ moddict = PyModule_GetDict(module); PyDict_SetItemString(moddict, "__version__", PyString_FromString(VERSION)); mx_ToUpper = mxTextTools_ToUpper(); PyDict_SetItemString(moddict, "to_upper", mx_ToUpper); mx_ToLower = mxTextTools_ToLower(); PyDict_SetItemString(moddict, "to_lower", mx_ToLower); /* Let the tag table cache live in the module dictionary; we just keep a weak reference in mxTextTools_TagTables around. */ PyDict_SetItemString(moddict, "tagtable_cache", mxTextTools_TagTables); Py_DECREF(mxTextTools_TagTables); insint(moddict, "BOYERMOORE", MXTEXTSEARCH_BOYERMOORE); insint(moddict, "FASTSEARCH", MXTEXTSEARCH_FASTSEARCH); insint(moddict, "TRIVIAL", MXTEXTSEARCH_TRIVIAL); /* Init exceptions */ if ((mxTextTools_Error = insexc(moddict, "Error", PyExc_StandardError)) == NULL) goto onError; /* Type objects */ Py_INCREF(&mxTextSearch_Type); PyDict_SetItemString(moddict, "TextSearchType", (PyObject *)&mxTextSearch_Type); Py_INCREF(&mxCharSet_Type); PyDict_SetItemString(moddict, "CharSetType", (PyObject *)&mxCharSet_Type); Py_INCREF(&mxTagTable_Type); PyDict_SetItemString(moddict, "TagTableType", (PyObject *)&mxTagTable_Type); /* Tag Table command symbols (these will be exposed via simpleparse.stt.TextTools.Constants.TagTables) */ insint(moddict, "_const_AllIn", MATCH_ALLIN); insint(moddict, "_const_AllNotIn", MATCH_ALLNOTIN); insint(moddict, "_const_Is", MATCH_IS); insint(moddict, "_const_IsIn", MATCH_ISIN); insint(moddict, "_const_IsNot", MATCH_ISNOTIN); insint(moddict, "_const_IsNotIn", MATCH_ISNOTIN); insint(moddict, "_const_Word", MATCH_WORD); insint(moddict, "_const_WordStart", MATCH_WORDSTART); insint(moddict, "_const_WordEnd", MATCH_WORDEND); insint(moddict, "_const_AllInSet", MATCH_ALLINSET); insint(moddict, "_const_IsInSet", MATCH_ISINSET); insint(moddict, "_const_AllInCharSet", MATCH_ALLINCHARSET); insint(moddict, "_const_IsInCharSet", MATCH_ISINCHARSET); insint(moddict, "_const_Fail", MATCH_FAIL); insint(moddict, "_const_Jump", MATCH_JUMP); insint(moddict, "_const_EOF", MATCH_EOF); insint(moddict, "_const_Skip", MATCH_SKIP); insint(moddict, "_const_Move", MATCH_MOVE); insint(moddict, "_const_JumpTarget", MATCH_JUMPTARGET); insint(moddict, "_const_sWordStart", MATCH_SWORDSTART); insint(moddict, "_const_sWordEnd", MATCH_SWORDEND); insint(moddict, "_const_sFindWord", MATCH_SFINDWORD); insint(moddict, "_const_NoWord", MATCH_NOWORD); insint(moddict, "_const_Call", MATCH_CALL); insint(moddict, "_const_CallArg", MATCH_CALLARG); insint(moddict, "_const_Table", MATCH_TABLE); insint(moddict, "_const_SubTable", MATCH_SUBTABLE); insint(moddict, "_const_TableInList", MATCH_TABLEINLIST); insint(moddict, "_const_SubTableInList", MATCH_SUBTABLEINLIST); insint(moddict, "_const_Loop", MATCH_LOOP); insint(moddict, "_const_LoopControl", MATCH_LOOPCONTROL); /* Tag Table command flags */ insint(moddict, "_const_CallTag", MATCH_CALLTAG); insint(moddict, "_const_AppendToTagobj", MATCH_APPENDTAG); insint(moddict, "_const_AppendTagobj", MATCH_APPENDTAGOBJ); insint(moddict, "_const_AppendMatch", MATCH_APPENDMATCH); insint(moddict, "_const_LookAhead", MATCH_LOOKAHEAD); /* Tag Table argument integers */ insint(moddict, "_const_To", MATCH_JUMP_TO); insint(moddict, "_const_MatchOk", MATCH_JUMP_MATCHOK); insint(moddict, "_const_MatchFail", MATCH_JUMP_MATCHFAIL); insint(moddict, "_const_ToEOF", MATCH_MOVE_EOF); insint(moddict, "_const_ToBOF", MATCH_MOVE_BOF); insint(moddict, "_const_Here", MATCH_FAIL_HERE); insint(moddict, "_const_ThisTable", MATCH_THISTABLE); insint(moddict, "_const_Break", MATCH_LOOPCONTROL_BREAK); insint(moddict, "_const_Reset", MATCH_LOOPCONTROL_RESET); DPRINTF("sizeof(string_charset)=%i bytes\n", sizeof(string_charset)); #ifdef HAVE_UNICODE DPRINTF("sizeof(unicode_charset)=%i bytes\n", sizeof(unicode_charset)); #endif /* We are now initialized */ mxTextTools_Initialized = 1; onError: /* Check for errors and report them */ if (PyErr_Occurred()) Py_ReportModuleInitError(MXTEXTTOOLS_MODULE); return; }