ports//math/libRmath/work/R-2.4.0/src/main/pcre.c

/*
 *  R : A Computer Language for Statistical Data Analysis
 *  Copyright (C) 1995, 1996  Robert Gentleman and Ross Ihaka
 *  Copyright (C) 1997--2006  Robert Gentleman, Ross Ihaka and the
 *                            R Development Core Team
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Pulic License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street Fifth Floor, Boston, MA 02110-1301  USA
 */

/* <UTF8>
   regex code should be OK.
   substitution code does ASCII comparisons only.
   regexpr returned pos and match length in bytes not chars.
*/

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#include <Defn.h>
#include <sys/types.h>		/* probably not needed */
#include <Rmath.h>		/* imax2 */

#ifdef HAVE_PCRE_PCRE_H
# include <pcre/pcre.h>
#else
# include <pcre.h>
#endif

#ifdef SUPPORT_UTF8
# include <R_ext/rlocale.h>
# include <wchar.h>
# include <wctype.h>
#endif

SEXP attribute_hidden do_pgrep(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP pat, vec, ind, ans;
    int i, j, n, nmatches;
    int igcase_opt, value_opt, useBytes, erroffset;
    int options = 0;
    const char *errorptr;
    pcre *re_pcre;
    const unsigned char *tables;

    checkArity(op, args);
    pat = CAR(args); args = CDR(args);
    vec = CAR(args); args = CDR(args);
    igcase_opt = asLogical(CAR(args)); args = CDR(args);
    value_opt = asLogical(CAR(args)); args = CDR(args);
    if (igcase_opt == NA_INTEGER) igcase_opt = 0;
    if (value_opt == NA_INTEGER) value_opt = 0;
    useBytes = asLogical(CAR(args)); args = CDR(args);
    if (useBytes == NA_INTEGER) useBytes = 0;


    if (length(pat) < 1) errorcall(call, R_MSG_IA);

    /* NAs are removed in R code so this isn't used */
    /* it's left in case we change our minds again */
    /* special case: NA pattern matches only NAs in vector */
    if (STRING_ELT(pat,0) == NA_STRING) {
	n = length(vec);
	nmatches = 0;
	PROTECT(ind = allocVector(LGLSXP, n));
	for(i = 0; i < n; i++) {
	    if(STRING_ELT(vec,i) == NA_STRING){
		INTEGER(ind)[i] = 1;
		nmatches++;
	    }
	    else
		INTEGER(ind)[i] = 0;
	}
	if (value_opt) {
	    ans = allocVector(STRSXP, nmatches);
	    j = 0;
	    for (i = 0 ; i < n ; i++)
		if (INTEGER(ind)[i]) {
		    SET_STRING_ELT(ans, j++, STRING_ELT(vec, i));
		}
	} else {
	    ans = allocVector(INTSXP, nmatches);
	    j = 0;
	    for (i = 0 ; i < n ; i++)
		if (INTEGER(ind)[i]) INTEGER(ans)[j++] = i + 1;
	}
	UNPROTECT(1);
    return ans;
    }
    /* end NA pattern handling */

#ifdef SUPPORT_UTF8
    if(useBytes) ;
    else if(utf8locale) options = PCRE_UTF8;
    else if(mbcslocale)
	warning(_("perl = TRUE is only fully implemented in UTF-8 locales"));
    if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(pat, 0))))
	errorcall(call, _("regular expression is invalid in this locale"));
#endif
    if (igcase_opt) options |= PCRE_CASELESS;

    tables = pcre_maketables();
    re_pcre = pcre_compile(CHAR(STRING_ELT(pat, 0)), options, &errorptr,
			   &erroffset, tables);
    if (!re_pcre) errorcall(call, _("invalid regular expression '%s'"),
			    CHAR(STRING_ELT(pat, 0)));

    n = length(vec);
    ind = allocVector(LGLSXP, n);
    nmatches = 0;
    for (i = 0 ; i < n ; i++) {
	int rc, ovector;
	char *s = CHAR(STRING_ELT(vec, i));
	if (STRING_ELT(vec,i) == NA_STRING){
	    INTEGER(ind)[i] = 0;
	    continue;
	}
#ifdef SUPPORT_UTF8
	if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(vec, i)))) {
	    warningcall(call, _("input string %d is invalid in this locale"),
			i+1);
	    continue;
	}
#endif
	rc = pcre_exec(re_pcre, NULL, s, strlen(s), 0, 0, &ovector, 0);
	if (rc >= 0) {
	    INTEGER(ind)[i] = 1;
	    nmatches++;
	}
	else INTEGER(ind)[i] = 0;
    }
    (pcre_free)(re_pcre);
    pcre_free((void *)tables);
    PROTECT(ind);
    if (value_opt) {
	SEXP nmold = getAttrib(vec, R_NamesSymbol), nm;
	ans = allocVector(STRSXP, nmatches);
	for (i = 0, j = 0; i < n ; i++)
	    if (LOGICAL(ind)[i])
		SET_STRING_ELT(ans, j++, STRING_ELT(vec, i));
	/* copy across names and subset */
	if (!isNull(nmold)) {
	    nm = allocVector(STRSXP, nmatches);
	    for (i = 0, j = 0; i < n ; i++)
		if (LOGICAL(ind)[i])
		    SET_STRING_ELT(nm, j++, STRING_ELT(nmold, i));
	    setAttrib(ans, R_NamesSymbol, nm);
	}
    } else {
	ans = allocVector(INTSXP, nmatches);
	for (i = 0, j = 0 ; i < n ; i++)
	    if (LOGICAL(ind)[i]) INTEGER(ans)[j++] = i + 1;
    }
    UNPROTECT(1);
    return ans;
}

/* The following R functions do substitution for regular expressions,
 * either once or globally.
 * The functions are loosely patterned on the "sub" and "gsub" in "nawk". */

static int length_adj(char *orig, char *repl, int *ovec, int nsubexpr, 
		      Rboolean useBytes)
{
    int k, n, nb;
    char *p = repl;
    Rboolean upper = FALSE, lower = FALSE;

    n = strlen(repl) - (ovec[1] - ovec[0]);
    while (*p) {
	if (*p == '\\') {
	    if ('1' <= p[1] && p[1] <= '9') {
		k = p[1] - '0';
		if (k > nsubexpr)
		    error(_("invalid backreference %d in regular expression"),
			  k);
		nb = ovec[2*k+1] - ovec[2*k];
#ifdef SUPPORT_UTF8
		if(nb >0 && !useBytes && mbcslocale && (upper || lower)) {
		    wctrans_t tr = wctrans(upper ? "toupper" : "tolower");
		    int j, nc;
		    char *xi, *p;
		    wchar_t *wc;
		    p = xi = (char *) alloca((nb+1)*sizeof(char));
		    R_CheckStack();
		    for(j = 0; j < nb; j++) *p++ = orig[ovec[2*k]+j];
		    *p = '\0';
		    nc = mbstowcs(NULL, xi, 0);
		    if(nc >= 0) {
			wc = (wchar_t *) alloca((nc+1)*sizeof(wchar_t));
			R_CheckStack();
			mbstowcs(wc, xi, nc + 1);
			for(j = 0; j < nc; j++) wc[j] = towctrans(wc[j], tr);
			nb = wcstombs(NULL, wc, 0);
		    }
		}
#endif
		n += nb - 2;
		p++;
	    } else if (p[1] == 'U') {
		p++; n -= 2;
		upper = TRUE; lower = FALSE;
	    } else if (p[1] == 'L') {
		p++; n -= 2;
		upper = FALSE; lower = TRUE;
	    } else if (p[1] == 0) {
				/* can't escape the final '\0' */
		n--;
	    } else {
		n--;
		p++;
	    }
	}
	p++;
    }
    return n;
}

static char *string_adj(char *target, char *orig, char *repl, int *ovec, 
			Rboolean useBytes)
{
    int i, k, nb;
    char *p = repl, *t = target, c;
    Rboolean upper = FALSE, lower = FALSE;

    while (*p) {
	if (*p == '\\') {
	    if ('1' <= p[1] && p[1] <= '9') {
		k = p[1] - '0';
		/* Here we need to work in chars */
		nb = ovec[2*k+1] - ovec[2*k];
#ifdef SUPPORT_UTF8
		if(nb > 0 && !useBytes && mbcslocale && (upper || lower)) {
		    wctrans_t tr = wctrans(upper ? "toupper" : "tolower");
		    int j, nc;
		    char *xi, *p;
		    wchar_t *wc;
		    p = xi = (char *) alloca((nb+1)*sizeof(char));
		    R_CheckStack();
		    for(j = 0; j < nb; j++) *p++ = orig[ovec[2*k]+j];
		    *p = '\0';
		    nc = mbstowcs(NULL, xi, 0);
		    if(nc >= 0) {
			wc = (wchar_t *) alloca((nc+1)*sizeof(wchar_t));
			R_CheckStack();
			mbstowcs(wc, xi, nc + 1);
			for(j = 0; j < nc; j++) wc[j] = towctrans(wc[j], tr);
			nb = wcstombs(NULL, wc, 0);
			wcstombs(xi, wc, nb + 1);
			for(j = 0; j < nb; j++) *t++ = *xi++;
		    }
		} else
#endif
		    for (i = ovec[2*k] ; i < ovec[2*k+1] ; i++) {
			c = orig[i];
			*t++ = upper ? toupper(c) : (lower ? tolower(c) : c);
		    }
		p += 2;
	    } else if (p[1] == 'U') {
		p += 2;
		upper = TRUE; lower = FALSE;
	    } else if (p[1] == 'L') {
		p += 2;
		upper = FALSE; lower = TRUE;
	    } else if (p[1] == 0) {
		p += 1;
	    } else {
		p += 1;
		*t++ = *p++;
	    }
	} else *t++ = *p++;
    }
    return t;
}


SEXP attribute_hidden do_pgsub(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP pat, rep, vec, ans;
    int i, j, n, ns, nns, nmatch, offset, re_nsub;
    int global, igcase_opt, useBytes, erroffset, eflag, last_end;
    int options = 0;
    char *s, *t, *u, *uu;
    const char *errorptr;
    pcre *re_pcre;
    pcre_extra *re_pe;
    const unsigned char *tables;

    checkArity(op, args);

    global = PRIMVAL(op);

    pat = CAR(args); args = CDR(args);
    rep = CAR(args); args = CDR(args);
    vec = CAR(args); args = CDR(args);
    igcase_opt = asLogical(CAR(args)); args = CDR(args);
    if (igcase_opt == NA_INTEGER) igcase_opt = 0;
    useBytes = asLogical(CAR(args)); args = CDR(args);
    if (useBytes == NA_INTEGER) useBytes = 0;

#ifdef SUPPORT_UTF8
    if(useBytes) ;
    else if(utf8locale) options = PCRE_UTF8;
    else if(mbcslocale)
	warning(_("perl = TRUE is only fully implemented in UTF-8 locales"));
    if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(pat, 0))))
	errorcall(call, _("'pattern' is invalid in this locale"));
    if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(rep, 0))))
	errorcall(call, _("'replacement' is invalid in this locale"));
#endif
    if (length(pat) < 1 || length(rep) < 1)
	errorcall(call, R_MSG_IA);


    if (igcase_opt) options |= PCRE_CASELESS;

    tables = pcre_maketables();
    re_pcre = pcre_compile(CHAR(STRING_ELT(pat, 0)), options, &errorptr,
			   &erroffset, tables);
    if (!re_pcre) errorcall(call, _("invalid regular expression '%s'"),
			    CHAR(STRING_ELT(pat, 0)));
    re_nsub = pcre_info(re_pcre, NULL, NULL);
    re_pe = pcre_study(re_pcre, 0, &errorptr);

    n = length(vec);
    PROTECT(ans = allocVector(STRSXP, n));

    for (i = 0 ; i < n ; i++) {
	int ovector[30];
	offset = 0;
	nmatch = 0;
	/* NA `pat' are removed in R code, the C code is left */
	/* in case we change our minds again */
	/* NA matches only itself */
        if (STRING_ELT(vec,i) == NA_STRING){
	    if (STRING_ELT(pat,0) == NA_STRING)
		SET_STRING_ELT(ans, i, STRING_ELT(rep,0));
	    else
		SET_STRING_ELT(ans, i, NA_STRING);
	    continue;
	}
	if (STRING_ELT(pat, 0) == NA_STRING){
	    SET_STRING_ELT(ans, i, STRING_ELT(vec,i));
	    continue;
	}
	/* end NA handling */
	s = CHAR(STRING_ELT(vec, i));
	t = CHAR(STRING_ELT(rep, 0));
	nns = ns = strlen(s);

#ifdef SUPPORT_UTF8
	if(!useBytes && mbcslocale && !mbcsValid(s)) {
	    errorcall(call, _("input string %d is invalid in this locale"),
		      i+1);
	}
#endif
	/* Looks like PCRE_NOTBOL is not needed in this version,
	   but leave in as a precaution */
	eflag = 0; last_end = -1;
	while (pcre_exec(re_pcre, re_pe, s, nns, offset, eflag,
			 ovector, 30) >= 0) {
	    nmatch += 1;
	    /* Do not repeat a 0-length match after a match, so
	       gsub("a*", "x", "baaac") is "xbxcx" not "xbxxcx" */
	    if(ovector[1] > last_end) {
		ns += length_adj(s, t, ovector, re_nsub, useBytes);
		last_end = ovector[1];
	    }
	    offset = ovector[1];
	    if (s[offset] == '\0' || !global) break;
	    /* If we have a 0-length match, move on a char */
	    if(ovector[1] == ovector[0]) {
#ifdef SUPPORT_UTF8
		if(!useBytes && mbcslocale) {
		    wchar_t wc; int used, pos = 0; mbstate_t mb_st;
		    mbs_init(&mb_st);
		    while( (used = Mbrtowc(&wc, s+pos, MB_CUR_MAX, &mb_st)) ) {
			pos += used;
			if(pos > offset) {
			    offset = pos;
			    break;
			}
		    }
		} else
#endif
		    offset++;
	    }
	    eflag = PCRE_NOTBOL;
	}
	if (nmatch == 0)
	    SET_STRING_ELT(ans, i, STRING_ELT(vec, i));
	else {
	    SET_STRING_ELT(ans, i, allocString(ns));
	    offset = 0;
	    s = CHAR(STRING_ELT(vec, i));
	    t = CHAR(STRING_ELT(rep, 0));
	    uu = u = CHAR(STRING_ELT(ans, i));
	    eflag = 0; last_end = -1;
	    while (pcre_exec(re_pcre, re_pe, s, nns, offset, eflag,
			     ovector, 30) >= 0) {
		/* printf("%s, %d, %d %d\n", s, offset,
		   ovector[0], ovector[1]); */
		for (j = offset; j < ovector[0]; j++) *u++ = s[j];
		if(ovector[1] > last_end) {
		    u = string_adj(u, s, t, ovector, useBytes);
		    last_end = ovector[1];
		}
		offset = ovector[1];
		if (s[offset] == '\0' || !global) break;
		if(ovector[1] == ovector[0]) { 
		    /* advance by a char */
#ifdef SUPPORT_UTF8
		    if(!useBytes && mbcslocale) {
			wchar_t wc; int used, pos = 0; mbstate_t mb_st;
			mbs_init(&mb_st);
			while( (used = Mbrtowc(&wc, s+pos, MB_CUR_MAX, &mb_st)) ) {
			    pos += used;
			    if(pos > offset) {
				for(j = offset; j < pos; j++) *u++ = s[j]; 
				offset = pos;
				break;
			    }
			}
		    } else
#endif
			*u++ = s[offset++];
		}

		eflag = PCRE_NOTBOL;
	    }
	    for (j = offset ; s[j] ; j++)
		*u++ = s[j];
	    *u = '\0';
	}
    }
    (pcre_free)(re_pe);
    (pcre_free)(re_pcre);
    pcre_free((void *)tables);
    UNPROTECT(1);
    return ans;
}


#include "RBufferUtils.h"
SEXP attribute_hidden do_pregexpr(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP pat, text, ans, matchlen;
    int i, n, st, erroffset;
    int options = 0, useBytes;
    const char *errorptr;
    pcre *re_pcre;
    const unsigned char *tables;
    /* To make this thread-safe remove static here and remove
       test on R_FreeStringBuffer below */
    static R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    checkArity(op, args);
    pat = CAR(args); args = CDR(args);
    text = CAR(args); args = CDR(args);
    useBytes = asLogical(CAR(args)); args = CDR(args);
    if (useBytes == NA_INTEGER) useBytes = 0;

    if (length(pat) < 1 || length(text) < 1 ) errorcall(call, R_MSG_IA);
    if (!isString(pat)) PROTECT(pat = coerceVector(pat, STRSXP));

#ifdef SUPPORT_UTF8
    if(useBytes) ;
    else if(utf8locale) options = PCRE_UTF8;
    else if(mbcslocale)
	warning(_("perl = TRUE is only fully implemented in UTF-8 locales"));
#endif

#ifdef SUPPORT_UTF8
    if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(pat, 0))))
	errorcall(call, _("regular expression is invalid in this locale"));
#endif
    tables = pcre_maketables();
    re_pcre = pcre_compile(CHAR(STRING_ELT(pat, 0)), options,
			   &errorptr, &erroffset, tables);
    if (!re_pcre) errorcall(call, _("invalid regular expression '%s'"),
			    CHAR(STRING_ELT(pat, 0)));
    n = length(text);
    PROTECT(ans = allocVector(INTSXP, n));
    PROTECT(matchlen = allocVector(INTSXP, n));

    for (i = 0 ; i < n ; i++) {
	int rc, ovector[3];
	char *s = CHAR(STRING_ELT(text, i));
	if (STRING_ELT(text,i) == NA_STRING){
	    INTEGER(ans)[i] = INTEGER(matchlen)[i] = R_NaInt;
	    continue;
	}
#ifdef SUPPORT_UTF8
	if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(text, i)))) {
	    warningcall(call, _("input string %d is invalid in this locale"),
			i+1);
	    INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
	    continue;
	}
#endif
	rc = pcre_exec(re_pcre, NULL, s, strlen(s), 0, 0, ovector, 3);
	if (rc >= 0) {
	    st = ovector[0];
	    INTEGER(ans)[i] = st + 1; /* index from one */
	    INTEGER(matchlen)[i] = ovector[1] - st;
#ifdef SUPPORT_UTF8
	    if(!useBytes && mbcslocale) {
		int mlen = ovector[1] - st;
		/* Unfortunately these are in bytes, so we need to
		   use chars instead */
		R_AllocStringBuffer(imax2(st, mlen+1), &cbuff);
		if(st > 0) {
		    memcpy(cbuff.data, CHAR(STRING_ELT(text, i)), st);
		    cbuff.data[st] = '\0';
		    INTEGER(ans)[i] = 1 + mbstowcs(NULL, cbuff.data, 0);
		    if(INTEGER(ans)[i] <= 0) /* an invalid string */
			INTEGER(ans)[i] = NA_INTEGER;
		}
		memcpy(cbuff.data, CHAR(STRING_ELT(text, i))+st, mlen);
		cbuff.data[mlen] = '\0';
		INTEGER(matchlen)[i] = mbstowcs(NULL, cbuff.data, 0);
		if(INTEGER(matchlen)[i] < 0) /* an invalid string */
		    INTEGER(matchlen)[i] = NA_INTEGER;
	    }
#endif
	} else {
	    INTEGER(ans)[i] = INTEGER(matchlen)[i] = -1;
	}
    }
    /* see comment above */
    if(cbuff.bufsize != MAXELTSIZE) R_FreeStringBuffer(&cbuff);
    (pcre_free)(re_pcre);
    pcre_free((void *)tables);
    setAttrib(ans, install("match.length"), matchlen);
    UNPROTECT(2);
    return ans;
}

SEXP attribute_hidden do_gpregexpr(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP pat, text, ansList, ans, matchlen;
    SEXP matchbuf, matchlenbuf;
    int bufsize = 1024;
    int i, n, st, erroffset;
    int options = 0, useBytes;
    const char *errorptr;
    pcre *re_pcre;
    const unsigned char *tables;
    /* To make this thread-safe remove static here and remove
       test on R_FreeStringBuffer below */
    static R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    checkArity(op, args);
    pat = CAR(args); args = CDR(args);
    text = CAR(args); args = CDR(args);
    useBytes = asLogical(CAR(args)); args = CDR(args);
    if (useBytes == NA_INTEGER) useBytes = 0;

    if (length(pat) < 1 || length(text) < 1 ) errorcall(call, R_MSG_IA);

#ifdef SUPPORT_UTF8
    if(useBytes) ;
    else if(utf8locale) options = PCRE_UTF8;
    else if(mbcslocale)
	warning(_("perl = TRUE is only fully implemented in UTF-8 locales"));
#endif

#ifdef SUPPORT_UTF8
    if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(pat, 0))))
	errorcall(call, _("regular expression is invalid in this locale"));
#endif
    tables = pcre_maketables();
    re_pcre = pcre_compile(CHAR(STRING_ELT(pat, 0)), options,
			   &errorptr, &erroffset, tables);
    if (!re_pcre) errorcall(call, _("invalid regular expression '%s'"),
			    CHAR(STRING_ELT(pat, 0)));
    n = length(text);
    PROTECT(ansList = allocVector(VECSXP, n));
    matchbuf = PROTECT(allocVector(INTSXP, bufsize));
    matchlenbuf = PROTECT(allocVector(INTSXP, bufsize));

    for (i = 0 ; i < n ; i++) {
	char *s = CHAR(STRING_ELT(text, i));
        int j, foundAll, foundAny, matchIndex, start;
        foundAll = foundAny = start = 0;
        matchIndex = -1;
	if (STRING_ELT(text,i) == NA_STRING){ 
            PROTECT(ans = allocVector(INTSXP, 1));
            PROTECT(matchlen = allocVector(INTSXP, 1));
	    INTEGER(ans)[0] = INTEGER(matchlen)[0] = R_NaInt;
            setAttrib(ans, install("match.length"), matchlen);
            SET_VECTOR_ELT(ansList, i, ans);
            UNPROTECT(2);
	    continue;
	}
#ifdef SUPPORT_UTF8
	if(!useBytes && mbcslocale && !mbcsValid(CHAR(STRING_ELT(text, i)))) {
	    warningcall(call, _("input string %d is invalid in this locale"),
			i+1);
            PROTECT(ans = allocVector(INTSXP, 1)); 
            PROTECT(matchlen = allocVector(INTSXP, 1));
	    INTEGER(ans)[0] = INTEGER(matchlen)[0] = -1;
            setAttrib(ans, install("match.length"), matchlen);
            SET_VECTOR_ELT(ansList, i, ans);
            UNPROTECT(2);
	    continue;
	}
#endif
        while (!foundAll) {
            int rc, ovector[3];
            rc = pcre_exec(re_pcre, NULL, s, strlen(s), start, 0, ovector, 3);
            if (rc >= 0) {
                if ((matchIndex + 1) == bufsize) {
                    /* Reallocate match buffers */
                    int newbufsize = bufsize * 2;
                    SEXP tmp;
                    tmp = allocVector(INTSXP, 2 * bufsize);
                    for (j = 0; j < bufsize; j++)
                        INTEGER(tmp)[j] = INTEGER(matchlenbuf)[j];
                    UNPROTECT(1);
                    matchlenbuf = tmp;
                    PROTECT(matchlenbuf);
                    tmp = allocVector(INTSXP, 2 * bufsize);
                    for (j = 0; j < bufsize; j++)
                        INTEGER(tmp)[j] = INTEGER(matchbuf)[j];
                    matchbuf = tmp;
                    UNPROTECT(2);
                    PROTECT(matchbuf);
                    PROTECT(matchlenbuf);
                    bufsize = newbufsize;
                }
                matchIndex++;
                foundAny = 1;
                st = ovector[0];
                INTEGER(matchbuf)[matchIndex] = st + 1; /* index from one */
                INTEGER(matchlenbuf)[matchIndex] = ovector[1] - st;
                if (INTEGER(matchlenbuf)[matchIndex] == 0)
                    start = ovector[0] + 1;
                else
                    start = ovector[1];
#ifdef SUPPORT_UTF8
                if(!useBytes && mbcslocale) {
                    int mlen = ovector[1] - st;
                    /* Unfortunately these are in bytes, so we need to
                       use chars instead */
                    R_AllocStringBuffer(imax2(st, mlen+1), &cbuff);
                    if(st > 0) {
                        memcpy(cbuff.data, CHAR(STRING_ELT(text, i)), st);
                        cbuff.data[st] = '\0';
                        INTEGER(matchbuf)[matchIndex] = 1 + mbstowcs(NULL, cbuff.data, 0);
                        if(INTEGER(matchbuf)[matchIndex] <= 0) { /* an invalid string */
                            INTEGER(matchbuf)[matchIndex] = NA_INTEGER;
                            foundAll = 1; /* if we get here, we are done */
                        }
                    }
                    memcpy(cbuff.data, CHAR(STRING_ELT(text, i))+st, mlen);
                    cbuff.data[mlen] = '\0';
                    INTEGER(matchlenbuf)[matchIndex] = mbstowcs(NULL, cbuff.data, 0);
                    if(INTEGER(matchlenbuf)[matchIndex] < 0) {/* an invalid string */
                        INTEGER(matchlenbuf)[matchIndex] = NA_INTEGER;
                        foundAll = 1; 
                    }
                    if (!foundAll)
                        start = INTEGER(matchbuf)[matchIndex] 
                            + INTEGER(matchlenbuf)[matchIndex];
                }
#endif
            } else {
                foundAll = 1;
                if (!foundAny)
                    matchIndex = 0;
            }
        }
        PROTECT(ans = allocVector(INTSXP, matchIndex + 1));
        PROTECT(matchlen = allocVector(INTSXP, matchIndex + 1));
        if (foundAny) {
            /* copy from buffers */
            for (j = 0; j <= matchIndex; j++) {
                INTEGER(ans)[j] = INTEGER(matchbuf)[j];
                INTEGER(matchlen)[j] = INTEGER(matchlenbuf)[j];
            }
        } else {
            INTEGER(ans)[0] = INTEGER(matchlen)[0] = -1;
        }
        setAttrib(ans, install("match.length"), matchlen);
        SET_VECTOR_ELT(ansList, i, ans);
        UNPROTECT(2);
    }
    /* see comment above */
    if(cbuff.bufsize != MAXELTSIZE) R_FreeStringBuffer(&cbuff);
    (pcre_free)(re_pcre);
    pcre_free((void *)tables);
    UNPROTECT(3);
    return ansList;
}
syntax highlighted by Code2HTML, v. 0.9.1