ports//lang/parrot/work/parrot-0.5.1/src/charset/unicode.c

/*
Copyright (C) 2005-2007, The Perl Foundation.
$Id: unicode.c 23777 2007-12-12 03:36:35Z petdance $

=head1 NAME

src/charset/unicode.c

=head1 DESCRIPTION

This file implements the charset functions for unicode data

=over 4

=cut

*/

#include "parrot/parrot.h"
#include "unicode.h"
#include "ascii.h"
#include "tables.h"

/* HEADERIZER HFILE: src/charset/unicode.h */

/* HEADERIZER BEGIN: static */

static INTVAL compare(PARROT_INTERP,
    ARGIN(const STRING *lhs),
    ARGIN(const STRING *rhs))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2)
        __attribute__nonnull__(3);

PARROT_CANNOT_RETURN_NULL
static STRING* compose(PARROT_INTERP, NOTNULL(STRING *src))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static size_t compute_hash(PARROT_INTERP,
    ARGIN(const STRING *src),
    size_t seed)
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static INTVAL cs_rindex(PARROT_INTERP,
    NOTNULL(STRING *source_string),
    SHIM(STRING *search_string),
    UINTVAL offset)
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

PARROT_CANNOT_RETURN_NULL
static STRING* decompose(PARROT_INTERP, SHIM(STRING *src))
        __attribute__nonnull__(1);

static void downcase(PARROT_INTERP, NOTNULL(STRING *src))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static void downcase_first(PARROT_INTERP, NOTNULL(STRING *source_string))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static INTVAL find_cclass(PARROT_INTERP,
    INTVAL flags,
    NOTNULL(STRING *source_string),
    UINTVAL offset,
    UINTVAL count)
        __attribute__nonnull__(1)
        __attribute__nonnull__(3);

static INTVAL find_not_cclass(PARROT_INTERP,
    INTVAL flags,
    NOTNULL(STRING *source_string),
    UINTVAL offset,
    UINTVAL count)
        __attribute__nonnull__(1)
        __attribute__nonnull__(3);

PARROT_CANNOT_RETURN_NULL
static STRING * get_graphemes(PARROT_INTERP,
    NOTNULL(STRING *source_string),
    UINTVAL offset,
    UINTVAL count)
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

PARROT_CANNOT_RETURN_NULL
static STRING * get_graphemes_inplace(PARROT_INTERP,
    NOTNULL(STRING *source_string),
    UINTVAL offset,
    UINTVAL count,
    NOTNULL(STRING *dest_string))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2)
        __attribute__nonnull__(5);

static INTVAL is_cclass(PARROT_INTERP,
    INTVAL flags,
    NOTNULL(STRING *source_string),
    UINTVAL offset)
        __attribute__nonnull__(1)
        __attribute__nonnull__(3);

static void set_graphemes(PARROT_INTERP,
    NOTNULL(STRING *source_string),
    UINTVAL offset,
    UINTVAL replace_count,
    NOTNULL(STRING *insert_string))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2)
        __attribute__nonnull__(5);

PARROT_CANNOT_RETURN_NULL
static STRING * string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
        __attribute__nonnull__(1);

static void titlecase(PARROT_INTERP, NOTNULL(STRING *src))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static void titlecase_first(PARROT_INTERP, NOTNULL(STRING *source_string))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

PARROT_CANNOT_RETURN_NULL
static STRING* to_charset(PARROT_INTERP,
    NOTNULL(STRING *src),
    NOTNULL(STRING *dest))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2)
        __attribute__nonnull__(3);

static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
        __attribute__nonnull__(1);

static void upcase(PARROT_INTERP, NOTNULL(STRING *src))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static void upcase_first(PARROT_INTERP, NOTNULL(STRING *source_string))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

static UINTVAL validate(PARROT_INTERP, NOTNULL(STRING *src))
        __attribute__nonnull__(1)
        __attribute__nonnull__(2);

/* HEADERIZER END: static */

#ifdef EXCEPTION
#  undef EXCEPTION
#endif

#if PARROT_HAS_ICU
#  include <unicode/ucnv.h>
#  include <unicode/utypes.h>
#  include <unicode/uchar.h>
#  include <unicode/ustring.h>
#  include <unicode/unorm.h>
#endif
#define EXCEPTION(err, str) \
    real_exception(interp, NULL, err, str)

#define UNIMPL EXCEPTION(UNIMPLEMENTED, "unimplemented unicode")

/*

=item C<static void
set_graphemes(PARROT_INTERP, NOTNULL(STRING *source_string),
        UINTVAL offset, UINTVAL replace_count, NOTNULL(STRING *insert_string))>

RT#48260: Not yet documented!!!

=cut

*/

static void
set_graphemes(PARROT_INTERP, NOTNULL(STRING *source_string),
        UINTVAL offset, UINTVAL replace_count, NOTNULL(STRING *insert_string))
{
    ENCODING_SET_CODEPOINTS(interp, source_string, offset,
            replace_count, insert_string);
}

/*

=item C<PARROT_CANNOT_RETURN_NULL
static STRING *
get_graphemes(PARROT_INTERP, NOTNULL(STRING *source_string),
        UINTVAL offset, UINTVAL count)>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
static STRING *
get_graphemes(PARROT_INTERP, NOTNULL(STRING *source_string),
        UINTVAL offset, UINTVAL count)
{
    return ENCODING_GET_CODEPOINTS(interp, source_string, offset, count);
}

/*

=item C<PARROT_CANNOT_RETURN_NULL
static STRING *
get_graphemes_inplace(PARROT_INTERP, NOTNULL(STRING *source_string),
        UINTVAL offset, UINTVAL count, NOTNULL(STRING *dest_string))>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
static STRING *
get_graphemes_inplace(PARROT_INTERP, NOTNULL(STRING *source_string),
        UINTVAL offset, UINTVAL count, NOTNULL(STRING *dest_string))
{
    return ENCODING_GET_CODEPOINTS_INPLACE(interp, source_string,
            offset, count, dest_string);
}

/*

=item C<PARROT_CANNOT_RETURN_NULL
static STRING*
to_charset(PARROT_INTERP, NOTNULL(STRING *src), NOTNULL(STRING *dest))>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
static STRING*
to_charset(PARROT_INTERP, NOTNULL(STRING *src), NOTNULL(STRING *dest))
{
    const charset_converter_t conversion_func =
            Parrot_find_charset_converter(interp, src->charset,
                    Parrot_unicode_charset_ptr);
    if (conversion_func)
         return conversion_func(interp, src, dest);
    return Parrot_utf8_encoding_ptr->to_encoding(interp, src, dest);
}


/*

=item C<PARROT_CANNOT_RETURN_NULL
static STRING*
compose(PARROT_INTERP, NOTNULL(STRING *src))>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
static STRING*
compose(PARROT_INTERP, NOTNULL(STRING *src))
{
#if PARROT_HAS_ICU
    STRING *dest;
    int src_len, dest_len;
    UErrorCode err;
    /*
       U_STABLE int32_t U_EXPORT2
       unorm_normalize(const UChar *source, int32_t sourceLength,
       UNormalizationMode mode, int32_t options,
       UChar *result, int32_t resultLength,
       UErrorCode *status);
       */
    dest_len = src_len = src->strlen;
    dest = string_make_direct(interp, NULL, src_len * sizeof (UChar),
            src->encoding, src->charset, 0);
    err = U_ZERO_ERROR;
    dest_len = unorm_normalize((UChar *)src->strstart, src_len,
            UNORM_DEFAULT,      /* default is NFC */
            0,                  /* options 0 default - no specific icu
                                 * version */
            (UChar *)dest->strstart, dest_len,
            &err);
    dest->bufused = dest_len * sizeof (UChar);
    if (!U_SUCCESS(err)) {
        err = U_ZERO_ERROR;
        Parrot_reallocate_string(interp, dest, dest->bufused);
        dest_len = unorm_normalize((UChar *)src->strstart, src_len,
                UNORM_DEFAULT,      /* default is NFC */
                0,                  /* options 0 default - no specific
                                     * icu version */
                (UChar *)dest->strstart, dest_len,
                &err);
        PARROT_ASSERT(U_SUCCESS(err));
        dest->bufused = dest_len * sizeof (UChar);
    }
    dest->strlen = dest_len;
    return dest;
#else
    real_exception(interp, NULL, E_LibraryNotLoadedError,
            "no ICU lib loaded");
#endif
}

/*

=item C<PARROT_CANNOT_RETURN_NULL
static STRING*
decompose(PARROT_INTERP, SHIM(STRING *src))>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
static STRING*
decompose(PARROT_INTERP, SHIM(STRING *src))
{
    UNIMPL;
}

/*

=item C<static void
upcase(PARROT_INTERP, NOTNULL(STRING *src))>

RT#48260: Not yet documented!!!

=cut

*/

static void
upcase(PARROT_INTERP, NOTNULL(STRING *src))
{
#if PARROT_HAS_ICU

    UErrorCode err;
    int dest_len, src_len, needed;

    if (src->bufused == src->strlen &&
            src->encoding == Parrot_utf8_encoding_ptr) {
        Parrot_ascii_charset_ptr->upcase(interp, src);
        return;
    }
    src = Parrot_utf16_encoding_ptr->to_encoding(interp, src, NULL);
    /*
       U_CAPI int32_t U_EXPORT2
       u_strToUpper(UChar *dest, int32_t destCapacity,
       const UChar *src, int32_t srcLength,
       const char *locale,
       UErrorCode *pErrorCode);
       */
    err = U_ZERO_ERROR;
    /* use all available space - see below XXX */
    /* TODO downcase, titlecase too */
    dest_len = PObj_buflen(src) / sizeof (UChar);
    src_len = src->bufused / sizeof (UChar);
    /*
     * XXX troubles:
     *   t/op/string_cs_45  upcase unicode:"\u01f0"
     *   this creates \u004a \u030c J+NON-SPACING HACEK
     *   the string needs resizing, *if* the src buffer is
     *   too short. *But* with icu 3.2/3.4 the src string is
     *   overwritten with partial result, despite the icu docs sayeth:
     *
     *      The source string and the destination buffer
     *      are allowed to overlap.
     *
     *  Workaround:  'preflighting' returns needed length
     *  Alternative: forget about inplace operation - create new result
     *
     *  TODO downcase, titlecase
     */
    needed = u_strToUpper(NULL, 0,
            (UChar *)src->strstart, src_len,
            NULL,       /* locale = default */
            &err);
    if (needed > dest_len) {
        Parrot_reallocate_string(interp, src, needed * sizeof (UChar));
        dest_len = needed;
    }
    err = U_ZERO_ERROR;
    dest_len = u_strToUpper((UChar *)src->strstart, dest_len,
            (UChar *)src->strstart, src_len,
            NULL,       /* locale = default */
            &err);
    PARROT_ASSERT(U_SUCCESS(err));
    src->bufused = dest_len * sizeof (UChar);
    /* downgrade if possible */
    if (dest_len == (int)src->strlen)
        src->encoding = Parrot_ucs2_encoding_ptr;
    else {
        /* string is likely still ucs2 if it was earlier
         * but strlen changed tue to combining char
         */
        src->strlen = dest_len;
    }
#else
    real_exception(interp, NULL, E_LibraryNotLoadedError,
            "no ICU lib loaded");
#endif
}

/*

=item C<static void
downcase(PARROT_INTERP, NOTNULL(STRING *src))>

RT#48260: Not yet documented!!!

=cut

*/

static void
downcase(PARROT_INTERP, NOTNULL(STRING *src))
{
#if PARROT_HAS_ICU

    UErrorCode err;
    int dest_len, src_len;

    if (src->bufused == src->strlen &&
            src->encoding == Parrot_utf8_encoding_ptr) {
        Parrot_ascii_charset_ptr->downcase(interp, src);
        return;
    }

    src = Parrot_utf16_encoding_ptr->to_encoding(interp, src, NULL);
    /*
U_CAPI int32_t U_EXPORT2
u_strToLower(UChar *dest, int32_t destCapacity,
             const UChar *src, int32_t srcLength,
             const char *locale,
             UErrorCode *pErrorCode);
     */
    err = U_ZERO_ERROR;
    src_len = src->bufused / sizeof (UChar);
    dest_len = u_strToLower((UChar *)src->strstart, src_len,
            (UChar *)src->strstart, src_len,
            NULL,       /* locale = default */
            &err);
    src->bufused = dest_len * sizeof (UChar);
    if (!U_SUCCESS(err)) {
        err = U_ZERO_ERROR;
        Parrot_reallocate_string(interp, src, src->bufused);
        dest_len = u_strToLower((UChar *)src->strstart, dest_len,
                (UChar *)src->strstart, src_len,
                NULL,       /* locale = default */
                &err);
        PARROT_ASSERT(U_SUCCESS(err));
    }
    /* downgrade if possible */
    if (dest_len == (int)src->strlen)
        src->encoding = Parrot_ucs2_encoding_ptr;
#else
    real_exception(interp, NULL, E_LibraryNotLoadedError,
            "no ICU lib loaded");
#endif
}

/*

=item C<static void
titlecase(PARROT_INTERP, NOTNULL(STRING *src))>

RT#48260: Not yet documented!!!

=cut

*/

static void
titlecase(PARROT_INTERP, NOTNULL(STRING *src))
{
#if PARROT_HAS_ICU

    UErrorCode err;
    int dest_len, src_len;

    if (src->bufused == src->strlen &&
            src->encoding == Parrot_utf8_encoding_ptr) {
        Parrot_ascii_charset_ptr->titlecase(interp, src);
        return;
    }
    src = Parrot_utf16_encoding_ptr->to_encoding(interp, src, NULL);
    /*
U_CAPI int32_t U_EXPORT2
u_strToTitle(UChar *dest, int32_t destCapacity,
             const UChar *src, int32_t srcLength,
             UBreakIterator *titleIter,
             const char *locale,
             UErrorCode *pErrorCode);
     */
    err = U_ZERO_ERROR;
    src_len = src->bufused / sizeof (UChar);
    dest_len = u_strToTitle((UChar *)src->strstart, src_len,
            (UChar *)src->strstart, src_len,
            NULL,       /* default titleiter */
            NULL,       /* locale = default */
            &err);
    src->bufused = dest_len * sizeof (UChar);
    if (!U_SUCCESS(err)) {
        err = U_ZERO_ERROR;
        Parrot_reallocate_string(interp, src, src->bufused);
        dest_len = u_strToTitle((UChar *)src->strstart, dest_len,
                (UChar *)src->strstart, src_len,
                NULL, NULL,
                &err);
        PARROT_ASSERT(U_SUCCESS(err));
    }
    /* downgrade if possible */
    if (dest_len == (int)src->strlen)
        src->encoding = Parrot_ucs2_encoding_ptr;
#else
    real_exception(interp, NULL, E_LibraryNotLoadedError,
            "no ICU lib loaded");
#endif
}

/*

=item C<static void
upcase_first(PARROT_INTERP, NOTNULL(STRING *source_string))>

RT#48260: Not yet documented!!!

=cut

*/

static void
upcase_first(PARROT_INTERP, NOTNULL(STRING *source_string))
{
    UNIMPL;
}

/*

=item C<static void
downcase_first(PARROT_INTERP, NOTNULL(STRING *source_string))>

RT#48260: Not yet documented!!!

=cut

*/

static void
downcase_first(PARROT_INTERP, NOTNULL(STRING *source_string))
{
    UNIMPL;
}

/*

=item C<static void
titlecase_first(PARROT_INTERP, NOTNULL(STRING *source_string))>

RT#48260: Not yet documented!!!

=cut

*/

static void
titlecase_first(PARROT_INTERP, NOTNULL(STRING *source_string))
{
    UNIMPL;
}

/*

=item C<static INTVAL
compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))>

RT#48260: Not yet documented!!!

=cut

*/

static INTVAL
compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs))
{
    String_iter l_iter, r_iter;
    UINTVAL offs, cl, cr, min_len, l_len, r_len;

    /* TODO make optimized equal - strings are equal length then already */
    ENCODING_ITER_INIT(interp, lhs, &l_iter);
    ENCODING_ITER_INIT(interp, rhs, &r_iter);
    l_len = lhs->strlen;
    r_len = rhs->strlen;
    min_len = l_len > r_len ? r_len : l_len;
    for (offs = 0; offs < min_len; ++offs) {
        cl = l_iter.get_and_advance(interp, &l_iter);
        cr = r_iter.get_and_advance(interp, &r_iter);
        if (cl != cr)
            return cl < cr ? -1 : 1;
    }
    if (l_len < r_len) {
        return -1;
    }
    if (l_len > r_len) {
        return 1;
    }
    return 0;
}


/*

=item C<static INTVAL
cs_rindex(PARROT_INTERP, NOTNULL(STRING *source_string),
        SHIM(STRING *search_string), UINTVAL offset)>

RT#48260: Not yet documented!!!

=cut

*/

static INTVAL
cs_rindex(PARROT_INTERP, NOTNULL(STRING *source_string),
        SHIM(STRING *search_string), UINTVAL offset)
{
    UNIMPL;
}

/*

=item C<static UINTVAL
validate(PARROT_INTERP, NOTNULL(STRING *src))>

RT#48260: Not yet documented!!!

=cut

*/

static UINTVAL
validate(PARROT_INTERP, NOTNULL(STRING *src))
{
    UINTVAL offset;

    for (offset = 0; offset < string_length(interp, src); ++offset) {
        const UINTVAL codepoint = ENCODING_GET_CODEPOINT(interp, src, offset);
        /* Check for Unicode non-characters */
        if (codepoint >= 0xfdd0 &&
            (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe) &&
                codepoint <= 0x10ffff)
                    return 0;
    }
    return 1;
}

/*

=item C<static int
u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)>

RT#48260: Not yet documented!!!

=cut

*/

static int
u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
{
#if PARROT_HAS_ICU
            /* XXX which one
               return u_charDigitValue(codepoint);
               */
    if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1;
    if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1;
    if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1;
    if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1;
    if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1;
    if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1;
    if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1;
    if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1;
    if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1;
    if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1;
    if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1;
    if ((flags & enum_cclass_word) &&
        (u_isalnum(codepoint) || codepoint == '_')) return 1;
    return 0;
#else
    if (codepoint < 256) {
        return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
    }
    if (flags & enum_cclass_whitespace) {
        /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */
        switch (codepoint) {
            case 0x1680: case 0x180e: case 0x2000: case 0x2001:
            case 0x2002: case 0x2003: case 0x2004: case 0x2005:
            case 0x2006: case 0x2007: case 0x2008: case 0x2009:
            case 0x200a: case 0x202f: case 0x205f: case 0x3000:
                return 1;
            default:
                break;
        }
    }
    if (flags & enum_cclass_numeric) {
        /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
        if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1;
        if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1;
        if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1;
        if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1;
        if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1;
        if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1;
        if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1;
        if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1;
        if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1;
        if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1;
        if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1;
        if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1;
        if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1;
        if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1;
        if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1;
        if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1;
        if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1;
        if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1;
        if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1;
        if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1;
        if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1;
        if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1;
    }
    if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric))
        real_exception(interp, NULL, E_LibraryNotLoadedError,
            "no ICU lib loaded");
    return 0;
#endif
}

/*

=item C<static INTVAL
is_cclass(PARROT_INTERP, INTVAL flags,
          NOTNULL(STRING *source_string), UINTVAL offset)>

RT#48260: Not yet documented!!!

=cut

*/

static INTVAL
is_cclass(PARROT_INTERP, INTVAL flags,
          NOTNULL(STRING *source_string), UINTVAL offset)
{
    UINTVAL codepoint;

    if (offset >= source_string->strlen)
        return 0;
    codepoint = ENCODING_GET_CODEPOINT(interp, source_string, offset);
    if (codepoint >= 256) {
        return u_iscclass(interp, codepoint, flags) != 0;
    }
    return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;
}

/*

=item C<static INTVAL
find_cclass(PARROT_INTERP, INTVAL flags,
            NOTNULL(STRING *source_string), UINTVAL offset, UINTVAL count)>

RT#48260: Not yet documented!!!

=cut

*/

static INTVAL
find_cclass(PARROT_INTERP, INTVAL flags,
            NOTNULL(STRING *source_string), UINTVAL offset, UINTVAL count)
{
    UINTVAL pos = offset;
    UINTVAL end = offset + count;
    UINTVAL codepoint;

    PARROT_ASSERT(source_string != 0);
    end = source_string->strlen < end ? source_string->strlen : end;
    for (; pos < end; ++pos) {
        codepoint = ENCODING_GET_CODEPOINT(interp, source_string, pos);
        if (codepoint >= 256) {
            if (u_iscclass(interp, codepoint, flags))
                    return pos;
        }
        else {
            if (Parrot_iso_8859_1_typetable[codepoint] & flags) {
                return pos;
            }
        }
    }
    return end;
}

/*

=item C<static INTVAL
find_not_cclass(PARROT_INTERP, INTVAL flags,
                NOTNULL(STRING *source_string), UINTVAL offset, UINTVAL count)>

RT#48260: Not yet documented!!!

=cut

*/

static INTVAL
find_not_cclass(PARROT_INTERP, INTVAL flags,
                NOTNULL(STRING *source_string), UINTVAL offset, UINTVAL count)
{
    UINTVAL pos = offset;
    UINTVAL end = offset + count;
    UINTVAL codepoint;
    int bit;

    PARROT_ASSERT(source_string);
    end = source_string->strlen < end ? source_string->strlen : end;
    for (; pos < end; ++pos) {
        codepoint = ENCODING_GET_CODEPOINT(interp, source_string, pos);
        if (codepoint >= 256) {
            for (bit = enum_cclass_uppercase;
                    bit <= enum_cclass_word ; bit <<= 1) {
                if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
                    return pos;
            }
        }
        else {
            if (!(Parrot_iso_8859_1_typetable[codepoint] & flags)) {
                return pos;
            }
        }
    }
    return end;
}

/*

=item C<PARROT_CANNOT_RETURN_NULL
static STRING *
string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
static STRING *
string_from_codepoint(PARROT_INTERP, UINTVAL codepoint)
{
    String_iter iter;
    STRING * const dest = string_make(interp, "", 1, "unicode", 0);

    dest->strlen = 1;
    ENCODING_ITER_INIT(interp, dest, &iter);
    iter.set_and_advance(interp, &iter, codepoint);
    dest->bufused = iter.bytepos;

    return dest;
}

/*

=item C<static size_t
compute_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed)>

RT#48260: Not yet documented!!!

=cut

*/

static size_t
compute_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed)
{
    String_iter iter;
    size_t hashval = seed;
    UINTVAL offs;

    ENCODING_ITER_INIT(interp, src, &iter);
    for (offs = 0; offs < src->strlen; ++offs) {
        const UINTVAL c = iter.get_and_advance(interp, &iter);
        hashval += hashval << 5;
        hashval += c;
    }
    return hashval;
}

/*

=item C<PARROT_CANNOT_RETURN_NULL
const CHARSET *
Parrot_charset_unicode_init(PARROT_INTERP)>

RT#48260: Not yet documented!!!

=cut

*/

PARROT_CANNOT_RETURN_NULL
const CHARSET *
Parrot_charset_unicode_init(PARROT_INTERP)
{
    CHARSET * const return_set = Parrot_new_charset(interp);
    static const CHARSET base_set = {
        "unicode",
        get_graphemes,
        get_graphemes_inplace,
        set_graphemes,
        to_charset,
        compose,
        decompose,
        upcase,
        downcase,
        titlecase,
        upcase_first,
        downcase_first,
        titlecase_first,
        compare,
        mixed_cs_index,
        cs_rindex,
        validate,
        is_cclass,
        find_cclass,
        find_not_cclass,
        string_from_codepoint,
        compute_hash,
        NULL
    };

    STRUCT_COPY(return_set, &base_set);
    /*
     * for now use utf8
     * TODO replace it with a fixed uint_16 or uint_32 encoding
     *      XXX if this is changed, modify string_make so it
     *          still takes "utf8" when fed "unicode" as charset!
     */
    return_set->preferred_encoding = Parrot_utf8_encoding_ptr;
    Parrot_register_charset(interp, "unicode", return_set);
    return return_set;
}


/*
 * Local variables:
 *   c-file-style: "parrot"
 * End:
 * vim: expandtab shiftwidth=4:
 */
syntax highlighted by Code2HTML, v. 0.9.1