/* Copyright (C) 2001-2007, The Perl Foundation. $Id: utf8.c 23777 2007-12-12 03:36:35Z petdance $ =head1 NAME src/encodings/utf8.c - UTF-8 encoding =head1 DESCRIPTION UTF-8 (L). =head2 Functions =over 4 =cut */ #include "parrot/parrot.h" #include "../unicode.h" #include "utf8.h" /* HEADERIZER HFILE: src/encodings/utf8.h */ /* HEADERIZER BEGIN: static */ static void become_encoding(PARROT_INTERP, SHIM(STRING *src)) __attribute__nonnull__(1); PARROT_PURE_FUNCTION static UINTVAL bytes(SHIM_INTERP, NOTNULL(STRING *src)) __attribute__nonnull__(2); static UINTVAL codepoints(PARROT_INTERP, NOTNULL(STRING *src)) __attribute__nonnull__(1) __attribute__nonnull__(2); static UINTVAL get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset) __attribute__nonnull__(2); PARROT_CANNOT_RETURN_NULL static STRING * get_bytes(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count) __attribute__nonnull__(1) __attribute__nonnull__(2); PARROT_CANNOT_RETURN_NULL static STRING * get_bytes_inplace(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *return_string)) __attribute__nonnull__(1); static UINTVAL get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset) __attribute__nonnull__(1) __attribute__nonnull__(2); PARROT_CANNOT_RETURN_NULL static STRING * get_codepoints(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count) __attribute__nonnull__(1) __attribute__nonnull__(2); PARROT_CANNOT_RETURN_NULL static STRING * get_codepoints_inplace(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count, NOTNULL(STRING *return_string)) __attribute__nonnull__(1) __attribute__nonnull__(2) __attribute__nonnull__(5); static void iter_init(SHIM_INTERP, ARGIN(const STRING *src), NOTNULL(String_iter *iter)) __attribute__nonnull__(2) __attribute__nonnull__(3); static void set_byte(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL byte) __attribute__nonnull__(1) __attribute__nonnull__(2); static void set_bytes(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *new_bytes)) __attribute__nonnull__(1); static void set_codepoint(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL codepoint) __attribute__nonnull__(1) __attribute__nonnull__(2); static void set_codepoints(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *new_codepoints)) __attribute__nonnull__(1); PARROT_CAN_RETURN_NULL static STRING * to_encoding(PARROT_INTERP, NOTNULL(STRING *src), NULLOK(STRING *dest)) __attribute__nonnull__(1) __attribute__nonnull__(2); static UINTVAL utf8_characters(PARROT_INTERP, ARGIN(const utf8_t *ptr), UINTVAL byte_len) __attribute__nonnull__(1) __attribute__nonnull__(2); static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr)) __attribute__nonnull__(1) __attribute__nonnull__(2); static UINTVAL utf8_decode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i)) __attribute__nonnull__(1) __attribute__nonnull__(2); PARROT_CANNOT_RETURN_NULL static void * utf8_encode(PARROT_INTERP, NOTNULL(void *ptr), UINTVAL c) __attribute__nonnull__(1) __attribute__nonnull__(2); static void utf8_encode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i), UINTVAL c) __attribute__nonnull__(1) __attribute__nonnull__(2); static void utf8_set_position(SHIM_INTERP, NOTNULL(String_iter *i), UINTVAL pos) __attribute__nonnull__(2); PARROT_WARN_UNUSED_RESULT PARROT_CANNOT_RETURN_NULL static const void * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n) __attribute__nonnull__(1); PARROT_CANNOT_RETURN_NULL static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n) __attribute__nonnull__(1); /* HEADERIZER END: static */ #define UNIMPL real_exception(interp, NULL, UNIMPLEMENTED, "unimpl utf8") const char Parrot_utf8skip[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* ascii */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* bogus */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* scripts */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* cjk etc. */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6 /* cjk etc. */ }; #if 0 typedef unsigned char utf8_t; #endif /* =item C Returns the number of characters in the C bytes from C<*ptr>. =cut */ static UINTVAL utf8_characters(PARROT_INTERP, ARGIN(const utf8_t *ptr), UINTVAL byte_len) { const utf8_t *u8ptr = ptr; const utf8_t *u8end = u8ptr + byte_len; UINTVAL characters = 0; while (u8ptr < u8end) { u8ptr += UTF8SKIP(u8ptr); characters++; } if (u8ptr > u8end) { real_exception(interp, NULL, MALFORMED_UTF8, "Unaligned end in UTF-8 string\n"); } return characters; } /* =item C Returns the integer for the UTF-8 character found at C<*ptr>. =cut */ static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr)) { const utf8_t *u8ptr = ptr; UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { UINTVAL len = UTF8SKIP(u8ptr); UINTVAL count; c &= UTF8_START_MASK(len); for (count = 1; count < len; count++) { u8ptr++; if (!UTF8_IS_CONTINUATION(*u8ptr)) { real_exception(interp, NULL, MALFORMED_UTF8, "Malformed UTF-8 string\n"); } c = UTF8_ACCUMULATE(c, *u8ptr); } if (UNICODE_IS_SURROGATE(c)) { real_exception(interp, NULL, MALFORMED_UTF8, "Surrogate in UTF-8 string\n"); } } else if (!UNICODE_IS_INVARIANT(c)) { real_exception(interp, NULL, MALFORMED_UTF8, "Malformed UTF-8 string\n"); } return c; } /* =item C Returns the UTF-8 encoding of integer C. =cut */ PARROT_CANNOT_RETURN_NULL static void * utf8_encode(PARROT_INTERP, NOTNULL(void *ptr), UINTVAL c) { const UINTVAL len = UNISKIP(c); /* the const is good on u8ptr, but using ptr on other variables avoids the * need to do a yucky cast to remove constness */ const utf8_t * const u8ptr = (utf8_t *)ptr; utf8_t *u8end = (utf8_t *)ptr + len - 1; if (c > 0x10FFFF || UNICODE_IS_SURROGATE(c)) { real_exception(interp, NULL, INVALID_CHARACTER, "Invalid character for UTF-8 encoding\n"); } while (u8end > u8ptr) { *u8end-- = (utf8_t)((c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK); c >>= UTF8_ACCUMULATION_SHIFT; } *u8end = (utf8_t)((c & UTF8_START_MASK(len)) | UTF8_START_MARK(len)); return (utf8_t *)ptr + len; } /* =item C Moves C C characters forward. =cut */ PARROT_CANNOT_RETURN_NULL static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n) { const utf8_t *u8ptr = (const utf8_t *)ptr; while (n-- > 0) { u8ptr += UTF8SKIP(u8ptr); } return u8ptr; } /* =item C Moves C C characters back. =cut */ PARROT_WARN_UNUSED_RESULT PARROT_CANNOT_RETURN_NULL static const void * utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n) { const utf8_t *u8ptr = (const utf8_t *)ptr; while (n-- > 0) { u8ptr--; while (UTF8_IS_CONTINUATION(*u8ptr)) u8ptr--; } return u8ptr; } /* =back =head2 Iterator Functions =over 4 =cut */ /* =item C The UTF-8 implementation of the string iterator's C function. */ static UINTVAL utf8_decode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i)) { const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos); UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { UINTVAL len = UTF8SKIP(u8ptr); c &= UTF8_START_MASK(len); i->bytepos += len; for (len--; len; len--) { u8ptr++; if (!UTF8_IS_CONTINUATION(*u8ptr)) { real_exception(interp, NULL, MALFORMED_UTF8, "Malformed UTF-8 string\n"); } c = UTF8_ACCUMULATE(c, *u8ptr); } if (UNICODE_IS_SURROGATE(c)) { real_exception(interp, NULL, MALFORMED_UTF8, "Surrogate in UTF-8 string\n"); } } else if (!UNICODE_IS_INVARIANT(c)) { real_exception(interp, NULL, MALFORMED_UTF8, "Malformed UTF-8 string\n"); } else { i->bytepos++; } i->charpos++; return c; } /* =item C The UTF-8 implementation of the string iterator's C function. =cut */ static void utf8_encode_and_advance(PARROT_INTERP, NOTNULL(String_iter *i), UINTVAL c) { const STRING * const s = i->str; unsigned char * const pos = (unsigned char *)s->strstart + i->bytepos; unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c); i->bytepos += (new_pos - pos); /* XXX possible buffer overrun exception? */ PARROT_ASSERT(i->bytepos <= PObj_buflen(s)); i->charpos++; } /* =item C The UTF-8 implementation of the string iterator's C function. =cut */ /* XXX Should use quickest direction */ static void utf8_set_position(SHIM_INTERP, NOTNULL(String_iter *i), UINTVAL pos) { const utf8_t *u8ptr = (const utf8_t *)i->str->strstart; i->charpos = pos; while (pos-- > 0) { u8ptr += UTF8SKIP(u8ptr); } i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_CAN_RETURN_NULL static STRING * to_encoding(PARROT_INTERP, NOTNULL(STRING *src), NULLOK(STRING *dest)) { STRING *result; String_iter src_iter; UINTVAL offs, dest_len, dest_pos, src_len; const int in_place = (dest == NULL); unsigned char *new_pos, *pos, *p; if (src->encoding == Parrot_utf8_encoding_ptr) return in_place ? src : string_copy(interp, src); src_len = src->strlen; if (in_place) { result = src; } else { result = dest; } /* init iter before possilby changing encoding */ ENCODING_ITER_INIT(interp, src, &src_iter); result->charset = Parrot_unicode_charset_ptr; result->encoding = Parrot_utf8_encoding_ptr; result->strlen = src_len; if (!src->strlen) return dest; if (in_place) { /* need intermediate memory */ p = (unsigned char *)mem_sys_allocate(src_len); } else { Parrot_reallocate_string(interp, dest, src_len); p = (unsigned char *)dest->strstart; } if (src->charset == Parrot_ascii_charset_ptr) { for (dest_len = 0; dest_len < src_len; ++dest_len) { p[dest_len] = ((unsigned char*)src->strstart)[dest_len]; } result->bufused = dest_len; } else { dest_len = src_len; dest_pos = 0; for (offs = 0; offs < src_len; ++offs) { const UINTVAL c = src_iter.get_and_advance(interp, &src_iter); if (dest_len - dest_pos < 6) { UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5); if (need < 16) need = 16; dest_len += need; if (in_place) p = (unsigned char *)mem_sys_realloc(p, dest_len); else { result->bufused = dest_pos; Parrot_reallocate_string(interp, dest, dest_len); p = (unsigned char *)dest->strstart; } } pos = p + dest_pos; new_pos = (unsigned char *)utf8_encode(interp, pos, c); dest_pos += (new_pos - pos); } result->bufused = dest_pos; } if (in_place) { Parrot_reallocate_string(interp, src, src->bufused); memcpy(src->strstart, p, src->bufused); mem_sys_free(p); } return result; } /* =item C RT#48260: Not yet documented!!! =cut */ static UINTVAL get_codepoint(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset) { const utf8_t * const start = (const utf8_t *)utf8_skip_forward(src->strstart, offset); return utf8_decode(interp, start); } /* =item C RT#48260: Not yet documented!!! =cut */ static void set_codepoint(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL codepoint) { const void *start; void *p; DECL_CONST_CAST; start = utf8_skip_forward(src->strstart, offset); p = const_cast(start); utf8_encode(interp, p, codepoint); } /* =item C RT#48260: Not yet documented!!! =cut */ static UINTVAL get_byte(SHIM_INTERP, ARGIN(const STRING *src), UINTVAL offset) { unsigned char *contents = (unsigned char *)src->strstart; if (offset >= src->bufused) { /* real_exception(interp, NULL, 0, "get_byte past the end of the buffer (%i of %i)", offset, src->bufused);*/ return 0; } return contents[offset]; } /* =item C RT#48260: Not yet documented!!! =cut */ static void set_byte(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL byte) { unsigned char *contents; if (offset >= src->bufused) { real_exception(interp, NULL, 0, "set_byte past the end of the buffer"); } contents = (unsigned char *)src->strstart; contents[offset] = (unsigned char)byte; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_CANNOT_RETURN_NULL static STRING * get_codepoints(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count) { String_iter iter; UINTVAL start; STRING * const return_string = Parrot_make_COW_reference(interp, src); iter_init(interp, src, &iter); iter.set_position(interp, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; iter.set_position(interp, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; return return_string; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_CANNOT_RETURN_NULL static STRING * get_bytes(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count) { STRING * const return_string = Parrot_make_COW_reference(interp, src); return_string->encoding = src->encoding; /* XXX */ return_string->charset = src->charset; return_string->strstart = (char *)return_string->strstart + offset ; return_string->bufused = count; return_string->strlen = count; return_string->hashval = 0; return return_string; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_CANNOT_RETURN_NULL static STRING * get_codepoints_inplace(PARROT_INTERP, NOTNULL(STRING *src), UINTVAL offset, UINTVAL count, NOTNULL(STRING *return_string)) { String_iter iter; UINTVAL start; Parrot_reuse_COW_reference(interp, src, return_string); iter_init(interp, src, &iter); iter.set_position(interp, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; iter.set_position(interp, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; return return_string; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_CANNOT_RETURN_NULL static STRING * get_bytes_inplace(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *return_string)) { UNIMPL; } /* =item C RT#48260: Not yet documented!!! =cut */ static void set_codepoints(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *new_codepoints)) { UNIMPL; } /* =item C RT#48260: Not yet documented!!! =cut */ static void set_bytes(PARROT_INTERP, SHIM(STRING *src), UINTVAL offset, UINTVAL count, SHIM(STRING *new_bytes)) { UNIMPL; } /* =item C Unconditionally makes the string be in this encoding, if that's valid =cut */ static void become_encoding(PARROT_INTERP, SHIM(STRING *src)) { UNIMPL; } /* =item C RT#48260: Not yet documented!!! =cut */ static UINTVAL codepoints(PARROT_INTERP, NOTNULL(STRING *src)) { String_iter iter; /* * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ iter_init(interp, src, &iter); while (iter.bytepos < src->bufused) iter.get_and_advance(interp, &iter); return iter.charpos; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_PURE_FUNCTION static UINTVAL bytes(SHIM_INTERP, NOTNULL(STRING *src)) { return src->bufused; } /* =item C RT#48260: Not yet documented!!! =cut */ static void iter_init(SHIM_INTERP, ARGIN(const STRING *src), NOTNULL(String_iter *iter)) { iter->str = src; iter->bytepos = 0; iter->charpos = 0; iter->get_and_advance = utf8_decode_and_advance; iter->set_and_advance = utf8_encode_and_advance; iter->set_position = utf8_set_position; } /* =item C RT#48260: Not yet documented!!! =cut */ PARROT_CANNOT_RETURN_NULL ENCODING * Parrot_encoding_utf8_init(PARROT_INTERP) { ENCODING * const return_encoding = Parrot_new_encoding(interp); static const ENCODING base_encoding = { "utf8", 4, /* Max bytes per codepoint 0 .. 0x10ffff */ to_encoding, get_codepoint, set_codepoint, get_byte, set_byte, get_codepoints, get_codepoints_inplace, get_bytes, get_bytes_inplace, set_codepoints, set_bytes, become_encoding, codepoints, bytes, iter_init }; STRUCT_COPY(return_encoding, &base_encoding); Parrot_register_encoding(interp, "utf8", return_encoding); return return_encoding; } /* =back =head1 SEE ALSO F, F, F, F. =cut */ /* * Local variables: * c-file-style: "parrot" * End: * vim: expandtab shiftwidth=4: */