/* widechar.c - handle multibyte and UTF-8 encoding Copyright (C) 1996-2000 Paul Sheer This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. */ #include #include #define MB_MARKER_DENSITY 64 /* 1 | 7 | 0vvvvvvv 2 | 11 | 110vvvvv 10vvvvvv 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv */ /* force utf-8 only multibyte encoding - i.e. ignore locale settings */ int option_utf_interpretation = 0; unsigned char *wcrtomb_ucs4_to_utf8 (wchar_t c) { static unsigned char r[32]; int i = 0; #undef APPEND #define APPEND(x) r[i++] = (unsigned char) (x) if (c < (1 << 7)) { APPEND (c); } else if (c < (1 << 11)) { APPEND ((c >> 6) | 0xC0); APPEND ((c & 0x3F) | 0x80); } else if (c < (1 << 16)) { APPEND ((c >> 12) | 0xE0); APPEND (((c >> 6) & 0x3F) | 0x80); APPEND ((c & 0x3F) | 0x80); } else if (c < (1 << 21)) { APPEND ((c >> 18) | 0xE0); APPEND (((c >> 12) & 0x3F) | 0x80); APPEND (((c >> 6) & 0x3F) | 0x80); APPEND ((c & 0x3F) | 0x80); } APPEND ('\0'); return r; } /* makes sense to me... (although only goes to 21 bits) */ static inline int mbrtowc_utf8_to_ucs4 (wchar_t * c, char *t, int n, void *x /* no shifting with utf8 */ ) { unsigned char *s = (unsigned char *) t; if (!*s) { *c = 0; return 0; } if (*s < 0x80) { *c = (wchar_t) * s; return 1; } if (*s < 0xC0) return -1; if (*s < 0xE0) { if (n < 2) return -2; if ((s[1] & 0xC0) != 0x80) return -1; *c = ((wchar_t) (s[0] & 0x1F) << 6) | (wchar_t) (s[1] & 0x3F); if (*c < (1 << 7)) return -1; return 2; } if (*s < 0xF0) { if (n < 3) return -2; if ((s[1] & 0xC0) != 0x80) return -1; if ((s[2] & 0xC0) != 0x80) return -1; *c = ((wchar_t) (s[0] & 0x0F) << 12) | ((wchar_t) (s[1] & 0x3F) << 6) | (wchar_t) (s[2] & 0x3F); if (*c < (1 << 11)) return -1; return 3; } if (*s < 0xF8) { if (n < 4) return -2; if ((s[1] & 0xC0) != 0x80) return -1; if ((s[2] & 0xC0) != 0x80) return -1; if ((s[3] & 0xC0) != 0x80) return -1; *c = ((wchar_t) (s[0] & 0x07) << 18) | ((wchar_t) (s[1] & 0x3F) << 12) | ((wchar_t) (s[2] & 0x3F) << 6) | (wchar_t) (s[3] & 0x3F); if (*c < (1 << 16)) return -1; return 4; } if (*s < 0xFC) { if (n < 5) return -2; if ((s[1] & 0xC0) != 0x80) return -1; if ((s[2] & 0xC0) != 0x80) return -1; if ((s[3] & 0xC0) != 0x80) return -1; if ((s[4] & 0xC0) != 0x80) return -1; *c = ((wchar_t) (s[0] & 0x03) << 24) | ((wchar_t) (s[1] & 0x3F) << 18) | ((wchar_t) (s[2] & 0x3F) << 12) | ((wchar_t) (s[3] & 0x3F) << 6) | (wchar_t) (s[4] & 0x3F); if (*c < (1 << 21)) return -1; return 5; } if (*s < 0xFE) { if (n < 6) return -2; if ((s[1] & 0xC0) != 0x80) return -1; if ((s[2] & 0xC0) != 0x80) return -1; if ((s[3] & 0xC0) != 0x80) return -1; if ((s[4] & 0xC0) != 0x80) return -1; if ((s[5] & 0xC0) != 0x80) return -1; *c = ((wchar_t) (s[0] & 0x01) << 30) | ((wchar_t) (s[1] & 0x3F) << 24) | ((wchar_t) (s[2] & 0x3F) << 18) | ((wchar_t) (s[3] & 0x3F) << 12) | ((wchar_t) (s[4] & 0x3F) << 6) | (wchar_t) (s[5] & 0x3F); if (*c < (1 << 26)) return -1; return 6; } return -1; } #if 0 /* last arg is len of t to convert NOT len of c */ int mbstowcs_utf8_to_ucs4 (wchar_t * c, char *t, int n) { int v = 0; while (n) { int r; if ((r = mbrtowc_utf8_to_ucs4 (c, t, n, 0)) == -1) { *c++ = *t++; v++; n--; } else if (r == -2) { break; } else { t += r; n -= r; v++; c++; } } return v; } wchar_t *mbstowcs_dup (unsigned char *s) { wchar_t *t; t = CMalloc ((strlen ((char *) s) + 1) * sizeof (wchar_t)); t[mbstowcs_utf8_to_ucs4 (t, (char *) s, strlen ((char *) s))] = 0; return t; } int wchar_t_strlen (wchar_t * p) { int v; for (v = 0; *p; p++, v++); return v; } #endif static inline struct mb_rule apply_mb_rules_going_right_utf8_to_ucs4 (WEdit * edit, long byte_index, struct mb_rule mb_rule) { wchar_t wc; unsigned char p[16]; int n; if (mb_rule.end) { mb_rule.end--; mb_rule.ch = -1; return mb_rule; } for (n = 0; n < 6; n++) { int r; p[n] = edit_get_byte (edit, byte_index + n); r = mbrtowc_utf8_to_ucs4 (&wc, (char *) p, n + 1, &mb_rule.shift_state); if (r >= 0) { mb_rule.end = n; mb_rule.ch = wc; return mb_rule; } if (r == -1) { mb_rule.end = 0; mb_rule.ch = (unsigned long) *p | 0x80000000; return mb_rule; } } mb_rule.end = 0; mb_rule.ch = -1; return mb_rule; } static inline struct mb_rule apply_mb_rules_going_right (WEdit * edit, long byte_index, struct mb_rule mb_rule) { #ifdef HAVE_WCHAR_H #ifdef HAVE_MBRTOWC wchar_t wc; unsigned char p[16]; int n; if (mb_rule.end) { mb_rule.end--; mb_rule.ch = -1; return mb_rule; } for (n = 0; n < MB_CUR_MAX; n++) { int r; p[n] = edit_get_byte (edit, byte_index + n); r = mbrtowc (&wc, (char *) p, n + 1, &mb_rule.shift_state); if (r >= 0) { mb_rule.end = n; mb_rule.ch = wc; return mb_rule; } if (r == -1) { mb_rule.end = 0; mb_rule.ch = *p; return mb_rule; } } mb_rule.end = 0; mb_rule.ch = -1; #endif #endif return mb_rule; } struct mb_rule get_mb_rule (WEdit * edit, long byte_index) { long i; if ( #ifndef HAVE_WCHAR_H !option_utf_interpretation || #endif (MB_CUR_MAX == 1 && !option_utf_interpretation)) { struct mb_rule r; r.end = 0; r.ch = edit_get_byte (edit, byte_index); return r; } if (edit->mb_invalidate) { struct _mb_marker *s; while (edit->mb_marker && edit->mb_marker->offset >= edit->last_get_mb_rule) { s = edit->mb_marker->next; free (edit->mb_marker); edit->mb_marker = s; } if (edit->mb_marker) { edit->last_get_mb_rule = edit->mb_marker->offset; edit->mb_rule = edit->mb_marker->rule; } else { edit->last_get_mb_rule = -1; memset (&edit->mb_rule, 0, sizeof (edit->mb_rule)); } edit->mb_invalidate = 0; } if (byte_index > edit->last_get_mb_rule) { if (option_utf_interpretation) { for (i = edit->last_get_mb_rule + 1; i <= byte_index; i++) { edit->mb_rule = apply_mb_rules_going_right_utf8_to_ucs4 (edit, i, edit->mb_rule); if (i > (edit->mb_marker ? edit->mb_marker->offset + MB_MARKER_DENSITY : MB_MARKER_DENSITY)) { struct _mb_marker *s; s = edit->mb_marker; edit->mb_marker = malloc (sizeof (struct _mb_marker)); edit->mb_marker->next = s; edit->mb_marker->offset = i; edit->mb_marker->rule = edit->mb_rule; } } } else { for (i = edit->last_get_mb_rule + 1; i <= byte_index; i++) { edit->mb_rule = apply_mb_rules_going_right (edit, i, edit->mb_rule); if (i > (edit->mb_marker ? edit->mb_marker->offset + MB_MARKER_DENSITY : MB_MARKER_DENSITY)) { struct _mb_marker *s; s = edit->mb_marker; edit->mb_marker = malloc (sizeof (struct _mb_marker)); edit->mb_marker->next = s; edit->mb_marker->offset = i; edit->mb_marker->rule = edit->mb_rule; } } } } else if (byte_index < edit->last_get_mb_rule) { struct _mb_marker *s; for (;;) { if (!edit->mb_marker) { memset (&edit->mb_rule, 0, sizeof (edit->mb_rule)); if (option_utf_interpretation) { for (i = -1; i <= byte_index; i++) edit->mb_rule = apply_mb_rules_going_right_utf8_to_ucs4 (edit, i, edit->mb_rule); } else { for (i = -1; i <= byte_index; i++) edit->mb_rule = apply_mb_rules_going_right (edit, i, edit->mb_rule); } break; } if (byte_index >= edit->mb_marker->offset) { edit->mb_rule = edit->mb_marker->rule; if (option_utf_interpretation) { for (i = edit->mb_marker->offset + 1; i <= byte_index; i++) edit->mb_rule = apply_mb_rules_going_right_utf8_to_ucs4 (edit, i, edit->mb_rule); } else { for (i = edit->mb_marker->offset + 1; i <= byte_index; i++) edit->mb_rule = apply_mb_rules_going_right (edit, i, edit->mb_rule); } break; } s = edit->mb_marker->next; free (edit->mb_marker); edit->mb_marker = s; } } edit->last_get_mb_rule = byte_index; return edit->mb_rule; } long edit_get_wide_byte (WEdit * edit, long byte_index) { struct mb_rule r; r = get_mb_rule (edit, byte_index); return r.ch; }