/* #define TEST_GET_EAST_ASIA_STR_WIDTH 1 */
#include <assert.h>
#include <locale.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include "eawidth.h"
/*
* If the amount of columns the cursor advances on a TAB character depends
* on the current position, set this to a negative number (i.e. -8 for tab
* stops every eight columns. If static, set to a positive number. Zero if
* tabs are ignored.
*/
static const int tab_width = -8;
typedef struct {
unsigned short start, end;
east_asia_type type;
} eaw_db_type;
static const eaw_db_type eaw_db[] = {
{ 0x0020,0x007E,narrow },
{ 0x00A1,0x00A1,ambiguous }, /*INVERTED EXCLAMATION MARK*/
{ 0x00A2,0x00A3,narrow },
{ 0x00A4,0x00A4,ambiguous }, /*CURRENCY SIGN*/
{ 0x00A5,0x00A6,narrow },
{ 0x00A7,0x00A8,ambiguous },
{ 0x00AA,0x00AA,ambiguous }, /*FEMININE ORDINAL INDICATOR*/
{ 0x00AC,0x00AC,narrow }, /*NOT SIGN*/
{ 0x00AD,0x00AD,ambiguous }, /*SOFT HYPHEN*/
{ 0x00AF,0x00AF,narrow }, /*MACRON*/
{ 0x00B0,0x00B4,ambiguous },
{ 0x00B6,0x00BA,ambiguous },
{ 0x00BC,0x00BF,ambiguous },
{ 0x00C6,0x00C6,ambiguous }, /*LATIN CAPITAL LETTER AE*/
{ 0x00D0,0x00D0,ambiguous }, /*LATIN CAPITAL LETTER ETH*/
{ 0x00D7,0x00D8,ambiguous },
{ 0x00DE,0x00E1,ambiguous },
{ 0x00E6,0x00E6,ambiguous }, /*LATIN SMALL LETTER AE*/
{ 0x00E8,0x00EA,ambiguous },
{ 0x00EC,0x00ED,ambiguous },
{ 0x00F0,0x00F0,ambiguous }, /*LATIN SMALL LETTER ETH*/
{ 0x00F2,0x00F3,ambiguous },
{ 0x00F7,0x00FA,ambiguous },
{ 0x00FC,0x00FC,ambiguous }, /*LATIN SMALL LETTER U WITH DIAERESIS*/
{ 0x00FE,0x00FE,ambiguous }, /*LATIN SMALL LETTER THORN*/
{ 0x0101,0x0101,ambiguous }, /*LATIN SMALL LETTER A WITH MACRON*/
{ 0x0111,0x0111,ambiguous }, /*LATIN SMALL LETTER D WITH STROKE*/
{ 0x0113,0x0113,ambiguous }, /*LATIN SMALL LETTER E WITH MACRON*/
{ 0x011B,0x011B,ambiguous }, /*LATIN SMALL LETTER E WITH CARON*/
{ 0x0126,0x0127,ambiguous },
{ 0x012B,0x012B,ambiguous }, /*LATIN SMALL LETTER I WITH MACRON*/
{ 0x0131,0x0133,ambiguous },
{ 0x0138,0x0138,ambiguous }, /*LATIN SMALL LETTER KRA*/
{ 0x013F,0x0142,ambiguous },
{ 0x0144,0x0144,ambiguous }, /*LATIN SMALL LETTER N WITH ACUTE*/
{ 0x0148,0x014A,ambiguous },
{ 0x014D,0x014D,ambiguous }, /*LATIN SMALL LETTER O WITH MACRON*/
{ 0x0152,0x0153,ambiguous },
{ 0x0166,0x0167,ambiguous },
{ 0x016B,0x016B,ambiguous }, /*LATIN SMALL LETTER U WITH MACRON*/
{ 0x01CE,0x01CE,ambiguous }, /*LATIN SMALL LETTER A WITH CARON*/
{ 0x01D0,0x01D0,ambiguous }, /*LATIN SMALL LETTER I WITH CARON*/
{ 0x01D2,0x01D2,ambiguous }, /*LATIN SMALL LETTER O WITH CARON*/
{ 0x01D4,0x01D4,ambiguous }, /*LATIN SMALL LETTER U WITH CARON*/
{ 0x01D6,0x01D6,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+MACRON*/
{ 0x01D8,0x01D8,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+ACUTE*/
{ 0x01DA,0x01DA,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+CARON*/
{ 0x01DC,0x01DC,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+GRAVE*/
{ 0x0251,0x0251,ambiguous }, /*LATIN SMALL LETTER ALPHA*/
{ 0x0261,0x0261,ambiguous }, /*LATIN SMALL LETTER SCRIPT G*/
{ 0x02C7,0x02C7,ambiguous }, /*CARON*/
{ 0x02C9,0x02CB,ambiguous },
{ 0x02CD,0x02CD,ambiguous }, /*MODIFIER LETTER LOW MACRON*/
{ 0x02D0,0x02D0,ambiguous }, /*MODIFIER LETTER TRIANGULAR COLON*/
{ 0x02D8,0x02DB,ambiguous },
{ 0x02DD,0x02DD,ambiguous }, /*DOUBLE ACUTE ACCENT*/
{ 0x0300,0x0362,ambiguous },
{ 0x0391,0x03A9,ambiguous },
{ 0x03B1,0x03C1,ambiguous },
{ 0x03C3,0x03C9,ambiguous },
{ 0x0401,0x0401,ambiguous }, /*CYRILLIC CAPITAL LETTER IO*/
{ 0x0410,0x044F,ambiguous },
{ 0x0451,0x0451,ambiguous }, /*CYRILLIC SMALL LETTER IO*/
{ 0x1100,0x115F,wide },
{ 0x2010,0x2010,ambiguous }, /*HYPHEN*/
{ 0x2013,0x2016,ambiguous },
{ 0x2018,0x2019,ambiguous },
{ 0x201C,0x201D,ambiguous },
{ 0x2020,0x2021,ambiguous },
{ 0x2025,0x2027,ambiguous },
{ 0x2030,0x2030,ambiguous }, /*PER MILLE SIGN*/
{ 0x2032,0x2033,ambiguous },
{ 0x2035,0x2035,ambiguous }, /*REVERSED PRIME*/
{ 0x203B,0x203B,ambiguous }, /*REFERENCE MARK*/
{ 0x2074,0x2074,ambiguous }, /*SUPERSCRIPT FOUR*/
{ 0x207F,0x207F,ambiguous }, /*SUPERSCRIPT LATIN SMALL LETTER N*/
{ 0x2081,0x2084,ambiguous },
{ 0x20A9,0x20A9,half_width }, /*WON SIGN*/
{ 0x20AC,0x20AC,ambiguous }, /*EURO SIGN*/
{ 0x2103,0x2103,ambiguous }, /*DEGREE CELSIUS*/
{ 0x2105,0x2105,ambiguous }, /*CARE OF*/
{ 0x2109,0x2109,ambiguous }, /*DEGREE FAHRENHEIT*/
{ 0x2113,0x2113,ambiguous }, /*SCRIPT SMALL L*/
{ 0x2121,0x2122,ambiguous },
{ 0x2126,0x2126,ambiguous }, /*OHM SIGN*/
{ 0x212B,0x212B,ambiguous }, /*ANGSTROM SIGN*/
{ 0x2154,0x2155,ambiguous },
{ 0x215B,0x215B,ambiguous }, /*VULGAR FRACTION ONE EIGHTH*/
{ 0x215E,0x215E,ambiguous }, /*VULGAR FRACTION SEVEN EIGHTHS*/
{ 0x2160,0x216B,ambiguous },
{ 0x2170,0x2179,ambiguous },
{ 0x2190,0x2199,ambiguous },
{ 0x21D2,0x21D2,ambiguous }, /*RIGHTWARDS DOUBLE ARROW*/
{ 0x21D4,0x21D4,ambiguous }, /*LEFT RIGHT DOUBLE ARROW*/
{ 0x2200,0x2200,ambiguous }, /*FOR ALL*/
{ 0x2202,0x2203,ambiguous },
{ 0x2207,0x2208,ambiguous },
{ 0x220B,0x220B,ambiguous }, /*CONTAINS AS MEMBER*/
{ 0x220F,0x220F,ambiguous }, /*N-ARY PRODUCT*/
{ 0x2211,0x2211,ambiguous }, /*N-ARY SUMMATION*/
{ 0x2215,0x2215,ambiguous }, /*DIVISION SLASH*/
{ 0x221A,0x221A,ambiguous }, /*SQUARE ROOT*/
{ 0x221D,0x2220,ambiguous },
{ 0x2223,0x2223,ambiguous }, /*DIVIDES*/
{ 0x2225,0x2225,ambiguous }, /*PARALLEL TO*/
{ 0x2227,0x222C,ambiguous },
{ 0x222E,0x222E,ambiguous }, /*CONTOUR INTEGRAL*/
{ 0x2234,0x2237,ambiguous },
{ 0x223C,0x223D,ambiguous },
{ 0x2248,0x2248,ambiguous }, /*ALMOST EQUAL TO*/
{ 0x224C,0x224C,ambiguous }, /*ALL EQUAL TO*/
{ 0x2252,0x2252,ambiguous }, /*APPROXIMATELY EQUAL TO OR THE IMAGE OF*/
{ 0x2260,0x2261,ambiguous },
{ 0x2264,0x2267,ambiguous },
{ 0x226A,0x226B,ambiguous },
{ 0x226E,0x226F,ambiguous },
{ 0x2282,0x2283,ambiguous },
{ 0x2286,0x2287,ambiguous },
{ 0x2295,0x2295,ambiguous }, /*CIRCLED PLUS*/
{ 0x2299,0x2299,ambiguous }, /*CIRCLED DOT OPERATOR*/
{ 0x22A5,0x22A5,ambiguous }, /*UP TACK*/
{ 0x22BF,0x22BF,ambiguous }, /*RIGHT TRIANGLE*/
{ 0x2312,0x2312,ambiguous }, /*ARC*/
{ 0x2460,0x24BF,ambiguous },
{ 0x24D0,0x24E9,ambiguous },
{ 0x2500,0x254B,ambiguous },
{ 0x2550,0x2574,ambiguous },
{ 0x2580,0x258F,ambiguous },
{ 0x2592,0x25A1,ambiguous },
{ 0x25A3,0x25A9,ambiguous },
{ 0x25B2,0x25B3,ambiguous },
{ 0x25B6,0x25B7,ambiguous },
{ 0x25BC,0x25BD,ambiguous },
{ 0x25C0,0x25C1,ambiguous },
{ 0x25C6,0x25C8,ambiguous },
{ 0x25CB,0x25CB,ambiguous }, /*WHITE CIRCLE*/
{ 0x25CE,0x25D1,ambiguous },
{ 0x25E2,0x25E5,ambiguous },
{ 0x25EF,0x25EF,ambiguous }, /*LARGE CIRCLE*/
{ 0x2605,0x2606,ambiguous },
{ 0x2609,0x2609,ambiguous }, /*SUN*/
{ 0x260E,0x260F,ambiguous },
{ 0x261C,0x261C,ambiguous }, /*WHITE LEFT POINTING INDEX*/
{ 0x261E,0x261E,ambiguous }, /*WHITE RIGHT POINTING INDEX*/
{ 0x2640,0x2640,ambiguous }, /*FEMALE SIGN*/
{ 0x2642,0x2642,ambiguous }, /*MALE SIGN*/
{ 0x2660,0x2661,ambiguous },
{ 0x2663,0x2665,ambiguous },
{ 0x2667,0x266A,ambiguous },
{ 0x266C,0x266D,ambiguous },
{ 0x266F,0x266F,ambiguous }, /*MUSIC SHARP SIGN*/
{ 0x2E80,0x3009,wide },
{ 0x300A,0x300B,ambiguous },
{ 0x300C,0x3019,wide },
{ 0x301A,0x301B,ambiguous },
{ 0x301C,0x303E,wide },
{ 0x3041,0xD7A3,wide },
{ 0xE000,0xF8FF,ambiguous },
{ 0xF900,0xFA2D,wide },
{ 0xFE30,0xFE6B,wide },
{ 0xFF01,0xFF5E,full_width },
{ 0xFF61,0xFFDC,half_width },
{ 0xFFE0,0xFFE6,full_width },
{ 0xFFE8,0xFFEE,half_width },
};
static int
eaw_db_cmp (const void *ck, const void *ce) {
const eaw_db_type *key = ck, *element = ce;
assert(key != NULL);
assert(element != NULL);
if (key->start < element->start) return -1;
else if (key->end > element->end) return 1;
return 0;
}
static int
is_cjk_locale (const char *locale_name) {
static const char c[] = "zh"; /* Chinese */
static const char j[] = "ja"; /* Japanese */
static const char k[] = "ko"; /* Korean */
if (NULL == locale_name) return 0;
if (strncmp(locale_name, c, sizeof(c)) == 0) return 1;
if (strncmp(locale_name, j, sizeof(j)) == 0) return 1;
if (strncmp(locale_name, k, sizeof(k)) == 0) return 1;
return 0;
}
east_asia_type
get_east_asia_type (wchar_t unicode) {
assert(0xFFFF != unicode && 0xFFFE != unicode);
if (unicode > 0xFFFF) {
/*
* Plane 2 is intended for CJK ideographs
*/
if (unicode >= 0x20000 && unicode <= 0x2FFFD) return wide;
return ambiguous;
}
else {
eaw_db_type *pos, key;
size_t n;
n = sizeof(eaw_db) / sizeof(eaw_db_type);
key.start = key.end = (unsigned short) unicode;
pos = bsearch(&key, eaw_db, n, sizeof(eaw_db_type), eaw_db_cmp);
if (NULL != pos) return pos->type;
}
return neutral;
}
int
east_asia_mblen (const char *locale_name, const char *s, size_t n, int x)
{
wchar_t *wcs, *p;
int width = 0;
if (NULL == s) s = "";
/*
* Getting the locale name via setlocale() is expensive, so we prefer
* to have it passed to us.
*/
if (NULL == locale_name) {
locale_name = setlocale(LC_CTYPE, NULL);
if (NULL == locale_name) return INT_MAX;
}
wcs = (wchar_t *) calloc(n, sizeof(wchar_t));
if (NULL == wcs) return INT_MAX;
#if defined __GLIBC__ && !__GLIBC_PREREQ(2,2)
#warning wide character support is broken. Glibc 2.2 or better needed.
#endif
if ((size_t) -1 == mbstowcs(wcs, s, n)) return INT_MAX;
switch (get_east_asia_type(*wcs)) {
case neutral:
/*
* Put characters that print nothing here.
*
* XXX: Yes, I know there are a lot more than this in ISO-10646, but
* this function is intended to calculate the width of strings for
* fixed width terminals displaying legacy CJK character sets.
* State-of-the-art Unicode handling terminals probably won't need
* this function anyway.
*/
if (0x0000 == *wcs) break; /* NULL */
if (0x0007 == *wcs) break; /* BELL */
/* FIXME: there will probably be ASCII chars after the escape
* code, which will be counted as part of the width even though they
* aren't displayed.
*/
if (0x001B == *wcs) break; /* ESC */
if (0xFEFF == *wcs) break; /* ZWNBSP aka BOM (magic, signature) */
/*
* Special characters go here
*/
if (0x0008 == *wcs) { /* BACKSPACE */
width = -1;
break;
}
if (0x0009 == *wcs) { /* TAB */
if (tab_width < 0) width = x % abs(tab_width);
else width = tab_width;
break;
}
/*FALLTHRU*/
case narrow:
case half_width:
width = 1;
break;
case wide:
case full_width:
width = 2;
break;
case ambiguous:
width = is_cjk_locale(locale_name) ? 2 : 1;
break;
default:
width = INT_MAX;
}
free(wcs);
return width;
}
int
get_east_asia_str_n_width (const char *locale_name, const char *s, size_t n, int x)
{
int total_width = 0;
wchar_t *wcs, *p;
if (NULL == s) s = "";
/*
* Getting the locale name via setlocale() is expensive, so we prefer
* to have it passed to us.
*/
if (NULL == locale_name) {
locale_name = setlocale(LC_CTYPE, NULL);
if (NULL == locale_name) return INT_MAX;
}
wcs = (wchar_t *) calloc(n, sizeof(wchar_t));
if (NULL == wcs) return INT_MAX;
#if defined __GLIBC__ && !__GLIBC_PREREQ(2,2)
#warning wide character support is broken. Glibc 2.2 or better needed.
#endif
if ((size_t) -1 == mbstowcs(wcs, s, n)) return INT_MAX;
for (p = wcs; L'\0' != *p; p++) {
int width = 0;
switch (get_east_asia_type(*p)) {
case neutral:
/*
* Put characters that print nothing here.
*
* XXX: Yes, I know there are a lot more than this in ISO-10646, but
* this function is intended to calculate the width of strings for
* fixed width terminals displaying legacy CJK character sets.
* State-of-the-art Unicode handling terminals probably won't need
* this function anyway.
*/
if (0x0000 == *p) break; /* NULL */
if (0x0007 == *p) break; /* BELL */
/* FIXME: there will probably be ASCII chars after the escape
* code, which will be counted as part of the width even though they
* aren't displayed.
*/
if (0x001B == *p) break; /* ESC */
if (0xFEFF == *p) break; /* ZWNBSP aka BOM (magic, signature) */
/*
* Special characters go here
*/
if (0x0008 == *p) { /* BACKSPACE */
width = -1;
break;
}
if (0x0009 == *p) { /* TAB */
if (tab_width < 0) width = x % abs(tab_width);
else width = tab_width;
break;
}
/*FALLTHRU*/
case narrow:
case half_width:
width = 1;
break;
case wide:
case full_width:
width = 2;
break;
case ambiguous:
width = is_cjk_locale(locale_name) ? 2 : 1;
break;
default: abort(); /* Doh! */
}
x += width;
total_width += width;
}
free(wcs);
return total_width;
}
int
get_east_asia_str_width (const char *locale_name, const char *s, int x) {
size_t n;
int rc;
n = strlen(s) + 1;
rc = get_east_asia_str_n_width (locale_name, s, n, x);
if (rc == INT_MAX)
return strlen (s);
return rc;
}
#if TEST_GET_EAST_ASIA_STR_WIDTH
#include <stdio.h>
int
main (int argc, char *argv[]) {
int i;
char *lc;
const char *fmt = "word #%d ('%s') length is %zu, width is %u\n";
lc = setlocale(LC_CTYPE, "");
if (NULL == lc) {
fputs("couldn't set the default locale for LC_CTYPE\n", stderr);
exit(EXIT_FAILURE);
}
if (printf("character type locale is '%s'\n", lc) < 0) {
perror(NULL);
exit(EXIT_FAILURE);
}
for (i = 1; argc < 2 || i < argc; i++) {
char *s;
size_t length;
unsigned width;
if (argc < 2) {
if (scanf("%as", &s) < 1 && ferror(stdin)) {
perror(NULL);
exit(EXIT_FAILURE);
}
else if (feof(stdin)) break;
}
else s = strdup(argv[(size_t) i]);
if (NULL == s) {
perror(NULL);
exit(EXIT_FAILURE);
}
length = strlen(s);
width = get_east_asia_str_width(lc, s, 0);
if (printf(fmt, i, s, length, width) < 0) {
perror(NULL);
exit(EXIT_FAILURE);
}
free(s);
}
return 0;
}
#endif
syntax highlighted by Code2HTML, v. 0.9.1