// --------------------------------------------------------------------------- // - cucd.hpp - // - unicode database (ucd) library definitions - // --------------------------------------------------------------------------- // - This program is free software; you can redistribute it and/or modify - // - it provided that this copyright notice is kept intact. - // - - // - This program is distributed in the hope that it will be useful, but - // - without any warranty; without even the implied warranty of - // - merchantability or fitness for a particular purpose. In no event shall - // - the copyright holder be liable for any direct, indirect, incidental or - // - special damages arising in any way out of the use of this software. - // --------------------------------------------------------------------------- // - copyright (c) 1999-2007 amaury darsch - // --------------------------------------------------------------------------- #include "ccnf.hpp" namespace afnix { /// the ucd version const t_byte UCD_MAJOR = 5; const t_byte UCD_MINOR = 0; const t_byte UCD_PATCH = 0; /// general category value const t_byte UCD_GCV_LU = 0x00; // letter, uppercase const t_byte UCD_GCV_LL = 0x01; // letter, lowercase const t_byte UCD_GCV_LT = 0x02; // letter, titlecase const t_byte UCD_GCV_LM = 0x03; // letter, modifier const t_byte UCD_GCV_LO = 0x04; // letter, other const t_byte UCD_GCV_MN = 0x10; // mark, nonspacing const t_byte UCD_GCV_MC = 0x11; // mark, spacing combining const t_byte UCD_GCV_ME = 0x12; // mark, enclosing const t_byte UCD_GCV_ND = 0x20; // number, decimal digit const t_byte UCD_GCV_NL = 0x21; // number, letter const t_byte UCD_GCV_NO = 0x22; // number, other const t_byte UCD_GCV_PC = 0x30; // punctuation, connector const t_byte UCD_GCV_PD = 0x31; // punctuation, dash const t_byte UCD_GCV_PS = 0x32; // punctuation, open const t_byte UCD_GCV_PE = 0x33; // punctuation, close const t_byte UCD_GCV_PI = 0x34; // punctuation, initial quote const t_byte UCD_GCV_PF = 0x35; // punctuation, final quote const t_byte UCD_GCV_PO = 0x36; // punctuation, other const t_byte UCD_GCV_SM = 0x40; // symbol, math const t_byte UCD_GCV_SC = 0x41; // symbol, currency const t_byte UCD_GCV_SK = 0x42; // symbol, modifier const t_byte UCD_GCV_SO = 0x43; // symbol, other const t_byte UCD_GCV_ZS = 0x50; // separator, space const t_byte UCD_GCV_ZL = 0x51; // separator, line const t_byte UCD_GCV_ZP = 0x52; // separator, paragraph const t_byte UCD_GCV_CC = 0x60; // other, control const t_byte UCD_GCV_CF = 0x61; // other, format const t_byte UCD_GCV_CS = 0x62; // other, surrogate const t_byte UCD_GCV_CO = 0x63; // other, private use const t_byte UCD_GCV_CN = 0x64; // other, not assigned /// bidirectional class value (BCV) const t_byte UCD_BCV_L = 0x00; // left-to-right const t_byte UCD_BCV_LRE = 0x01; // left-to-right embedding const t_byte UCD_BCV_LRO = 0x02; // left-to-right override const t_byte UCD_BCV_R = 0x03; // right-to-left const t_byte UCD_BCV_AL = 0x04; // right-to-left arabic const t_byte UCD_BCV_RLE = 0x05; // right-to-left embedding const t_byte UCD_BCV_RLO = 0x06; // right-to-left override const t_byte UCD_BCV_PDF = 0x07; // pop directional format const t_byte UCD_BCV_EN = 0x08; // european number const t_byte UCD_BCV_ES = 0x09; // european number separator const t_byte UCD_BCV_ET = 0x0A; // european number terminator const t_byte UCD_BCV_AN = 0x0B; // arabic number const t_byte UCD_BCV_CS = 0x0C; // common number separator const t_byte UCD_BCV_NSM = 0x0D; // non-spacing mark const t_byte UCD_BCV_BN = 0x0E; // boundary neutral const t_byte UCD_BCV_B = 0x0F; // paragraph separator const t_byte UCD_BCV_S = 0x10; // segment separator const t_byte UCD_BCV_WS = 0x11; // whitespace const t_byte UCD_BCV_ON = 0x12; // other neutrals /// character decomposition mapping value const t_byte UCD_DMV_NIL = 0x00; // no decompostion const t_byte UCD_DMV_FNT = 0x01; // font variant const t_byte UCD_DMV_NOB = 0x02; // no-break of space or hyphen const t_byte UCD_DMV_INI = 0x03; // initial presentation form (arabic) const t_byte UCD_DMV_MED = 0x04; // medial presentation form (arabic) const t_byte UCD_DMV_FIN = 0x05; // final presentation form (arabic) const t_byte UCD_DMV_ISO = 0x06; // isolated presentation form (arabic) const t_byte UCD_DMV_CIR = 0x07; // encircled form const t_byte UCD_DMV_SUP = 0x08; // superscript form const t_byte UCD_DMV_SUB = 0x09; // subscript form const t_byte UCD_DMV_VER = 0x0A; // vertical layout presentation form const t_byte UCD_DMV_WID = 0x0B; // wide compatibility character const t_byte UCD_DMV_NRW = 0x0C; // narrow compatibility character const t_byte UCD_DMV_SML = 0x0D; // small variant form const t_byte UCD_DMV_SQR = 0x0E; // CJK squared font variant const t_byte UCD_DMV_FRA = 0x0F; // vulgar fraction form const t_byte UCD_DMV_CPT = 0x10; // unspecified compatible character /// the canonical decomposition field const long UCD_CDV_MAX = 18; /// the upper case mapping max fields const long UCD_UCM_MAX = 3; /// the lower case mapping max fields const long UCD_LCM_MAX = 3; /// the title case mapping max fields const long UCD_TCM_MAX = 3; /// the ucd structure struct ucd_s { t_quad d_code; // code point value const char* p_name; // code point name t_byte d_pgcv; // general category value long d_pccc; // canonical combining class t_byte d_pbcv; // bidirectional class value t_byte d_cexc; // composition exclusion flag t_byte d_pdmv; // decomposition mapping value // canonical decomposition t_quad d_cdmv[UCD_CDV_MAX]; // upper case mapping t_quad d_umap[UCD_UCM_MAX]; // lower case mapping t_quad d_lmap[UCD_LCM_MAX]; // title case mapping t_quad d_tmap[UCD_TCM_MAX]; }; /// @return a plane size by plane index const long c_ucdpsize (const long index); /// @return a plane array by plane index const ucd_s* c_ucdplane (const long index); /// @return a ucd structure by code point const ucd_s* c_getucd (const t_quad code); /// perform a normal form decomposition (canonical) /// @param dst the destination buffer /// @param code the code to decompose bool c_ucdnfd (t_quad dst[UCD_CDV_MAX], const t_quad code); /// perform a normal form decomposition (canonical) /// @param dst the destination buffer /// @param src the source buffer to decompose bool c_ucdnfd (t_quad dst[UCD_CDV_MAX], const t_quad src[UCD_CDV_MAX]); /// @return a nil allocated string t_quad* c_ucdnil (void); /// normalize character buffer into a canonical form /// @param s the string to convert /// @param size the string size t_quad* c_ucdnrm (const char* s, const long size); /// normalize a string into a canonical form /// @param s the string to convert /// @param size the string size t_quad* c_ucdnrm (const t_quad* s, const long size); /// put an array in a canonical order form /// @param buf the character buffer to sort /// @param size the buffer size void c_ucdcof (t_quad* buf, const long size); /// convert a unicode character to lower case /// @param dst the destination buffer /// @param code the code to decompose /// @return the destination size long c_ucdtol (t_quad dst[UCD_LCM_MAX], const t_quad code); /// convert a unicode character to upper case /// @param dst the destination buffer /// @param code the code to decompose /// @return the destination size long c_ucdtou (t_quad dst[UCD_UCM_MAX], const t_quad code); /// @return true if the code point is not combining bool c_ucdncc (const t_quad code); };