// --------------------------------------------------------------------------- // - cucd.cxx - // - unicode database (ucd) library functions implementation - // --------------------------------------------------------------------------- // - This program is free software; you can redistribute it and/or modify - // - it provided that this copyright notice is kept intact. - // - - // - This program is distributed in the hope that it will be useful, but - // - without any warranty; without even the implied warranty of - // - merchantability or fitness for a particular purpose. In no event shall - // - the copyright holder be liable for any direct, indirect, incidental or - // - special damages arising in any way out of the use of this software. - // --------------------------------------------------------------------------- // - copyright (c) 1999-2007 amaury darsch - // --------------------------------------------------------------------------- #include "cucd.hxx" #include "cthr.hpp" #include "csys.hpp" namespace afnix { // ------------------------------------------------------------------------- // - private section - // ------------------------------------------------------------------------- // the plane size 0x8000 = 32768 static const long UCD_DPA_MAX = 65536; // the indirect plane size 0x8000 = 32768 static const long UCD_IPA_MAX = 32768; // the indirect plane access array static const ucd_s*** p_ipa = nilp; // static mutex creation function static void* mtx_create (void); // mutex or network services static void* mtx = mtx_create (); // this function destroy the mutex at exit static void mtx_destroy (void) { // destroy each ipa if (p_ipa != nilp) { for (long i = 0; i < UCD_IPA_MAX; i++) delete [] p_ipa[i]; } // destroy the indirect array delete [] p_ipa; // destroy the mutex c_mtxdestroy (mtx); } // this function initialize a mutex statically and register its // destruction to be done at exit static void* mtx_create (void) { void* mtx = c_mtxcreate (); c_atexit (mtx_destroy); return mtx; } // this function loads a plane by index static void ucd_load_plane (const long index) { // check the plane index if ((index < 0) || (index >= UCD_IPA_MAX)) return; // lock the mutex c_mtxlock (mtx); // preload the indirect array if (p_ipa == nilp) { p_ipa = new const ucd_s**[UCD_IPA_MAX]; for (long i = 0; i < UCD_IPA_MAX; i++) p_ipa[i] = nilp; } if (p_ipa[index] == nilp) { // initialize the array const ucd_s** dpa = new const ucd_s*[UCD_DPA_MAX]; for (long i = 0; i < UCD_DPA_MAX; i++) dpa[i] = nilp; // get the plane size and data const long psize = c_ucdpsize (index); const ucd_s* plane = c_ucdplane (index); // initialize the direct access plane for (long i = 0; i < psize; i++) { long pidx = (long) (plane[i].d_code & 0x0000FFFF); dpa[pidx] = &plane[i]; } p_ipa[index] = dpa; } c_mtxunlock (mtx); } // fill an array with a canonical decompostion and an index static bool ucd_fill_nfd (t_quad dst[UCD_CDV_MAX], long& index, const t_quad code) { // check the index if (index >= UCD_CDV_MAX) return false; // get the ucd record const ucd_s* ucd = c_getucd (code); if (ucd == nilp) { dst[index++] = code; return true; } // check if a canonical decomposition exists if (ucd->d_pdmv != UCD_DMV_NIL) { dst[index++] = code; return true; } // check if the first decomposition is null - if yes then there is no // decomposition and the character is mapped in placed if (ucd->d_cdmv[0] == nilq) { dst[index++] = code; return true; } // here a decomposition exists - the decomposition mapping value is nil // so it means that it is a canonical decomposition - let's go with it! for (long i = 0; i < UCD_CDV_MAX; i++) { // get the decomposition value t_quad c = ucd->d_cdmv[i]; if (c == nilq) break; // recursivelly remap it ucd_fill_nfd (dst, index, c); } return true; } // this procedure find the ccc value for a code point static inline long ucd_find_ccc (const t_quad code) { // get the ucd record const ucd_s* ucd = c_getucd (code); // extract ccc value return (ucd == nilp) ? 0 : ucd->d_pccc; } // ------------------------------------------------------------------------- // - public section - // ------------------------------------------------------------------------- // return the ucd plane size by index const long c_ucdpsize (const long index) { switch (index) { case 0x0000: return ucd_get_psize_0000 (); case 0x0001: return ucd_get_psize_0001 (); case 0x0002: return ucd_get_psize_0002 (); case 0x0003: return ucd_get_psize_0003 (); case 0x0004: return ucd_get_psize_0004 (); case 0x0005: return ucd_get_psize_0005 (); case 0x0006: return ucd_get_psize_0006 (); case 0x0007: return ucd_get_psize_0007 (); case 0x0008: return ucd_get_psize_0008 (); case 0x0009: return ucd_get_psize_0009 (); case 0x000A: return ucd_get_psize_000A (); case 0x000B: return ucd_get_psize_000B (); case 0x000C: return ucd_get_psize_000C (); case 0x000D: return ucd_get_psize_000D (); case 0x000E: return ucd_get_psize_000E (); default: break; } return 0; } // return the ucd plane array by index const ucd_s* c_ucdplane (const long index) { switch (index) { case 0x0000: return ucd_get_plane_0000 (); case 0x0001: return ucd_get_plane_0001 (); case 0x0002: return ucd_get_plane_0002 (); case 0x0003: return ucd_get_plane_0003 (); case 0x0004: return ucd_get_plane_0004 (); case 0x0005: return ucd_get_plane_0005 (); case 0x0006: return ucd_get_plane_0006 (); case 0x0007: return ucd_get_plane_0007 (); case 0x0008: return ucd_get_plane_0008 (); case 0x0009: return ucd_get_plane_0009 (); case 0x000A: return ucd_get_plane_000A (); case 0x000B: return ucd_get_plane_000B (); case 0x000C: return ucd_get_plane_000C (); case 0x000D: return ucd_get_plane_000D (); case 0x000E: return ucd_get_plane_000E (); default: break; } return nilp; } // return a ucd structure by code point const ucd_s* c_getucd (const t_quad code) { // get the plane index long plane = code >> 16; // check if we are loaded ucd_load_plane (plane); // get the indirect plane if (p_ipa == nilp) return nilp; const ucd_s** dpa = p_ipa[plane]; if (dpa == nilp) return nilp; // get the ucd structure long index = (long) (code & 0x0000FFFF); return dpa[index]; } // fill an array with a canonical decomposition bool c_ucdnfd (t_quad dst[UCD_CDV_MAX], const t_quad code) { // initialize the array for (long i = 0; i < UCD_CDV_MAX; i++) dst[i] = nilq; // get the decomposition long index = 0; bool result = ucd_fill_nfd (dst, index, code); if (result == false) return false; // update the result with the ccc coding c_ucdcof (dst, UCD_CDV_MAX); return true; } // fill an array with a canonical decomposition - this one is for // test purpose only since the array is bounded bool c_ucdnfd (t_quad dst[UCD_CDV_MAX], const t_quad src[UCD_CDV_MAX]) { // initialize the array for (long i = 0; i < UCD_CDV_MAX; i++) dst[i] = nilq; // get the decomposition long index = 0; bool status = true; for (long i = 0; i < UCD_CDV_MAX; i++) { t_quad code = src[i]; if (code == nilq) break; status &= ucd_fill_nfd (dst, index, code); } if (status == false) return false; // update the result with the ccc coding c_ucdcof (dst, UCD_CDV_MAX); return true; } // return a nil allocated string t_quad* c_ucdnil (void) { t_quad* result = new t_quad[1]; result[0] = nilq; return result; } // normalize a character buffer into a canonical form t_quad* c_ucdnrm (const char* s, const long size) { // check for nil first if ((s == nilp) || (size == 0)) return c_ucdnil (); // create a temporary buffer that holds the quad representation t_quad* buf = new t_quad[size]; for (long i = 0; i < size; i++) buf[i] = ((t_quad) s[i]) & 0x000000FF; try { // convert the buffer t_quad* result = c_ucdnrm (buf, size); // clean and return delete [] buf; return result; } catch (...) { delete [] buf; throw; } } // normalize a string into a canonical form t_quad* c_ucdnrm (const t_quad* s, const long size){ if ((s == nilp) || (size <= 0)) return c_ucdnil (); // allocate a buffer of sufficent size an initialize it long len = size * UCD_CDV_MAX + 1; t_quad* buf = new t_quad[len]; for (long i = 0; i < len; i++) buf[i] = nilq; // loop in the string and update the buffer long pos = 0; for (long i = 0; i < size; i++) { // get the code and check for nil t_quad code = s[i]; if (code == nilq) { buf[pos++] = nilq; break; } // get the character mapping t_quad dst[UCD_CDV_MAX]; if (c_ucdnfd (dst, code) == false) { delete [] buf; return c_ucdnil (); } // update the buffer with the mapping for (long j = 0; j < UCD_CDV_MAX; j++) { t_quad c = dst[j]; if (c == nilq) break; buf[pos++] = c; } } // put the buffer in canonical order c_ucdcof (buf, len); return buf; } // put a character array in a canonical order form void c_ucdcof (t_quad* buf, const long size) { // check for 0 order if ((buf == nilp) || (size == 0)) return; // order in place for (long i = 1; i < size; i++) { // get the code point and exit if null t_quad code = buf[i]; if (code == nilq) break; // get the code point ccc - if the ccc is 0 continue long ccci = ucd_find_ccc (code); if (ccci == 0) continue; // find the initial position for swaping - the scan position is 0 // or the first position with a ccc of 0 long pos = i; for (long j = pos; j >= 0; j--) { long cccj = ucd_find_ccc (buf[j]); if (cccj == 0) break; pos = j; } // loop from position and eventuall swap if a lower condition is found // as a matter of fact we do no swap but rather rotate from left to right for (long j = pos; j < i; j++) { long cccj = ucd_find_ccc (buf[j]); if (ccci < cccj) { for (long k = i; k > j; k--) buf[k] = buf[k-1]; buf[j] = code; i = j; break; } } } } // convert a unicode character to lower case long c_ucdtol (t_quad dst[UCD_LCM_MAX], const t_quad code) { // get the ucd record and do nothing if it does not exist const ucd_s* ucd = c_getucd (code); if (ucd == nilp) { dst[0] = code; return 1; } // loop in the lower map long result = 0; for (long i = 0; i < UCD_LCM_MAX; i++) { t_quad c = ucd->d_lmap[i]; if (c == nilq) break; dst[i] = c; result++; } // if the result is null just map the existing character if (result == 0) dst[0] = code; return 1; } // convert a unicode character to upper case long c_ucdtou (t_quad dst[UCD_UCM_MAX], const t_quad code) { // get the ucd record and do nothing if it does not exist const ucd_s* ucd = c_getucd (code); if (ucd == nilp) { dst[0] = code; return 1; } // loop in the upper map long result = 0; for (long i = 0; i < UCD_UCM_MAX; i++) { t_quad c = ucd->d_umap[i]; if (c == nilq) break; dst[i] = c; result++; } // if the result is null just map the existing character if (result == 0) dst[0] = code; return 1; } // return true if the code point is not combining bool c_ucdncc (const t_quad code) { // get the ucd record and do nothing if it does not exist const ucd_s* ucd = c_getucd (code); if (ucd == nilp) return false; // check for ccc 0 return (ucd->d_pccc == 0); } };