ports//lang/afnix/work/afnix-src-1-5-2/src/lib/std/shl/ucd/cucd.hpp

// ---------------------------------------------------------------------------
// - cucd.hpp                                                                -
// - unicode database (ucd) library  definitions                             -
// ---------------------------------------------------------------------------
// - This program is free software;  you can redistribute it  and/or  modify -
// - it provided that this copyright notice is kept intact.                  -
// -                                                                         -
// - This program  is  distributed in  the hope  that it will be useful, but -
// - without  any  warranty;  without  even   the   implied    warranty   of -
// - merchantability or fitness for a particular purpose.  In no event shall -
// - the copyright holder be liable for any  direct, indirect, incidental or -
// - special damages arising in any way out of the use of this software.     -
// ---------------------------------------------------------------------------
// - copyright (c) 1999-2007 amaury darsch                                   -
// ---------------------------------------------------------------------------

#include "ccnf.hpp"

namespace afnix {

  /// the ucd version
  const t_byte UCD_MAJOR   = 5;
  const t_byte UCD_MINOR   = 0;
  const t_byte UCD_PATCH   = 0;

  /// general category value
  const t_byte UCD_GCV_LU  = 0x00; // letter, uppercase           
  const t_byte UCD_GCV_LL  = 0x01; // letter, lowercase           
  const t_byte UCD_GCV_LT  = 0x02; // letter, titlecase           
  const t_byte UCD_GCV_LM  = 0x03; // letter, modifier            
  const t_byte UCD_GCV_LO  = 0x04; // letter, other               
  
  const t_byte UCD_GCV_MN  = 0x10; // mark, nonspacing            
  const t_byte UCD_GCV_MC  = 0x11; // mark, spacing combining     
  const t_byte UCD_GCV_ME  = 0x12; // mark, enclosing             
  
  const t_byte UCD_GCV_ND  = 0x20; // number, decimal digit       
  const t_byte UCD_GCV_NL  = 0x21; // number, letter              
  const t_byte UCD_GCV_NO  = 0x22; // number, other               
  
  const t_byte UCD_GCV_PC  = 0x30; // punctuation, connector      
  const t_byte UCD_GCV_PD  = 0x31; // punctuation, dash           
  const t_byte UCD_GCV_PS  = 0x32; // punctuation, open           
  const t_byte UCD_GCV_PE  = 0x33; // punctuation, close          
  const t_byte UCD_GCV_PI  = 0x34; // punctuation, initial quote   
  const t_byte UCD_GCV_PF  = 0x35; // punctuation, final quote    
  const t_byte UCD_GCV_PO  = 0x36; // punctuation, other          
  
  const t_byte UCD_GCV_SM  = 0x40; // symbol, math                
  const t_byte UCD_GCV_SC  = 0x41; // symbol, currency            
  const t_byte UCD_GCV_SK  = 0x42; // symbol, modifier            
  const t_byte UCD_GCV_SO  = 0x43; // symbol, other               
  
  const t_byte UCD_GCV_ZS  = 0x50; // separator, space            
  const t_byte UCD_GCV_ZL  = 0x51; // separator, line             
  const t_byte UCD_GCV_ZP  = 0x52; // separator, paragraph        
  
  const t_byte UCD_GCV_CC  = 0x60; // other, control              
  const t_byte UCD_GCV_CF  = 0x61; // other, format               
  const t_byte UCD_GCV_CS  = 0x62; // other, surrogate            
  const t_byte UCD_GCV_CO  = 0x63; // other, private use          
  const t_byte UCD_GCV_CN  = 0x64; // other, not assigned         
  
  /// bidirectional class value (BCV) 
  const t_byte UCD_BCV_L   = 0x00; // left-to-right              
  const t_byte UCD_BCV_LRE = 0x01; // left-to-right embedding    
  const t_byte UCD_BCV_LRO = 0x02; // left-to-right override     
  const t_byte UCD_BCV_R   = 0x03; // right-to-left              
  const t_byte UCD_BCV_AL  = 0x04; // right-to-left arabic       
  const t_byte UCD_BCV_RLE = 0x05; // right-to-left embedding    
  const t_byte UCD_BCV_RLO = 0x06; // right-to-left override     
  const t_byte UCD_BCV_PDF = 0x07; // pop directional format     
  const t_byte UCD_BCV_EN  = 0x08; // european number            
  const t_byte UCD_BCV_ES  = 0x09; // european number separator  
  const t_byte UCD_BCV_ET  = 0x0A; // european number terminator 
  const t_byte UCD_BCV_AN  = 0x0B; // arabic number              
  const t_byte UCD_BCV_CS  = 0x0C; // common number separator    
  const t_byte UCD_BCV_NSM = 0x0D; // non-spacing mark           
  const t_byte UCD_BCV_BN  = 0x0E; // boundary neutral           
  const t_byte UCD_BCV_B   = 0x0F; // paragraph separator        
  const t_byte UCD_BCV_S   = 0x10; // segment separator          
  const t_byte UCD_BCV_WS  = 0x11; // whitespace                 
  const t_byte UCD_BCV_ON  = 0x12; // other neutrals             
  
  /// character decomposition mapping value 
  const t_byte UCD_DMV_NIL = 0x00; // no decompostion
  const t_byte UCD_DMV_FNT = 0x01; // font variant
  const t_byte UCD_DMV_NOB = 0x02; // no-break of space or hyphen
  const t_byte UCD_DMV_INI = 0x03; // initial presentation form (arabic)
  const t_byte UCD_DMV_MED = 0x04; // medial presentation form (arabic)
  const t_byte UCD_DMV_FIN = 0x05; // final presentation form (arabic)
  const t_byte UCD_DMV_ISO = 0x06; // isolated presentation form (arabic)
  const t_byte UCD_DMV_CIR = 0x07; // encircled form
  const t_byte UCD_DMV_SUP = 0x08; // superscript form
  const t_byte UCD_DMV_SUB = 0x09; // subscript form
  const t_byte UCD_DMV_VER = 0x0A; // vertical layout presentation form
  const t_byte UCD_DMV_WID = 0x0B; // wide compatibility character
  const t_byte UCD_DMV_NRW = 0x0C; // narrow compatibility character
  const t_byte UCD_DMV_SML = 0x0D; // small variant form
  const t_byte UCD_DMV_SQR = 0x0E; // CJK squared font variant
  const t_byte UCD_DMV_FRA = 0x0F; // vulgar fraction form
  const t_byte UCD_DMV_CPT = 0x10; // unspecified compatible character
  
  /// the canonical decomposition field
  const long UCD_CDV_MAX = 18;
  /// the upper case mapping max fields
  const long UCD_UCM_MAX = 3;
  /// the lower case mapping max fields
  const long UCD_LCM_MAX = 3;
  /// the title case mapping max fields
  const long UCD_TCM_MAX = 3;

  /// the ucd structure 
  struct ucd_s {
    t_quad      d_code; // code point value           
    const char* p_name; // code point name            
    t_byte      d_pgcv; // general category value     
    long        d_pccc; // canonical combining class  
    t_byte      d_pbcv; // bidirectional class value
    t_byte      d_cexc; // composition exclusion flag
    t_byte      d_pdmv; // decomposition mapping value 
    // canonical decomposition    
    t_quad      d_cdmv[UCD_CDV_MAX];
    // upper case mapping
    t_quad      d_umap[UCD_UCM_MAX];
    // lower case mapping         
    t_quad      d_lmap[UCD_LCM_MAX];      
    // title case mapping
    t_quad      d_tmap[UCD_TCM_MAX];
  };

  /// @return a plane size by plane index
  const long c_ucdpsize (const long index);

  /// @return a plane array by plane index
  const ucd_s* c_ucdplane (const long index);

  /// @return a ucd structure by code point
  const ucd_s* c_getucd (const t_quad code);

  /// perform a normal form decomposition (canonical)
  /// @param dst  the destination buffer
  /// @param code the code to decompose
  bool c_ucdnfd (t_quad dst[UCD_CDV_MAX], const t_quad code);

  /// perform a normal form decomposition (canonical)
  /// @param dst the destination buffer
  /// @param src the source buffer to decompose
  bool c_ucdnfd (t_quad dst[UCD_CDV_MAX], const t_quad src[UCD_CDV_MAX]);

  /// @return a nil allocated string
  t_quad* c_ucdnil (void);

  /// normalize character buffer into a canonical form
  /// @param s    the string to convert
  /// @param size the string size
  t_quad* c_ucdnrm (const char* s, const long size);

  /// normalize a string into a canonical form
  /// @param s    the string to convert
  /// @param size the string size
  t_quad* c_ucdnrm (const t_quad* s, const long size);

  /// put an array in a canonical order form
  /// @param buf  the character buffer to sort
  /// @param size the buffer size
  void c_ucdcof (t_quad* buf, const long size);

  /// convert a unicode character to lower case
  /// @param dst  the destination buffer
  /// @param code the code to decompose
  /// @return the destination size
  long c_ucdtol (t_quad dst[UCD_LCM_MAX], const t_quad code);

  /// convert a unicode character to upper case
  /// @param dst  the destination buffer
  /// @param code the code to decompose
  /// @return the destination size
  long c_ucdtou (t_quad dst[UCD_UCM_MAX], const t_quad code);

  /// @return true if the code point is not combining
  bool c_ucdncc (const t_quad code);
};
syntax highlighted by Code2HTML, v. 0.9.1