/*- * Copyright 2005 Guram Dukashvili * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ //--------------------------------------------------------------------------- #include #include //--------------------------------------------------------------------------- namespace utf8 { //--------------------------------------------------------------------------- uintptr_t cpansi = 0; uintptr_t cpoem = 0; const utf8cp * cpansip = NULL; const utf8cp * cpoemp = NULL; //--------------------------------------------------------------------------- static const struct CharsetEntry { const char * charsetName_; uint32_t codepage_; } charsetNames[] = { { "BIG5", 950 }, { "CP1250", 1250 }, { "CP1251", 1251 }, { "CP1252", 1252 }, { "CP1253", 1253 }, { "CP1254", 1254 }, { "CP1255", 1255 }, { "CP1256", 1256 }, { "CP1257", 1257 }, { "CP1258", 1258 }, { "CP932", 932 }, { "CP936", 936 }, { "CP949", 949 }, { "CP950", 950 }, { "EUCJP", 20932 }, { "GB2312", 936 }, { "IBM037", 37 }, { "IBM1026", 1026 }, { "IBM424", 424 }, { "IBM437", 437 }, { "IBM500", 500 }, { "IBM850", 850 }, { "IBM852", 852 }, { "IBM855", 855 }, { "IBM857", 857 }, { "IBM860", 860 }, { "IBM861", 861 }, { "IBM862", 862 }, { "IBM863", 863 }, { "IBM864", 864 }, { "IBM865", 865 }, { "IBM866", 866 }, { "IBM869", 869 }, { "IBM874", 874 }, { "IBM875", 875 }, { "ISO88591", 28591 }, { "ISO885910", 28600 }, { "ISO885913", 28603 }, { "ISO885914", 28604 }, { "ISO885915", 28605 }, { "ISO88592", 28592 }, { "ISO88593", 28593 }, { "ISO88594", 28594 }, { "ISO88595", 28595 }, { "ISO88596", 28596 }, { "ISO88597", 28597 }, { "ISO88598", 28598 }, { "ISO88599", 28599 }, { "UTF8", CP_UTF8 }, { "KOI8-R", 20866 }, { "KOI8-U", 20866 } }; //--------------------------------------------------------------------------- static #ifndef __BCPLUSPLUS__ inline #endif uintptr_t mapCharsetNameToCodepage(const char * charset) { const struct CharsetEntry * bsa = charsetNames; intptr_t low = 0, high = sizeof(charsetNames) / sizeof(charsetNames[0]) - 1; intptr_t pos, c; while( low <= high ){ pos = (low + high) / 2; #if HAVE_STRCASECMP c = strcasecmp(charset,bsa[pos].charsetName_); #elif HAVE__STRICMP c = _stricmp(charset,bsa[pos].charsetName_); #elif HAVE_STRICMP c = stricmp(charset,bsa[pos].charsetName_); #else c = strcmp(charset,bsa[pos].charsetName_); #endif if( c > 0 ){ low = pos + 1; } else if( c < 0 ){ high = pos - 1; } else return bsa[pos].codepage_; } return CP_UTF8; } //--------------------------------------------------------------------------- static #ifndef __BCPLUSPLUS__ inline #endif uintptr_t getANSICodepage() { #if defined(__WIN32__) || defined(__WIN64__) return GetACP(); #else uintptr_t cp = CP_UTF8; if( cpansi == 0 ){ char * lang; if ( (lang = getenv( "LC_ALL" )) || (lang = getenv( "LANGUAGE" )) || (lang = getenv( "LANG" )) ){ char * buf = (char *) malloc(strlen(lang) + 1); strcpy(buf,lang); lang = buf; do { char * next, * dialect, * charset, * country; if( (next = strchr(lang,':')) != NULL ) *next++ = '\0'; if( (dialect = strchr(lang,'@')) != NULL ) *dialect++ = '\0'; if( (charset = strchr(lang,'.')) != NULL ) *charset++ = '\0'; if( (country = strchr(lang,'_')) != NULL ) *country++ = '\0'; if( (cp = mapCharsetNameToCodepage(charset)) == CP_UTF8 ) break; lang = next; } while( lang != NULL ); free(lang); } cpansi = cp; } else { cp = cpansi; } return cp; #endif } //--------------------------------------------------------------------------- static inline uintptr_t getOEMCodepage() { #if defined(__WIN32__) || defined(__WIN64__) return GetOEMCP(); #else return getANSICodepage(); #endif } //--------------------------------------------------------------------------- const utf8cp * findCodePage(uintptr_t cp) { if( cp == CP_ACP ){ cp = cpansi; } else if( cp == CP_OEMCP ){ cp = cpoem; } if( cp != CP_UTF8 ){ intptr_t low = 0, high = utf8cpsCount - 1; while( low <= high ){ const utf8cp * e = utf8cps + (low + high) / 2; if( cp > (uint32_t) *(uint32_t *) e->cps2utf8s ){ low = e - utf8cps + 1; } else if( cp < (uint32_t) *(uint32_t *) e->cps2utf8s ){ high = e - utf8cps - 1; } else return e; } } return NULL; } //--------------------------------------------------------------------------- uintptr_t detectCodePages() { cpansip = findCodePage(cpansi = getANSICodepage()); cpoemp = findCodePage(cpoem = getOEMCodepage()); return cpansi; } //--------------------------------------------------------------------------- uintptr_t getCodePage(uintptr_t cp) { if( cpansi == 0 || cpoem == 0 ) detectCodePages(); if( cp == CP_ACP ){ cp = cpansi; } else if( cp == CP_OEMCP ){ cp = cpoem; } return cp; } //--------------------------------------------------------------------------- } // namespace utf8 //---------------------------------------------------------------------------