/* ==================================================================== * The Kannel Software License, Version 1.0 * * Copyright (c) 2001-2005 Kannel Group * Copyright (c) 1998-2001 WapIT Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Kannel Group (http://www.kannel.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Kannel" and "Kannel Group" must not be used to * endorse or promote products derived from this software without * prior written permission. For written permission, please * contact org@kannel.org. * * 5. Products derived from this software may not be called "Kannel", * nor may "Kannel" appear in their name, without prior written * permission of the Kannel Group. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Kannel Group. For more information on * the Kannel Group, please see . * * Portions of this software are based upon software originally written at * WapIT Ltd., Helsinki, Finland for the Kannel project. */ /* * gwlib/charset.c - character set conversions * * This file implements the character set conversions declared in charset.h. * * Richard Braakman */ #include "gwlib/gwlib.h" #if HAVE_ICONV_H #include #include #endif /* Map GSM default alphabet characters to ISO-Latin-1 characters. * The greek characters at positions 16 and 18 through 26 are not * mappable. They are mapped to '?' characters. * The escape character, at position 27, is mapped to a space, * though normally the function that indexes into this table will * treat it specially. */ static const unsigned char gsm_to_latin1[128] = { '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec, /* 0 - 7 */ 0xf2, 0xc7, 10, 0xd8, 0xf8, 13, 0xc5, 0xe5, /* 8 - 15 */ '?', '_', '?', '?', '?', '?', '?', '?', /* 16 - 23 */ '?', '?', '?', ' ', 0xc6, 0xe6, 0xdf, 0xc9, /* 24 - 31 */ ' ', '!', '"', '#', 0xa4, '%', '&', '\'', /* 32 - 39 */ '(', ')', '*', '+', ',', '-', '.', '/', /* 40 - 47 */ '0', '1', '2', '3', '4', '5', '6', '7', /* 48 - 55 */ '8', '9', ':', ';', '<', '=', '>', '?', /* 56 - 63 */ 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G', /* 64 - 71 */ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', /* 73 - 79 */ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', /* 80 - 87 */ 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7, /* 88 - 95 */ 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g', /* 96 - 103 */ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', /* 104 - 111 */ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', /* 112 - 119 */ 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0 /* 120 - 127 */ }; /* This is the extension table defined in GSM 03.38. It is the mapping * used for the character after a GSM 27 (Escape) character. All characters * not in the table, as well as characters we can't represent, will map * to themselves. We cannot represent the euro symbol, which is an escaped * 'e', so we left it out of this table. */ static const struct { int gsmesc; int latin1; } gsm_escapes[] = { { 10, 12 }, /* ASCII page break */ { 20, '^' }, { 40, '{' }, { 41, '}' }, { 47, '\\' }, { 60, '[' }, { 61, '~' }, { 62, ']' }, { 64, '|' }, { 101, 128 }, { -1, -1 } }; /* Code used for non-representable characters */ #define NRP '?' /* Map ISO-Latin-1 characters to the GSM default alphabet. Negative * encoded as ESC (code 27) followed by the absolute value of the * number. */ static const int latin1_to_gsm[256] = { NRP, NRP, NRP, NRP, NRP, NRP, NRP, NRP, /* 0 - 7 */ /* TAB approximates to space */ /* LF and CR map to self */ /* Page break maps to escaped LF */ NRP, ' ', 10, NRP, -10, 13, NRP, NRP, /* 8 - 15 */ /* 16, 18-26 are nonprintable in latin1, and in GSM are greek * characters that are unrepresentable in latin1. So we let them * map to self, to create a way to specify them. */ 16, NRP, 18, 19, 20, 21, 22, 23, /* 16 - 23 */ 24, 25, 26, NRP, NRP, NRP, NRP, NRP, /* 24 - 31 */ /* $ maps to 2 */ ' ', '!', '"', '#', 2, '%', '&', '\'', /* 32 - 39 */ '(', ')', '*', '+', ',', '-', '.', '/', /* 40 - 47 */ '0', '1', '2', '3', '4', '5', '6', '7', /* 48 - 55 */ '8', '9', ':', ';', '<', '=', '>', '?', /* 56 - 63 */ /* @ maps to 0 */ 0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', /* 64 - 71 */ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', /* 72 - 79 */ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', /* 80 - 87 */ /* [ is an escaped < */ /* \ is an escaped / */ /* ] is an escaped > */ /* ^ is an escaped Greek Lambda */ /* _ maps to 17 */ 'X', 'Y', 'Z', -60, -47, -62, -20, 17, /* 88 - 95 */ /* The backquote cannot be represented at all */ NRP, 'a', 'b', 'c', 'd', 'e', 'f', 'g', /* 96 - 103 */ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', /* 104 - 111 */ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', /* 112 - 119 */ /* { is an escaped ( */ /* | is an escaped inverted ! */ /* } is an escaped ) */ /* ~ is an escaped = */ 'x', 'y', 'z', -40, -64, -41, -61, NRP, /* 120 - 127 */ -101, NRP, NRP, NRP, NRP, NRP, NRP, NRP, /* 128 - 135 */ NRP, NRP, NRP, NRP, NRP, NRP, NRP, NRP, /* 136 - 143 */ NRP, NRP, NRP, NRP, NRP, NRP, NRP, NRP, /* 144 - 151 */ NRP, NRP, NRP, NRP, NRP, NRP, NRP, NRP, /* 152 - 159 */ /* 160 - 167 */ ' ', 64, /* Inverted ! */ 'c', /* approximation of cent marker */ 1, /* Pounds sterling */ 36, /* International currency symbol */ 3, /* Yen */ 64, /* approximate broken bar as inverted ! */ 95, /* Section marker */ /* 168 - 175 */ '"', /* approximate dieresis */ 'C', /* approximate copyright marker */ 'a', /* approximate ordfeminine */ '<', /* approximate french << */ '!', /* approximate logical not sign */ '-', /* approximate hyphen */ 'R', /* approximate registered marker */ '-', /* approximate macron */ /* 176 - 183 */ 'o', /* approximate degree marker */ NRP, /* plusminus */ '2', /* approximate superscript 2 */ '3', /* approximate superscript 3 */ '\'', /* approximate acute accent */ 'u', /* approximate greek mu */ NRP, /* paragraph marker */ '.', /* approximate bullet */ /* 184 - 191 */ ',', /* approximate cedilla */ 'i', /* approximate dotless i */ 'o', /* approximate ordmasculine */ '>', /* approximate french >> */ NRP, /* onequarter */ NRP, /* onehalf */ NRP, /* threequarters */ 96, /* Inverted ? */ /* 192 - 199 */ 'A', /* approximate A grave */ 'A', /* approximate A acute */ 'A', /* approximate A circumflex */ 'A', /* approximate A tilde */ 91, /* A dieresis */ 14, /* A ring */ 28, /* AE ligature */ 9, /* C cedilla */ /* 200 - 207 */ 'E', /* approximate E grave */ 31, /* E acute */ 'E', /* approximate E circumflex */ 'E', /* approximate E dieresis */ 'I', /* approximate I grave */ 'I', /* approximate I acute */ 'I', /* approximate I circumflex */ 'I', /* approximate I dieresis */ /* 208 - 215 */ NRP, /* Eth */ 93, /* N tilde */ 'O', /* approximate O grave */ 'O', /* approximate O acute */ 'O', /* approximate O circumflex */ 'O', /* approximate O tilde */ 92, /* O dieresis */ 'x', /* approximate multiplication sign */ /* 216 - 223 */ 11, /* O slash */ 'U', /* approximate U grave */ 'U', /* approximate U acute */ 'U', /* approximate U circumflex */ 94, /* U dieresis */ 'Y', /* approximate Y acute */ NRP, /* approximate Thorn */ 30, /* german double-s */ /* 224 - 231 */ 127, /* a grave */ 'a', /* approximate a acute */ 'a', /* approximate a circumflex */ 'a', /* approximate a tilde */ 123, /* a dieresis */ 15, /* a ring */ 29, /* ae ligature */ 'c', /* approximate c cedilla as c */ /* 232 - 239 */ 4, /* e grave */ 5, /* e acute */ 'e', /* approximate e circumflex */ 'e', /* approximate e dieresis */ 7, /* i grave */ 'i', /* approximate i acute */ 'i', /* approximate i circumflex */ 'i', /* approximate i dieresis */ /* 240 - 247 */ NRP, /* eth */ 125, /* n tilde */ 8, /* o grave */ 'o', /* approximate o acute */ 'o', /* approximate o circumflex */ 'o', /* approximate o tilde */ 124, /* o dieresis */ NRP, /* division sign */ /* 248 - 255 */ 12, /* o slash */ 6, /* u grave */ 'u', /* approximate u acute */ 'u', /* approximate u circumflex */ 126, /* u dieresis */ 'y', /* approximate y acute */ NRP, /* thorn */ 'y', /* approximate y dieresis */ }; /* * Register alises for Windows character sets that the libxml/libiconv can * recoqnise them. */ struct alias_t { char *real; char *alias; }; typedef struct alias_t alias_t; alias_t chars_aliases[] = { { "CP1250", "WIN-1250" }, { "CP1250", "WINDOWS-1250" }, { "CP1251", "WIN-1251" }, { "CP1251", "WINDOWS-1251" }, { "CP1252", "WIN-1252" }, { "CP1252", "WINDOWS-1252" }, { "CP1253", "WIN-1253" }, { "CP1253", "WINDOWS-1253" }, { "CP1254", "WIN-1254" }, { "CP1254", "WINDOWS-1254" }, { "CP1257", "WIN-1257" }, { "CP1257", "WINDOWS-1257" }, { NULL } }; void charset_init() { int i; for (i = 0; chars_aliases[i].real != NULL; i++) { xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias); /*debug("encoding",0,"Add encoding for %s",chars_aliases[i].alias);*/ } } void charset_shutdown() { xmlCleanupEncodingAliases(); } void charset_gsm_to_latin1(Octstr *ostr) { long pos, len; len = octstr_len(ostr); for (pos = 0; pos < len; pos++) { int c, new, i; c = octstr_get_char(ostr, pos); if (c == 27 && pos + 1 < len) { /* GSM escape code. Delete it, then process the next * character specially. */ octstr_delete(ostr, pos, 1); len--; c = octstr_get_char(ostr, pos); for (i = 0; gsm_escapes[i].gsmesc >= 0; i++) { if (gsm_escapes[i].gsmesc == c) break; } if (gsm_escapes[i].gsmesc == c) new = gsm_escapes[i].latin1; else if (c < 128) new = gsm_to_latin1[c]; else continue; } else if (c < 128) { new = gsm_to_latin1[c]; } else { continue; } if (new != c) octstr_set_char(ostr, pos, new); } } void charset_latin1_to_gsm(Octstr *ostr) { long pos, len; int c, new; unsigned char esc = 27; len = octstr_len(ostr); for (pos = 0; pos < len; pos++) { c = octstr_get_char(ostr, pos); gw_assert(c >= 0); gw_assert(c <= 256); new = latin1_to_gsm[c]; if (new < 0) { /* Escaped GSM code */ octstr_insert_data(ostr, pos, &esc, 1); pos++; len++; new = -new; } if (new != c) octstr_set_char(ostr, pos, new); } } /* * This function is a wrapper arround charset_latin1_to_gsm() * which implements the mapping of a NRCs (national reprentation codes) * ISO 21 German. */ void charset_gsm_to_nrc_iso_21_german(Octstr *ostr) { long pos, len; int c, new; len = octstr_len(ostr); for (pos = 0; pos < len; pos++) { c = octstr_get_char(ostr, pos); switch (c) { /* GSM value; NRC value */ case 0x5b: new = 0x5b; break; /* Ä */ case 0x5c: new = 0x5c; break; /* Ö */ case 0x5e: new = 0x5d; break; /* Ü */ case 0x7b: new = 0x7b; break; /* ä */ case 0x7c: new = 0x7c; break; /* ö */ case 0x7e: new = 0x7d; break; /* ü */ case 0x1e: new = 0x7e; break; /* ß */ case 0x5f: new = 0x5e; break; /* § */ default: new = c; } if (new != c) octstr_set_char(ostr, pos, new); } } void charset_nrc_iso_21_german_to_gsm(Octstr *ostr) { long pos, len; int c, new; len = octstr_len(ostr); for (pos = 0; pos < len; pos++) { c = octstr_get_char(ostr, pos); switch (c) { /* NRC value; GSM value */ case 0x5b: new = 0x5b; break; /* Ä */ case 0x5c: new = 0x5c; break; /* Ö */ case 0x5d: new = 0x5e; break; /* Ü */ case 0x7b: new = 0x7b; break; /* ä */ case 0x7c: new = 0x7c; break; /* ö */ case 0x7d: new = 0x7e; break; /* ü */ case 0x7e: new = 0x1e; break; /* ß */ case 0x5e: new = 0x5f; break; /* § */ default: new = c; } if (new != c) octstr_set_char(ostr, pos, new); } } int charset_gsm_truncate(Octstr *gsm, long max) { if (octstr_len(gsm) > max) { /* If the last GSM character was an escaped character, * then chop off the escape as well as the character. */ if (octstr_get_char(gsm, max - 1) == 27) octstr_truncate(gsm, max - 1); else octstr_truncate(gsm, max); return 1; } return 0; } int charset_to_utf8(Octstr *from, Octstr **to, Octstr *charset_from) { int ret; xmlCharEncodingHandlerPtr handler = NULL; xmlBufferPtr frombuffer = NULL; xmlBufferPtr tobuffer = NULL; if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) { *to = octstr_duplicate(from); return 0; } handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from)); if (handler == NULL) return -2; /* Build the libxml buffers for the transcoding. */ tobuffer = xmlBufferCreate(); frombuffer = xmlBufferCreate(); xmlBufferAdd(frombuffer, octstr_get_cstr(from), octstr_len(from)); ret = xmlCharEncInFunc(handler, tobuffer, frombuffer); *to = octstr_create_from_data(tobuffer->content, tobuffer->use); /* Memory cleanup. */ xmlBufferFree(tobuffer); xmlBufferFree(frombuffer); return ret; } int charset_from_utf8(Octstr *utf8, Octstr **to, Octstr *charset_to) { int ret; xmlCharEncodingHandlerPtr handler = NULL; xmlBufferPtr frombuffer = NULL; xmlBufferPtr tobuffer = NULL; handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to)); if (handler == NULL) return -2; /* Build the libxml buffers for the transcoding. */ tobuffer = xmlBufferCreate(); frombuffer = xmlBufferCreate(); xmlBufferAdd(frombuffer, octstr_get_cstr(utf8), octstr_len(utf8)); ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer); if (ret < -2) /* Libxml seems to be here a little uncertain what would be the * return code -3, so let's make it -1. Ugly thing, indeed. --tuo */ ret = -1; *to = octstr_create_from_data(tobuffer->content, tobuffer->use); /* Memory cleanup. */ xmlBufferFree(tobuffer); xmlBufferFree(frombuffer); return ret; } int charset_convert(Octstr *string, char *charset_from, char *charset_to) { #if HAVE_ICONV_H char *from_buf, *to_buf, *pointer; size_t inbytes, outbytes; int ret; iconv_t cd; if (!charset_from || !charset_to || !string) /* sanity check */ return -1; cd = iconv_open(charset_to, charset_from); /* Did I succeed in getting a conversion descriptor ? */ if (cd == (iconv_t)(-1)) { /* I guess not */ error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.", charset_from, charset_to); return -1; } from_buf = octstr_get_cstr(string); /* allocate max sized buffer, assuming target encoding may be 4 byte unicode */ inbytes = octstr_len(string); outbytes = sizeof(char) *octstr_len(string) * 4; pointer = to_buf = gw_malloc(outbytes + 1); memset(to_buf, 0, outbytes + 1); ret = iconv(cd, (char**)&from_buf, &inbytes, &pointer, &outbytes); iconv_close(cd); if (ret != -1) { /* conversion succeeded */ octstr_delete(string, 0, octstr_len(string)); octstr_append_data(string, to_buf, pointer - to_buf); if (ret) debug("charset", 0, "charset_convert did %d non-reversible conversions", ret); ret = 0; } else { error(0,"Failed to convert string from <%s> to <%s>, errno was <%d>", charset_from, charset_to, errno); } if (errno == EILSEQ) { debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>", from_buf - octstr_get_cstr(string)); } gw_free(to_buf); return ret; #endif /* no convertion done due to not having iconv */ return -1; }