/* * ratCode.c -- * * This file contains basic support for decoding and encoding of * strings coded in various MIME-encodings. * * TkRat software and its included text is Copyright 1996-2002 by * Martin Forssén * * The full text of the legal notice is contained in the file called * COPYRIGHT, included with this distribution. */ #include "rat.h" /* * List used when decoding QP */ char alphabetHEX[17] = "0123456789ABCDEF"; /* * List used when decoding base64 * It consists of 64 chars plus '=' and null */ static char alphabet64[66] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; /* * List used when decoding modified base64 * It consists of 64 chars plus '=' and null */ static char modified64[66] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,="; #define RFC2047_MAX_LINE_LENGTH 75 #define RFC2047_MAX_ENCODED_WORD_LENGTH 75 static int FindMimeHdr(Tcl_Interp *interp, unsigned char *hdr, unsigned char **sPtr, unsigned char **ePtr, Tcl_Encoding *encoding, int *code, unsigned char **data, int *length); static int RatUtf8to16(const unsigned char *src, unsigned char *dst); static int RatUtf16to8(const unsigned char *src, unsigned char *dst); static int RatCheckEncoding(Tcl_Interp *interp, char *encoding_name, const char *string, int length); static int CreateEncWord(Tcl_Interp *interp, Tcl_Encoding enc, const char *charset, unsigned char *raw, int length, Tcl_DString *dest, int maxUse); /* *---------------------------------------------------------------------- * * FindMimeHdr -- * * Find a string encoded according to rfc2047 * * Results: * Returns data in most arguments. * * Side effects: * None * *---------------------------------------------------------------------- */ static int FindMimeHdr(Tcl_Interp *interp, unsigned char *hdr, unsigned char **sPtr, unsigned char **ePtr, Tcl_Encoding *encoding, int *code, unsigned char **data, int *length) { unsigned char *sCharset, *eCharset, *cPtr, c; for (cPtr = hdr; *cPtr; cPtr++) { if ('=' == cPtr[0] && '?' == cPtr[1]) { *sPtr = cPtr; sCharset = cPtr+2; for (cPtr+=2; '?' != *cPtr && *cPtr; cPtr++); if ('?' != *cPtr) return 0; if ('?' != cPtr[2]) continue; switch (cPtr[1]) { case 'b': case 'B': *code = ENCBASE64; break; case 'q': case 'Q': *code = ENCQUOTEDPRINTABLE; break; default: continue; } eCharset = cPtr; *data = cPtr+3; for (cPtr+=3, *length = 0; *cPtr && ('?' != *cPtr || '=' != cPtr[1]); cPtr++, (*length)++); if ('?' != *cPtr) return 0; *ePtr = cPtr+2; c = *eCharset; *eCharset = '\0'; *encoding = RatGetEncoding(interp, (char*)sCharset); *eCharset = c; return 1; } } return 0; } /* *---------------------------------------------------------------------- * * RatDecodeHeader -- * * Decodes a header line encoded according to rfc2047. * * Results: * Returns a pointer to a static storage area * * Side effects: * None * * TODO, handle address entries correct * *---------------------------------------------------------------------- */ char* RatDecodeHeader(Tcl_Interp *interp, const char *data, int adr) { static Tcl_DString ds, tmp; static int initialized = 0; unsigned char *sPtr, *ePtr, *decoded, *text, *cPtr, *point = (unsigned char*)data; int length, code, first = 1; unsigned long dlen; unsigned int i; Tcl_Encoding encoding; Tcl_DString *myPtr = NULL; if (!data || !*data) { return ""; } if (!initialized) { Tcl_DStringInit(&ds); initialized = 1; } else { Tcl_DStringSetLength(&ds, 0); } /* * Check for headers from buggy programs (with raw eight-bit data * in them) */ for (cPtr = (unsigned char*)data; *cPtr; cPtr++) { if (*cPtr & 0x80) { myPtr = (Tcl_DString*)ckalloc(sizeof(Tcl_DString)); Tcl_DStringInit(myPtr); Tcl_ExternalToUtfDString(NULL, data, -1, myPtr); data = Tcl_DStringValue(myPtr); point = (unsigned char*)data; break; } } while (FindMimeHdr(interp, point, &sPtr, &ePtr, &encoding, &code, &text, &length)) { if (sPtr != point) { if (!first) { for (cPtr = point; cPtr < sPtr && isspace(*cPtr); cPtr++); if (cPtr < sPtr) { Tcl_DStringAppend(&ds, (char*)point, sPtr-point); } } else { for (i=0; i>4)&0x3); if (strchr(alphabet64, '=')-alphabet64 != lbuf[2]) { buf[srcLength++] = lbuf[1] << 4 | ((lbuf[2]>>2)&0xf); if (strchr(alphabet64, '=')-alphabet64 != lbuf[3]) { buf[srcLength++] = lbuf[2] << 6 | (lbuf[3]&0x3f); } } } } else if (cte == ENCQUOTEDPRINTABLE) { src = buf; for (srcLength = 0; dataIndex < length && srcLength < sizeof(buf); ) { if ('=' == data[dataIndex]) { if ('\r' == data[dataIndex+1]) { dataIndex += 3; } else if ('\n' == data[dataIndex+1]) { dataIndex += 2; } else { buf[srcLength++] = 16*(strchr(alphabetHEX, data[dataIndex+1])-alphabetHEX) + strchr(alphabetHEX, data[dataIndex+2])-alphabetHEX; dataIndex += 3; } } else { buf[srcLength++] = data[dataIndex++]; } } } else { src = data; srcLength = length; dataIndex = length; allocated = 0; } if (charset) { Tcl_ExternalToUtfDString(enc, src, srcLength, &tmpDs); Tcl_DStringAppend(dsPtr, Tcl_DStringValue(&tmpDs), Tcl_DStringLength(&tmpDs)); Tcl_DStringFree(&tmpDs); } else { Tcl_DStringAppend(dsPtr, src, srcLength); } } if (charset) { len = Tcl_DStringLength(dsPtr); for (src = dst = Tcl_DStringValue(dsPtr); *src; src++) { if (*src != '\r') { *dst++ = *src; } else { len--; } } Tcl_DStringSetLength(dsPtr, len); } return dsPtr; } /* *---------------------------------------------------------------------- * * CreateEncWord -- * * Tres to create an encoded word (if needed) by the given string. * It uses at most length bytes from raw and stores the result in * dest. The result will be no more than maxUse characters. * * Results: * Returns non-zero if the encoding was successful. * * Side effects: * None. * * *---------------------------------------------------------------------- */ static int CreateEncWord(Tcl_Interp *interp, Tcl_Encoding enc, const char *charset, unsigned char *raw, int length, Tcl_DString *dest, int maxUse) { unsigned char buf[RFC2047_MAX_ENCODED_WORD_LENGTH+1], buf2[RFC2047_MAX_ENCODED_WORD_LENGTH+1]; Tcl_EncodingState state; int i, consumed, wrote, d; /* * Check if we must encode this */ for (i=0; i sizeof(buf)-1) { maxUse = sizeof(buf)-1; } /* * Try to convert to external encoding */ if (TCL_OK != Tcl_UtfToExternal(interp, enc, raw, length, TCL_ENCODING_START|TCL_ENCODING_END, &state, buf2, sizeof(buf2), &consumed, &wrote, NULL) || consumed != length) { return 0; } /* * Convert into quoted-printable, check that we have room all the time */ snprintf(buf, sizeof(buf), "=?%s?Q?", charset); for (i=0, d=strlen(buf); i= maxUse-2) { return 0; } buf[d++] = '='; buf[d++] = alphabetHEX[buf2[i]>>4]; buf[d++] = alphabetHEX[buf2[i]&0xf]; } else { buf[d++] = buf2[i]; } } if (i < wrote) { return 0; } buf[d++] = '?'; buf[d++] = '='; Tcl_DStringAppend(dest, buf, d); return 1; } /* *---------------------------------------------------------------------- * * RatEncodeHeaderLine -- * * Encodes one header line according to MIME (rfc2047). * The nameLength argument should tell how long the header name is in * characters. This is so that the line folding can do its job properly. * * Results: * A block of encoded header line. THis block of data will be valid * until the next call to thius function. * * Side effects: * None. * * *---------------------------------------------------------------------- */ char* RatEncodeHeaderLine (Tcl_Interp *interp, Tcl_Obj *line, int nameLength) { static Tcl_DString ds; static int initialized = 0; Tcl_Obj **objv; int i, objc, l, l1, pre = nameLength, maxUse; char *s; const char *charset; Tcl_Encoding enc; if (NULL == line) { return NULL; } if (!initialized) { Tcl_DStringInit(&ds); initialized = 1; } else { Tcl_DStringSetLength(&ds, 0); } /* * Find suitable encoding */ Tcl_ListObjGetElements(interp, Tcl_GetVar2Ex(interp, "option", "charset_candidates", TCL_GLOBAL_ONLY), &objc, &objv); s = Tcl_GetStringFromObj(line, &l); for (i=0; i0 && !isspace(s[l]); l--); if (0 == l) { l = RFC2047_MAX_LINE_LENGTH-pre; } } maxUse = RFC2047_MAX_LINE_LENGTH-pre; while (!CreateEncWord(interp, enc, charset, s, l, &ds, maxUse)) { for (l1 = l-1; l1 > 0 && !isspace(s[l1]); l1--); if (0 < l1) { l = l1; } else { maxUse = 1024; l--; } } s += l; if (*s) { Tcl_DStringAppend(&ds, "\r\n", 2); for (pre=0; isspace(*s) && prepersonal) { for (cPtr = adrPtr->personal; *cPtr; cPtr++) { if (*cPtr & 0x80) { oPtr = Tcl_NewStringObj(adrPtr->personal, -1); cPtr = RatEncodeHeaderLine(interp, oPtr, 0); Tcl_DecrRefCount(oPtr); ckfree(adrPtr->personal); adrPtr->personal = cpystr(cPtr); } } } adrPtr = adrPtr->next; } } /* *---------------------------------------------------------------------- * * RatGetEncoding -- * * Return the tcl-encoding attached to the given name. This name * may be mapped from a MIME-name into a tcl-name. * * Results: * A tcl Tcl_Endoding blob. The given encoding must be freed by the * caller by calling Tcl_FreeEncoding(). * * Side effects: * None. * * *---------------------------------------------------------------------- */ Tcl_Encoding RatGetEncoding(Tcl_Interp *interp, const char *name) { Tcl_Encoding enc; const char *tclName; char lname[256]; if (NULL == name) { return NULL; } strlcpy(lname, name, sizeof(lname)); lcase(lname); tclName = Tcl_GetVar2(interp, "charsetMapping", lname, TCL_GLOBAL_ONLY); if (NULL == tclName) { tclName = lname; } enc = Tcl_GetEncoding(interp, tclName); if (NULL == enc) { return NULL; } return enc; } /* *---------------------------------------------------------------------- * * RatCheckEncoding -- * * Check if the given encoding can encode the given string * * Results: * Non-zero if all characters in the give string can be encoded * successfully * * Side effects: * None. * * *---------------------------------------------------------------------- */ static int RatCheckEncoding(Tcl_Interp *interp, char *encoding_name, const char *string, int length) { Tcl_EncodingState state; Tcl_Encoding enc; char buf[1024]; int ret, in; if (NULL == (enc = RatGetEncoding(interp, encoding_name))) { return 0; } ret = 0; while (length && TCL_CONVERT_UNKNOWN != ret) { ret = Tcl_UtfToExternal(interp, enc, string, length, TCL_ENCODING_STOPONERROR|TCL_ENCODING_START, &state, buf, sizeof(buf), &in, NULL, NULL); string += in; length -= in; } Tcl_FreeEncoding(enc); return TCL_CONVERT_UNKNOWN != ret; } /* *---------------------------------------------------------------------- * * RatCheckEncodingsCmd -- * * See ../doc/interface for a descriptions of arguments and result. * * Results: * See above * * Side effects: * None. * * *---------------------------------------------------------------------- */ int RatCheckEncodingsCmd(ClientData dummy, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]) { int i, listLength, srcLen; Tcl_Obj *oPtr, *vPtr; char *src; if (3 != objc) { Tcl_AppendResult(interp, "Usage: ", Tcl_GetString(objv[0]), \ " variable charsets", (char*) NULL); return TCL_ERROR; } vPtr = Tcl_GetVar2Ex(interp, Tcl_GetString(objv[1]), NULL, 0); Tcl_ListObjLength(interp, objv[2], &listLength); src = Tcl_GetStringFromObj(vPtr, &srcLen); for (i=0; i 0; l -= 3, cPtr += 3) { buf[0] = alphabet64[cPtr[0] >> 2]; buf[1] = alphabet64[((cPtr[0] << 4) + (l>1 ? (cPtr[1]>>4) : 0))&0x3f]; buf[2] = l > 1 ? alphabet64[((cPtr[1]<<2) + (l>2 ? (cPtr[2]>>6) : 0)) & 0x3f] : '='; buf[3] = l > 2 ? alphabet64[cPtr[2] & 0x3f] : '='; Tcl_AppendToObj(dPtr, (char*)buf, 4); if (18 == ++ll || l < 4) { Tcl_AppendToObj(dPtr, "\n", 1); ll = 0; } } return dPtr; } /* *---------------------------------------------------------------------- * * RatUtf8to16 -- * * Convert the given utf-8 character to UCS-2 * * Results: * Returns the number of characters consumed from src * On failure a negative number is returned. * * Side effects: * None. * * *---------------------------------------------------------------------- */ static int RatUtf8to16(const unsigned char *src, unsigned char *dst) { if (0 == (*src & 0x80)) { dst[0] = 0; dst[1] = *src; return 1; } else if (0xc0 == (*src & 0xe0)) { if (!(src[1] & 0x80)) { return 1; } dst[0] = (src[0] & 0x1f) >> 2; dst[1] = ((src[0] & 0x03) << 6) + (src[1] & 0x3f); return 2; } else if (0xe0 == (*src & 0xf0)) { if (!(src[1] & 0x80) && !(src[2] & 0x80)) { return 1; } dst[0] = ((src[0] & 0x0f) << 4) + ((src[1] & 0x3f) >> 2); dst[1] = ((src[1] & 0x03) << 6) + (src[2] & 0x3f); return 3; } else { dst[0] = 0; dst[1] = *src; return 1; } } /* *---------------------------------------------------------------------- * * RatUtf16to8 -- * * Convert the given UCS-2 character to utf-8 * * Results: * Returns the length of the generated string on success. * On failure a negative number is returned. * * Side effects: * None. * * *---------------------------------------------------------------------- */ static int RatUtf16to8(const unsigned char *src, unsigned char *dst) { if (src[0] >= 0x08) { dst[0] = 0xe0 | (src[0] >> 4); dst[1] = 0x80 | ((src[0] & 0x0f) << 2) | (src[1] >> 6); dst[2] = 0x80 | (src[1] & 0x3f); return 3; } else if (src[0] || src[1] > 0x7f) { dst[0] = 0xc0 | (src[0] << 2) | (src[1] >> 6); dst[1] = 0x80 | (src[1] & 0x3f); return 2; } else { dst[0] = src[1]; return 1; } } /* *---------------------------------------------------------------------- * * RatUtf8toMutf7 -- * * Convert the given utf-8 encoded text to modified utf-7 * * Results: * Returns a pointer to a static buffer containing the new text * * Side effects: * None. * * *---------------------------------------------------------------------- */ char* RatUtf8toMutf7(const char *signed_src) { static unsigned char *dst = NULL; static int dstlen = 0; unsigned char buf[3], *src = (unsigned char*)signed_src; int len = 0, overflow = 0; if (dstlen < strlen((char*)src)*3+1) { dstlen = strlen((char*)src)*3; dst = (unsigned char *)ckrealloc(dst, dstlen); } while (*src) { if ('&' == *src) { if (dstlen <= len+2) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } dst[len++] = '&'; dst[len++] = '-'; src++; } else if (*src & 0x80) { if (dstlen <= len+6) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } dst[len++] = '&'; do { if (dstlen <= len+5) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } if (overflow) { buf[0] = buf[3]; if (*src & 0x80) { src += RatUtf8to16(src, buf+1); } else { buf[1] = buf[2] = 0; } overflow = 0; } else { src += RatUtf8to16(src, buf); if (*src & 0x80) { src += RatUtf8to16(src, buf+2); overflow = 1; } else { buf[2] = buf[3] = 0; } } dst[len++] = modified64[buf[0] >> 2]; dst[len++] = modified64[((buf[0] << 4) + (buf[1]>>4)) & 0x3f]; if (buf[1] || buf[2]) { dst[len++] = modified64[((buf[1]<<2) + (buf[2]>>6)) & 0x3f]; if (buf[2]) { dst[len++] = modified64[buf[2] & 0x3f]; } } } while (*src & 0x80 || overflow); if (strchr(modified64, *src) || '\0' == *src) { dst[len++] = '-'; } } else { if (dstlen <= len+1) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } dst[len++] = *src++; } } dst[len] = '\0'; return (char*)dst; } /* *---------------------------------------------------------------------- * * RatMutf7toUtf8 -- * * Convert the given modified utf-7 encoded text to utf-8 * * Results: * Returns the length of the generated string on success. * On failure a negative number is returned. * * Side effects: * None. * * *---------------------------------------------------------------------- */ char* RatMutf7toUtf8(const char *signed_src) { static unsigned char *dst = NULL; static int dstlen = 0; unsigned char utf16[2], lbuf[4], *src = (unsigned char*)signed_src; int i, l, len=0, odd; if (dstlen < strlen((char*)src)*3) { dstlen = strlen((char*)src)*3; dst = (unsigned char *)ckrealloc(dst, dstlen); } while (*src) { if (len >= dstlen) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } if ('&' == *src && '-' == src[1]) { dst[len++] = '&'; src += 2; } else if ('&' == *src) { src++; odd = 0; do { for (i=0; i<4; i++) { if (strchr(modified64, *src)) { lbuf[i] = strchr(modified64, *src++) - modified64; } else { lbuf[i] = 0; } } if (odd) { odd = 0; if (len >= dstlen+6) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } utf16[1] = (lbuf[0] << 2) | (lbuf[1] >> 4); len += RatUtf16to8(utf16, dst+len); utf16[0] = (lbuf[1] << 4) | (lbuf[2] >> 2); utf16[1] = (lbuf[2] << 6) | lbuf[3]; if (utf16[0] != 0 || utf16[1] != 0) { l = RatUtf16to8(utf16, dst+len); len += l; } } else { if (len >= dstlen+3) { dstlen += 128; dst = (unsigned char *)ckrealloc(dst, dstlen); } utf16[0] = (lbuf[0] << 2) | (lbuf[1] >> 4); utf16[1] = (lbuf[1] << 4) | (lbuf[2] >> 2); len += RatUtf16to8(utf16, dst+len); utf16[0] = (lbuf[2] << 6) | lbuf[3]; odd = 1; } } while (strchr(modified64, *src)); if ('-' == *src) { src++; } } else { dst[len++] = *src++; } } dst[len] = '\0'; return (char*)dst; } /* *---------------------------------------------------------------------- * * RatEncodeQP - * * Encode the given text to QP * * Results: * Returns an intialized Tcl_DString pointer. It is up to the caller to * free this when not needing it anymore. * * Side effects: * None. * * *---------------------------------------------------------------------- */ Tcl_DString* RatEncodeQP(const unsigned char *line) { Tcl_DString *ds = (Tcl_DString*)ckalloc(sizeof(*ds)); const unsigned char *c; unsigned char buf[4]; Tcl_DStringInit(ds); for (c=line; *c; c++) { if ('=' == *c || 0x80 <= *c) { snprintf(buf, sizeof(buf), "=%02X", *c); Tcl_DStringAppend(ds, buf, 3); } else { Tcl_DStringAppend(ds, c, 1); } } return ds; } /* *---------------------------------------------------------------------- * * RatEncodeQPCmd -- * * See ../doc/interface * * Results: * A standard tcl result * * Side effects: * None * * *---------------------------------------------------------------------- */ int RatEncodeQPCmd(ClientData dummy, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]) { Tcl_Encoding enc; Tcl_DString ext, *encoded; if (objc != 3) { Tcl_AppendResult(interp, "Bad usage", TCL_STATIC); return TCL_ERROR; } enc = Tcl_GetEncoding(interp, Tcl_GetString(objv[1])); Tcl_UtfToExternalDString(enc, Tcl_GetString(objv[2]), -1, &ext); encoded = RatEncodeQP(Tcl_DStringValue(&ext)); Tcl_DStringFree(&ext); Tcl_DStringResult(interp, encoded); Tcl_FreeEncoding(enc); ckfree(encoded); return TCL_OK; } /* *---------------------------------------------------------------------- * * RatDecodeQP - * * Dencode the given text from QP * * Results: * Returns a pointer to a string. This string has been allocated with * ckalloc and it is up to the caller to free it when not needing it. * * Side effects: * None. * * *---------------------------------------------------------------------- */ unsigned char* RatDecodeQP(unsigned char *line) { unsigned char *s, *d; d = s = line; while (*s) { if ('=' == *s && isxdigit(s[1]) && isxdigit(s[2])) { *d++ = ((strchr(alphabetHEX, s[1])-alphabetHEX)<<4) + (strchr(alphabetHEX, s[2])-alphabetHEX); s += 3; } else { *d++ = *s++; } } *d = '\0'; return line; } /* *---------------------------------------------------------------------- * * RatDecodeQPCmd -- * * See ../doc/interface * * Results: * A standard tcl result * * Side effects: * None * * *---------------------------------------------------------------------- */ int RatDecodeQPCmd(ClientData dummy, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]) { Tcl_Encoding enc; Tcl_DString utf; char *text; if (objc != 3) { Tcl_AppendResult(interp, "Bad usage", TCL_STATIC); return TCL_ERROR; } enc = Tcl_GetEncoding(interp, Tcl_GetString(objv[1])); text = cpystr(Tcl_GetString(objv[2])); RatDecodeQP(text); Tcl_ExternalToUtfDString(enc, text, -1, &utf); ckfree(text); Tcl_DStringResult(interp, &utf); Tcl_FreeEncoding(enc); return TCL_OK; } /* * Test code for Mutf7 <-> utf8 functions static void Test(unsigned char *in) { unsigned char stage1[1024], stage2[1024]; printf("In: %s\n", in); fflush(stdin); RatUtf8toMutf7(in, stage1, sizeof(stage1)); printf("Stage1: %s\n", stage1); fflush(stdin); RatMutf7toUtf8(stage1, stage2, sizeof(stage2)); printf("Stage2: %s\n", stage2); fflush(stdin); if (strcmp(stage2, in)) { printf("ERROR\n"); } printf("\n"); } int main() { Test("fÃ¥r"); Test("RäksmörgÃ¥s"); Test("Ã¥"); Test("åä"); Test("åäö"); Test("åäöå"); Test("åäöåä"); Test("åäöåäö"); return 0; } */