/* * File: utf7.c * * Author: Ulli Horlacher (framstag@rus.uni-stuttgart.de) * * History: * * 1995-08-12 Framstag initial version * 1996-03-22 Framstag replaced some unsigned char with char * 1996-04-04 Framstag fixed memory leak in utf2iso * 1997-02-23 Framstag modified str_* function names * 1997-07-04 Framstag added NULL argument for utf2iso * 1997-07-10 Framstag '=' is not allowed in mbase64! * bugfix: corrupt UTF7 string will now be ignored * 1998-07-21 Framstag allow single spaces in UTF7 strings * 1998-10-29 Framstag iso2utf -> iso2utf7 * 1999-03-13 Framstag added uni2utf() and utf2uni() * * UTF-7 and Unicode coding routines for the sendfile package. * Look at utf7.h for a list of the functions. * * Copyright © 1995-1999 Ulli Horlacher * This file is covered by the GNU General Public License */ #include #include #include "string.h" /* extended string functions */ #include "utf7.h" /* * utf2iso - UTF-7 to ISO Latin-1 decoding * * INPUT: fnf - unix file name flag; if >0 substitute '/' with '_' * utf - UTF-7 encoded string * * OUTPUT: iso - ISO Latin-1 string * show - ISO Latin-1 string without control codes * shell - ISO Latin-1 string without control codes and meta characters * * RETURN: 2 digit binary code * - if no digit is set: no special chars found * - if 1st digit is set: Unicode characters or '/' or '\0' found * - if 2nd digit is set: meta chars or control code chars found * * REMARK: iso, show and shell may be NULL in which case they will be ignored */ int utf2iso(int fnf, char *iso, char *show, char *shell, char *utf) { int ucc, /* Unicode character count */ flags=0; /* output flags */ char *cp, *cp2, *cp3, /* char pointers for positioning substrings */ mbase_part[LEN_UTF]; /* mbase64 string */ pstr_t *uni_part; /* Unicode part pstring */ /* initialisize the strings */ if (iso) *iso=0; if (show) *show=0; if (shell) *shell=0; uni_part=pstr_create(LEN_UNI); /* loop over the UTF-7 encoded string to find mbase64 parts */ for (cp=utf, cp2=utf+1; *cp!=0; cp++, cp2++) { /* mbase64 shift char? */ if (*cp=='+') { /* end of mbase64 part? */ if (*cp2=='-' || *cp2==0) { if (iso) strcat(iso,"+"); if (show) strcat(show,"+"); if (shell) strcat(shell,"+"); /* still more string to parse? */ if (*cp2!=0) { cp++; cp2++; } } else { /* find end of mbase64 part */ for (cp3=cp2; *cp3!='-'; cp3++); /* cut out mbase64 part string */ *mbase_part=0; strncat(mbase_part,cp2,cp3-cp2); /* decode it to Unicode */ decode_mbase64(uni_part,mbase_part); /* loop over Unicode pstring to look for ISO Latin-1 chars */ for (ucc=1; ucc<=uni_part->length; ucc+=2) { /* next character a ISO Latin-1 char? */ if (uni_part->string[ucc]==0) add_char(fnf,iso,show,shell,uni_part->string[ucc+1],&flags); else { /* substitute non valid Unicode character with '_' */ flags = flags|1; if (iso) strcat(iso,"_"); if (show) strcat(show,"_"); if (shell) strcat(shell,"_"); } } /* adjust the pointers */ cp = cp3; cp2 = ++cp3; } } else /* add a ISO Latin-1 char */ add_char(fnf,iso,show,shell,*cp,&flags); } /* dont allow "." or ".." as file names */ if (fnf) { if (iso && str_eq(iso,".")) { strcpy(iso,"_"); flags = flags|1; } if (show && str_eq(show,".")) { strcpy(show,"_"); flags = flags|1; } if (shell && str_eq(shell,".")) { strcpy(shell,"_"); flags = flags|1; } if (iso && str_eq(iso,"..")) { strcpy(iso,"__"); flags = flags|1; } if (show && str_eq(show,"..")) { strcpy(show,"__"); flags = flags|1; } if (shell && str_eq(shell,"..")) { strcpy(shell,"__"); flags = flags|1; } } /* free memory for no longer used Unicode pstring */ pstr_delete(uni_part); return(flags); } /* * add_char - add a char depending on its range * * INPUT: fnf - unix file name flag; if >0 substitute '/' with '_' * c - char to add * flags - return flags for utf2iso function * * OUTPUT: iso - ISO Latin-1 string * show - ISO Latin-1 string without control codes * shell - ISO Latin-1 string without control codes and meta characters * flags - return flags for utf2iso function */ void add_char(int fnf, char *iso, char *show, char *shell, char c, int *flags) { unsigned char sc[2]; /* string to add */ const char *meta="\"!#$&'()*?\\`| "; /* (bourne) shell meta characters */ /* build the string to add */ sc[0] = c; sc[1] = 0; /* is it a non valid char for a UNIX file name? */ if (*sc==0 || (*sc=='/' && fnf)) { *flags = *flags|1; *sc = '_'; } /* add the char to the iso-string */ if (iso) strcat(iso,(char *)sc); /* is it a control code? */ if (*sc<32 || (*sc>126 && *sc<161)) { *flags = *flags|2; *sc = '_'; } /* add the char to the show-string */ if (show) strcat(show,(char *)sc); /* is it a meta char? */ if (strchr(meta,*(char *)sc)) { *flags = *flags|2; *sc = '_'; } /* add the char to the shell-string */ if (shell) strcat(shell,(char *)sc); } /* * iso2utf - ISO Latin-1 to UTF-7 encoding * * INPUT: iso - ISO Latin-1 string * * OUTPUT: utf - UTF-7 encoded string */ void iso2utf(char *utf_name, char *iso_name) { iso2utf7(utf_name,iso_name,1); } /* * iso2utf7 - ISO Latin-1 to UTF-7 encoding * * INPUT: iso - ISO Latin-1 string * withspace - flag for encoding spaces * * OUTPUT: utf - UTF-7 encoded string */ void iso2utf7(char *utf_name, char *iso_name, int withspace) { char *cp, *cp2, /* string pointers */ *DO_set="abcdefghijklmnopqrstuvwxyz" /* mbase64 D and O sets */ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "01234567890" "'(),-./:?!\"#$%&*;<>@[]^_`{}|"; char iso_part[LEN_ISO], /* ISO Latin-1 part string */ mbase_part[LEN_UTF]; /* mbase64 part string */ pstr_t *uni_part; /* Unicode part pstring */ /* initialisize the strings */ utf_name[0]=0; uni_part=pstr_create(LEN_UNI); /* scan the iso-string */ for (cp=iso_name; *cp!=0 ; cp++) { /* char in DO set (or is it a middle single space)? */ if (strchr(DO_set,*cp) || (cp!=iso_name && *(cp+1) && withspace && *cp==32 && *(cp+1)!=32)) strncat(utf_name,cp,1); else { /* add UTF-7 shift char */ strcat(utf_name,"+"); /* +- short encoding? */ if (*cp=='+') strcat(utf_name,"-"); else { /* search for the next char in the DO set */ cp2=cp; cp2++; while (strchr(DO_set,*cp2)==NULL && *cp2!=0) cp2++; /*while ((strchr(DO_set,(int)*cp2)==NULL) && (*cp2!=NULL)) cp2++;*/ /* cut out the iso-part string */ *iso_part=0; strncat(iso_part,cp,cp2-cp); /* translate it to Unicode */ iso2uni(uni_part,iso_part); /* encode to mbase64 */ encode_mbase64(mbase_part,uni_part); /* add it to the utf string */ strcat(utf_name,mbase_part); strcat(utf_name,"-"); cp=cp2-1; } } } /* free memory for no longer used Unicode pstring */ pstr_delete(uni_part); } /* * uni2utf - Unicode to UTF-7 encoding * * INPUT: uni - unicode pstring * * OUTPUT: utf - UTF-7 encoded string */ void uni2utf(char *utf, pstr_t *uni) { /* encode to mbase64 */ encode_mbase64(utf+1,uni); /* build utf7 string */ utf[0]='+'; strcat(utf,"-"); } /* * utf2uni - UTF-7 to Unicode decoding * * INPUT: utf - UTF-7 encoded string * * OUTPUT: uni - unicode pstring */ void utf2uni(pstr_t *uni, char *utf) { char *cp, *cp2, *cp3, /* char pointers for positioning substrings */ mbase_part[LEN_UTF]; /* mbase64 string */ pstr_t *uni_part; /* Unicode part pstring */ uni_part=pstr_create(LEN_UNI); /* loop over the UTF-7 encoded string to find mbase64 parts */ for (cp=utf, cp2=utf+1; *cp!=0; cp++, cp2++) { /* mbase64 shift char? */ if (*cp=='+') { /* short end of mbase64 part ("+-")? */ if (*cp2=='-' || *cp2==0) { if (pstr_addchar(uni,0)<0) return; if (pstr_addchar(uni,'+')<0) return; /* still more string to parse? */ if (*cp2!=0) { cp++; cp2++; } } else { /* find end of mbase64 part */ for (cp3=cp2; *cp3!='-'; cp3++); /* cut out mbase64 part string */ *mbase_part=0; strncat(mbase_part,cp2,cp3-cp2); /* decode it to Unicode */ decode_mbase64(uni_part,mbase_part); /* add unicode part string */ if (pstr_addpstring(uni,uni_part)<0) return; /* adjust the pointers */ cp = cp3; cp2 = ++cp3; } } else { /* add a ISO Latin-1 char */ if (pstr_addchar(uni,0)<0) return; if (pstr_addchar(uni,*cp)<0) return; } } /* free memory for no longer used Unicode pstring */ pstr_delete(uni_part); } /* * iso2uni - transform ISO Latin-1 to Unicode * * INPUT: iso - ISO Latin-1 string * * OUTPUT: uni - Unicode pstring */ void iso2uni(pstr_t *uni, char *iso) { char *cp; /* character pointer */ /* Unicode length is 0 at start */ uni->length=0; /* loop over iso string */ for (cp=iso; *cp!=0; cp++) { /* first byte of Unicode character is always 0 */ if (pstr_addchar(uni,0)<0) return; /* add the ISO Latin-1 char byte */ if (pstr_addchar(uni,*cp)<0) return; } } /* * The functions decode_mbase64 and encode_mbase64 are based on encdec.c * by Jürgen Hägg which has been debugged and rewritten to use as C functions. * The original header was: * * Written by Jürgen Hägg 1993 * Version 1.1 * * (This filter is written for use in a MTA written in perl.) * * Please send comments and bugfixes when you find them. * Permission to use and change this program is given for any purpose * as long as this note remains unchanged. * * The usage() is the manual. * Use encdec as you wish :-) * */ void decode_mbase64(pstr_t *outstring, char *instring) { int i, j, num, len, err; long d, val; char *p, *c; static char vec[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; unsigned char nw[4]; outstring->length = 0; len = strlen(instring); strcat(instring,"=="); for (i=0; i=0; j--) { nw[j] = val & 255; val >>= 8; } for (j=0; jlength/2 != outstring->length/2.) outstring->length--; } void encode_mbase64(char *outstring, pstr_t *instring) { int n = 0, iz, oz = 0, i; unsigned char c; long val = 0; unsigned char enc[4]; static char vec[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; for (iz=1; iz<=instring->length; iz++) { c=instring->string[iz]; if (n++<=2) { val <<= 8; val += c; continue; } for (i=0; i<4; i++) { enc[i] = val&63; val >>= 6; } for (i=3; i>=0; i--) outstring[oz++] = vec[(int)enc[i]]; n = 1; val = c; } if (n==1) { val <<= 16; for (i=0; i<4; i++) { enc[i] = val&63; val >>= 6; } enc[0] = enc[1] = 64; } if (n==2) { val <<= 8; for (i=0; i<4; i++) { enc[i] = val&63; val >>= 6; } enc[0] = 64; } if (n==3) for (i=0; i<4; i++) { enc[i] = val&63; val >>= 6; } if (n) for (i=3; i>=0; i--) { c = vec[(int)enc[i]]; if (c!='=') outstring[oz++] = c; } outstring[oz] = 0; }