/*
* File: utf7.c
*
* Author: Ulli Horlacher (framstag@rus.uni-stuttgart.de)
*
* History:
*
* 1995-08-12 Framstag initial version
* 1996-03-22 Framstag replaced some unsigned char with char
* 1996-04-04 Framstag fixed memory leak in utf2iso
* 1997-02-23 Framstag modified str_* function names
* 1997-07-04 Framstag added NULL argument for utf2iso
* 1997-07-10 Framstag '=' is not allowed in mbase64!
* bugfix: corrupt UTF7 string will now be ignored
* 1998-07-21 Framstag allow single spaces in UTF7 strings
* 1998-10-29 Framstag iso2utf -> iso2utf7
* 1999-03-13 Framstag added uni2utf() and utf2uni()
*
* UTF-7 and Unicode coding routines for the sendfile package.
* Look at utf7.h for a list of the functions.
*
* Copyright © 1995-1999 Ulli Horlacher
* This file is covered by the GNU General Public License
*/
#include <stdio.h>
#include <stdlib.h>
#include "string.h" /* extended string functions */
#include "utf7.h"
/*
* utf2iso - UTF-7 to ISO Latin-1 decoding
*
* INPUT: fnf - unix file name flag; if >0 substitute '/' with '_'
* utf - UTF-7 encoded string
*
* OUTPUT: iso - ISO Latin-1 string
* show - ISO Latin-1 string without control codes
* shell - ISO Latin-1 string without control codes and meta characters
*
* RETURN: 2 digit binary code
* - if no digit is set: no special chars found
* - if 1st digit is set: Unicode characters or '/' or '\0' found
* - if 2nd digit is set: meta chars or control code chars found
*
* REMARK: iso, show and shell may be NULL in which case they will be ignored
*/
int utf2iso(int fnf, char *iso, char *show, char *shell, char *utf) {
int ucc, /* Unicode character count */
flags=0; /* output flags */
char *cp, *cp2, *cp3, /* char pointers for positioning substrings */
mbase_part[LEN_UTF]; /* mbase64 string */
pstr_t *uni_part; /* Unicode part pstring */
/* initialisize the strings */
if (iso) *iso=0;
if (show) *show=0;
if (shell) *shell=0;
uni_part=pstr_create(LEN_UNI);
/* loop over the UTF-7 encoded string to find mbase64 parts */
for (cp=utf, cp2=utf+1; *cp!=0; cp++, cp2++) {
/* mbase64 shift char? */
if (*cp=='+') {
/* end of mbase64 part? */
if (*cp2=='-' || *cp2==0) {
if (iso) strcat(iso,"+");
if (show) strcat(show,"+");
if (shell) strcat(shell,"+");
/* still more string to parse? */
if (*cp2!=0) {
cp++;
cp2++;
}
} else {
/* find end of mbase64 part */
for (cp3=cp2; *cp3!='-'; cp3++);
/* cut out mbase64 part string */
*mbase_part=0;
strncat(mbase_part,cp2,cp3-cp2);
/* decode it to Unicode */
decode_mbase64(uni_part,mbase_part);
/* loop over Unicode pstring to look for ISO Latin-1 chars */
for (ucc=1; ucc<=uni_part->length; ucc+=2) {
/* next character a ISO Latin-1 char? */
if (uni_part->string[ucc]==0)
add_char(fnf,iso,show,shell,uni_part->string[ucc+1],&flags);
else {
/* substitute non valid Unicode character with '_' */
flags = flags|1;
if (iso) strcat(iso,"_");
if (show) strcat(show,"_");
if (shell) strcat(shell,"_");
}
}
/* adjust the pointers */
cp = cp3;
cp2 = ++cp3;
}
} else
/* add a ISO Latin-1 char */
add_char(fnf,iso,show,shell,*cp,&flags);
}
/* dont allow "." or ".." as file names */
if (fnf) {
if (iso && str_eq(iso,".")) {
strcpy(iso,"_");
flags = flags|1;
}
if (show && str_eq(show,".")) {
strcpy(show,"_");
flags = flags|1;
}
if (shell && str_eq(shell,".")) {
strcpy(shell,"_");
flags = flags|1;
}
if (iso && str_eq(iso,"..")) {
strcpy(iso,"__");
flags = flags|1;
}
if (show && str_eq(show,"..")) {
strcpy(show,"__");
flags = flags|1;
}
if (shell && str_eq(shell,"..")) {
strcpy(shell,"__");
flags = flags|1;
}
}
/* free memory for no longer used Unicode pstring */
pstr_delete(uni_part);
return(flags);
}
/*
* add_char - add a char depending on its range
*
* INPUT: fnf - unix file name flag; if >0 substitute '/' with '_'
* c - char to add
* flags - return flags for utf2iso function
*
* OUTPUT: iso - ISO Latin-1 string
* show - ISO Latin-1 string without control codes
* shell - ISO Latin-1 string without control codes and meta characters
* flags - return flags for utf2iso function
*/
void add_char(int fnf, char *iso, char *show, char *shell, char c, int *flags) {
unsigned char sc[2]; /* string to add */
const char *meta="\"!#$&'()*?\\`| "; /* (bourne) shell meta characters */
/* build the string to add */
sc[0] = c;
sc[1] = 0;
/* is it a non valid char for a UNIX file name? */
if (*sc==0 || (*sc=='/' && fnf)) {
*flags = *flags|1;
*sc = '_';
}
/* add the char to the iso-string */
if (iso) strcat(iso,(char *)sc);
/* is it a control code? */
if (*sc<32 || (*sc>126 && *sc<161)) {
*flags = *flags|2;
*sc = '_';
}
/* add the char to the show-string */
if (show) strcat(show,(char *)sc);
/* is it a meta char? */
if (strchr(meta,*(char *)sc)) {
*flags = *flags|2;
*sc = '_';
}
/* add the char to the shell-string */
if (shell) strcat(shell,(char *)sc);
}
/*
* iso2utf - ISO Latin-1 to UTF-7 encoding
*
* INPUT: iso - ISO Latin-1 string
*
* OUTPUT: utf - UTF-7 encoded string
*/
void iso2utf(char *utf_name, char *iso_name) {
iso2utf7(utf_name,iso_name,1);
}
/*
* iso2utf7 - ISO Latin-1 to UTF-7 encoding
*
* INPUT: iso - ISO Latin-1 string
* withspace - flag for encoding spaces
*
* OUTPUT: utf - UTF-7 encoded string
*/
void iso2utf7(char *utf_name, char *iso_name, int withspace) {
char *cp, *cp2, /* string pointers */
*DO_set="abcdefghijklmnopqrstuvwxyz" /* mbase64 D and O sets */
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"01234567890"
"'(),-./:?!\"#$%&*;<>@[]^_`{}|";
char iso_part[LEN_ISO], /* ISO Latin-1 part string */
mbase_part[LEN_UTF]; /* mbase64 part string */
pstr_t *uni_part; /* Unicode part pstring */
/* initialisize the strings */
utf_name[0]=0;
uni_part=pstr_create(LEN_UNI);
/* scan the iso-string */
for (cp=iso_name; *cp!=0 ; cp++) {
/* char in DO set (or is it a middle single space)? */
if (strchr(DO_set,*cp) ||
(cp!=iso_name && *(cp+1) && withspace && *cp==32 && *(cp+1)!=32))
strncat(utf_name,cp,1);
else {
/* add UTF-7 shift char */
strcat(utf_name,"+");
/* +- short encoding? */
if (*cp=='+')
strcat(utf_name,"-");
else {
/* search for the next char in the DO set */
cp2=cp;
cp2++;
while (strchr(DO_set,*cp2)==NULL && *cp2!=0) cp2++;
/*while ((strchr(DO_set,(int)*cp2)==NULL) && (*cp2!=NULL)) cp2++;*/
/* cut out the iso-part string */
*iso_part=0;
strncat(iso_part,cp,cp2-cp);
/* translate it to Unicode */
iso2uni(uni_part,iso_part);
/* encode to mbase64 */
encode_mbase64(mbase_part,uni_part);
/* add it to the utf string */
strcat(utf_name,mbase_part);
strcat(utf_name,"-");
cp=cp2-1;
}
}
}
/* free memory for no longer used Unicode pstring */
pstr_delete(uni_part);
}
/*
* uni2utf - Unicode to UTF-7 encoding
*
* INPUT: uni - unicode pstring
*
* OUTPUT: utf - UTF-7 encoded string
*/
void uni2utf(char *utf, pstr_t *uni) {
/* encode to mbase64 */
encode_mbase64(utf+1,uni);
/* build utf7 string */
utf[0]='+';
strcat(utf,"-");
}
/*
* utf2uni - UTF-7 to Unicode decoding
*
* INPUT: utf - UTF-7 encoded string
*
* OUTPUT: uni - unicode pstring
*/
void utf2uni(pstr_t *uni, char *utf) {
char *cp, *cp2, *cp3, /* char pointers for positioning substrings */
mbase_part[LEN_UTF]; /* mbase64 string */
pstr_t *uni_part; /* Unicode part pstring */
uni_part=pstr_create(LEN_UNI);
/* loop over the UTF-7 encoded string to find mbase64 parts */
for (cp=utf, cp2=utf+1; *cp!=0; cp++, cp2++) {
/* mbase64 shift char? */
if (*cp=='+') {
/* short end of mbase64 part ("+-")? */
if (*cp2=='-' || *cp2==0) {
if (pstr_addchar(uni,0)<0) return;
if (pstr_addchar(uni,'+')<0) return;
/* still more string to parse? */
if (*cp2!=0) {
cp++;
cp2++;
}
} else {
/* find end of mbase64 part */
for (cp3=cp2; *cp3!='-'; cp3++);
/* cut out mbase64 part string */
*mbase_part=0;
strncat(mbase_part,cp2,cp3-cp2);
/* decode it to Unicode */
decode_mbase64(uni_part,mbase_part);
/* add unicode part string */
if (pstr_addpstring(uni,uni_part)<0) return;
/* adjust the pointers */
cp = cp3;
cp2 = ++cp3;
}
} else {
/* add a ISO Latin-1 char */
if (pstr_addchar(uni,0)<0) return;
if (pstr_addchar(uni,*cp)<0) return;
}
}
/* free memory for no longer used Unicode pstring */
pstr_delete(uni_part);
}
/*
* iso2uni - transform ISO Latin-1 to Unicode
*
* INPUT: iso - ISO Latin-1 string
*
* OUTPUT: uni - Unicode pstring
*/
void iso2uni(pstr_t *uni, char *iso) {
char *cp; /* character pointer */
/* Unicode length is 0 at start */
uni->length=0;
/* loop over iso string */
for (cp=iso; *cp!=0; cp++) {
/* first byte of Unicode character is always 0 */
if (pstr_addchar(uni,0)<0) return;
/* add the ISO Latin-1 char byte */
if (pstr_addchar(uni,*cp)<0) return;
}
}
/*
* The functions decode_mbase64 and encode_mbase64 are based on encdec.c
* by Jürgen Hägg which has been debugged and rewritten to use as C functions.
* The original header was:
*
* Written by Jürgen Hägg 1993 <jh@efd.lth.se>
* Version 1.1
*
* (This filter is written for use in a MTA written in perl.)
*
* Please send comments and bugfixes when you find them.
* Permission to use and change this program is given for any purpose
* as long as this note remains unchanged.
*
* The usage() is the manual.
* Use encdec as you wish :-)
*
*/
void decode_mbase64(pstr_t *outstring, char *instring) {
int i, j, num, len, err;
long d, val;
char *p, *c;
static char
vec[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
unsigned char nw[4];
outstring->length = 0;
len = strlen(instring);
strcat(instring,"==");
for (i=0; i<len-1; i+=4) {
err = 0;
val = 0;
num = 3;
c = instring+i;
if (c[2] == '=') num = 1;
if (c[3] == '=') num = 2;
for (j=0; j<=num; j++) {
if (!(p = strchr(vec,c[j]))) {
err = 1;
break;
}
d = p-vec;
d <<= (3-j)*6;
val += d;
}
if (err) continue;
for (j=2; j>=0; j--) {
nw[j] = val & 255;
val >>= 8;
}
for (j=0; j<num; j++) pstr_addchar(outstring,nw[j]);
}
if (outstring->length/2 != outstring->length/2.) outstring->length--;
}
void encode_mbase64(char *outstring, pstr_t *instring) {
int n = 0, iz, oz = 0, i;
unsigned char c;
long val = 0;
unsigned char enc[4];
static char
vec[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
for (iz=1; iz<=instring->length; iz++) {
c=instring->string[iz];
if (n++<=2) {
val <<= 8;
val += c;
continue;
}
for (i=0; i<4; i++) {
enc[i] = val&63;
val >>= 6;
}
for (i=3; i>=0; i--) outstring[oz++] = vec[(int)enc[i]];
n = 1;
val = c;
}
if (n==1) {
val <<= 16;
for (i=0; i<4; i++) {
enc[i] = val&63;
val >>= 6;
}
enc[0] = enc[1] = 64;
}
if (n==2) {
val <<= 8;
for (i=0; i<4; i++) {
enc[i] = val&63;
val >>= 6;
}
enc[0] = 64;
}
if (n==3)
for (i=0; i<4; i++) {
enc[i] = val&63;
val >>= 6;
}
if (n)
for (i=3; i>=0; i--) {
c = vec[(int)enc[i]];
if (c!='=') outstring[oz++] = c;
}
outstring[oz] = 0;
}
syntax highlighted by Code2HTML, v. 0.9.1