ports//net/sendfile/work/sendfile-2.1a/src/utf7.c

/*
 * File:	utf7.c
 *
 * Author:	Ulli Horlacher (framstag@rus.uni-stuttgart.de)
 *
 * History:	
 * 
 *   1995-08-12 Framstag	initial version
 *   1996-03-22 Framstag	replaced some unsigned char with char
 *   1996-04-04 Framstag	fixed memory leak in utf2iso
 *   1997-02-23 Framstag	modified str_* function names
 *   1997-07-04 Framstag	added NULL argument for utf2iso
 *   1997-07-10	Framstag	'=' is not allowed in mbase64!
 * 				bugfix: corrupt UTF7 string will now be ignored
 *   1998-07-21 Framstag	allow single spaces in UTF7 strings
 *   1998-10-29 Framstag	iso2utf -> iso2utf7
 *   1999-03-13 Framstag	added uni2utf() and utf2uni()
 *
 * UTF-7 and Unicode coding routines for the sendfile package.
 * Look at utf7.h for a list of the functions.
 *
 * Copyright © 1995-1999 Ulli Horlacher
 * This file is covered by the GNU General Public License
 */


#include <stdio.h>
#include <stdlib.h>

#include "string.h"     /* extended string functions */
#include "utf7.h"


/*
 * utf2iso - UTF-7 to ISO Latin-1 decoding
 *
 * INPUT:  fnf   - unix file name flag; if >0 substitute '/' with '_'
 *         utf	 - UTF-7 encoded string
 *
 * OUTPUT: iso	 - ISO Latin-1 string
 *         show	 - ISO Latin-1 string without control codes
 *         shell - ISO Latin-1 string without control codes and meta characters
 *
 * RETURN: 2 digit binary code
 *         - if no digit is set:  no special chars found
 *         - if 1st digit is set: Unicode characters or '/' or '\0' found
 *         - if 2nd digit is set: meta chars or control code chars found
 * 
 * REMARK: iso, show and shell may be NULL in which case they will be ignored
 */
int utf2iso(int fnf, char *iso, char *show, char *shell, char *utf) {
  int ucc,			/* Unicode character count */
      flags=0;			/* output flags */
  char *cp, *cp2, *cp3, 	/* char pointers for positioning substrings */
       mbase_part[LEN_UTF];	/* mbase64 string */
  pstr_t *uni_part;		/* Unicode part pstring */

  /* initialisize the strings */
  if (iso)   *iso=0;
  if (show)  *show=0;
  if (shell) *shell=0;
  uni_part=pstr_create(LEN_UNI);

  /* loop over the UTF-7 encoded string to find mbase64 parts */
  for (cp=utf, cp2=utf+1; *cp!=0; cp++, cp2++) {
   
    /* mbase64 shift char? */
    if (*cp=='+') {
     
      /* end of mbase64 part? */
      if (*cp2=='-' || *cp2==0) {
       
        if (iso)   strcat(iso,"+");
        if (show)  strcat(show,"+");
        if (shell) strcat(shell,"+");

	/* still more string to parse? */
        if (*cp2!=0) {
	  cp++;
          cp2++;
        }

      } else {
       
	/* find end of mbase64 part */
	for (cp3=cp2; *cp3!='-'; cp3++);

	/* cut out mbase64 part string */
	*mbase_part=0;
	strncat(mbase_part,cp2,cp3-cp2);

	/* decode it to Unicode */
	decode_mbase64(uni_part,mbase_part);

	/* loop over Unicode pstring to look for ISO Latin-1 chars */
	for (ucc=1; ucc<=uni_part->length; ucc+=2) {
	 
	  /* next character a ISO Latin-1 char? */
	  if (uni_part->string[ucc]==0)
	    add_char(fnf,iso,show,shell,uni_part->string[ucc+1],&flags);
	  else {
	   
	    /* substitute non valid Unicode character with '_' */
	    flags = flags|1;
	    if (iso)   strcat(iso,"_");
	    if (show)  strcat(show,"_");
	    if (shell) strcat(shell,"_");
	  }
	}

	/* adjust the pointers */
	cp = cp3;
	cp2 = ++cp3;

      }
    } else

      /* add a ISO Latin-1 char */
      add_char(fnf,iso,show,shell,*cp,&flags);
  }

  /* dont allow "." or ".." as file names */
  if (fnf) {
    if (iso   && str_eq(iso,".")) {
      strcpy(iso,"_");
      flags = flags|1;
    }
    if (show  && str_eq(show,".")) {
      strcpy(show,"_");
      flags = flags|1;
    }
    if (shell && str_eq(shell,".")) {
      strcpy(shell,"_");
      flags = flags|1;
    }
    if (iso   && str_eq(iso,"..")) {
      strcpy(iso,"__");
      flags = flags|1;
    }
    if (show  && str_eq(show,"..")) {
      strcpy(show,"__");
      flags = flags|1;
    }
    if (shell && str_eq(shell,"..")) {
      strcpy(shell,"__");
      flags = flags|1;
    }
  }

  /* free memory for no longer used Unicode pstring */
  pstr_delete(uni_part);

  return(flags);
}


/*
 * add_char - add a char depending on its range
 *
 * INPUT:  fnf   - unix file name flag; if >0 substitute '/' with '_'
 *         c	 - char to add
 *         flags - return flags for utf2iso function
 *
 * OUTPUT: iso	 - ISO Latin-1 string
 *         show	 - ISO Latin-1 string without control codes
 *         shell - ISO Latin-1 string without control codes and meta characters
 *         flags - return flags for utf2iso function
 */
void add_char(int fnf, char *iso, char *show, char *shell, char c, int *flags) {
  unsigned char sc[2];			/* string to add */
  const char *meta="\"!#$&'()*?\\`| ";	/* (bourne) shell meta characters */

  /* build the string to add */
  sc[0] = c;
  sc[1] = 0;

  /* is it a non valid char for a UNIX file name? */
  if (*sc==0 || (*sc=='/' && fnf)) {
    *flags = *flags|1;
    *sc = '_';
  }

  /* add the char to the iso-string */
  if (iso) strcat(iso,(char *)sc);

  /* is it a control code? */
  if (*sc<32 || (*sc>126 && *sc<161)) {
    *flags = *flags|2;
    *sc = '_';
  }

  /* add the char to the show-string */
  if (show) strcat(show,(char *)sc);

  /* is it a meta char? */
  if (strchr(meta,*(char *)sc)) {
    *flags = *flags|2;
    *sc = '_';
  }

  /* add the char to the shell-string */
  if (shell) strcat(shell,(char *)sc);
}


/*
 * iso2utf - ISO Latin-1 to UTF-7 encoding
 *
 * INPUT:  iso	 - ISO Latin-1 string
 *
 * OUTPUT: utf	 - UTF-7 encoded string
 */
void iso2utf(char *utf_name, char *iso_name) {
  iso2utf7(utf_name,iso_name,1);
}


/*
 * iso2utf7 - ISO Latin-1 to UTF-7 encoding
 *
 * INPUT:  iso	 	- ISO Latin-1 string
 *         withspace	- flag for encoding spaces
 *
 * OUTPUT: utf	 	- UTF-7 encoded string
 */
void iso2utf7(char *utf_name, char *iso_name, int withspace) {
  char *cp, *cp2,				/* string pointers */
       *DO_set="abcdefghijklmnopqrstuvwxyz"	/* mbase64 D and O sets */
               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
               "01234567890"
               "'(),-./:?!\"#$%&*;<>@[]^_`{}|";
  char iso_part[LEN_ISO],			/* ISO Latin-1 part string */
       mbase_part[LEN_UTF];			/* mbase64 part string */
  pstr_t *uni_part;				/* Unicode part pstring */

  /* initialisize the strings */
  utf_name[0]=0;
  uni_part=pstr_create(LEN_UNI);

  /* scan the iso-string */
  for (cp=iso_name; *cp!=0 ; cp++) {
   
    /* char in DO set (or is it a middle single space)? */
    if (strchr(DO_set,*cp) || 
	(cp!=iso_name && *(cp+1) && withspace && *cp==32 && *(cp+1)!=32))
      strncat(utf_name,cp,1);
    else {
     
      /* add UTF-7 shift char */
      strcat(utf_name,"+");

      /* +- short encoding? */
      if (*cp=='+')
        strcat(utf_name,"-");
      else {
       
	/* search for the next char in the DO set */
	cp2=cp;
	cp2++;
	while (strchr(DO_set,*cp2)==NULL && *cp2!=0) cp2++;
	/*while ((strchr(DO_set,(int)*cp2)==NULL) && (*cp2!=NULL)) cp2++;*/

	/* cut out the iso-part string */
	*iso_part=0;
	strncat(iso_part,cp,cp2-cp);

	/* translate it to Unicode */
        iso2uni(uni_part,iso_part);

	/* encode to mbase64 */
	encode_mbase64(mbase_part,uni_part);

	/* add it to the utf string */
	strcat(utf_name,mbase_part);
        strcat(utf_name,"-");
	cp=cp2-1;

      }
    }
  }

  /* free memory for no longer used Unicode pstring */
  pstr_delete(uni_part);
}


/*
 * uni2utf - Unicode to UTF-7 encoding
 *
 * INPUT:  uni	 	- unicode pstring
 *
 * OUTPUT: utf	 	- UTF-7 encoded string
 */
void uni2utf(char *utf, pstr_t *uni) {

  /* encode to mbase64 */
  encode_mbase64(utf+1,uni);

  /* build utf7 string */
  utf[0]='+';
  strcat(utf,"-");
}


/*
 * utf2uni - UTF-7 to Unicode decoding
 *
 * INPUT:  utf	 	- UTF-7 encoded string
 *
 * OUTPUT: uni	 	- unicode pstring
 */
void utf2uni(pstr_t *uni, char *utf) {
  char *cp, *cp2, *cp3, 	/* char pointers for positioning substrings */
       mbase_part[LEN_UTF];	/* mbase64 string */
  pstr_t *uni_part;		/* Unicode part pstring */

  uni_part=pstr_create(LEN_UNI);

  /* loop over the UTF-7 encoded string to find mbase64 parts */
  for (cp=utf, cp2=utf+1; *cp!=0; cp++, cp2++) {
   
    /* mbase64 shift char? */
    if (*cp=='+') {
     
      /* short end of mbase64 part ("+-")? */
      if (*cp2=='-' || *cp2==0) {
       
	if (pstr_addchar(uni,0)<0) return;
	if (pstr_addchar(uni,'+')<0) return;

	/* still more string to parse? */
        if (*cp2!=0) {
	  cp++;
          cp2++;
        }

      } else {
       
	/* find end of mbase64 part */
	for (cp3=cp2; *cp3!='-'; cp3++);

	/* cut out mbase64 part string */
	*mbase_part=0;
	strncat(mbase_part,cp2,cp3-cp2);
	
	/* decode it to Unicode */
	decode_mbase64(uni_part,mbase_part);

	/* add unicode part string */
	if (pstr_addpstring(uni,uni_part)<0) return;

	/* adjust the pointers */
	cp = cp3;
	cp2 = ++cp3;

      }

    } else {

      /* add a ISO Latin-1 char */
      if (pstr_addchar(uni,0)<0) return;
      if (pstr_addchar(uni,*cp)<0) return;
      
    }
  }

  /* free memory for no longer used Unicode pstring */
  pstr_delete(uni_part);
}


/*
 * iso2uni - transform ISO Latin-1 to Unicode
 *
 * INPUT:  iso  - ISO Latin-1 string
 *
 * OUTPUT: uni  - Unicode pstring
 */
void iso2uni(pstr_t *uni, char *iso) {
  char *cp;	/* character pointer */

  /* Unicode length is 0 at start */
  uni->length=0;

  /* loop over iso string */
  for (cp=iso; *cp!=0; cp++) {
   
    /* first byte of Unicode character is always 0 */
    if (pstr_addchar(uni,0)<0) return;

    /* add the ISO Latin-1 char byte */
    if (pstr_addchar(uni,*cp)<0) return;
  }
}


/*
 * The functions decode_mbase64 and encode_mbase64 are based on encdec.c
 * by Jürgen Hägg which has been debugged and rewritten to use as C functions.
 * The original header was:
 *
 *	Written by Jürgen Hägg 1993 <jh@efd.lth.se>
 *	Version 1.1
 *
 *	(This filter is written for use in a MTA written in perl.)
 *
 *	Please send comments and bugfixes when you find them.
 *	Permission to use and change this program is given for any purpose
 *	as long as this note remains unchanged.
 *
 *	The usage() is the manual.
 *	Use encdec as you wish :-)
 *
 */

void decode_mbase64(pstr_t *outstring, char *instring) {
  int	i, j, num, len, err;
  long	d, val;
  char	*p, *c;
  static char 
    vec[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
  unsigned char	nw[4];

  outstring->length = 0;
  len = strlen(instring);
  strcat(instring,"==");
  for (i=0; i<len-1; i+=4) {
    err = 0;
    val = 0;
    num = 3;
    c = instring+i;
    if (c[2] == '=') num = 1;
    if (c[3] == '=') num = 2;
    for (j=0; j<=num; j++) {
      if (!(p = strchr(vec,c[j]))) {
	err = 1;
	break;
      }
      d = p-vec;
      d <<= (3-j)*6;
      val += d;
    }
    if (err) continue;
    for (j=2; j>=0; j--) {
      nw[j] = val & 255;
      val >>= 8;
    }
    for (j=0; j<num; j++) pstr_addchar(outstring,nw[j]);
  }
  if (outstring->length/2 != outstring->length/2.) outstring->length--;
}


void encode_mbase64(char *outstring, pstr_t *instring) {
  int n = 0, iz, oz = 0, i;
  unsigned char c;
  long val = 0;
  unsigned char enc[4];
  static char 
    vec[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";

  for (iz=1; iz<=instring->length; iz++) {
    c=instring->string[iz];
    if (n++<=2) {
      val <<= 8;
      val += c;
      continue;
    }
    for (i=0; i<4; i++) {
      enc[i] = val&63;
      val >>= 6;
    }
    for (i=3; i>=0; i--) outstring[oz++] = vec[(int)enc[i]];
    n = 1;
    val = c;
  }
  if (n==1) {
    val <<= 16;
    for (i=0; i<4; i++) {
      enc[i] = val&63;
      val >>= 6;
    }
    enc[0] = enc[1] = 64;
  }
  if (n==2) {
    val <<= 8;
    for (i=0; i<4; i++) {
      enc[i] = val&63;
      val >>= 6;
    }
    enc[0] = 64;
  }
  if (n==3)
    for (i=0; i<4; i++) {
      enc[i] = val&63;
      val >>= 6;
    }
  if (n)
    for (i=3; i>=0; i--) {
      c = vec[(int)enc[i]];
      if (c!='=') outstring[oz++] = c;
    }
  outstring[oz] = 0;
}
syntax highlighted by Code2HTML, v. 0.9.1