/* $Header: /home/agc/src/libutf-2.10/RCS/urelang.c,v 1.9 1997/10/20 12:37:33 agc Exp $ */

/*
 * Copyright © 1996-1997 Alistair G. Crooks.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by Alistair G. Crooks.
 * 4. The name of the author may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <config.h>

#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif

#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif

#include <stdio.h>
#include <ctype.h>

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef HAVE_STRING_H
#include <string.h>
#endif

#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif

#include "utf.h"
#include "ure.h"

/*************************************************************************/
/* basic unicode routines */

/* define EBCDIC_CHAR_SET if you use EBCDIC - ASCII is the default */

#ifdef EBCDIC_CHAR_SET
static unsigned char cv[256] = {
/* 0 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 1 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 2 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 3 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 4 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 5 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 6 */	0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 7 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 8 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 9 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* a */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* b */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* c */	0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* d */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* e */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* f */	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0
};
#else
static unsigned char cv[256] = {
/* 0 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 1 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 2 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 3 */	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0,
/* 4 */	0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 5 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 6 */	0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 7 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 8 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 9 */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* a */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* b */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* c */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* d */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* e */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* f */	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#endif /* !EBCDIC_CHAR_SET */

/* convert `cc' chars of `s' into a `base' number */
int
AsciiToNumber(char *s, int cc, int base)
{
	long	n;

	for (n = 0 ; *s && cc-- > 0 ; s++) {
		n = (n * base) + cv[(unsigned char)(*s)];
	}
	return n;
}

/* the character constant in cp is converted to an integer */
int
CharToNum(char *cp)
{
	char	*s;

	if (*cp == '\\') {
		switch(*++cp) {
		case '\a':
			return '\a';
		case '\b':
			return '\b';
		case '\f':
			return '\f';
		case '\r':
			return '\r';
		case '\n':
			return '\n';
		case '\t':
			return '\t';
		case '0':
		case '1':
		case '2':
		case '3':
		case '4':
		case '5':
		case '6':
		case '7':
			s = strchr(cp, '\'');
			return AsciiToNumber(cp, s - cp, 8);
		case 'x':
			s = strchr(++cp, '\'');
			return AsciiToNumber(cp, s - cp, 16);
		}
	}
	return *cp;
}

/* return the decimal value of `ch' */
int
CharToDec(unsigned char ch)
{
	return (int) cv[ch];
}

/*************************************************************************/
/* language specific functions */

/* a dumb-string structure - no ref counting, just the gubbins */
typedef struct dumbstr {
	int	ds_c;		/* length of string */
	char	ds_v[1];	/* the string itself */
} ds_t;

/* this struct describes a language's collation sequences */
typedef struct langstruct {
	ds_t	*l_lang;	/* language name */
	ds_t	*l_lower;	/* lower case */
	ds_t	*l_upper;	/* upper case */
	ds_t	*l_digits;	/* digits */
	ds_t	*l_imports;	/* imported runes */
	ds_t	*l_values;	/* their values */
} Lang_t;

static Lang_t	*language;	/* current language being used */

#ifndef HAVE_MEMMOVE
/* overlapping-safe memory move function */
static char *
memmove(char *dst, char *src, int nbytes)
{
	char	*ret;

	if ((ret = dst) >= src && dst <= &src[nbytes]) {
		for (dst += nbytes, src += nbytes ; nbytes-- > 0 ; ) {
			*--dst = *--src;
		}
	} else {
		while (nbytes-- > 0) {
			*dst++ = *src++;
		}
	}
	return ret;
}
#endif

/* make a dumb string from `n' chars of `s' */
static ds_t *
ds_save(char *s, int n)
{
	ds_t	*dp;

	if ((dp = (ds_t *) malloc(sizeof(ds_t) + (sizeof(char) * (n + 1)))) == (ds_t *) NULL) {
		(void) fprintf(stderr, "Memory problems in strnsave\n");
		exit(1);
	}
	(void) memmove(dp->ds_v, s, dp->ds_c = n);
	dp->ds_v[n] = 0;
	return dp;
}

/* free storage allocated to dumb string */
void
ds_free(ds_t *dp)
{
	free(dp);
}

/* set the language sequence to that given */
static int
LangSetSequence(char *lang, char *lower, char *upper, char *digits, char *imports, char *values)
{
	if (language != (Lang_t *) NULL) {
		ds_free(language->l_lang);
		ds_free(language->l_lower);
		ds_free(language->l_upper);
		ds_free(language->l_digits);
		ds_free(language->l_imports);
		ds_free(language->l_values);
	}
	if ((language = (Lang_t *) malloc(sizeof(Lang_t))) == (Lang_t *) NULL) {
		return 0;
	}
	language->l_lang = ds_save(lang, utfbytes(lang));
	language->l_lower = ds_save(lower, utfbytes(lower));
	language->l_upper = ds_save(upper, utfbytes(upper));
	language->l_digits = ds_save(digits, utfbytes(digits));
	language->l_imports = ds_save(imports, utfbytes(imports));
	language->l_values = ds_save(values, utfbytes(values));
	return 1;
}

#ifndef LANG_COLL_FILE
#define LANG_COLL_FILE	"langcoll.utf"
#endif

#ifndef ETCDIR
#define ETCDIR	"/usr/local/etc"
#endif

#ifndef DEFLANG
#define DEFLANG	"English"
#endif

/* open the language collation sequence file, and return its handle */
/* the order for searching is described in the code below */
static FILE *
opencollfile(void)
{
	FILE	*fp;
	char	buf[BUFSIZ];
	char	*cp;

	/* then look for file in cwd */
	if ((fp = fopen(LANG_COLL_FILE, "r")) != (FILE *) NULL) {
		return fp;
	}
	/* look for LANG_COLL_FILE in $HOME */
	if ((cp = getenv("HOME")) != (char *) NULL) {
		if ((cp = utfrune(cp, '=')) != (char *) NULL) {
			(void) utf_snprintf(buf, sizeof(buf), "%s/%s", cp + 1, LANG_COLL_FILE);
			if ((fp = fopen(buf, "r")) != (FILE *) NULL) {
				return fp;
			}
		}
	}
	/* then look for system wide file */
	(void) utf_snprintf(buf, sizeof(buf), "%s/%s", ETCDIR, LANG_COLL_FILE);
	if ((fp = fopen(buf, "r")) != (FILE *) NULL) {
		return fp;
	}
	/* give up */
	return (FILE *) NULL;
}

/* return the contents of the language collation sequence file */
static char *
getcollfile(void)
{
	struct stat	s;
	FILE		*fp;
	char		*cp;
	int		cc;

	if ((fp = opencollfile()) == (FILE *) NULL) {
		return (char *) NULL;
	}
	(void) fstat(fileno(fp), &s);
	if ((cp = (char *) malloc((size_t)s.st_size + 1)) == (char *) NULL) {
		return (char *) NULL;
	}
	cc = read(fileno(fp), cp, (size_t) s.st_size);
	(void) fclose(fp);
	if (cc != s.st_size) {
		free(cp);
		return (char *) NULL;
	}
	cp[cc] = 0;
	return cp;
}

/* initialise the language collation sequences */
int
urecollseq(char *collseq)
{
	Rune	r;
	char	*imports;
	char	*values;
	char	*digits;
	char	*lower;
	char	*upper;
	char	*buf;
	char	*seq;
	char	*nl;
	char	*cp;
	int	seplen;
	int	rc;

	if (language != (Lang_t *) NULL && collseq == (char *) NULL) {
		return 1;
	}
	if (collseq == (char *) NULL && (collseq = getenv("UTFCOLLSEQ")) == (char *) NULL) {
		collseq = DEFLANG;
	}
	if (language != (Lang_t *) NULL && utfcmp(language->l_lang->ds_v, collseq) == 0) {
		return 1;
	}
	seplen = utfbytes("\t");
	if (collseq != (char *) NULL && (cp = buf = getcollfile()) != (char *) NULL) {
		for (;;) {
			if ((nl = utfrune(cp, '\n')) == (char *) NULL) {
				break;
			}
			cp += utfspan(cp, " \t", &rc);
			(void) chartorune(&r, cp);
			if (r == '#' || r == '\n') {
				/* comment */
				cp = nl + utfbytes("\n");
				continue;
			}
			/* got a seq name */
			cp = utfrune(seq = cp, '\t');
			*cp = 0;
			if (utfcmp(collseq, seq) == 0) {
				lower = cp + seplen;
				cp = utfrune(lower, '\t');
				*cp = 0;
				upper = cp + seplen;
				cp = utfrune(upper, '\t');
				*cp = 0;
				digits = cp + seplen;
				if ((cp = utfrune(digits, '\t')) == (char *) NULL) {
					imports = values = "";
				} else {
					*cp = 0;
					imports = cp + seplen;
					cp = utfrune(imports, '\t');
					*cp = 0;
					values = cp + seplen;
				}
				*nl = 0;
				LangSetSequence(seq, lower, upper, digits, imports, values);
				free(buf);
				return 1;
			}
			cp = nl + utfbytes("\n");
		}
		free(buf);
	}
	(void) fprintf(stderr, "%s not found - using %s\n", collseq, DEFLANG);
	LangSetSequence("English",
			"abcdefghijklmnopqrstuvwxyz",
			"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
			"0123456789",
			"",
			"");
	return 0;
}

/* return the number of UTF chars from the start of UTF string */
int
runesubscript(char *s, Rune r)
{
	Rune	ch;
	char	*cp;
	int	count;
	int	len;
	int	i;

	cp = s;
	count = 1;
	i = 0;
	for (;;) {
		len = chartorune(&ch, cp);
		if (ch == r) {
			return i;
		}
		if (ch == 0) {
			return -1;
		}
		cp += len;
		if (ch == '[') {
			count = 0;
		}
		if (ch == ']') {
			count = 1;
		}
		if (count) {
			i++;
		}
	}
	return -1;
}

/* return the `n'th rune in s */
Rune
runeutfget(char *s, int n)
{
	Rune	r;

	for (;;) {
		s += chartorune(&r, s);
		if (n-- <= 0) {
			break;
		}
	}
	return r;
}

enum {
	LowercaseRune = 1,
	UppercaseRune,
	NocaseRune
};

/* return the ordinal number of the rune `r', in current language */
int
runeord(Rune r, int *runecase)
{
	int	i;

	if (r == '[' || r == ']') {
		return -1;
	}
	if ((i = runesubscript(language->l_imports->ds_v, r)) >= 0) {
		r = runeutfget(language->l_values->ds_v, i);
	}
	if ((i = runesubscript(language->l_digits->ds_v, r)) >= 0) {
		return i;
	}
	if ((i = runesubscript(language->l_upper->ds_v, r)) >= 0) {
		*runecase = UppercaseRune;
		return i;
	}
	if ((i = runesubscript(language->l_lower->ds_v, r)) >= 0) {
		*runecase = LowercaseRune;
		return i;
	}
	*runecase = NocaseRune;
	return r;
}

/* return 1 if the two strings are the same in `lang' */
/* ignore the case of the letters if icase is non-zero */
int
utflangcmp(char *s1, char *s2, char *lang, int icase)
{
	Rune	r1, r2;
	int	rc1, rc2;
	int	i1, i2;

	urecollseq(lang);
#if 0
	do {
		s1 += chartorune(&r1, s1);
		s2 += chartorune(&r2, s2);
		i1 = runeord(r1, icase);
		i2 = runeord(r2, icase);
	} while (i1 == i2 && r1 != 0 && r2 != 0);
	return i2 - i1;
#else
	for (;;) {
		s1 += chartorune(&r1, s1);
		s2 += chartorune(&r2, s2);
		i1 = runeord(r1, &rc1);
		i2 = runeord(r2, &rc2);
		if (i1 != i2 || r1 == 0 || r2 == 0) {
			return i2 - i1;
		}
		if (!icase && rc1 != rc2) {
			return rc2 - rc1;
		}
	}
#endif
}

/* structure to describe a range of runes */
typedef struct rangestruct {
	Rune	r_lower;	/* lower bound of range */
	Rune	r_upper;	/* upper bound of range */
} Range_t;

static Range_t	unicode_digits[15] = {
	{	0x0030,	0x0039	},	/* ISO-Latin-1, and ASCII, digits */
	{	0x0660,	0x0669	},	/* Arabic-Indic digits */
	{	0x06f0,	0x06f9	},	/* Eastern Arabic-Indic digits */
	{	0x0966,	0x096f	},	/* Devenagari digits */
	{	0x09e6,	0x09ef	},	/* Bengali digits */
	{	0x0a66,	0x0a6f	},	/* Germukhi digits */
	{	0x0ae6,	0x0aef	},	/* Gujurati digits */
	{	0x0b66,	0x0b6f	},	/* Oriya digits */
	{	0x0be7,	0x0bef	},	/* Tamil digits (only nine, no zero) */
	{	0x0c66,	0x0c6f	},	/* Telegu digits */
	{	0x0ce6,	0x0cef	},	/* Kannada digits */
	{	0x0d66,	0x0d6f	},	/* Malayalam digits */
	{	0x0e50,	0x0e59	},	/* Thai digits */
	{	0x0ed0,	0x0ed9	},	/* Lao digits */
	{	0xff10,	0xff19	}	/* Fullwidth digits */
};

static Range_t	unicode_letters[13] = {
	{	0x0041,	0x005a	},	/* ISO-Latin-1, ASCII, uppercase */
	{	0x0061,	0x007a	},	/* ISO-Latin-1, ASCII, lowercase */
	{	0x00c0,	0x00d6	},	/* ISO-Latin-1 supplementary letters */
	{	0x00d8,	0x00f6	},	/* ISO-Latin-1 supplementary letters */
	{	0x00f8,	0x00ff	},	/* ISO-Latin-1 supplementary letters */
	{	0x0100,	0x1fff	},	/* Latin-extended-A... */
	{	0x3040,	0x9fff	},	/* Hiragana... */
	{	0xf900,	0xfdff	},	/* CJK compatibility... */
	{	0xfe70,	0xfefe	},	/* arabic */
	{	0xff10,	0xff19	},	/* Fullwidth digits */
	{	0xff21,	0xff3a	},	/* Fullwidth Latin uppercase */
	{	0xff41,	0xff5a	},	/* Fullwidth Latin lowercase */
	{	0xff66,	0xffdc	}	/* Halfwidth katakana/Hangul */
};

/* returns non-zero if ch is a Unicode digit */
int
UNICODE_isdigit(Rune ch)
{
	Range_t	*rp;
	int	i;

	for (i = 0, rp = unicode_digits ; i < 15 ; i++, rp++) {
		if (ch >= rp->r_lower && ch <= rp->r_upper) {
			return 1;
		}
		if (ch < rp->r_lower) {
			break;
		}
	}
	return 0;
}

/* returns non-zero if ch is a Unicode letter */
int
UNICODE_isletter(Rune ch)
{
	Range_t	*rp;
	int	i;

	for (i = 0, rp = unicode_letters ; i < 13 ; i++, rp++) {
		if (ch >= rp->r_lower && ch <= rp->r_upper) {
			/* some digits overlap one of the letters ranges */
			return !UNICODE_isdigit(ch);
		}
		if (ch < rp->r_lower) {
			break;
		}
	}
	return 0;
}

/* this isn't quick, but we're after functionality here */
int
UNICODE_IsIdent(Rune ch)
{
	return (UNICODE_isdigit(ch) || UNICODE_isletter(ch) || ch == '_');
}

/* return ch as lower case (if it's upper case), or as ch otherwise */
Rune
UNICODE_tolower(Rune ch)
{
	Rune	r;
	char	*cp;

	if (language == (Lang_t *) NULL) {
		urecollseq(NULL);
	}
	if (ch == '[' || ch == ']' ||
	    (cp = utfrune(language->l_upper->ds_v, ch)) == (char *) NULL ||
	    (cp - language->l_upper->ds_v) > language->l_upper->ds_c) {
		return ch;
	}
	(void) chartorune(&r, &language->l_lower->ds_v[cp - language->l_upper->ds_v]);
	return r;
}

/* return ch as upper case (if it's lower case), or as ch otherwise */
Rune
UNICODE_toupper(Rune ch)
{
	Rune	r;
	char	*cp;

	if (language == (Lang_t *) NULL) {
		urecollseq(NULL);
	}
	if (ch == '[' || ch == ']' ||
	    (cp = utfrune(language->l_lower->ds_v, ch)) == (char *) NULL ||
	    (cp - language->l_lower->ds_v) > language->l_lower->ds_c) {
		return ch;
	}
	(void) chartorune(&r, &language->l_upper->ds_v[cp - language->l_lower->ds_v]);
	return r;
}

/* is upper case */
int
UNICODE_isupper(Rune ch)
{
	if (language == (Lang_t *) NULL) {
		urecollseq(NULL);
	}
	return ch != '[' && ch != ']' && utfrune(language->l_upper->ds_v, ch) != (char *) NULL;
}

/* is lower case */
int
UNICODE_islower(Rune ch)
{
	if (language == (Lang_t *) NULL) {
		urecollseq(NULL);
	}
	return ch != '[' && ch != ']' && utfrune(language->l_lower->ds_v, ch) != (char *) NULL;
}

/* returns 1 if c is a letter in the current `alphabet' */
int
UNICODE_isalpha(Rune c)
{
	return UNICODE_isupper(c) || UNICODE_islower(c);
}

/* returns 1 if c is a number in the current `alphabet' */
int
UNICODE_isnumber(Rune c)
{
	if (language == (Lang_t *) NULL) {
		urecollseq(NULL);
	}
	return utfrune(language->l_digits->ds_v, c) != (char *) NULL;
}

/* is a hexadecimal digit */
int
UNICODE_isxdigit(Rune c)
{
	if (UNICODE_isnumber(c)) {
		return 1;
	}
	return utfrune("abcdefABCDEF", c) != (char *) NULL;
}

/* is alphabetic or numeric */
int
UNICODE_isalnum(Rune c)
{
	return UNICODE_isdigit(c) || UNICODE_isletter(c);
}

/* is a space character */
int
UNICODE_isspace(Rune c)
{
	switch(c) {
	case ' ':
	case '\n':
	case '\r':
	case '\t':
	case '\f':
		return 1;
	}
	return 0;
}

/* is a blank character */
int
UNICODE_isblank(Rune c)
{
	return (c == ' ' || c == '\t');
}

/* is a punctuation character */
int
UNICODE_ispunct(Rune c)
{
	return UNICODE_isprint(c) && (c != ' ' || !UNICODE_isalnum(c));
}

/* is a control character */
int
UNICODE_iscntrl(Rune c)
{
	return (c == 0x7f || c < 0x20);
}

/* is a printable character */
int
UNICODE_isprint(Rune c)
{
	return !UNICODE_iscntrl(c);
}

/* is an ASCII character */
int
UNICODE_isascii(Rune c)
{
	return c < 0x7f;
}

/* is a graphics character */
int
UNICODE_isgraph(Rune c)
{
	return UNICODE_isprint(c) && c != ' ';
}

/* return 1 if ch is >= first && <= last */
int
UNICODE_InRange(Rune first, Rune last, Rune ch)
{
	char	*alphabet;
	char	*ind;
	char	*cp2;
	char	*cp;

	if (language == (Lang_t *) NULL) {
		urecollseq(NULL);
	}
	/* determine the case of the first character */
	if (ch == '[' || ch == ']') {
		/* not in any alphabet, and could clash with homo-rune metachars */
		return 0;
	}
	if ((cp = utfrune(language->l_lower->ds_v, first)) != (char *) NULL) {
		alphabet = language->l_lower->ds_v;
	} else if ((cp = utfrune(language->l_upper->ds_v, first)) != (char *) NULL) {
		alphabet = language->l_upper->ds_v;
	} else if ((cp = utfrune(language->l_digits->ds_v, first)) != (char *) NULL) {
		alphabet = language->l_digits->ds_v;
	} else {
		return 0;
	}
	if ((cp2 = utfrune(alphabet, last)) == (char *) NULL) {
		/* check first and last are in same alphabet */
		return 0;
	}
	if ((cp - alphabet) > (cp2 - alphabet) + 1) {
		/* check first appears before last in alphabet */
		return 0;
	}
	return (ind = utfrune(alphabet, ch)) != (char *) NULL && ind >= cp && ind <= cp2;
}

/*************************************************************************/
/* unicode string routines */
/* these routines manipulate arrays of Runes */

/* compare an array of Runes against a UTF-string */
int
UNICODE_mixed_strncmp(Rune *s1, char *s2, int n)
{
	Rune	r;
	int	i;
	int	c;

	while (n-- > 0 && *s1) {
		i = chartorune(&r, s2);
		s2 += i;
		if ((c = *s1 - r) != 0) {
			return c;
		}
	}
	return 0;
}

int
UNICODE_strcasecmp(Rune *s1, Rune *s2)
{
	int	c;

	while (*s1) {
		if ((c = UNICODE_tolower(*s1) - UNICODE_tolower(*s2)) != 0) {
			return c;
		}
	}
	return 0;
}

int
UNICODE_strlen(Rune *s)
{
	Rune	*cp;

	for (cp = s ; *s ; s++) {
	}
	return s - cp;
}

Rune *
UNICODE_strcat(Rune *s1, Rune *s2)
{
	Rune	*cp;

	for (cp = s1, s1 += UNICODE_strlen(s1) ; (*s1++ = *s2++) != 0 ; ) {
	}
	return cp;
}

Rune *
UNICODE_strchr(Rune *s, Rune ch)
{
	for ( ; *s && *s != ch ; s++) {
	}
	return (*s == 0) ? (Rune *) NULL : s;
}

int
UNICODE_strcmp(Rune *s1, Rune *s2)
{
	int	c;

	while (*s1) {
		if ((c = *s1 - *s2) != 0) {
			return c;
		}
	}
	return 0;
}

Rune *
UNICODE_strcpy(Rune *to, Rune *from)
{
	Rune	*ret;

	for (ret = to; (*to++ = *from++) != 0 ; ) {
	}
	return ret;
}

int
UNICODE_strcspn(Rune *s1, Rune *s2)
{
	int	c;

	for (c = 0 ; *s1 ; s1++, c++) {
		if (UNICODE_strchr(s2, *s1) != (Rune *) NULL) {
			break;
		}
	}
	return c;
}

Rune *
UNICODE_strdup(Rune *s)
{
	Rune	*cp;
	int		n;

	n = UNICODE_strlen(s);
	cp = (Rune *) calloc(sizeof(Rune), n + 1);
	if (cp != (Rune *) NULL) {
		(void) memcpy(cp, s, n * sizeof(Rune));
	}
	cp[n] = 0;
	return cp;
}

int
UNICODE_strncasecmp(Rune *s1, Rune *s2, int n)
{
	int	c;

	for ( ; n-- > 0 && *s1 ; s1++, s2++) {
		if ((c = UNICODE_tolower(*s1) - UNICODE_tolower(*s2)) != 0) {
			return c;
		}
	}
	return 0;
}

Rune *
UNICODE_strncat(Rune *s1, Rune *s2, int n)
{
	Rune	*cp;

	for (cp = s1, s1 += UNICODE_strlen(s1) ;
	     n-- > 0 && (*s1++ = *s2++) != 0 ; ) {
	}
	return cp;
}

int
UNICODE_strncmp(Rune *s1, Rune *s2, int n)
{
	int	c;

	for ( ; n-- > 0 && *s1 ; s1++, s2++) {
		if ((c = *s1 - *s2) != 0) {
			return c;
		}
	}
	return 0;
}

Rune *
UNICODE_strncpy(Rune *to, Rune *from, int n)
{
	Rune	*ret;

	for (ret = to; n-- > 0 && (*to++ = *from++) != 0 ; ) {
	}
	return ret;
}

Rune *
UNICODE_strpbrk(Rune *s1, Rune *s2)
{
	for ( ; *s1 ; s1++) {
		if (UNICODE_strchr(s2, *s1) != (Rune *) NULL) {
			return s1;
		}
	}
	return (Rune *) NULL;
}

Rune *
UNICODE_strrchr(Rune *s, Rune ch)
{
	Rune	*cp;

	for (cp = &s[UNICODE_strlen(s) - 1] ; cp >= s && *s != ch ; --s) {
	}
	return (s < cp) ? (Rune *) NULL : s;
}

int
UNICODE_strspn(Rune *s1, Rune *s2)
{
	int	c;

	for (c = 0 ; *s1 ; s1++, c++) {
		if (UNICODE_strchr(s2, *s1) == (Rune *) NULL) {
			break;
		}
	}
	return c;
}

Rune *
UNICODE_strstr(Rune *s, Rune *find)
{
	Rune	first;
	Rune	*cp;
	int		n;

	for (first = *find, n = UNICODE_strlen(find), cp = s ;
	     (cp = UNICODE_strchr(cp, first)) != (Rune *) NULL ; ) {
		if (UNICODE_strncmp(cp + 1, find + 1, n - 1) == 0) {
			return cp;
		}
	}
	return (Rune *) NULL;
}