/*
* Copyright (c) 1997-2005 Kazushi (Jam) Marukawa
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice in the documentation and/or other materials provided with
* the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The design of data structure of jless
*
* We use char[] byte data and CHARSET[] character set data to represent
* multilingual text. We defined CHARSET following ISO 2022 technique.
* All characters represented in ISO 2022 can be stored in less without
* any destructive conversion.
*
* For example, less can read text files using JIS C 6226-1978, JIS X
* 0208-1983, and JIS X 0208:1990 character sets and output everything
* using their original character set while searching a character encoded
* by JIS X 0213:2004. Inside of less, it buffers all text files using
* their original character set, unifies them when matching with the
* searching character, and outputs using their original character sets.
*
* If less needs conversions when it outputs internal data, it converts
* them on the fly.
*
* On the other hand, text using SJIS or UJIS are buffered after
* conversion while less is reading input stream.
*
* In addition, UTF-8 is buffered as UTF-8. Less converts it to appropriate
* character set/sets on the fly. (UTF-8 is notimplemented yet).
*/
/*
* Definition of values to specify the character set.
* And definitions some well known character sets and a types of set.
*/
typedef unsigned short CHARSET;
/*
* The structure of CHARSET:
*
* 151413121110 9 8 7 6 5 4 3 2 1 0
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |r| IRR |m|n| F |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
* r: True if it is not the first byte of multi-byte characters.
* IRR: Identification of Revisions of Registered character sets (IRR).
* Read ISO-2022 for mode details. The value of IRR is ranged from
* 00/01 to 03/15. 00/00 means no IRR. IRR (from 00/01 to 03/15)
* is mapped to a code from 04/00 to 07/14 in ISO-2022.
* m: True if it is part of multi-byte characters.
* n: True if it is one of 96 or 96x96 graphic sets, otherwise it is one
* of 94 or 94x94 graphic sets.
* F: Final byte (F). This select graphi sets of characters.
* The value of F is ranged from 00/00 to 04/14. Such values are coded
* from 03/00 to 07/14 in ISO-2022.
*/
#define REST_MASK 0x8000 /* r */
#define CSISHEAD(cs) (!((cs) & REST_MASK))
#define CSISREST(cs) ((cs) & REST_MASK)
#define IRR_MASK 0x7e00 /* IRR */
#define IRR_SHIFT 9
#define CS2IRR(cs) (((cs) & IRR_MASK) >> IRR_SHIFT)
#define IRR2CS(irr) (((irr) << IRR_SHIFT) & IRR_MASK)
#define CODE_MASK 0x003f /* coded IRR in ISO 2022 */
#define CODE_DIFF 0x0040
#define IRR2CODE(irr) ((((irr) - 1) & CODE_MASK) + CODE_DIFF)
#define CODE2IRR(code) ((((code) - CODE_DIFF) & CODE_MASK) + 1)
#define TYPE_94_CHARSET 0x0000 /* m & n */
#define TYPE_96_CHARSET 0x0080
#define TYPE_94N_CHARSET 0x0100
#define TYPE_96N_CHARSET 0x0180
#define TYPE_MASK 0x0180
#define CS2TYPE(cs) ((cs) & TYPE_MASK)
#define TYPE2CS(type) ((type) & TYPE_MASK)
#define FT_MASK 0x007f /* F */
#define FT_DIFF 0x0030
#define CS2FT(cs) (((cs) & FT_MASK) + FT_DIFF)
#define FT2CS(ft) (((ft) - FT_DIFF) & FT_MASK)
/*
* Each character sets is represented by IRR, TYPE and FT.
*/
#define CHARSET_MASK (IRR_MASK | TYPE_MASK | FT_MASK)
#define CS2CHARSET(cs) ((cs) & CHARSET_MASK)
/*
* There is a reserved empty set in every type of charset. 07/14.
* So we cannot use (CS2CHARSET(cs) == WRONGCS) to check it.
*/
#define CSISWRONG(cs) (CS2FT(cs) == '~')
/*
* List of representative character sets.
*/
#define ASCII (TYPE_94_CHARSET | FT2CS('B'))
#define WRONGCS (TYPE_94_CHARSET | FT2CS('~'))
#if ISO
#define JISX0201KANA (TYPE_94_CHARSET | FT2CS('I'))
#define JISX0201ROMAN (TYPE_94_CHARSET | FT2CS('J'))
#define LATIN1 (TYPE_96_CHARSET | FT2CS('A'))
#define LATIN2 (TYPE_96_CHARSET | FT2CS('B'))
#define LATIN3 (TYPE_96_CHARSET | FT2CS('C'))
#define LATIN4 (TYPE_96_CHARSET | FT2CS('D'))
#define GREEK (TYPE_96_CHARSET | FT2CS('F'))
#define ARABIC (TYPE_96_CHARSET | FT2CS('G'))
#define HEBREW (TYPE_96_CHARSET | FT2CS('H'))
#define CYRILLIC (TYPE_96_CHARSET | FT2CS('L'))
#define LATIN5 (TYPE_96_CHARSET | FT2CS('M'))
/*
* JISX0208_78KANJI means JIS C 6226-1978
* JISX0208KANJI means JIS X 0208-1983 (same as JIS C 6226-1983)
* This is similar to JIS C 6226-1978. Several characters are moved
* or exchanged in code space. Conversion table is available in unify.c.
* JISX0208_90KANJI means JIS X 0208:1990 (same as JIS X 0208-1990)
* This is super set of JIS X 0208-1983. Two characters are added from
* JIS X 0208-1983. In addition, this covers JIS X 0208:1997 too.
* They have the same code space. The difference between them is
* historical description. JIS X 0208:1997 defines ans describes
* all characters.
* JISX0213KANJI1 means JIS X 0213:2000 plane 1
* This is super set of JIS X 0208:1990 and JIS X 0208:1997. Several
* characters are added.
* JISX02132004KANJI1 means JIS X 0213:2004 plane 1
* This is super set of JIS X 0213:2000. 10 characters are added.
* And, glyph of several characters is modified.
*
* JISX0212KANJISUP means JIS X 0212:1990 (same as JIS X 0212-1990)
* JISX0213KANJI2 means JIS X 0213:2000 plane 1
* JISX02132004KANJI2 means JIS X 0213:2004 plane 1
*
* JISX0201KANA means JIS X 0201:1976 right plane (same as JIS X 0201-1976
* and JIS C 6220-1976 right plane)
* JISX0201ROMAN means JIS X 0201:1976 left plane (same as JIS X 0201-1976
* and JIS C 6220-1976 left plane)
* These cover JIS X 0201:1997 too. They have the same code space.
* The difference between them is historical description.
* JIS X 0201:1997 defines ans describes all characters.
*/
#define JISX0208_78KANJI (TYPE_94N_CHARSET | FT2CS('@'))
#define GB2312 (TYPE_94N_CHARSET | FT2CS('A'))
#define JISX0208KANJI (TYPE_94N_CHARSET | FT2CS('B'))
#define JISX0208_90KANJI (IRR2CS(1) | TYPE_94N_CHARSET | FT2CS('B'))
#define KSC5601 (TYPE_94N_CHARSET | FT2CS('C'))
#define JISX0212KANJISUP (TYPE_94N_CHARSET | FT2CS('D'))
#define JISX0213KANJI1 (TYPE_94N_CHARSET | FT2CS('O'))
#define JISX0213KANJI2 (TYPE_94N_CHARSET | FT2CS('P'))
#define JISX02132004KANJI1 (TYPE_94N_CHARSET | FT2CS('Q'))
#define JISX02132004KANJI2 (TYPE_94N_CHARSET | FT2CS('P'))
#if JAPANESE
/*
* Special number for Japanese code set. Only input_set use following with
* above definitions. The 07/15 or 07/14 are not valid for F. So, we are
* using them as indications of special character sets.
*
* SJIS contains ASCII, JIS X 0201:1976 right plane, and JIS X 0208:1997
* UJIS contains ASCII, JIS X 0201:1976, and JIS X 0208:1997
* SJIS2000 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2000
* UJIS2000 contains ASCII, JIS X 0201:1976, JIS X 0213:2000,
* and JIS X 0212:1990
* SJIS2004 contains ASCII, JIS X 0201:1976 right plane, and JIS X 0213:2004
* UJIS2004 contains ASCII, JIS X 0201:1976, JIS X 0213:2004,
* and JIS X 0212:1990
*/
#define SJIS (IRR2CS(0) | TYPE_94N_CHARSET | FT_MASK)
#define SJIS2000 (IRR2CS(1) | TYPE_94N_CHARSET | FT_MASK)
#define SJIS2004 (IRR2CS(2) | TYPE_94N_CHARSET | FT_MASK)
#define UJIS (IRR2CS(0) | TYPE_94N_CHARSET | (FT_MASK-1))
#define UJIS2000 (IRR2CS(1) | TYPE_94N_CHARSET | (FT_MASK-1))
#define UJIS2004 (IRR2CS(2) | TYPE_94N_CHARSET | (FT_MASK-1))
#define UTF8 (IRR2CS(0) | TYPE_94N_CHARSET | (FT_MASK-2))
/*
* Make SJIS/UJIS character set from mp.
*
* SJIS and UJIS are using only fixed number of plane sets. Therefore,
* it is impossible to use JIS X 0208:1990 and JIS X 0213:2004 at the
* same time. SJIS use only one of them. And, it is declared by
* MULBUF->io.right. This function constructs appropriate SJIS
* character set number from it.
*
* Usage: sjiscs = MAKESUJISCS(mp, SJIS);
* ujiscs = MAKESUJISCS(mp, UJIS);
*/
#define MAKESUJISCS(mp,su) \
((su)| (((mp)->io.right&CJISX0213_2004)?IRR2CS(2):\
(((mp)->io.right&CJISX0213_2000)?IRR2CS(1):0)))
#endif
#endif
/*
* List of special characters and character set for it.
*
* A terminator of string with character set is represented by
* both a NULCH and a NULLCS. A padding character in string with
* character set is represented by both a PADCH and a NULLCS. A
* binary data '\0' and '\1' are represented by both '\0' and a
* WRONGCS, and both '\1' and a WRONGCS respectively.
*/
#define NULCH ('\0')
#define PADCH ('\1')
#define NULLCS (ASCII)
/*
* Macros for easy checking.
*/
#define CSISASCII(cs) (CS2CHARSET(cs) == ASCII)
#define CSISNULLCS(cs) (CS2CHARSET(cs) == NULLCS)
/*
* Definition of values to specify the character set and character.
*/
typedef int CHARVAL;
#define MAKECV(ch, cs) (((cs) << 8 * sizeof(char)) | ch)
#define CV2CH(cv) ((cv) & ((1 << 8 * sizeof(char)) - 1))
#define CV2CS(cv) ((cv) >> 8 * sizeof(char))
/*
* Definition of SETCHARSET.
*
* SETCHARSET represents a set of character sets. This is used to
* specify character sets less accepts.
*
* Although, ISO 2022 can accept any character sets, the output device
* cannot represents all. Therefore, we add less ability to specify
* character sets that a user want to use.
*
* SCSASCII is a value to specify ASCII character set.
* SCSJISX0201_1976..SCSJISX0213_2004 specify Japanese character sets.
* All of these are character sets are defined in Japan. However,
* Japanese terminal devices can display only few of them. So, we
* decide to give users the ability to specify character sets that
* their terminal device can display.
* SCSOTHERISO is used to allow all other ISO 2022 character sets.
* There are too many character sets in the world. And the number
* of them is increasing. Therefore, we also decide to give users
* the ability to try all of them. ;-)
*/
typedef int SETCHARSET;
#define SCSASCII 0x0000
#define SCSJISX0201_1976 0x0001
#define SCSJISC6226_1978 0x0002
#define SCSJISX0208_1983 0x0004
#define SCSJISX0208_1990 0x0008
#define SCSJISX0212_1990 0x0010
#define SCSJISX0213_2000 0x0020
#define SCSJISX0213_2004 0x0040
#define SCSJISX0213_2ND 0x0080 /* 2nd plane of JIS X 0213:2000 and */
/* JIS X 0213:2004 */
#define SCSOTHERISO 0x0100
#define SCSUTF8 0x0200
/*
* SCSALLJIS - everything
* SCSALLJISTRAD - everything except JIS X 0213 plane 2 and JIS X 0212.
* SCSALLSJIS - everything except JIS X 0212
*/
#define SCSALLJIS (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\
SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004|\
SCSJISX0213_2ND|SCSJISX0212_1990)
#define SCSALLJISTRAD (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\
SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004)
#define SCSALLSJIS (SCSJISX0201_1976|SCSJISC6226_1978|SCSJISX0208_1983|\
SCSJISX0208_1990|SCSJISX0213_2000|SCSJISX0213_2004|\
SCSJISX0213_2ND)
/*
* Definition of ENCSET.
*
* ENCSET represents a set of encoding schemes less accepts. ENCSET is
* used as a triplet like { input, inputr, output }. "input" represents
* a set of encoding schemes for input stream left plane (0x00..0x7f).
* "inputr" represents a set of encoding schemes for input stream right
* plane (0x80..0xff). "output" represents an encoding scheme for output
* stream.
*
* ESNONE has to be used exclusively to specify no-data. This is used
* as only "inputr" to specify no right plane (0x80..0xff) data.
* ESNOCONV has to be used exclusively to specify no-conversion.
* ESISO7 and ESISO8 specify ISO style encoding techniques. ESISO7 can
* be used as "input" or "output". ESISO8 can be used as "inputr" or
* "output".
* ESJIS83, ESSJIS, and ESUJIS specify Japanese encoding techniques.
* Note: As input, users can use any combination of these values.
* However, as output, users need to use only one of them.
* Note: If ESJIS83 is used as "output", less output all KANJI
* character set using only JIS X 0208-1983 character set (ESC$B) with
* a hope that user's terminal device is using glyph of JIS X 0213:2004
* plane 1 character set as its default glyph. It is hard to update
* terminal device to understand JIS X 0213:2004 completely, but it is
* easy to change the glyph.
* ESUTF8 specifies encoding technique and character set. This have to
* be used exclusively as output.
*/
typedef int ENCSET;
#define ESNONE 0x0000
#define ESNOCONV 0x0001
#define ESISO7 0x0002
#define ESISO8 0x0004
#define ESJIS83 0x0008
#define ESSJIS 0x0010
#define ESUJIS 0x0020
#define ESUTF8 0x0040
#define ESALLJA (ESSJIS|ESUJIS|ESUTF8)
/*
* J_PRIORITY: priority to select either UJIS or SJIS as encoding scheme.
*/
typedef enum {
PUJIS,
PSJIS,
PUTF8,
PNONE
} J_PRIORITY;
/*
* A structure used as a return value in multi_parse().
*/
typedef struct {
char *cbuf;
CHARSET *csbuf;
int byte;
} M_BUFDATA;
/*
* struct multibuf is internal data structure for multi.c.
* Defines it name only.
*/
typedef struct multibuf MULBUF;
/*
* in multi.c
*/
extern int set_planeset ();
extern void init_def_scs_es ();
extern void init_def_priority ();
extern void init_priority ();
extern J_PRIORITY get_priority ();
extern void set_priority ();
extern MULBUF * new_multibuf ();
extern void clear_multibuf ();
extern void init_multibuf ();
extern void multi_start ();
extern void multi_parse ();
extern void multi_flush ();
extern void multi_discard ();
extern void set_codesets ();
extern char * get_icharset_string ();
extern char * outchar();
extern char * outbuf();
extern int mwidth();
extern char * rotate_right_codeset ();
extern int strlen_cs();
extern int chlen_cs();
extern char* strdup_cs();
/*
* in unify.c
*/
extern void jis78to90();
extern void chconvert_cs();
extern void chunify_cs();
extern int chcmp_cs();
extern int chisvalid_cs();
syntax highlighted by Code2HTML, v. 0.9.1