/*
 * Copyright (c) 2002, The Tendra Project <http://www.ten15.org/>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice unmodified, this list of conditions, and the following
 *    disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *
 *    		 Crown Copyright (c) 1997
 *
 *    This TenDRA(r) Computer Program is subject to Copyright
 *    owned by the United Kingdom Secretary of State for Defence
 *    acting through the Defence Evaluation and Research Agency
 *    (DERA).  It is made available to Recipients with a
 *    royalty-free licence for its use, reproduction, transfer
 *    to other parties and amendment for any purpose not excluding
 *    product development provided that any such use et cetera
 *    shall be deemed to be acceptance of the following conditions:-
 *
 *        (1) Its Recipients shall ensure that this Notice is
 *        reproduced upon any copies or amended versions of it;
 *
 *        (2) Any amended version of it shall be clearly marked to
 *        show both the nature of and the organisation responsible
 *        for the relevant amendment or amendments;
 *
 *        (3) Its onward transfer from a recipient to another
 *        party shall be deemed to be that party's acceptance of
 *        these conditions;
 *
 *        (4) DERA gives no warranty or assurance as to its
 *        quality or suitability for any purpose and DERA accepts
 *        no liability whatsoever in relation to any use to which
 *        it may be put.
 *
 * $TenDRA: tendra/src/producers/common/parse/lex.c,v 1.12 2005/10/08 17:16:49 stefanf Exp $
 */


#include "config.h"
#include "producer.h"
#include <limits.h>
#if FS_MULTIBYTE
#include <locale.h>
#endif

#include "fmm.h"
#include "msgcat.h"

#include "c_types.h"
#include "exp_ops.h"
#include "hashid_ops.h"
#include "id_ops.h"
#include "member_ops.h"
#include "str_ops.h"
#include "error.h"
#include "catalog.h"
#include "option.h"
#include "buffer.h"
#include "char.h"
#include "constant.h"
#include "file.h"
#include "dump.h"
#include "hash.h"
#include "lex.h"
#include "literal.h"
#include "macro.h"
#include "parse.h"
#include "pragma.h"
#include "preproc.h"
#include "print.h"
#include "syntax.h"
#include "ustring.h"


/*
 *    PARSER OPTIONS
 *
 *    These flags control the behaviour of the parser and determine whether
 *    such features as trigraphs and digraphs are allowed.
 */

int allow_trigraphs = 1;
int allow_digraphs = 1;
int allow_unicodes = LANGUAGE_CPP;
int allow_multibyte = 1;
int allow_cpp_comments = LANGUAGE_CPP;
int allow_dos_newline = 0;
int allow_extra_symbols = 0;
int allow_iso_keywords = LANGUAGE_CPP;
int allow_newline_strings = 0;
int analyse_comments = 1;
unsigned long max_id_length = 1024;


/*
 *    TABLE OF SYMBOLS AND KEYWORDS
 *
 *    This table gives the mapping between lexical token numbers and the
 *    corresponding symbols and keywords.  It is derived from the list of
 *    tokens in symbols.h.
 */

const char *token_names [] = {
#define LEX_TOKEN(A, B, C)		(B),
#include "symbols.h"
#undef LEX_TOKEN
	NULL
};


/*
 *    TRANSLATION A LEXICAL TOKEN TO ITS PRIMARY FORM
 *
 *    This routine translates the alternative ISO keywords and digraphs
 *    into their primary form.
 */

int
primary_form(int t)
{
	int u = t;
	switch (u) {
	case lex_and_H2 : u = lex_and_H1; break;
	case lex_and_Heq_H2 : u = lex_and_Heq_H1; break;
	case lex_close_Hbrace_H2 : u = lex_close_Hbrace_H1; break;
	case lex_close_Hsquare_H2 : u = lex_close_Hsquare_H1; break;
	case lex_compl_H2 : u = lex_compl_H1; break;
	case lex_hash_H2 : u = lex_hash_H1; break;
	case lex_hash_Hhash_H2 : u = lex_hash_Hhash_H1; break;
	case lex_logical_Hand_H2 : u = lex_logical_Hand_H1; break;
	case lex_logical_Hor_H2 : u = lex_logical_Hor_H1; break;
	case lex_not_H2 : u = lex_not_H1; break;
	case lex_not_Heq_H2 : u = lex_not_Heq_H1; break;
	case lex_open_Hbrace_H2 : u = lex_open_Hbrace_H1; break;
	case lex_open_Hsquare_H2 : u = lex_open_Hsquare_H1; break;
	case lex_or_H2 : u = lex_or_H1; break;
	case lex_or_Heq_H2 : u = lex_or_Heq_H1; break;
	case lex_xor_H2 : u = lex_xor_H1; break;
	case lex_xor_Heq_H2 : u = lex_xor_Heq_H1; break;
	}
	return (u);
}


/*
 *    REPORT A DIGRAPH TOKEN
 *
 *    This routine reports the digraph t, returning the primary form of t.
 */

int
get_digraph(int t)
{
	int u = primary_form (t);
	if (u != t) {
		update_column ();
		report (crt_loc, ERR_lex_digraph_replace (t, u));
	}
	return (u);
}


/*
 *    CREATE A KEYWORD
 *
 *    This routine creates a keyword identifier with name nm and lexical
 *    token number key.  The special case when key is lex_unknown is used
 *    to indicate a reserved identifier, analogically lex_identifier maps to
 *    a C99 keyword.
 */

IDENTIFIER
make_keyword(HASHID nm, int key, IDENTIFIER id)
{
	PTR (IDENTIFIER) ptr = hashid_id (nm);
	if (IS_NULL_id (id)) {
		/* Find keyword type */
		unsigned tag = id_keyword_tag;
		if (key >= FIRST_ISO_KEYWORD && key <= LAST_ISO_KEYWORD) {
			tag = id_iso_keyword_tag;
		} else if (key >= FIRST_SYMBOL && key <= LAST_SYMBOL) {
			tag = id_iso_keyword_tag;
		} else if (key == lex_unknown) {
			tag = id_reserved_tag;
		} else if (key == lex_identifier) {
			tag = id_c99_keyword_tag;
		}

		/* Create keyword identifier */
		MAKE_id_keyword_etc (tag, nm, dspec_none, NULL_nspace, crt_loc, id);
		COPY_ulong (id_no (id), (unsigned long) key);
	}
	COPY_id (hashid_cache (nm), NULL_id);
	if (do_keyword) dump_declare (id, &crt_loc, 1);

	/* Add keyword to identifier meanings */
	for (;;) {
		IDENTIFIER pid = DEREF_id (ptr);
		switch (TAG_id (pid)) {
	    case id_dummy_tag :
	    case id_keyword_tag :
	    case id_c99_keyword_tag :
	    case id_iso_keyword_tag :
	    case id_reserved_tag : {
			COPY_id (id_alias (id), pid);
			COPY_id (ptr, id);
			return (id);
	    }
		}
		ptr = id_alias (pid);
	}
	/* NOTREACHED */
}


/*
 *    INITIALISE KEYWORDS
 *
 *    This routine initialises the hash table entries for the keywords.
 */

void
init_keywords(void)
{
	int key;

	/* Set up keyword entries */
	for (key = FIRST_KEYWORD; key <= LAST_KEYWORD; key++) {
		int ext = 0;
		string keyword = token_name (key);
		unsigned long h = hash (keyword);
		if (keyword [0] == char_less) ext = 1;
		KEYWORD (key) = lookup_name (keyword, h, ext, key);
	}

	/* Bring the C keywords into scope */
	for (key = FIRST_C_KEYWORD; key <= LAST_C_KEYWORD; key++) {
		HASHID nm = KEYWORD (key);
		IGNORE make_keyword (nm, key, NULL_id);
	}

	/* Bring the C99 keywords into scope */
	for (key = FIRST_C99_KEYWORD; key <= LAST_C99_KEYWORD; key++) {
		HASHID nm = KEYWORD (key);
		IGNORE make_keyword (nm, lex_identifier, NULL_id);
	}

	/* Bring the C++ keywords into scope */
	for (key = FIRST_CPP_KEYWORD; key <= LAST_CPP_KEYWORD; key++) {
		HASHID nm = KEYWORD (key);
#if LANGUAGE_CPP
		IGNORE make_keyword (nm, key, NULL_id);
#else
		if (key != lex_wchar_Ht) {
			IGNORE make_keyword (nm, lex_unknown, NULL_id);
		}
#endif
	}

	/* Bring the ISO alternative keywords into scope */
	for (key = FIRST_ISO_KEYWORD; key <= LAST_ISO_KEYWORD; key++) {
		HASHID nm = KEYWORD (key);
		if (allow_iso_keywords) {
			IGNORE make_keyword (nm, key, NULL_id);
		} else {
			IGNORE make_keyword (nm, lex_unknown, NULL_id);
		}
	}

	/* Find underlying dummy identifier for 'operator' */
	underlying_op = DEREF_id (hashid_id (KEYWORD (lex_operator)));
	underlying_op = underlying_id (underlying_op);
	return;
}


/*
 *    ADJUST A CHARACTER FOR TRIGRAPHS
 *
 *    This routine is called after a question mark has been read from the
 *    input file to allow for trigraphs.  It returns the trigraph replacement
 *    character or '?' if the following characters do not form a trigraph.
 */

static int
adjust_trigraph(void)
{
	if (allow_trigraphs) {
		int c = next_char ();
		if (c == char_end) c = refill_char ();
		if (c == char_question) {
			int d;
			c = next_char ();
			if (c == char_end) c = refill_char ();
			switch (c) {
			case char_close_round : {
				/* Map '\?\?)' to ']' */
				d = char_close_square;
				break;
			}
			case char_equal : {
				/* Map '\?\?=' to '#' */
				d = char_hash;
				break;
			}
			case char_exclaim : {
				/* Map '\?\?!' to '|' */
				d = char_bar;
				break;
			}
			case char_greater : {
				/* Map '\?\?>' to '}' */
				d = char_close_brace;
				break;
			}
			case char_less : {
				/* Map '\?\?<' to '{' */
				d = char_open_brace;
				break;
			}
			case char_minus : {
				/* Map '\?\?-' to '~' */
				d = char_tilde;
				break;
			}
			case char_open_round : {
				/* Map '\?\?(' to '[' */
				d = char_open_square;
				break;
			}
			case char_single_quote : {
				/* Map '\?\?\'' to '^' */
				d = char_circum;
				break;
			}
			case char_slash : {
				/* Map '\?\?/' to '\\' */
				d = char_backslash;
				break;
			}
			default : {
				/* Not a trigraph */
				unread_char (c);
				unread_char (char_question);
				return (char_question);
			}
			}
			update_column ();
			report (crt_loc, ERR_lex_trigraph_replace (c, d));
			return (d);
		} else {
			/* Not a trigraph */
			unread_char (c);
		}
	}
	return (char_question);
}


/*
 *    READ A NEWLINE CHARACTER
 *
 *    This routine is called after each carriage return character, checking
 *    for a following newline character.
 */

static int
read_newline(void)
{
	if (allow_dos_newline) {
		int c = next_char ();
		if (c == char_end) c = refill_char ();
		if (c == char_newline) return (c);
		unread_char (c);
	}
	return (char_return);
}


/*
 *    READ AN END OF FILE CHARACTER
 *
 *    This routine is called after each terminate character, checking for
 *    a following end of file character.
 */

static int
read_eof(void)
{
	if (allow_dos_newline) {
		int c = next_char ();
		if (c == char_end) c = refill_char ();
		if (c == char_eof) return (c);
		unread_char (c);
	}
	return (char_sub);
}


/*
 *    READ THE NEXT CHARACTER ALLOWING FOR TRIGRAPHS ETC.
 *
 *    This routine reads the next character from the input file, adjusting
 *    it as necessary for trigraphs and escaped newlines.  This routine
 *    corresponds to phases 1 and 2 of the phases of translation.
 */

static int
read_char(void)
{
	for (;;) {
		int c = next_char ();
		if (c == char_end) c = refill_char ();
		if (c == char_question) c = adjust_trigraph ();
		if (c != char_backslash) {
			/* Not an escaped newline */
			return (c);
		}
		c = next_char ();
		if (c == char_end) c = refill_char ();
		if (c == char_return) c = read_newline ();
		if (c != char_newline) {
			/* Not an escaped newline */
			unread_char (c);
			return (char_backslash);
		}
		crt_loc.line++;
		crt_loc.column = 0;
		input_crt = input_posn;
	}
	/* NOTREACHED */
}


/*
 *    CHARACTER LOOK-UP TABLE
 *
 *    This look-up table gives the various character types.  Note that the
 *    default look-up table is for ASCII, for other codesets the table
 *    needs to be rewritten.  The only really interesting points in the
 *    table itself are that newline has not been classified as a white-space
 *    and that character char_eof (-1) represents end of file.
 */

#define SPACE_M			0x01
#define ALPHA_M			0x02
#define DIGIT_M			0x04
#define ALNUM_M			0x08
#define PPDIG_M			0x10
#define SYMBL_M			0x20
#define NLINE_M			0x40
#define LEGAL_M			0x80

#define ILLEG			0x00
#define LEGAL			LEGAL_M
#define SPACE			(SPACE_M | LEGAL_M)
#define ALPHA			(ALPHA_M | ALNUM_M | PPDIG_M | LEGAL_M)
#define DIGIT			(DIGIT_M | ALNUM_M | PPDIG_M | LEGAL_M)
#define SYMBL			(SYMBL_M | LEGAL_M)
#define POINT			(PPDIG_M | SYMBL_M | LEGAL_M)
#define NLINE			(NLINE_M | LEGAL_M)

#define main_characters		(characters + 1)
#define lookup_char(C)	((int) main_characters [C])
#define is_white(T)		((T) & SPACE_M)
#define is_alpha(T)		((T) & ALPHA_M)
#define is_digit(T)		((T) & DIGIT_M)
#define is_alphanum(T)	((T) & ALNUM_M)
#define is_ppdigit(T)		((T) & PPDIG_M)
#define is_symbol(T)		((T) & SYMBL_M)
#define is_newline(T)		((T) & NLINE_M)
#define is_legal(T)		((T) & LEGAL_M)

static unsigned char characters [ NO_CHAR + 2 ] = {
	LEGAL,			/* EOF */
#define CHAR_DATA(A, B, C, D)	(A),
#include "char.h"
#undef CHAR_DATA
	ILLEG			/* dummy */
};

static unsigned char *copy_characters = main_characters;


/*
 *    SET A CHARACTER LOOK-UP
 *
 *    This routine sets the look-up value for character a to be equal to
 *    the underlying value for character b.  As a special case, setting
 *    the look-up for a carriage return to that for newline enables
 *    DOS-like rules on newline and end of file characters.
 */

void
set_char_lookup(int a, int b)
{
	if (a >= 0 && a < NO_CHAR && b >= 0 && b < NO_CHAR) {
		unsigned char t = copy_characters [b];
		if (a == char_return) {
			if (b == char_newline) {
				/* Set DOS-like newline rules */
				allow_dos_newline = 1;
				return;
			}
			if (b == char_return) {
				/* Unset DOS-like newline rules */
				allow_dos_newline = 0;
			}
		}
		main_characters [a] = t;
	}
	return;
}


/*
 *    SET A NUMBER OF CHARACTER LOOK-UPS
 *
 *    This routine sets the character look-ups for all the elements of the
 *    string or character literal expression a to be equal to that for the
 *    character literal expression b.  If b is the null expression then
 *    the look-up is set to be an illegal character.
 */

void
set_character(EXP a, EXP b)
{
	int c = get_char_value (b);
	if (IS_exp_string_lit (a)) {
		STRING s = DEREF_str (exp_string_lit_str (a));
		unsigned long n = DEREF_ulong (str_simple_len (s));
		string t = DEREF_string (str_simple_text (s));
		unsigned kind = DEREF_unsigned (str_simple_kind (s));
		if (kind & STRING_MULTI) {
			while (n) {
				int ch = CHAR_SIMPLE;
				unsigned long d = get_multi_char (t, &ch);
				if (d < (unsigned long) NO_CHAR) {
					set_char_lookup ((int) d, c);
				}
				t += MULTI_WIDTH;
				n--;
			}
		} else {
			while (n) {
				int d = (int) *t;
				set_char_lookup (d, c);
				t++;
				n--;
			}
		}
	} else {
		int d = get_char_value (a);
		if (d != char_illegal) set_char_lookup (d, c);
	}
	return;
}


/*
 *    CHECK FOR WHITE SPACE CHARACTERS
 *
 *    This routine checks whether the character a represents a white space.
 *    The newline character constitutes a special case.
 */

int
is_white_char(unsigned long a)
{
	int t;
	if (a >= NO_CHAR) return (0);
	t = lookup_char (a);
	return (is_white (t) || is_newline (t));
}


/*
 *    CHECK FOR ALPHABETIC CHARACTERS
 *
 *    This routine checks whether the character a represents an alphabetic
 *    character.
 */

int
is_alpha_char(unsigned long a)
{
	if (a >= NO_CHAR) return (0);
	return (is_alpha (lookup_char (a)));
}


/*
 *    CHECK FOR LEGAL CHARACTERS
 *
 *    This routine checks whether the character a represents a legal character.
 */

int
is_legal_char(unsigned long a)
{
	if (a >= NO_CHAR) return (0);
	return (is_legal (lookup_char (a)));
}


/*
 *    PEEK AHEAD ONE CHARACTER
 *
 *    This routine tests whether the next character is a (which will not be
 *    newline).  If so the current character is advanced one, otherwise it
 *    is left unchanged.  legal is set to false if the next character is
 *    not legal.
 */

int
peek_char(int a, int *legal)
{
	int c = read_char ();
	ASSERT (a != char_newline);
	if (c == a) return (1);
	*legal = is_legal_char ((unsigned long) c);
	unread_char (c);
	return (0);
}


/*
 *    TOKEN BUFFER
 *
 *    This buffer is used by read_token to hold the values of identifiers,
 *    numbers and strings.
 */

BUFFER token_buff = NULL_buff;


/*
 *    TOKEN IDENTIFICATION MACROS
 *
 *    These macros are used to identify the start or end of certain tokens
 *    such as comments and strings.
 */

#define START_COMMENT(A)\
	((A) == char_asterix)
#define END_COMMENT(A, B)\
	((A) == char_asterix && (B) == char_slash)
#define START_CPP_COMMENT(A)\
	((A) == char_slash && allow_cpp_comments)
#define END_CPP_COMMENT(A)\
	((A) == char_newline)
#define START_STRING(A)\
	((A) == char_quote || (A) == char_single_quote)
#define END_STRING(A, Q)\
	((A) == (Q))


/*
 *    END OF FILE FLAG
 *
 *    Each source file should end in a newline character, which is not
 *    preceded by a backspace.  This flag is used to indicate whether the
 *    end of the present file has the correct form.
 */

static int good_eof = 0;


/*
 *    SKIP A STRING
 *
 *    This routine skips a string or character literal.  It is entered after
 *    the initial quote, q, has been read.  Escape sequences are always
 *    allowed.  The routine returns lex_string_Hlit if the string terminates
 *    correctly and lex_eof otherwise.
 */

static int
skip_string(int q)
{
	int e = q;
	LOCATION loc;
	unsigned nl = 0;
	int escaped = 0;
	int have_char = 0;
	int allow_nl = allow_newline_strings;
	if (e == char_single_quote || in_preproc_dir == 1) allow_nl = 0;
	update_column ();
	loc = crt_loc;

	/* Scan to end of string */
	for (;;) {
		int c = read_char ();
		if (END_STRING (c, e) && !escaped) {
			if (e == char_single_quote && !have_char) {
				update_column ();
				report (crt_loc, ERR_lex_ccon_empty ());
			}
			if (nl) report (loc, ERR_lex_string_nl (nl, nl));
			return (lex_string_Hlit);
		}
		if (c == char_newline) {
			if (allow_nl) {
				/* Report newlines but continue */
				crt_loc.line++;
				crt_loc.column = 0;
				input_crt = input_posn;
				nl++;
			} else {
				unread_char (c);
				update_column ();
				report (crt_loc, ERR_lex_string_pp_nl ());
				break;
			}
		} else if (c == char_eof) {
			report (loc, ERR_lex_phases_str_eof ());
			good_eof = 1;
			nl = 0;
			break;
		}
		if (escaped) {
			escaped = 0;
		} else {
			if (c == char_backslash) escaped = 1;
		}
		if (!escaped) have_char = 1;
	}
	if (nl) {
		/* Report newlines in string */
		report (loc, ERR_lex_string_nl (nl, nl));
	} else {
		/* Don't bother with error recovery */
		/* EMPTY */
	}
	return (lex_eof);
}


/*
 *    READ THE BODY OF A STRING
 *
 *    This routine reads the body of a string or character literal or of a
 *    header name.  It is entered after the initial quote has been read.
 *    The corresponding close quote is passed in as q.  The esc argument
 *    indicates whether escape sequences are allowed (they are not in
 *    header names for example).  The string itself is built up in
 *    token_buff.  The routine returns lex_string_Hlit if the string
 *    terminates correctly and lex_eof otherwise.  It also sets
 *    token_buff.posn to point to the end of the string.
 */

int
read_string(int q, int esc)
{
	int c;
	int e = q;
	LOCATION loc;
	long posn = -1;
	int escaped = 0;
	unsigned nl = 0;
	int have_char = 0;
	string s = token_buff.start;
	string se = token_buff.end;
	int allow_nl = allow_newline_strings;
	update_column ();
	if (e == char_single_quote) {
		posn = tell_buffer (crt_buff_no);
		allow_nl = 0;
	} else if (in_preproc_dir == 1) {
		allow_nl = 0;
	}
	loc = crt_loc;

	/* Scan the string */
	for (;;) {
		c = read_char ();
		if (END_STRING (c, e) && !escaped) {
			if (e == char_single_quote && !have_char) {
				update_column ();
				report (crt_loc, ERR_lex_ccon_empty ());
			}
			if (nl) report (loc, ERR_lex_string_nl (nl, nl));
			token_buff.posn = s;
			*s = 0;
			return (lex_string_Hlit);
		}
		if (c == char_newline) {
			if (allow_nl) {
				/* Report newlines but continue */
				crt_loc.line++;
				crt_loc.column = 0;
				input_crt = input_posn;
				nl++;
			} else {
				unread_char (c);
				update_column ();
				if (e == char_greater) {
					/* Header name */
					report (crt_loc, ERR_cpp_include_incompl ());
				} else {
					report (crt_loc, ERR_lex_string_pp_nl ());
				}
				break;
			}
		} else if (c == char_eof) {
			report (loc, ERR_lex_phases_str_eof ());
			good_eof = 1;
			nl = 0;
			break;
		}
		*s = (character) c;
		if (++s == se) {
			s = extend_buffer (&token_buff, s);
			se = token_buff.end;
		}
		if (escaped) {
			escaped = 0;
		} else {
			if (c == char_backslash) escaped = esc;
		}
		if (!escaped) have_char = 1;
	}
	if (nl) {
		/* Report newlines in string */
		report (loc, ERR_lex_string_nl (nl, nl));
	} else {
		/* Error recovery */
		if (e == char_single_quote && have_char) {
			seek_buffer (crt_buff_no, posn, 1);
			crt_loc = loc;
			s = token_buff.start;
			c = read_char ();
			*(s++) = (character) c;
			if (c == char_backslash && esc) {
				c = read_char ();
				*(s++) = (character) c;
			}
		}
	}
	token_buff.posn = s;
	*s = 0;
	return (lex_eof);
}


/*
 *    SKIP A C STYLE COMMENT
 *
 *    This routine skips a C style comment, returning lex_ignore_token if
 *    the comment is terminated correctly and lex_eof otherwise.  It is
 *    entered after the first two characters comprising the comment start
 *    have been read.  If keep is true then the comment text is built up
 *    in token_buff, otherwise it is discarded.
 */

static int
skip_comment(int keep)
{
	int c = 0;
	int lastc;
	string s, se;
	LOCATION loc;
	update_column ();
	loc = crt_loc;
	if (keep) {
		s = token_buff.start;
		se = token_buff.end;
	} else {
		s = NULL;
		se = NULL;
	}
	do {
		lastc = c;
		read_label : {
			/* Inlined version of read_char */
			c = next_char ();
			if (c == char_end) c = refill_char ();
			if (c == char_question) c = adjust_trigraph ();
			if (c == char_backslash) {
				c = next_char ();
				if (c == char_end) c = refill_char ();
				if (c == char_return) c = read_newline ();
				if (c == char_newline) {
					/* Allow for escaped newlines */
					crt_loc.line++;
					crt_loc.column = 0;
					input_crt = input_posn;
					goto read_label;
				}
				unread_char (c);
				c = char_backslash;
			} else if (c == char_newline) {
				/* New line characters */
				crt_loc.line++;
				crt_loc.column = 0;
				input_crt = input_posn;
				crt_line_changed = 1;
				crt_spaces = 0;
			} else if (c == char_eof) {
				/* End of file characters */
				report (loc, ERR_lex_phases_comm_eof ());
				good_eof = 1;
				if (s) {
					token_buff.posn = s;
					*s = 0;
				}
				return (lex_eof);
			} else if (c == char_asterix && lastc == char_slash) {
				/* Nested comments */
				update_column ();
				report (crt_loc, ERR_lex_comment_nest ());
			}
			if (s) {
				*s = (character) c;
				if (++s == se) {
					s = extend_buffer (&token_buff, s);
					se = token_buff.end;
				}
			}
		}
	} while (!END_COMMENT (lastc, c));
	if (s) {
		s -= 2;
		token_buff.posn = s;
		*s = 0;
	}
	crt_spaces++;
	return (lex_ignore_token);
}


/*
 *    SKIP A C++ STYLE COMMENT
 *
 *    This routine skips a C++ style comment, returning lex_ignore_token
 *    if the comment terminates correctly and lex_eof otherwise.  It is
 *    entered after the first two characters comprising the comment start
 *    have been read.  The next token read after the comment will be the
 *    terminating newline.  If keep is true then the comment text is built
 *    up in token_buff, otherwise it is discarded.
 */

static int
skip_cpp_comment(int keep)
{
	int c;
	string s, se;
	if (keep) {
		s = token_buff.start;
		se = token_buff.end;
	} else {
		s = NULL;
		se = NULL;
	}
	do {
		read_label : {
			/* Inlined version of read_char */
			c = next_char ();
			if (c == char_end) c = refill_char ();
			if (c == char_question) c = adjust_trigraph ();
			if (c == char_backslash) {
				c = next_char ();
				if (c == char_end) c = refill_char ();
				if (c == char_return) c = read_newline ();
				if (c == char_newline) {
					/* Allow for escaped newlines */
					crt_loc.line++;
					crt_loc.column = 0;
					input_crt = input_posn;
					goto read_label;
				}
				unread_char (c);
				c = char_backslash;
			} else if (c == char_eof) {
				/* End of file characters */
				update_column ();
				report (crt_loc, ERR_lex_phases_comm_eof ());
				good_eof = 1;
				if (s) {
					token_buff.posn = s;
					*s = 0;
				}
				return (lex_eof);
			}
			if (s) {
				*s = (character) c;
				if (++s == se) {
					s = extend_buffer (&token_buff, s);
					se = token_buff.end;
				}
			}
		}
	} while (!END_CPP_COMMENT (c));
	unread_char (c);
	if (s) {
		s -= 1;
		token_buff.posn = s;
		*s = 0;
	}
	crt_line_changed = 1;
	crt_spaces = 0;
	return (lex_ignore_token);
}


/*
 *    SKIP WHITE-SPACE CHARACTERS
 *
 *    This routine skips any white-space characters (including comments).
 *    Newline characters are treated as white-space only if nl is true.
 *    The result is a bitpattern formed from the components:
 *
 *	WHITE_SPACE		for white-space characters;
 *	WHITE_NEWLINE		for newline characters;
 *	WHITE_ESC_NEWLINE	for escaped newlines;
 *
 *    the result being reset to WHITE_NEWLINE after each newline.  Note that
 *    trigraphs and escaped newlines are treated by hand.  The effect of this
 *    routine is that all non-empty sequences of white-space characters other
 *    than newlines are treated as if they were a single space (the C/C++
 *    specification says that this is implementation-defined).
 */

unsigned long
skip_white(int nl)
{
	int c;
	unsigned long sp = 0;
	for (;;) {
		c = next_char ();
		if (c == char_end) c = refill_char ();
		if (c == char_return) c = read_newline ();
		if (c == char_sub) c = read_eof ();
		if (c == char_newline) {
			/* Deal with newline characters */
			if (!nl) break;
			sp = WHITE_NEWLINE;
			crt_loc.line++;
			crt_loc.column = 0;
			input_crt = input_posn;
			crt_line_changed = 1;
			crt_spaces = 0;
		} else if (c == char_space) {
			/* Deal with simple spaces */
			sp |= WHITE_SPACE;
			crt_spaces++;
		} else if (c == char_tab) {
			/* Deal with tab characters */
			unsigned long tab = tab_width;
			sp |= WHITE_SPACE;
			crt_spaces = tab * (crt_spaces / tab + 1);
		} else if (c == char_eof) {
			/* End of file */
			if (sp == WHITE_NEWLINE) good_eof = 1;
			break;
		} else {
			int t;
#if FS_EXTENDED_CHAR
			if (IS_EXTENDED (c)) break;
#endif
			t = lookup_char (c);
			if (is_white (t)) {
				/* Deal with other white space characters */
				sp |= WHITE_SPACE;
				crt_spaces++;
			} else {
				if (c == char_question) c = adjust_trigraph ();
				if (c == char_slash) {
					/* Deal with comments */
					int b = read_char ();
					if (START_COMMENT (b)) {
						sp |= WHITE_SPACE;
						b = skip_comment (0);
						if (b == lex_eof) return (sp);
					} else if (START_CPP_COMMENT (b)) {
						sp |= WHITE_SPACE;
						b = skip_cpp_comment (0);
						if (b == lex_eof) return (sp);
						if (!nl) return (sp);
					} else {
						unread_char (b);
						break;
					}
				} else if (c == char_backslash) {
					/* Deal with escaped newlines */
					int b = next_char ();
					if (b == char_end) b = refill_char ();
					if (b == char_return) b = read_newline ();
					if (b == char_newline) {
						crt_loc.line++;
						crt_loc.column = 0;
						input_crt = input_posn;
					} else {
						unread_char (b);
						break;
					}
					sp |= WHITE_ESC_NEWLINE;
				} else {
					break;
				}
			}
		}
	}
	unread_char (c);
	return (sp);
}


/*
 *    PATCH UP WHITE-SPACE CHARACTERS
 *
 *    Calling skip_white (1) can mess up the parser as regards spotting
 *    preprocessing directives and valid end of file markers.  This routine
 *    may be called with the return value of skip_white as an argument to
 *    patch up the buffer in order to get the parser back into the right
 *    state.
 */

void
patch_white(unsigned long sp)
{
	if (sp & WHITE_NEWLINE) {
		if (sp & WHITE_SPACE) {
			/* Patch in a space after a newline */
			unsigned long n;
			update_column ();
			n = crt_loc.column;
			while (n) {
				unread_char (char_space);
				if (input_posn <= input_start) break;
				n--;
			}
		} else if (sp & WHITE_ESC_NEWLINE) {
			/* Patch in an escaped newline after a newline */
			unread_char (char_backslash);
			unread_char (char_newline);
			crt_loc.line--;
		}
		/* Patch in a newline */
		unread_char (char_newline);
		crt_loc.line--;
		crt_loc.column = 0;
		crt_spaces = 0;
	}
	return;
}


/*
 *    SKIP TO END OF LINE
 *
 *    This routine skips to the end of the current line.  It returns 0 if
 *    only white-space characters are encountered.  It uses skip_white to
 *    jump over white-space (including comments).
 */

int
skip_to_end(void)
{
	int c;
	int res = 0;
	in_preproc_dir = 0;
	for (;;) {
		IGNORE skip_white (0);
		read_label : {
			/* Inlined version of read_char */
			c = next_char ();
			if (c == char_end) c = refill_char ();
			if (c == char_question) c = adjust_trigraph ();
			if (c == char_backslash) {
				c = next_char ();
				if (c == char_end) c = refill_char ();
				if (c == char_return) c = read_newline ();
				if (c == char_newline) {
					/* Allow for escaped newlines */
					crt_loc.line++;
					crt_loc.column = 0;
					input_crt = input_posn;
					goto read_label;
				}
				unread_char (c);
			} else if (c == char_newline) {
				/* New line characters */
				crt_loc.line++;
				crt_loc.column = 0;
				input_crt = input_posn;
				crt_line_changed = 1;
				crt_spaces = 0;
				return (res);
			} else if (START_STRING (c)) {
				/* String literals */
				res = 1;
				c = skip_string (c);
				if (c == lex_eof) return (res);
			} else if (c == char_eof) {
				/* End of file characters */
				break;
			} else {
				res = 1;
			}
		}
	}
	update_column ();
	report (crt_loc, ERR_lex_phases_eof ());
	good_eof = 1;
	return (res);
}


/*
 *    READ A UNICODE CHARACTER
 *
 *    This routine reads a unicode character.  It is entered after the
 *    initial backslash and the following character, c, have been read.
 *    It assigns the character type to pc and returns the character code.
 */

static unsigned long
read_unicode(int c, int *pc)
{
	unsigned i, n;
	unsigned long u;
	character s [10];
	ERROR err = NULL_err;
	string p = s;
	if (c == char_u && allow_unicodes) {
		/* Read '\uxxxx' */
		*pc = CHAR_UNI4;
		n = 4;
	} else if (c == char_U && allow_unicodes) {
		/* Read '\Uxxxxxxxx' */
		*pc = CHAR_UNI8;
		n = 8;
	} else {
		unread_char (c);
		*pc = CHAR_NONE;
		return (0);
	}
	for (i = 0; i < n; i++) {
		int t;
		int d = read_char ();
		if (d == char_eof) break;
#if FS_EXTENDED_CHAR
		if (IS_EXTENDED (d)) {
			unread_char (d);
			break;
		}
#endif
		t = lookup_char (d);
		if (!is_alphanum (t)) {
			unread_char (d);
			break;
		}
		s [i] = (character) d;
	}
	s [i] = 0;
	u = eval_unicode (c, n, pc, &p, &err);
	if (!IS_NULL_err (err)) {
		update_column ();
		report (crt_loc, err);
	}
	return (u);
}


/*
 *    READ AN EXTENDED IDENTIFIER
 *
 *    This routine reads an extended identifier name (one including a unicode
 *    character).  It is entered after reading the simple characters in the
 *    token buffer plus the unicode character given by u and ch.
 */

static HASHID
read_extended_id(unsigned long u, int ch)
{
	string s;
	int c, t;
	HASHID nm;
	unsigned long h;
	BUFFER *bf = &token_buff;
	do {
		if (!unicode_alpha (u)) {
			/* Report illegal identifiers */
			update_column ();
			report (crt_loc, ERR_lex_name_extendid (u));
		}
		print_char (u, ch, 0, bf);
		for (;;) {
			c = read_char ();
#if FS_EXTENDED_CHAR
			if (IS_EXTENDED (c)) break;
#endif
			t = lookup_char (c);
			if (!is_alphanum (t)) break;
			bfputc (bf, c);
		}
		ch = CHAR_NONE;
		if (c == char_backslash) {
			int nextc = read_char ();
			u = read_unicode (nextc, &ch);
		}
	} while (ch != CHAR_NONE);
	unread_char (c);
	bfputc (bf, 0);
	s = bf->start;
	h = hash (s);
	nm = lookup_name (s, h, 1, lex_unknown);
	return (nm);
}


/*
 *    HASH VALUE FOR IDENTIFIERS
 *
 *    The hash value for identifiers is built up as the identifier is read.
 *    It is then stored in this variable.  The algorithm for calculating
 *    the hash value needs to be kept in step with the routine hash (it
 *    is checked by an assertion in lookup_name, so any errors should be
 *    caught quickly if in debug mode).
 */

HASHID token_hashid = NULL_hashid;


/*
 *    MAIN PASS ANALYSER
 *
 *    This routine reads the next preprocessing token from the input file.
 *    It is designed for speed rather than elegance, hence the rather
 *    indiscriminate use of labels.  Trigraphs and escaped newlines
 *    involving the first character are processed by hand.  This routine
 *    corresponds to phase 3 of the phases of translation.  The position
 *    within the line is tracked by column - this is zero at the start of
 *    a line, positive if only white space has been read and negative
 *    otherwise.  preproc keeps track of the last preprocessing directive.
 */

int
read_token(void)
{
	int c, t;
	int column = -1;
	int preproc = lex_ignore_token;

	/* Read the next character */
	start_label : {
		c = next_char ();
		if (c == char_end) c = refill_char ();
		restart_label : {
#if FS_EXTENDED_CHAR
			if (IS_EXTENDED (c)) {
				goto unknown_label;
			}
#endif
			t = lookup_char (c);
			if (is_white (t)) {
				crt_spaces++;
				goto start_label;
			}
		}
		process_label : {
			/* Process the next character */
		}
	}

	/* Check symbols and punctuation */
	if (is_symbol (t)) {
		switch (c) {

	    case char_question : {
			/* Deal with '?' and trigraphs */
			c = adjust_trigraph ();
			if (c == char_question) return (lex_question);
			goto restart_label;
	    }

	    case char_backslash : {
			/* Deal with escaped newlines */
			unsigned long u;
			int ch = CHAR_NONE;
			int nextc = next_char ();
			if (nextc == char_end) nextc = refill_char ();
			if (nextc == char_return) nextc = read_newline ();
			if (nextc == char_newline) {
				crt_loc.line++;
				crt_loc.column = 0;
				input_crt = input_posn;
				if (column == 0) column = 1;
				goto start_label;
			}

			/* Check for unicode characters */
			u = read_unicode (nextc, &ch);
			if (ch != CHAR_NONE) {
				token_buff.posn = token_buff.start;
				token_hashid = read_extended_id (u, ch);
				return (lex_identifier);
			}
			return (lex_backslash);
	    }

	    case char_hash : {
			/* Deal with '#' and '##' */
			c = read_char ();
			if (c == char_hash) return (lex_hash_Hhash_H1);
			unread_char (c);

			/* Return with '#' if not at start of line */
			if (column < 0 || no_preproc_dir) {
				return (lex_hash_H1);
			}

			/* Deal with preprocessing directives */
			preproc_label : {
				unsigned long sp = skip_white (0);
				update_column ();
				if (column) report (crt_loc, ERR_cpp_indent ());
				if (sp & (WHITE_SPACE | WHITE_ESC_NEWLINE)) {
					report (preproc_loc, ERR_cpp_indent_dir ());
				}
				preproc = read_preproc_dir (1, preproc);
				if (preproc < 0) goto start_line_label;
				unread_char (char_newline);
				crt_loc.line--;
				crt_loc.column = 0;
				return (preproc);
			}
	    }

	    case char_percent : {
			/* Deal with '%', '%=', '%>', '%:' and '%:%:' */
			c = read_char ();
			if (c == char_equal) return (lex_rem_Heq);
			if (c == char_greater && allow_digraphs) {
				return (lex_close_Hbrace_H2);
			}
			if (c == char_colon && allow_digraphs) {
				/* Check for '%:' and '%:%:' */
				c = read_char ();
				if (c == char_percent) {
					int nextc = read_char ();
					if (nextc == char_colon) {
						return (lex_hash_Hhash_H2);
					}
					unread_char (nextc);
				}
				unread_char (c);

				/* Return with '%:' if not at start of line */
				if (column < 0 || no_preproc_dir) {
					return (lex_hash_H2);
				}

				/* Otherwise this is a preprocessing directive */
				IGNORE get_digraph (lex_hash_H2);
				goto preproc_label;
			}
			unread_char (c);
			return (lex_rem);
	    }

	    case char_quote : {
			/* Deal with string literals */
			IGNORE read_string (c, 1);
			return (lex_string_Hlit);
	    }

	    case char_single_quote : {
			/* Deal with character literals */
			IGNORE read_string (c, 1);
			return (lex_char_Hlit);
	    }

	    case char_exclaim : {
			/* Deal with '!' and '!=' */
			c = read_char ();
			if (c == char_equal) return (lex_not_Heq_H1);
			unread_char (c);
			return (lex_not_H1);
	    }

	    case char_ampersand : {
			/* Deal with '&', '&&' and '&=' */
			c = read_char ();
			if (c == char_ampersand) return (lex_logical_Hand_H1);
			if (c == char_equal) return (lex_and_Heq_H1);
			unread_char (c);
			return (lex_and_H1);
	    }

	    case char_asterix : {
			/* Deal with '*' and '*=' */
			c = read_char ();
			if (c == char_equal) return (lex_star_Heq);
			unread_char (c);
			return (lex_star);
	    }

	    case char_plus : {
			/* Deal with '+', '++' and '+=' */
			c = read_char ();
			if (c == char_plus) return (lex_plus_Hplus);
			if (c == char_equal) return (lex_plus_Heq);
			if (c == char_question && allow_extra_symbols) {
				return (lex_abs);
			}
			unread_char (c);
			return (lex_plus);
	    }

	    case char_minus : {
			/* Deal with '-', '--', '-=', '->' and '->*' */
			c = read_char ();
			if (c == char_minus) return (lex_minus_Hminus);
			if (c == char_equal) return (lex_minus_Heq);
			if (c == char_greater) {
#if LANGUAGE_CPP
				/* '->*' is only allowed in C++ */
				c = read_char ();
				if (c == char_asterix) return (lex_arrow_Hstar);
				unread_char (c);
#endif
				return (lex_arrow);
			}
			unread_char (c);
			return (lex_minus);
	    }

	    case char_dot : {
			/* Deal with '.', '...', '.*' and numbers */
			c = read_char ();
			if (c == char_dot) {
				c = read_char ();
				if (c == char_dot) return (lex_ellipsis);
				unread_char (c);
				unread_char (char_dot);
				return (lex_dot);
			}
#if LANGUAGE_CPP
			/* '.*' is only allowed in C++ */
			if (c == char_asterix) return (lex_dot_Hstar);
#endif
#if FS_EXTENDED_CHAR
			if (IS_EXTENDED (c)) {
				unread_char (c);
				return (lex_dot);
			}
#endif
			t = lookup_char (c);
			if (is_digit (t)) {
				/* Indicate a number with first digit '.' */
				t = POINT;
				goto number_label;
			}
			unread_char (c);
			return (lex_dot);
	    }

	    case char_slash : {
			/* Deal with '/', '/=' and comments */
			c = read_char ();
			if (START_COMMENT (c)) {
				int a = analyse_comments;
				c = skip_comment (a);
				if (c == lex_eof) goto eof_label;
				if (a) {
					c = lint_comment ();
					if (c >= 0) return (c);
				}
				if (column == 0) column = 1;
				goto start_label;
			}
			if (START_CPP_COMMENT (c)) {
				int a = analyse_comments;
				c = skip_cpp_comment (a);
				if (c == lex_eof) goto eof_label;
				if (a) {
					c = lint_comment ();
					if (c >= 0) return (c);
				}
				IGNORE read_char ();
				goto newline_label;
			}
			if (c == char_equal) return (lex_div_Heq);
			unread_char (c);
			return (lex_div);
	    }

	    case char_colon : {
			/* Deal with ':', '::' and ':>' */
			c = read_char ();
#if LANGUAGE_CPP
			/* '::' is only allowed in C++ */
			if (c == char_colon) return (lex_colon_Hcolon);
#endif
			if (c == char_greater && allow_digraphs) {
				return (lex_close_Hsquare_H2);
			}
			unread_char (c);
			return (lex_colon);
	    }

	    case char_less : {
			/* Deal with '<', '<=', '<<', '<<=', '<%', '<:' */
			c = read_char ();
			if (c == char_equal) return (lex_less_Heq);
			if (c == char_less) {
				c = read_char ();
				if (c == char_equal) return (lex_lshift_Heq);
				unread_char (c);
				return (lex_lshift);
			}
			if (c == char_percent && allow_digraphs) {
				return (lex_open_Hbrace_H2);
			}
			if (c == char_colon && allow_digraphs) {
				return (lex_open_Hsquare_H2);
			}
			if (c == char_question && allow_extra_symbols) {
				return (lex_min);
			}
			unread_char (c);
			return (lex_less);
	    }

	    case char_equal : {
			/* Deal with '=' and '==' */
			c = read_char ();
			switch (c) {
		    case char_equal : {
				return (lex_eq);
		    }
		    case char_ampersand :
		    case char_asterix :
		    case char_minus :
		    case char_plus : {
				update_column ();
				report (crt_loc, ERR_lex_op_old_assign (c, c));
				break;
		    }
			}
			unread_char (c);
			return (lex_assign);
	    }

	    case char_greater : {
			/* Deal with '>', '>=', '>>' and '>>=' */
			c = read_char ();
			if (c == char_equal) return (lex_greater_Heq);
			if (c == char_greater) {
				c = read_char ();
				if (c == char_equal) return (lex_rshift_Heq);
				unread_char (c);
				return (lex_rshift);
			}
			if (c == char_question && allow_extra_symbols) {
				return (lex_max);
			}
			unread_char (c);
			return (lex_greater);
	    }

	    case char_circum : {
			/* Deal with '^' and '^=' */
			c = read_char ();
			if (c == char_equal) return (lex_xor_Heq_H1);
			unread_char (c);
			return (lex_xor_H1);
	    }

	    case char_bar : {
			/* Deal with '|', '||' and '|=' */
			c = read_char ();
			if (c == char_bar) return (lex_logical_Hor_H1);
			if (c == char_equal) return (lex_or_Heq_H1);
			unread_char (c);
			return (lex_or_H1);
	    }

	    case char_open_round : {
			/* Deal with '(' */
			return (lex_open_Hround);
	    }

	    case char_close_round : {
			/* Deal with ')' */
			return (lex_close_Hround);
	    }

	    case char_comma : {
			/* Deal with ',' */
			return (lex_comma);
	    }

	    case char_semicolon : {
			/* Deal with ';' */
			return (lex_semicolon);
	    }

	    case char_open_square : {
			/* Deal with '[' */
			return (lex_open_Hsquare_H1);
	    }

	    case char_close_square : {
			/* Deal with ']' */
			return (lex_close_Hsquare_H1);
	    }

	    case char_open_brace : {
			/* Deal with '{' */
			return (lex_open_Hbrace_H1);
	    }

	    case char_close_brace : {
			/* Deal with '}' */
			return (lex_close_Hbrace_H1);
	    }

	    case char_tilde : {
			/* Deal with '~' */
			return (lex_compl_H1);
	    }

	    default : {
			/* Anything else is an unknown character */
			goto unknown_label;
	    }
		}
	}

	/* Read an identifier (calculating hash value on fly) */
	if (is_alpha (t)) {
		HASHID nm;
		LOCATION loc;
		BUFFER *bf = &token_buff;
		string s = bf->start;
		string se = bf->end;
		unsigned long h = (unsigned long) c;
		*(s++) = (character) c;

		/* Get the second character */
		update_column ();
		loc = crt_loc;
		c = read_char ();
#if FS_EXTENDED_CHAR
		t = (IS_EXTENDED (c) ? ILLEG : lookup_char (c));
#else
		t = lookup_char (c);
#endif
		if (is_alphanum (t)) {
			/* Scan the third and subsequent characters */
			do {
				h = HASH_POWER * h + (unsigned long) c;
				*s = (character) c;
				if (++s == se) {
					s = extend_buffer (bf, s);
					se = bf->end;
				}
				c = read_char ();
#if FS_EXTENDED_CHAR
				if (IS_EXTENDED (c)) break;
#endif
				t = lookup_char (c);
			} while (is_alphanum (t));
		} else {
			/* Allow for wide strings and characters */
			if (h == char_L && is_symbol (t)) {
				if (c == char_quote) {
					IGNORE read_string (c, 1);
					return (lex_wstring_Hlit);
				}
				if (c == char_single_quote) {
					IGNORE read_string (c, 1);
					return (lex_wchar_Hlit);
				}
			}
			/* Identifier of length one */
		}
		if (c == char_backslash) {
			/* Allow for extended identifiers */
			int ch = CHAR_NONE;
			int nextc = read_char ();
			unsigned long u = read_unicode (nextc, &ch);
			if (ch != CHAR_NONE) {
				bf->posn = s;
				nm = read_extended_id (u, ch);
				goto identifier_label;
			}
		}
		unread_char (c);
		se = s;
		*se = 0;

		/* Look up the symbol in the hash table */
		h %= HASH_SIZE;
		s = bf->start;
		nm = lookup_name (s, h, 0, lex_unknown);
		identifier_label : {
			IDENTIFIER id = DEREF_id (hashid_id (nm));
			while (!IS_id_dummy (id)) {
				/* Scan to last hidden value */
				id = DEREF_id (id_alias (id));
			}
			COPY_loc (id_loc (id), loc);
		}
		token_hashid = nm;
		return (lex_identifier);
	}

	/* Read the first token in a line */
	if (c == char_return) c = read_newline ();
	if (c == char_newline) {
		newline_label : {
			/* Re-entry point after C++ style comments */
			crt_loc.line++;
			crt_loc.column = 0;
			input_crt = input_posn;
			crt_line_changed = 1;
			crt_spaces = 0;
			if (in_preproc_dir == 1) {
				in_preproc_dir = 0;
				return (lex_newline);
			}
		}
		start_line_label : {
			/* Re-entry point after preprocessing directives */
			column = 0;
			for (;;) {
				/* Step over any obvious spaces */
				c = next_char ();
				if (c == char_end) c = refill_char ();
				if (c == char_return) c = read_newline ();
				if (c == char_sub) c = read_eof ();
				if (c == char_newline) {
					crt_loc.line++;
					crt_loc.column = 0;
					input_crt = input_posn;
					crt_line_changed = 1;
					crt_spaces = 0;
					column = 0;
				} else if (c == char_eof) {
					/* Check for end of file (should start line) */
					if (column == 0) good_eof = 1;
					goto eof_label;
				} else if (c == char_space) {
					crt_spaces++;
					column = 1;
				} else if (c == char_tab) {
					unsigned long tab = tab_width;
					crt_spaces = tab * (crt_spaces / tab + 1);
					column = 1;
				} else {
#if FS_EXTENDED_CHAR
					if (IS_EXTENDED (c)) {
						t = ILLEG;
						break;
					}
#endif
					t = lookup_char (c);
					if (is_white (t)) {
						if (!is_newline (t)) {
							crt_spaces++;
							column = 1;
						}
					} else {
						break;
					}
				}
			}
			/* c and t now hold the next character */
			goto process_label;
		}
	}

	/* Read a pp-number */
	if (is_digit (t)) {
		number_label : {
			int lastc;
			BUFFER *bf = &token_buff;
			string s = bf->start;
			string se = bf->end;
			if (t == POINT) {
				/* t is set to POINT to indicate an initial '.' */
				*(s++) = char_dot;
			}
			digit_label : {
				/* Step over alphanumeric characters and '.' */
				do {
					*s = (character) c;
					if (++s == se) {
						s = extend_buffer (bf, s);
						se = bf->end;
					}
					next_digit_label : {
						lastc = c;
						c = read_char ();
#if FS_EXTENDED_CHAR
						if (IS_EXTENDED (c)) break;
#endif
						t = lookup_char (c);
					}
				} while (is_ppdigit (t));
				if (c == char_plus || c == char_minus) {
					/* Allow for [Ee][+-] */
					if (lastc == char_e || lastc == char_E) {
						goto digit_label;
					}
					/* Allow for [Pp][+-] */
					if (lastc == char_p || lastc == char_P) {
						goto digit_label;
					}
				}
				if (c == char_backslash) {
					/* Allow for unicode characters */
					int ch = CHAR_NONE;
					int nextc = read_char ();
					unsigned long u = read_unicode (nextc, &ch);
					if (ch != CHAR_NONE) {
						bf->posn = s;
						print_char (u, ch, 0, bf);
						s = bf->posn;
						se = bf->end;
						goto next_digit_label;
					}
				}
			}
			*s = 0;
			unread_char (c);
		}
		return (lex_integer_Hlit);
	}

	/* End of file marker */
	if (c == char_sub) c = read_eof ();
	if (c == char_eof) {
		eof_label : {
			if (in_preproc_dir != 0) return (lex_eof);
			if (!good_eof) {
				update_column ();
				report (crt_loc, ERR_lex_phases_eof ());
				good_eof = 1;
			}
			if (end_include (preproc)) {
				/* Revert to previous file */
				good_eof = 0;
				preproc = lex_ignore_token;
				goto start_line_label;
			}
		}
		/* End of main file */
		return (lex_eof);
	}

	/* Unknown characters */
	unknown_label : {
		string s = token_buff.start;
		add_multi_char (s, (unsigned long) c, CHAR_SIMPLE);
	}
	return (lex_unknown);
}


/*
 *    INITIALISE INPUT VARIABLES
 *
 *    This routine initialises the tables of character look-ups and the token
 *    buffer.
 */

void
init_char(void)
{
	int i;
	unsigned char *p, *q;

	/* Set native locale for multibyte characters */
#if FS_MULTIBYTE
	if (allow_multibyte) IGNORE setlocale (LC_CTYPE, "");
#endif

	/* Allow for non-ASCII codesets */
	map_ascii (main_characters);
	map_ascii (digit_values);
	map_ascii (escape_sequences);

	/* Set up extra characters */
	p = xmalloc_nof (unsigned char, NO_CHAR);
	q = main_characters;
	copy_characters = p;
	for (i = 0; i < NO_CHAR; i++) *(p++) = *(q++);

	/* Initialise token buffer */
	token_buff.posn = extend_buffer (&token_buff, token_buff.posn);
	return;
}


/*
 *    INITIALISE INPUT FILE READING
 *
 *    This routine initialises the lexical analysis routines in preparation
 *    for parsing or preprocessing the current input file.
 */

void
init_lex(void)
{
	/* Initialise file variables */
	crt_buff_no = 0;
	IGNORE init_buffer (crt_buff_no);
	start_preproc_if ();
	preproc_loc = crt_loc;
	have_syntax_error = 0;
	if (do_header) dump_start (&crt_loc, NULL);

	/* Deal with first start-up file */
	open_startup ();

	/* Force processing to start at the beginning of a line */
	unread_char (char_newline);
	crt_loc.line--;

	/* Initialise the parser */
	init_parser (NULL);
	return;
}


/*
 *    PARSE INPUT FILE
 *
 *    This routine is the main entry point for the parsing of the current
 *    input file.
 */

void
process_file(void)
{
	init_lex ();
	ADVANCE_LEXER;
	parse_file (NULL_type, dspec_none);
	return;
}


syntax highlighted by Code2HTML, v. 0.9.1