/* c_lex.c - a standalone C lexical analyser */

/*  Copyright 1991 Mark Russell, University of Kent at Canterbury.
 *
 *  You can do what you like with this source code as long as
 *  you don't try to make money out of it and you include an
 *  unaltered copy of this message (including the copyright).
 */

/*
 *  This standalone version created on Jan 19 1998.
 *  Author : Dibyendu Majumdar
 *  Email  : dibyendu@mazumdar.demon.co.uk  
 *  Website: www.mazumdar.demon.co.uk
 */

/*
 * 21 Jan 2001 Added support for inline, restrict, _Bool, _Complex, and _Imaginary.
 */

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>

#include "c_lex.h"
static bool Want_debugging_output;

/* static const char *tokname (token_t token); */
static const char *parse_hash_directive (const char *line, lex_env_t *le);
static const char *skip_whitespace (lex_env_t *le, const char *line);
static int get_float_constant (lex_env_t *le, const char *line,
					const char **p_end, constant_t *co);
static const char *getline (lex_env_t *le);
static int get_string (lex_env_t *le, const char *line, constant_t *co);

static struct {
	const char *name;
	token_t token;
	bool need_lexinfo;
} Keytab[] = {
	{"_Bool",	BOOL,		FALSE},
	{"_Complex",	COMPLEX,	FALSE},
	{"_Imaginary",	IMAGINARY,	FALSE},
	{"auto",	AUTO,		FALSE},
	{"break",	BREAK,		TRUE},
	{"case",	CASE,		FALSE},
	{"char",	CHAR,		FALSE},
	{"const",	CONST,		FALSE},
	{"continue",	CONTINUE,	TRUE},
	{"default",	DEFAULT,	FALSE},
	{"do",		DO,		FALSE},
	{"double",	DOUBLE,		FALSE},
	{"else",	ELSE,		FALSE},
	{"enum",	ENUM,		FALSE},
	{"extern",	EXTERN,		FALSE},
	{"float",	FLOAT,		FALSE},
	{"for",		FOR,		TRUE},
	{"goto",	GOTO,		FALSE},
	{"if",		IF,		FALSE},
	{"inline",	INLINE,		FALSE},
	{"int",		INT,		FALSE},
	{"long",	LONG,		FALSE},
	{"register",	REGISTER,	FALSE},
	{"restrict",	RESTRICT,	FALSE},
	{"return",	RETURN,		TRUE},
	{"short",	SHORT,		FALSE},
	{"signed",	SIGNED,		FALSE},
	{"sizeof",	SIZEOF,		FALSE},
	{"static",	STATIC,		FALSE},
	{"struct",	STRUCT,		FALSE},
	{"switch",	SWITCH,		FALSE},
	{"typedef",	TYPEDEF,	FALSE},
	{"union",	UNION,		FALSE},
	{"unsigned",	UNSIGNED,	FALSE},
	{"void",	VOID,		FALSE},
	{"volatile",	VOLATILE,	FALSE},
	{"while",	WHILE,		FALSE}
};
#define NKEYS (sizeof Keytab / sizeof *Keytab)

lex_env_t *Lex_env;
lexeme_t *Lexeme;

constant_t Constant;
identifier_t Identifier;

char *string_copy(const char *string, int len);

void
lex_error(s)
const char *s;
{
	fprintf(stderr, "Error: %s", s);
}

static const char *
parse_hash_directive(line, le)
const char *line;
lex_env_t *le;
{
	int lnum, nitems;
	char name[256];

	for (; isspace(*line) && *line != '\0'; ++line)
		;
	if (*line == '\0')
		return line;

	if (strncmp(line, "pragma", 6) == 0 && isspace(line[6])) {
		for (line += 7; *line != '\0' && isspace(*line); ++line)
			;
		fprintf(stderr, "#pragma `%.*s' ignored",
							strlen(line) - 1, line);
		return line + strlen(line);
	}
	if (strncmp(line, "line", 4) == 0) {
		line += 4;
	}

	nitems = sscanf(line, "%d \"%[^\"]\"", &lnum, name);
	if (nitems < 1) {
		fprintf(stderr, "Bad # directive \"%s\"", line);
		return "";
	}
	if (nitems == 2) {
		char *buf;
		int len;

		len = strlen(name);
		buf = NEW_ARRAY(char, len + 1);
		(void) memcpy(buf, name, len + 1);
		le->le_filename = buf;
	}

	/*  Subtract 1 because we number internally from 0,
	 *  and 1 because we are just about to bump the
	 *  line number.
	 */
	le->le_lnum = lnum - 2;

	return line + strlen(line);
}



const char *
ci_translate_escape(s, p_res)
const char *s;
int *p_res;
{
	static const char hexdigits[] = "0123456789abcdefABCDEF";
	const char *pos, *save_s;
	int ch;

	switch (*s) {
	case 'n':
		ch = '\n';
		break;
	case 't':
		ch = '\t';
		break;
	case 'v':
		ch = '\v';
		break;
	case 'b':
		ch = '\b';
		break;
	case 'r':
		ch = '\r';
		break;
	case 'f':
		ch = '\f';
		break;
	case 'a':
		ch = '\007';
		break;
	case '0': case '1': case '2': case '3':
	case '4': case '5': case '6': case '7':
		ch = 0;
		for (save_s = s; isdigit(*s) && *s < '8' && s - save_s < 3; ++s)
			ch = ch * 8 + *s - '0';
		--s;
		break;
	case 'x':
		ch = 0;
		for (; *s != '\0' && (pos = strchr(hexdigits, *s)) != NULL; ++s) {
			if (pos >= hexdigits + 16)
				pos -= 6;
			ch = ch * 16 + pos - hexdigits;
		}
		break;
	default:
		ch = *s;
		break;
	}
	/* Dibyendu : 11/1/99
	 * Fixed problem of sign extension - '\377' is now -1 and not 255
	 */
	*p_res = (int)(char)ch;
	/* *p_res = ch; */

	return s;
}

/*  Based on K&P's hoc follow() function.
 */
#define follow(s, ch, ifyes, ifno) ((*(s) == (ch)) ? (++(s), (ifyes)) : (ifno))

static const char *
getline(le)
lex_env_t *le;
{
	if (le->le_abort_parse)
		return NULL;

	++le->le_lnum;
	return le->le_line = (*le->le_getline)(le->le_getline_arg);
}

/*  Skip white space and comments.
 */
static const char *
skip_whitespace(le, line)
lex_env_t *le;
const char *line;
{
	bool read_another_line;
	bool incomment;

	incomment = FALSE;
	read_another_line = FALSE;

	if (line == NULL) {
		if ((line = getline(le)) == NULL)
			return line;
	}

	for (;;) {
		for(;;) {
			while (*line != '\0' && isspace(*line))
				++line;
			if (*line != '\0')
				break;

			if ((line = getline(le)) == NULL)
				break;
			read_another_line = TRUE;
			if (*line == '#')
				line = parse_hash_directive(line + 1, le);
		}
		if (incomment) {
			if (line == NULL) {
				fprintf(stderr, 
						"Hit EOF while in a comment");
				break;
			}
			else if (*line == '*' && line[1] == '/') {
				line += 2;
				incomment = FALSE;
			}
			else
				++line;
		}
		else {
			if (line != NULL && *line == '/' && line[1] == '*') {
				line += 2;
				incomment = TRUE;
			}
			else
				break;
		}
	}

	if (Want_debugging_output && read_another_line) {
#if 0
		putchar('\n');
		printf("\n\"%s\", %d: %s", le->le_filename, le->le_lnum, line);
#endif
		printf("\"%s\", %d: %s\n", le->le_filename, le->le_lnum, line);
	}
	return line;
}

static bool
is_aggr_type_specifier(token)
token_t token;
{
	if (token == STRUCT 
	||  token == UNION
	||  token == ENUM)
		return TRUE;
	return FALSE;
}
    
static bool
is_basic_type_specifier(token)
token_t token;
{
	if (token == INT
	||  token == UNSIGNED
	||  token == SIGNED
	||  token == LONG
	||  token == SHORT
	||  token == VOID
	||  token == CHAR
	||  token == FLOAT
	||  token == DOUBLE) 
		return TRUE;
	return FALSE;
}

static bool
is_storage_class_or_qualifier(token)
token_t token;
{
	if (token == STATIC
	||  token == EXTERN
	||  token == TYPEDEF
	||  token == AUTO
	||  token == REGISTER
	||  token == CONST
	||  token == VOLATILE)
		return TRUE;
	return FALSE;
}

static bool
is_decl_specifier(token)
token_t token;
{
	return is_storage_class_or_qualifier(token)
	||     is_basic_type_specifier(token)
	||     is_aggr_type_specifier(token)
	||     token == TYPEDEF_NAME;
}

static token_t Prev_token = 0;	/* remember last token */
static bool Colon_follows = FALSE;

token_t
lex_prev_token(void)
{
	return Prev_token;
}

bool
lex_colon_follows(void)
{
	return Colon_follows;
}

token_t
lex_get_token()
{
	static int pos = -1;
	lex_env_t *le;
	token_t token;
	const char *line;

	le = Lex_env;

	if (pos == -1) {
		Want_debugging_output = getenv("LEX_DEBUG") != NULL;
		pos = 0;
	}
	if (le == NULL) {
		if (Want_debugging_output)
			puts("\n");
		return 0;
	}

	if ((line = skip_whitespace(le, le->le_lptr)) == NULL) {
		le->le_lptr = line;
		return 0;	/* EOF */
	}

	switch (*line++) {
	case '_': case '$':

	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
	case 'v': case 'w': case 'x': case 'y': case 'z': 

	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
	case 'V': case 'W': case 'X': case 'Y': case 'Z':
		{
			const char *s;
			int len, i;

			--line;
			for (s = line; isalnum(*s) || *s == '_' || *s == '$'; ++s)
				;
			len = s - line;

			for (i = 0; i < NKEYS; ++i)
				if (memcmp(Keytab[i].name, line, len) == 0 &&
							Keytab[i].name[len] == '\0')
					break;
			if (i < NKEYS) {
				token = Keytab[i].token;
				line += len;
				break;
			}
					
			if (len+1 > sizeof Identifier.id_name)
				len = sizeof Identifier.id_name-1;
			
			strncpy(Identifier.id_name, line, len+1);
			Identifier.id_name[len] = '\0';
			Lexeme->identifier = &Identifier;

			line = skip_whitespace(le, s);

			/* The parser provides the function name_type() which is 
			 * called here to determine whether a name is a potential 
			 * TYPEDEF name. 
			 */
			token = name_type(Identifier.id_name);
			Colon_follows = *line == ':';
		}
		break;
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
		{
			char *end;
			long val;
			
			val = strtol(line - 1, &end, 0);
			if (end == line - 1) {
				le->le_lptr = line;
				fprintf(stderr,
					"Badly formed integer constant \"%s\"",
					line - 1);
				token = BADTOK;
			}
			else if (*end == 'e' || *end == 'E' || *end == '.') {
				token = get_float_constant(le, line-1, &line,
								&Constant);
				Lexeme->constant = &Constant;
			}
			else {
				while (*end == 'L' || *end == 'l' || *end == 'u' || *end == 'U')
					++end;
				Constant.co_val = string_copy(line-1, end-(line-1));
				Constant.co_size = end-(line-1);
				line = end;

				Lexeme->constant = &Constant;
				token = INTEGER_CONSTANT;
			}
		}
		break;
	case '!':
		token = follow(line, '=', NOTEQ, NOT);
		break;
	case '=':
		token = follow(line, '=', EQEQ, EQUALS);
		break;
	case '%':
		token = follow(line, '=', PERCENT_EQUALS, PERCENT);
		break;
	case '/':
		token = follow(line, '=', SLASH_EQUALS, SLASH);
		break;
	case '^':
		token = follow(line, '=', XOR_EQUALS, XOR);
		break;
	case '*':
		token = follow(line, '=', STAR_EQUALS, STAR);
		break;
	case '[':
		token = LBRAC;
		break;
	case ']':
		token = RBRAC;
		break;
	case '{':
		token = LBRACE;
		break;
	case '}':
		token = RBRACE;
		break;
	case '(':
		token = LPAREN;
		break;
	case ')':
		token = RPAREN;
		break;
	case ',':
		token = COMMA;
		break;
	case ';':
		token = SEMI;
		break;
	case '?':
		token = QUERY;
		break;
	case ':':
		token = COLON;
		break;
	case '\'': {
		/*  BUG: no escapes etc.
		 */
		int val;
		const char *startp = line-1;
		const char *endp = 0;

		if (*line == '\\')
			line = ci_translate_escape(line + 1, &val);
		else
			val = *line;
		++line;

		if (*line != '\'') {
			le->le_lptr = line;
			fprintf(stderr, "Unterminated char constant");
			token = BADTOK;
		}
		else {
			endp = ++line;
			Constant.co_val = string_copy(startp, endp-startp);
			Constant.co_size = endp-startp;
			Lexeme->constant = &Constant;
			token = CHARACTER_CONSTANT;
		}
		break;
	}
	case '"': {
		token = get_string(le, line, &Constant);
		Lexeme->constant = &Constant;
		line = le->le_lptr;
		break;
	}
	case '.':
		if (*line == '.' && line[1] == '.') {
			line += 2;
			token = ELLIPSIS;
		}
		else if (isdigit(*line)) {
			token = get_float_constant(le, line-1, &line, &Constant);
			Lexeme->constant = &Constant;
		}
		else
			token = DOT;
		break;
	case '~':
		token = TILDE;
		break;
	case '+':
		if (*line == '+')
			token = PLUSPLUS;
		else if (*line == '=')
			token = PLUS_EQUALS;
		else {
			token = PLUS;
			--line;
		}
		++line;
		break;
	case '-':
		if (*line == '>')
			token = ARROW;
		else if (*line == '-')
			token = MINUSMINUS;
		else if (*line == '=')
			token = MINUS_EQUALS;
		else {
			token = MINUS;
			--line;
		}
		++line;
		break;
	case '|':
		if (*line == '|')
			token = OROR;
		else if (*line == '=')
			token = OR_EQUALS;
		else {
			--line;
			token = OR;
		}
		++line;
		break;
	case '&':
		if (*line == '&')
			token = ANDAND;
		else if (*line == '=')
			token = AND_EQUALS;
		else {
			--line;
			token = AND;
		}
		++line;
		break;
	case '>':
		if (*line == '>') {
			++line;
			token = follow(line, '=', RSHIFT_EQUALS, RSHIFT);
		}
		else if (*line == '=') {
			++line;
			token = GTEQ;
		}
		else
			token = GREATERTHAN;
		break;
	case '<':
		if (*line == '<') {
			++line;
			token = follow(line, '=', LSHIFT_EQUALS, LSHIFT);
		}
		else if (*line == '=') {
			++line;
			token = LESSEQ;
		}
		else
			token = LESSTHAN;
		break;
			
	default:
		le->le_lptr = line; /* because we are about to call diagf */
		fprintf(stderr,
			"Illegal character '%c' (0x%02x)", line[-1], line[-1]);
		token = BADTOK;
		break;
	}
	le->le_lptr = line;

#if 0
	if (Want_debugging_output) {
		const char *name;

		if (pos > 70) {
			putchar('\n');
			pos = 0;
		}
		name = tokname(token);
		printf("%s ", name);
		pos += strlen(name) + 1;
		fflush(stdout);
	}
#endif

	Prev_token = token;
	return token;
}

static int
get_string(le, line, co)
lex_env_t *le;
const char *line;
constant_t *co;
{
	static const char badalloc[] =
				"Unable to allocate memory for string constant";
	static char *buf;
	static int bufsize = 0;
	int opos;
	bool ok;

	if (bufsize == 0) {
		bufsize = 50;
		if ((buf = malloc(bufsize + 1)) == NULL) {
			fprintf(stderr, "%s", badalloc);
			return BADTOK;
		}
	}

	opos = 0;
	ok = FALSE;		/* set to TRUE on success */

	for (; *line != '\0'; ++line) {
		int ch;

		if (*line == '"') {
			const char *new_line;

			new_line = skip_whitespace(le, line + 1);
			if (new_line == NULL || *new_line != '"') {
				ok = TRUE;
				le->le_lptr = new_line;
				break;
			}

			line = new_line;
			continue;
		}

		if (*line != '\\')
			ch = *line;
		else if (*++line == '\n') {
			line = getline(le);
			ch = (line != NULL) ? *line : '\0';
		}
		else
			line = ci_translate_escape(line, &ch);

		if (line == NULL || *line == '\n' || *line == '\0') {
			le->le_lptr = line;
			fprintf(stderr,
						"Unterminated string constant");
			break;
		}

		if (opos == bufsize) {
			bufsize *= 2;
			if ((buf = realloc(buf, bufsize + 1)) == NULL) {
				le->le_lptr = line;
				fprintf(stderr,
							"%s", badalloc);
				break;
			}
		}
		buf[opos++] = ch;
	}
	buf[opos++] = '\0';

	if (!ok)
		return BADTOK;

	co->co_val = buf;
	co->co_size = opos;
	return STRING_CONSTANT;
}

static int
get_float_constant(le, line, p_end, co)
lex_env_t *le;
const char *line, **p_end;
constant_t *co;
{
	double val;
	char *end;

	val = strtod(line, &end);

	if (end == line) {
		le->le_lptr = line;
		fprintf(stderr, "Badly formed floating constant \"%s\"", line);
		return BADTOK;
	}

	co->co_val = string_copy(line, end-line);
	co->co_size = end-line;

	*p_end = end;

	return FLOATING_CONSTANT;
}

/* static */
const char *
tokname(token_t token)
{
	static struct {
		const char *name;
		token_t token;
	} tab[] = {
		"IF",                     IF,
		"ELSE",                   ELSE,
		"WHILE",                  WHILE,
		"FOR",                    FOR,
		"DO",                     DO,
		"GOTO",                   GOTO,
		"BREAK",                  BREAK,
		"CONTINUE",               CONTINUE,
		"RETURN",                 RETURN,
		"SWITCH",                 SWITCH,
		"CASE",                   CASE,
		"DEFAULT",                DEFAULT,
		"SIZEOF",                 SIZEOF,
		"AUTO",                   AUTO,
		"REGISTER",               REGISTER,
		"STATIC",                 STATIC,
		"EXTERN",                 EXTERN,
		"TYPEDEF",                TYPEDEF,
		"INLINE",                 INLINE,
		"BOOL",                   BOOL,
		"COMPLEX",                COMPLEX,
		"IMAGINARY",              IMAGINARY,
		"VOID",                   VOID,
		"CHAR",                   CHAR,
		"SHORT",                  SHORT,
		"INT",                    INT,
		"LONG",                   LONG,
		"FLOAT",                  FLOAT,
		"DOUBLE",                 DOUBLE,
		"SIGNED",                 SIGNED,
		"UNSIGNED",               UNSIGNED,
		"CONST",                  CONST,
		"VOLATILE",               VOLATILE,
		"RESTRICT",               RESTRICT,
		"STRUCT",                 STRUCT,
		"UNION",                  UNION,
		"ENUM",                   ENUM,
		"AND",                    AND,
		"TILDE",                  TILDE,
		"NOT",                    NOT,
		"LESSTHAN",               LESSTHAN,
		"GREATERTHAN",            GREATERTHAN,
		"XOR",                    XOR,
		"OR",                     OR,
		"PLUS",                   PLUS,
		"MINUS",                  MINUS,
		"SLASH",                  SLASH,
		"PERCENT",                PERCENT,
		"STAR",                   STAR,
		"DOT",                    DOT,
		"COLON",                  COLON,
		"QUERY",                  QUERY,
		"SEMI",                   SEMI,
		"COMMA",                  COMMA,
		"LPAREN",                 LPAREN,
		"RPAREN",                 RPAREN,
		"LBRACE",                 LBRACE,
		"RBRACE",                 RBRACE,
		"LBRAC",                  LBRAC,
		"RBRAC",                  RBRAC,
		"EQUALS",                 EQUALS,
		"STAR_EQUALS",            STAR_EQUALS,
		"SLASH_EQUALS",           SLASH_EQUALS,
		"PERCENT_EQUALS",         PERCENT_EQUALS,
		"PLUS_EQUALS",            PLUS_EQUALS,
		"MINUS_EQUALS",           MINUS_EQUALS,
		"LSHIFT_EQUALS",          LSHIFT_EQUALS,
		"RSHIFT_EQUALS",          RSHIFT_EQUALS,
		"AND_EQUALS",             AND_EQUALS,
		"XOR_EQUALS",             XOR_EQUALS,
		"OR_EQUALS",              OR_EQUALS,
		"ANDAND",                 ANDAND,
		"OROR",                   OROR,
		"EQEQ",                   EQEQ,
		"NOTEQ",                  NOTEQ,
		"GTEQ",                   GTEQ,
		"LESSEQ",                 LESSEQ,
		"LSHIFT",                 LSHIFT,
		"RSHIFT",                 RSHIFT,
		"PLUSPLUS",               PLUSPLUS,
		"MINUSMINUS",             MINUSMINUS,
		"ARROW",                  ARROW,
		"ELLIPSIS",               ELLIPSIS,
		"STRING_CONSTANT",        STRING_CONSTANT,
		"INTEGER_CONSTANT",       INTEGER_CONSTANT,
		"CHARACTER_CONSTANT",     CHARACTER_CONSTANT,
		"FLOATING_CONSTANT",      FLOATING_CONSTANT,
		"IDENTIFIER",             IDENTIFIER,
		"TYPEDEF_NAME",           TYPEDEF_NAME,
		"BADTOK",                 BADTOK,
		"EOF",                    0,
	};
	static char buf[100];
	int i;

	for (i = 0; i < sizeof tab / sizeof *tab; ++i)
		if (tab[i].token == token)
			return tab[i].name;

	(void) sprintf(buf, "<unknown token %d>", token);
	return buf;
}


void *safe_calloc(size_t n, size_t s)
{
	void *p = calloc(n,s);
	if (!p) {
		fprintf(stderr, "Error: out of memory\n");
		exit(1);
	}
	return p;
}

#include "alloc.h"
static allocator *String_allocator = 0;

char *string_copy(const char *string, int len)
{
	char *p;
	if (String_allocator == 0) {
		String_allocator = new_allocator(0, 100);
	}
	p = allocate(String_allocator, len+1);
	strncpy(p, string, len);
	p[len] = 0;
	return p;
}