/* c_lex.c - a standalone C lexical analyser */ /* Copyright 1991 Mark Russell, University of Kent at Canterbury. * * You can do what you like with this source code as long as * you don't try to make money out of it and you include an * unaltered copy of this message (including the copyright). */ /* * This standalone version created on Jan 19 1998. * Author : Dibyendu Majumdar * Email : dibyendu@mazumdar.demon.co.uk * Website: www.mazumdar.demon.co.uk */ /* * 21 Jan 2001 Added support for inline, restrict, _Bool, _Complex, and _Imaginary. */ #include #include #include #include #include "c_lex.h" static bool Want_debugging_output; /* static const char *tokname (token_t token); */ static const char *parse_hash_directive (const char *line, lex_env_t *le); static const char *skip_whitespace (lex_env_t *le, const char *line); static int get_float_constant (lex_env_t *le, const char *line, const char **p_end, constant_t *co); static const char *getline (lex_env_t *le); static int get_string (lex_env_t *le, const char *line, constant_t *co); static struct { const char *name; token_t token; bool need_lexinfo; } Keytab[] = { {"_Bool", BOOL, FALSE}, {"_Complex", COMPLEX, FALSE}, {"_Imaginary", IMAGINARY, FALSE}, {"auto", AUTO, FALSE}, {"break", BREAK, TRUE}, {"case", CASE, FALSE}, {"char", CHAR, FALSE}, {"const", CONST, FALSE}, {"continue", CONTINUE, TRUE}, {"default", DEFAULT, FALSE}, {"do", DO, FALSE}, {"double", DOUBLE, FALSE}, {"else", ELSE, FALSE}, {"enum", ENUM, FALSE}, {"extern", EXTERN, FALSE}, {"float", FLOAT, FALSE}, {"for", FOR, TRUE}, {"goto", GOTO, FALSE}, {"if", IF, FALSE}, {"inline", INLINE, FALSE}, {"int", INT, FALSE}, {"long", LONG, FALSE}, {"register", REGISTER, FALSE}, {"restrict", RESTRICT, FALSE}, {"return", RETURN, TRUE}, {"short", SHORT, FALSE}, {"signed", SIGNED, FALSE}, {"sizeof", SIZEOF, FALSE}, {"static", STATIC, FALSE}, {"struct", STRUCT, FALSE}, {"switch", SWITCH, FALSE}, {"typedef", TYPEDEF, FALSE}, {"union", UNION, FALSE}, {"unsigned", UNSIGNED, FALSE}, {"void", VOID, FALSE}, {"volatile", VOLATILE, FALSE}, {"while", WHILE, FALSE} }; #define NKEYS (sizeof Keytab / sizeof *Keytab) lex_env_t *Lex_env; lexeme_t *Lexeme; constant_t Constant; identifier_t Identifier; char *string_copy(const char *string, int len); void lex_error(s) const char *s; { fprintf(stderr, "Error: %s", s); } static const char * parse_hash_directive(line, le) const char *line; lex_env_t *le; { int lnum, nitems; char name[256]; for (; isspace(*line) && *line != '\0'; ++line) ; if (*line == '\0') return line; if (strncmp(line, "pragma", 6) == 0 && isspace(line[6])) { for (line += 7; *line != '\0' && isspace(*line); ++line) ; fprintf(stderr, "#pragma `%.*s' ignored", strlen(line) - 1, line); return line + strlen(line); } if (strncmp(line, "line", 4) == 0) { line += 4; } nitems = sscanf(line, "%d \"%[^\"]\"", &lnum, name); if (nitems < 1) { fprintf(stderr, "Bad # directive \"%s\"", line); return ""; } if (nitems == 2) { char *buf; int len; len = strlen(name); buf = NEW_ARRAY(char, len + 1); (void) memcpy(buf, name, len + 1); le->le_filename = buf; } /* Subtract 1 because we number internally from 0, * and 1 because we are just about to bump the * line number. */ le->le_lnum = lnum - 2; return line + strlen(line); } const char * ci_translate_escape(s, p_res) const char *s; int *p_res; { static const char hexdigits[] = "0123456789abcdefABCDEF"; const char *pos, *save_s; int ch; switch (*s) { case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case 'v': ch = '\v'; break; case 'b': ch = '\b'; break; case 'r': ch = '\r'; break; case 'f': ch = '\f'; break; case 'a': ch = '\007'; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': ch = 0; for (save_s = s; isdigit(*s) && *s < '8' && s - save_s < 3; ++s) ch = ch * 8 + *s - '0'; --s; break; case 'x': ch = 0; for (; *s != '\0' && (pos = strchr(hexdigits, *s)) != NULL; ++s) { if (pos >= hexdigits + 16) pos -= 6; ch = ch * 16 + pos - hexdigits; } break; default: ch = *s; break; } /* Dibyendu : 11/1/99 * Fixed problem of sign extension - '\377' is now -1 and not 255 */ *p_res = (int)(char)ch; /* *p_res = ch; */ return s; } /* Based on K&P's hoc follow() function. */ #define follow(s, ch, ifyes, ifno) ((*(s) == (ch)) ? (++(s), (ifyes)) : (ifno)) static const char * getline(le) lex_env_t *le; { if (le->le_abort_parse) return NULL; ++le->le_lnum; return le->le_line = (*le->le_getline)(le->le_getline_arg); } /* Skip white space and comments. */ static const char * skip_whitespace(le, line) lex_env_t *le; const char *line; { bool read_another_line; bool incomment; incomment = FALSE; read_another_line = FALSE; if (line == NULL) { if ((line = getline(le)) == NULL) return line; } for (;;) { for(;;) { while (*line != '\0' && isspace(*line)) ++line; if (*line != '\0') break; if ((line = getline(le)) == NULL) break; read_another_line = TRUE; if (*line == '#') line = parse_hash_directive(line + 1, le); } if (incomment) { if (line == NULL) { fprintf(stderr, "Hit EOF while in a comment"); break; } else if (*line == '*' && line[1] == '/') { line += 2; incomment = FALSE; } else ++line; } else { if (line != NULL && *line == '/' && line[1] == '*') { line += 2; incomment = TRUE; } else break; } } if (Want_debugging_output && read_another_line) { #if 0 putchar('\n'); printf("\n\"%s\", %d: %s", le->le_filename, le->le_lnum, line); #endif printf("\"%s\", %d: %s\n", le->le_filename, le->le_lnum, line); } return line; } static bool is_aggr_type_specifier(token) token_t token; { if (token == STRUCT || token == UNION || token == ENUM) return TRUE; return FALSE; } static bool is_basic_type_specifier(token) token_t token; { if (token == INT || token == UNSIGNED || token == SIGNED || token == LONG || token == SHORT || token == VOID || token == CHAR || token == FLOAT || token == DOUBLE) return TRUE; return FALSE; } static bool is_storage_class_or_qualifier(token) token_t token; { if (token == STATIC || token == EXTERN || token == TYPEDEF || token == AUTO || token == REGISTER || token == CONST || token == VOLATILE) return TRUE; return FALSE; } static bool is_decl_specifier(token) token_t token; { return is_storage_class_or_qualifier(token) || is_basic_type_specifier(token) || is_aggr_type_specifier(token) || token == TYPEDEF_NAME; } static token_t Prev_token = 0; /* remember last token */ static bool Colon_follows = FALSE; token_t lex_prev_token(void) { return Prev_token; } bool lex_colon_follows(void) { return Colon_follows; } token_t lex_get_token() { static int pos = -1; lex_env_t *le; token_t token; const char *line; le = Lex_env; if (pos == -1) { Want_debugging_output = getenv("LEX_DEBUG") != NULL; pos = 0; } if (le == NULL) { if (Want_debugging_output) puts("\n"); return 0; } if ((line = skip_whitespace(le, le->le_lptr)) == NULL) { le->le_lptr = line; return 0; /* EOF */ } switch (*line++) { case '_': case '$': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': { const char *s; int len, i; --line; for (s = line; isalnum(*s) || *s == '_' || *s == '$'; ++s) ; len = s - line; for (i = 0; i < NKEYS; ++i) if (memcmp(Keytab[i].name, line, len) == 0 && Keytab[i].name[len] == '\0') break; if (i < NKEYS) { token = Keytab[i].token; line += len; break; } if (len+1 > sizeof Identifier.id_name) len = sizeof Identifier.id_name-1; strncpy(Identifier.id_name, line, len+1); Identifier.id_name[len] = '\0'; Lexeme->identifier = &Identifier; line = skip_whitespace(le, s); /* The parser provides the function name_type() which is * called here to determine whether a name is a potential * TYPEDEF name. */ token = name_type(Identifier.id_name); Colon_follows = *line == ':'; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { char *end; long val; val = strtol(line - 1, &end, 0); if (end == line - 1) { le->le_lptr = line; fprintf(stderr, "Badly formed integer constant \"%s\"", line - 1); token = BADTOK; } else if (*end == 'e' || *end == 'E' || *end == '.') { token = get_float_constant(le, line-1, &line, &Constant); Lexeme->constant = &Constant; } else { while (*end == 'L' || *end == 'l' || *end == 'u' || *end == 'U') ++end; Constant.co_val = string_copy(line-1, end-(line-1)); Constant.co_size = end-(line-1); line = end; Lexeme->constant = &Constant; token = INTEGER_CONSTANT; } } break; case '!': token = follow(line, '=', NOTEQ, NOT); break; case '=': token = follow(line, '=', EQEQ, EQUALS); break; case '%': token = follow(line, '=', PERCENT_EQUALS, PERCENT); break; case '/': token = follow(line, '=', SLASH_EQUALS, SLASH); break; case '^': token = follow(line, '=', XOR_EQUALS, XOR); break; case '*': token = follow(line, '=', STAR_EQUALS, STAR); break; case '[': token = LBRAC; break; case ']': token = RBRAC; break; case '{': token = LBRACE; break; case '}': token = RBRACE; break; case '(': token = LPAREN; break; case ')': token = RPAREN; break; case ',': token = COMMA; break; case ';': token = SEMI; break; case '?': token = QUERY; break; case ':': token = COLON; break; case '\'': { /* BUG: no escapes etc. */ int val; const char *startp = line-1; const char *endp = 0; if (*line == '\\') line = ci_translate_escape(line + 1, &val); else val = *line; ++line; if (*line != '\'') { le->le_lptr = line; fprintf(stderr, "Unterminated char constant"); token = BADTOK; } else { endp = ++line; Constant.co_val = string_copy(startp, endp-startp); Constant.co_size = endp-startp; Lexeme->constant = &Constant; token = CHARACTER_CONSTANT; } break; } case '"': { token = get_string(le, line, &Constant); Lexeme->constant = &Constant; line = le->le_lptr; break; } case '.': if (*line == '.' && line[1] == '.') { line += 2; token = ELLIPSIS; } else if (isdigit(*line)) { token = get_float_constant(le, line-1, &line, &Constant); Lexeme->constant = &Constant; } else token = DOT; break; case '~': token = TILDE; break; case '+': if (*line == '+') token = PLUSPLUS; else if (*line == '=') token = PLUS_EQUALS; else { token = PLUS; --line; } ++line; break; case '-': if (*line == '>') token = ARROW; else if (*line == '-') token = MINUSMINUS; else if (*line == '=') token = MINUS_EQUALS; else { token = MINUS; --line; } ++line; break; case '|': if (*line == '|') token = OROR; else if (*line == '=') token = OR_EQUALS; else { --line; token = OR; } ++line; break; case '&': if (*line == '&') token = ANDAND; else if (*line == '=') token = AND_EQUALS; else { --line; token = AND; } ++line; break; case '>': if (*line == '>') { ++line; token = follow(line, '=', RSHIFT_EQUALS, RSHIFT); } else if (*line == '=') { ++line; token = GTEQ; } else token = GREATERTHAN; break; case '<': if (*line == '<') { ++line; token = follow(line, '=', LSHIFT_EQUALS, LSHIFT); } else if (*line == '=') { ++line; token = LESSEQ; } else token = LESSTHAN; break; default: le->le_lptr = line; /* because we are about to call diagf */ fprintf(stderr, "Illegal character '%c' (0x%02x)", line[-1], line[-1]); token = BADTOK; break; } le->le_lptr = line; #if 0 if (Want_debugging_output) { const char *name; if (pos > 70) { putchar('\n'); pos = 0; } name = tokname(token); printf("%s ", name); pos += strlen(name) + 1; fflush(stdout); } #endif Prev_token = token; return token; } static int get_string(le, line, co) lex_env_t *le; const char *line; constant_t *co; { static const char badalloc[] = "Unable to allocate memory for string constant"; static char *buf; static int bufsize = 0; int opos; bool ok; if (bufsize == 0) { bufsize = 50; if ((buf = malloc(bufsize + 1)) == NULL) { fprintf(stderr, "%s", badalloc); return BADTOK; } } opos = 0; ok = FALSE; /* set to TRUE on success */ for (; *line != '\0'; ++line) { int ch; if (*line == '"') { const char *new_line; new_line = skip_whitespace(le, line + 1); if (new_line == NULL || *new_line != '"') { ok = TRUE; le->le_lptr = new_line; break; } line = new_line; continue; } if (*line != '\\') ch = *line; else if (*++line == '\n') { line = getline(le); ch = (line != NULL) ? *line : '\0'; } else line = ci_translate_escape(line, &ch); if (line == NULL || *line == '\n' || *line == '\0') { le->le_lptr = line; fprintf(stderr, "Unterminated string constant"); break; } if (opos == bufsize) { bufsize *= 2; if ((buf = realloc(buf, bufsize + 1)) == NULL) { le->le_lptr = line; fprintf(stderr, "%s", badalloc); break; } } buf[opos++] = ch; } buf[opos++] = '\0'; if (!ok) return BADTOK; co->co_val = buf; co->co_size = opos; return STRING_CONSTANT; } static int get_float_constant(le, line, p_end, co) lex_env_t *le; const char *line, **p_end; constant_t *co; { double val; char *end; val = strtod(line, &end); if (end == line) { le->le_lptr = line; fprintf(stderr, "Badly formed floating constant \"%s\"", line); return BADTOK; } co->co_val = string_copy(line, end-line); co->co_size = end-line; *p_end = end; return FLOATING_CONSTANT; } /* static */ const char * tokname(token_t token) { static struct { const char *name; token_t token; } tab[] = { "IF", IF, "ELSE", ELSE, "WHILE", WHILE, "FOR", FOR, "DO", DO, "GOTO", GOTO, "BREAK", BREAK, "CONTINUE", CONTINUE, "RETURN", RETURN, "SWITCH", SWITCH, "CASE", CASE, "DEFAULT", DEFAULT, "SIZEOF", SIZEOF, "AUTO", AUTO, "REGISTER", REGISTER, "STATIC", STATIC, "EXTERN", EXTERN, "TYPEDEF", TYPEDEF, "INLINE", INLINE, "BOOL", BOOL, "COMPLEX", COMPLEX, "IMAGINARY", IMAGINARY, "VOID", VOID, "CHAR", CHAR, "SHORT", SHORT, "INT", INT, "LONG", LONG, "FLOAT", FLOAT, "DOUBLE", DOUBLE, "SIGNED", SIGNED, "UNSIGNED", UNSIGNED, "CONST", CONST, "VOLATILE", VOLATILE, "RESTRICT", RESTRICT, "STRUCT", STRUCT, "UNION", UNION, "ENUM", ENUM, "AND", AND, "TILDE", TILDE, "NOT", NOT, "LESSTHAN", LESSTHAN, "GREATERTHAN", GREATERTHAN, "XOR", XOR, "OR", OR, "PLUS", PLUS, "MINUS", MINUS, "SLASH", SLASH, "PERCENT", PERCENT, "STAR", STAR, "DOT", DOT, "COLON", COLON, "QUERY", QUERY, "SEMI", SEMI, "COMMA", COMMA, "LPAREN", LPAREN, "RPAREN", RPAREN, "LBRACE", LBRACE, "RBRACE", RBRACE, "LBRAC", LBRAC, "RBRAC", RBRAC, "EQUALS", EQUALS, "STAR_EQUALS", STAR_EQUALS, "SLASH_EQUALS", SLASH_EQUALS, "PERCENT_EQUALS", PERCENT_EQUALS, "PLUS_EQUALS", PLUS_EQUALS, "MINUS_EQUALS", MINUS_EQUALS, "LSHIFT_EQUALS", LSHIFT_EQUALS, "RSHIFT_EQUALS", RSHIFT_EQUALS, "AND_EQUALS", AND_EQUALS, "XOR_EQUALS", XOR_EQUALS, "OR_EQUALS", OR_EQUALS, "ANDAND", ANDAND, "OROR", OROR, "EQEQ", EQEQ, "NOTEQ", NOTEQ, "GTEQ", GTEQ, "LESSEQ", LESSEQ, "LSHIFT", LSHIFT, "RSHIFT", RSHIFT, "PLUSPLUS", PLUSPLUS, "MINUSMINUS", MINUSMINUS, "ARROW", ARROW, "ELLIPSIS", ELLIPSIS, "STRING_CONSTANT", STRING_CONSTANT, "INTEGER_CONSTANT", INTEGER_CONSTANT, "CHARACTER_CONSTANT", CHARACTER_CONSTANT, "FLOATING_CONSTANT", FLOATING_CONSTANT, "IDENTIFIER", IDENTIFIER, "TYPEDEF_NAME", TYPEDEF_NAME, "BADTOK", BADTOK, "EOF", 0, }; static char buf[100]; int i; for (i = 0; i < sizeof tab / sizeof *tab; ++i) if (tab[i].token == token) return tab[i].name; (void) sprintf(buf, "", token); return buf; } void *safe_calloc(size_t n, size_t s) { void *p = calloc(n,s); if (!p) { fprintf(stderr, "Error: out of memory\n"); exit(1); } return p; } #include "alloc.h" static allocator *String_allocator = 0; char *string_copy(const char *string, int len) { char *p; if (String_allocator == 0) { String_allocator = new_allocator(0, 100); } p = allocate(String_allocator, len+1); strncpy(p, string, len); p[len] = 0; return p; }