// ---------------------------------------------------------------------------
// - Lexer.cpp -
// - afnix engine - lexical analyzer class implementation -
// ---------------------------------------------------------------------------
// - This program is free software; you can redistribute it and/or modify -
// - it provided that this copyright notice is kept intact. -
// - -
// - This program is distributed in the hope that it will be useful, but -
// - without any warranty; without even the implied warranty of -
// - merchantability or fitness for a particular purpose. In no event shall -
// - the copyright holder be liable for any direct, indirect, incidental or -
// - special damages arising in any way out of the use of this software. -
// ---------------------------------------------------------------------------
// - copyright (c) 1999-2007 amaury darsch -
// ---------------------------------------------------------------------------
#include "Lexer.hpp"
#include "Unicode.hpp"
#include "Lexical.hpp"
namespace afnix {
// -------------------------------------------------------------------------
// - private section -
// -------------------------------------------------------------------------
static const t_quad LEX_CHAR_DQ = 0x00000022; // "
static const t_quad LEX_CHAR_DZ = 0x00000023; // #
static const t_quad LEX_CHAR_SQ = 0x00000027; // '
static const t_quad LEX_CHAR_LP = 0x00000028; // (
static const t_quad LEX_CHAR_RP = 0x00000029; // )
static const t_quad LEX_CHAR_PS = 0x0000002B; // +
static const t_quad LEX_CHAR_MS = 0x0000002D; // -
static const t_quad LEX_CHAR_SP = 0x0000002E; // .
static const t_quad LEX_CHAR_D0 = 0x00000030; // 0
static const t_quad LEX_CHAR_D1 = 0x00000031; // 0
static const t_quad LEX_CHAR_DP = 0x0000003A; // :
static const t_quad LEX_CHAR_LB = 0x0000005B; // [
static const t_quad LEX_CHAR_AS = 0x0000005C; // ANTI-SLASH
static const t_quad LEX_CHAR_RB = 0x0000005D; // ]
static const t_quad LEX_CHAR_US = 0x0000005F; // _
static const t_quad LEX_CHAR_LC = 0x0000007B; // {
static const t_quad LEX_CHAR_RC = 0x0000007D; // }
static const t_quad LEX_CHAR_BU = 0x00000042; // B
static const t_quad LEX_CHAR_BL = 0x00000062; // b
static const t_quad LEX_CHAR_EU = 0x00000045; // E
static const t_quad LEX_CHAR_EL = 0x00000065; // e
static const t_quad LEX_CHAR_NL = 0x0000006E; // n
static const t_quad LEX_CHAR_RU = 0x00000052; // R
static const t_quad LEX_CHAR_RL = 0x00000072; // r
static const t_quad LEX_CHAR_XU = 0x00000058; // X
static const t_quad LEX_CHAR_XL = 0x00000078; // x
static const t_quad LEX_CHAR_TL = 0x00000074; // t
// -------------------------------------------------------------------------
// - class section -
// -------------------------------------------------------------------------
// create a new lexer
Lexer::Lexer (Input* is) {
d_lnum = 1;
Object::iref (p_is = is);
}
// destroy this lexer
Lexer::~Lexer (void) {
Object::dref (p_is);
}
// return the next available token
Token Lexer::get (void) {
// reset controls
bool eflag = false;
bool esign = false;
bool eonly = false;
long rcount = 0;
t_quad clast = nilq;
// check for nil stream and reset buffer
if (p_is == nilp) return Token (Token::ERROR,d_lnum);
d_buffer.reset ();
s_begin:
t_quad c = p_is->rduc ();
switch (c) {
case blkq:
case tabq:
case crlq:
goto s_begin;
case eofq:
return Token (Token::EOF,d_lnum);
case eolq:
return Token (Token::EOL,++d_lnum);
case LEX_CHAR_LP:
return Token (Token::RFB, d_lnum);
case LEX_CHAR_RP:
return Token (Token::RFE, d_lnum);
case LEX_CHAR_LC:
return Token (Token::BFB, d_lnum);
case LEX_CHAR_RC:
return Token (Token::BFE, d_lnum);
case LEX_CHAR_DZ:
goto s_comment;
case LEX_CHAR_SQ:
goto s_character;
case LEX_CHAR_DQ:
goto s_string;
case LEX_CHAR_LB:
rcount++;
d_buffer.add (c);
goto s_regex;
case LEX_CHAR_PS:
case LEX_CHAR_MS:
d_buffer.add (c);
goto s_number;
case LEX_CHAR_D0:
d_buffer.add (c);
goto s_numfmt;
default:
d_buffer.add (c);
if (Unicode::isdigit (c) == true) goto s_integer;
if (Lexical::valid (c) == true) goto s_lexical;
break;
}
goto s_error;
s_comment:
c = p_is->rduc ();
if (c == eolq) return Token (Token::EOL,++d_lnum);
if (c == eofq) return Token (Token::EOF,++d_lnum);
goto s_comment;
s_number:
c = p_is->rduc ();
if (c == LEX_CHAR_D0) {
d_buffer.add (c);
goto s_numfmt;
}
if (Unicode::isdigit (c) == true) {
d_buffer.add (c);
goto s_integer;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::LEXICAL, d_buffer.tostring (), d_lnum);
s_lexical:
c = p_is->rduc ();
if (c == LEX_CHAR_DP) {
d_buffer.add (clast = c);
goto s_qualified;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::LEXICAL, d_buffer.tostring (), d_lnum);
s_qualified:
c = p_is->rduc ();
if ((Lexical::valid (c) == true) || (c == LEX_CHAR_DP)) {
if ((clast == LEX_CHAR_DP) && (c == LEX_CHAR_DP)) {
d_buffer.add (c);
goto s_error;
}
d_buffer.add (clast = c);
goto s_qualified;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
if (clast == LEX_CHAR_DP) goto s_error;
p_is->pushback (c);
return Token (Token::QUALIFIED, d_buffer.tostring (), d_lnum);
s_numfmt:
c = p_is->rduc ();
if (Unicode::isdigit (c) == true) {
d_buffer.add (c);
goto s_integer;
}
if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
goto s_relatif;
}
if (c == LEX_CHAR_SP) {
d_buffer.add (c);
goto s_real;
}
if ((c == LEX_CHAR_XL) || (c == LEX_CHAR_XU)) {
d_buffer.add (c);
goto s_hexa;
}
if ((c == LEX_CHAR_BL) || (c == LEX_CHAR_BU)) {
d_buffer.add (c);
goto s_binary;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);
s_integer:
c = p_is->rduc ();
if (Unicode::isdigit (c) == true) {
d_buffer.add (c);
goto s_integer;
}
if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
goto s_relatif;
}
if (c == LEX_CHAR_SP) {
d_buffer.add (c);
goto s_real;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);
s_hexa:
c = p_is->rduc ();
if (c == LEX_CHAR_US) {
goto s_hexa;
}
if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
goto s_relatif;
}
if (Unicode::ishexa (c) == true) {
d_buffer.add (c);
goto s_hexa;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);
s_binary:
c = p_is->rduc ();
if (c == LEX_CHAR_US) {
goto s_binary;
}
if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
goto s_relatif;
}
if ((c == LEX_CHAR_D0) || (c == LEX_CHAR_D1)) {
d_buffer.add (c);
goto s_binary;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);
s_relatif:
c = p_is->rduc ();
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::RELATIF, d_buffer.tostring (), d_lnum);
s_real:
c = p_is->rduc ();
if (Unicode::isdigit (c) == true) {
if ((eflag == true) && (eonly == true)) eonly = false;
d_buffer.add (c);
goto s_real;
}
if (((c == LEX_CHAR_EL) || (c == LEX_CHAR_EU)) && (eflag == false)) {
d_buffer.add (LEX_CHAR_EL);
eflag = true;
eonly = true;
goto s_real;
}
if (((c == LEX_CHAR_PS) || (c == LEX_CHAR_MS)) &&
(eflag == true) && (esign == false) && (eonly == true)) {
d_buffer.add (c);
esign = true;
eonly = false;
goto s_real;
}
if (Lexical::valid (c) == true) {
d_buffer.add (c);
goto s_lexical;
}
if (Unicode::isncc (c) == false) {
d_buffer.add (c);
goto s_error;
}
p_is->pushback (c);
return Token (Token::REAL, d_buffer.tostring (), d_lnum);
s_character:
c = p_is->rduc ();
if (c == LEX_CHAR_SQ)
return Token (Token::CHARACTER, d_buffer.tostring (), d_lnum);
if (c == LEX_CHAR_AS) goto s_charesc;
d_buffer.add (c);
goto s_character;
s_charesc:
c = p_is->rduc ();
switch (c) {
case LEX_CHAR_NL:
d_buffer.add (eolq);
break;
case LEX_CHAR_RL:
d_buffer.add (crlq);
break;
case LEX_CHAR_TL:
d_buffer.add (tabq);
break;
case LEX_CHAR_AS:
d_buffer.add (LEX_CHAR_AS);
break;
case LEX_CHAR_SQ:
d_buffer.add (LEX_CHAR_SQ);
break;
default:
d_buffer.add (LEX_CHAR_AS);
p_is->pushback (c);
break;
}
goto s_character;
s_string:
c = p_is->rduc ();
if (c == eofq) goto s_error;
if (c == LEX_CHAR_DQ) {
return Token (Token::STRING,d_buffer.tostring (), d_lnum);
}
if (c == LEX_CHAR_AS) goto s_stresc;
d_buffer.add (c);
goto s_string;
s_stresc:
c = p_is->rduc ();
switch (c) {
case LEX_CHAR_NL:
d_buffer.add (eolq);
break;
case LEX_CHAR_RL:
d_buffer.add (crlq);
break;
case LEX_CHAR_TL:
d_buffer.add (tabq);
break;
case LEX_CHAR_AS:
d_buffer.add (LEX_CHAR_AS);
break;
case LEX_CHAR_DQ:
d_buffer.add (LEX_CHAR_DQ);
break;
default:
d_buffer.add (LEX_CHAR_AS);
p_is->pushback (c);
break;
}
goto s_string;
s_regex:
c = p_is->rduc ();
switch (c) {
case eofc:
goto s_error;
case LEX_CHAR_LB:
d_buffer.add (c);
rcount++;
goto s_regex;
case LEX_CHAR_RB:
d_buffer.add (c);
if (--rcount == 0)
return Token (Token::REGEX, d_buffer.tostring (), d_lnum);
if (rcount < 0) goto s_error;
goto s_regex;
default:
d_buffer.add (c);
goto s_regex;
}
s_error:
p_is->flush (eolq);
return Token (Token::ERROR, d_buffer.tostring (), ++d_lnum);
}
// return the lexer line number
long Lexer::getlnum (void) const {
return d_lnum;
}
}
syntax highlighted by Code2HTML, v. 0.9.1