// --------------------------------------------------------------------------- // - Lexer.cpp - // - afnix engine - lexical analyzer class implementation - // --------------------------------------------------------------------------- // - This program is free software; you can redistribute it and/or modify - // - it provided that this copyright notice is kept intact. - // - - // - This program is distributed in the hope that it will be useful, but - // - without any warranty; without even the implied warranty of - // - merchantability or fitness for a particular purpose. In no event shall - // - the copyright holder be liable for any direct, indirect, incidental or - // - special damages arising in any way out of the use of this software. - // --------------------------------------------------------------------------- // - copyright (c) 1999-2007 amaury darsch - // --------------------------------------------------------------------------- #include "Lexer.hpp" #include "Unicode.hpp" #include "Lexical.hpp" namespace afnix { // ------------------------------------------------------------------------- // - private section - // ------------------------------------------------------------------------- static const t_quad LEX_CHAR_DQ = 0x00000022; // " static const t_quad LEX_CHAR_DZ = 0x00000023; // # static const t_quad LEX_CHAR_SQ = 0x00000027; // ' static const t_quad LEX_CHAR_LP = 0x00000028; // ( static const t_quad LEX_CHAR_RP = 0x00000029; // ) static const t_quad LEX_CHAR_PS = 0x0000002B; // + static const t_quad LEX_CHAR_MS = 0x0000002D; // - static const t_quad LEX_CHAR_SP = 0x0000002E; // . static const t_quad LEX_CHAR_D0 = 0x00000030; // 0 static const t_quad LEX_CHAR_D1 = 0x00000031; // 0 static const t_quad LEX_CHAR_DP = 0x0000003A; // : static const t_quad LEX_CHAR_LB = 0x0000005B; // [ static const t_quad LEX_CHAR_AS = 0x0000005C; // ANTI-SLASH static const t_quad LEX_CHAR_RB = 0x0000005D; // ] static const t_quad LEX_CHAR_US = 0x0000005F; // _ static const t_quad LEX_CHAR_LC = 0x0000007B; // { static const t_quad LEX_CHAR_RC = 0x0000007D; // } static const t_quad LEX_CHAR_BU = 0x00000042; // B static const t_quad LEX_CHAR_BL = 0x00000062; // b static const t_quad LEX_CHAR_EU = 0x00000045; // E static const t_quad LEX_CHAR_EL = 0x00000065; // e static const t_quad LEX_CHAR_NL = 0x0000006E; // n static const t_quad LEX_CHAR_RU = 0x00000052; // R static const t_quad LEX_CHAR_RL = 0x00000072; // r static const t_quad LEX_CHAR_XU = 0x00000058; // X static const t_quad LEX_CHAR_XL = 0x00000078; // x static const t_quad LEX_CHAR_TL = 0x00000074; // t // ------------------------------------------------------------------------- // - class section - // ------------------------------------------------------------------------- // create a new lexer Lexer::Lexer (Input* is) { d_lnum = 1; Object::iref (p_is = is); } // destroy this lexer Lexer::~Lexer (void) { Object::dref (p_is); } // return the next available token Token Lexer::get (void) { // reset controls bool eflag = false; bool esign = false; bool eonly = false; long rcount = 0; t_quad clast = nilq; // check for nil stream and reset buffer if (p_is == nilp) return Token (Token::ERROR,d_lnum); d_buffer.reset (); s_begin: t_quad c = p_is->rduc (); switch (c) { case blkq: case tabq: case crlq: goto s_begin; case eofq: return Token (Token::EOF,d_lnum); case eolq: return Token (Token::EOL,++d_lnum); case LEX_CHAR_LP: return Token (Token::RFB, d_lnum); case LEX_CHAR_RP: return Token (Token::RFE, d_lnum); case LEX_CHAR_LC: return Token (Token::BFB, d_lnum); case LEX_CHAR_RC: return Token (Token::BFE, d_lnum); case LEX_CHAR_DZ: goto s_comment; case LEX_CHAR_SQ: goto s_character; case LEX_CHAR_DQ: goto s_string; case LEX_CHAR_LB: rcount++; d_buffer.add (c); goto s_regex; case LEX_CHAR_PS: case LEX_CHAR_MS: d_buffer.add (c); goto s_number; case LEX_CHAR_D0: d_buffer.add (c); goto s_numfmt; default: d_buffer.add (c); if (Unicode::isdigit (c) == true) goto s_integer; if (Lexical::valid (c) == true) goto s_lexical; break; } goto s_error; s_comment: c = p_is->rduc (); if (c == eolq) return Token (Token::EOL,++d_lnum); if (c == eofq) return Token (Token::EOF,++d_lnum); goto s_comment; s_number: c = p_is->rduc (); if (c == LEX_CHAR_D0) { d_buffer.add (c); goto s_numfmt; } if (Unicode::isdigit (c) == true) { d_buffer.add (c); goto s_integer; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::LEXICAL, d_buffer.tostring (), d_lnum); s_lexical: c = p_is->rduc (); if (c == LEX_CHAR_DP) { d_buffer.add (clast = c); goto s_qualified; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::LEXICAL, d_buffer.tostring (), d_lnum); s_qualified: c = p_is->rduc (); if ((Lexical::valid (c) == true) || (c == LEX_CHAR_DP)) { if ((clast == LEX_CHAR_DP) && (c == LEX_CHAR_DP)) { d_buffer.add (c); goto s_error; } d_buffer.add (clast = c); goto s_qualified; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } if (clast == LEX_CHAR_DP) goto s_error; p_is->pushback (c); return Token (Token::QUALIFIED, d_buffer.tostring (), d_lnum); s_numfmt: c = p_is->rduc (); if (Unicode::isdigit (c) == true) { d_buffer.add (c); goto s_integer; } if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) { goto s_relatif; } if (c == LEX_CHAR_SP) { d_buffer.add (c); goto s_real; } if ((c == LEX_CHAR_XL) || (c == LEX_CHAR_XU)) { d_buffer.add (c); goto s_hexa; } if ((c == LEX_CHAR_BL) || (c == LEX_CHAR_BU)) { d_buffer.add (c); goto s_binary; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::INTEGER, d_buffer.tostring (), d_lnum); s_integer: c = p_is->rduc (); if (Unicode::isdigit (c) == true) { d_buffer.add (c); goto s_integer; } if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) { goto s_relatif; } if (c == LEX_CHAR_SP) { d_buffer.add (c); goto s_real; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::INTEGER, d_buffer.tostring (), d_lnum); s_hexa: c = p_is->rduc (); if (c == LEX_CHAR_US) { goto s_hexa; } if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) { goto s_relatif; } if (Unicode::ishexa (c) == true) { d_buffer.add (c); goto s_hexa; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::INTEGER, d_buffer.tostring (), d_lnum); s_binary: c = p_is->rduc (); if (c == LEX_CHAR_US) { goto s_binary; } if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) { goto s_relatif; } if ((c == LEX_CHAR_D0) || (c == LEX_CHAR_D1)) { d_buffer.add (c); goto s_binary; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::INTEGER, d_buffer.tostring (), d_lnum); s_relatif: c = p_is->rduc (); if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::RELATIF, d_buffer.tostring (), d_lnum); s_real: c = p_is->rduc (); if (Unicode::isdigit (c) == true) { if ((eflag == true) && (eonly == true)) eonly = false; d_buffer.add (c); goto s_real; } if (((c == LEX_CHAR_EL) || (c == LEX_CHAR_EU)) && (eflag == false)) { d_buffer.add (LEX_CHAR_EL); eflag = true; eonly = true; goto s_real; } if (((c == LEX_CHAR_PS) || (c == LEX_CHAR_MS)) && (eflag == true) && (esign == false) && (eonly == true)) { d_buffer.add (c); esign = true; eonly = false; goto s_real; } if (Lexical::valid (c) == true) { d_buffer.add (c); goto s_lexical; } if (Unicode::isncc (c) == false) { d_buffer.add (c); goto s_error; } p_is->pushback (c); return Token (Token::REAL, d_buffer.tostring (), d_lnum); s_character: c = p_is->rduc (); if (c == LEX_CHAR_SQ) return Token (Token::CHARACTER, d_buffer.tostring (), d_lnum); if (c == LEX_CHAR_AS) goto s_charesc; d_buffer.add (c); goto s_character; s_charesc: c = p_is->rduc (); switch (c) { case LEX_CHAR_NL: d_buffer.add (eolq); break; case LEX_CHAR_RL: d_buffer.add (crlq); break; case LEX_CHAR_TL: d_buffer.add (tabq); break; case LEX_CHAR_AS: d_buffer.add (LEX_CHAR_AS); break; case LEX_CHAR_SQ: d_buffer.add (LEX_CHAR_SQ); break; default: d_buffer.add (LEX_CHAR_AS); p_is->pushback (c); break; } goto s_character; s_string: c = p_is->rduc (); if (c == eofq) goto s_error; if (c == LEX_CHAR_DQ) { return Token (Token::STRING,d_buffer.tostring (), d_lnum); } if (c == LEX_CHAR_AS) goto s_stresc; d_buffer.add (c); goto s_string; s_stresc: c = p_is->rduc (); switch (c) { case LEX_CHAR_NL: d_buffer.add (eolq); break; case LEX_CHAR_RL: d_buffer.add (crlq); break; case LEX_CHAR_TL: d_buffer.add (tabq); break; case LEX_CHAR_AS: d_buffer.add (LEX_CHAR_AS); break; case LEX_CHAR_DQ: d_buffer.add (LEX_CHAR_DQ); break; default: d_buffer.add (LEX_CHAR_AS); p_is->pushback (c); break; } goto s_string; s_regex: c = p_is->rduc (); switch (c) { case eofc: goto s_error; case LEX_CHAR_LB: d_buffer.add (c); rcount++; goto s_regex; case LEX_CHAR_RB: d_buffer.add (c); if (--rcount == 0) return Token (Token::REGEX, d_buffer.tostring (), d_lnum); if (rcount < 0) goto s_error; goto s_regex; default: d_buffer.add (c); goto s_regex; } s_error: p_is->flush (eolq); return Token (Token::ERROR, d_buffer.tostring (), ++d_lnum); } // return the lexer line number long Lexer::getlnum (void) const { return d_lnum; } }