// ---------------------------------------------------------------------------
// - Lexer.cpp                                                               -
// - afnix engine - lexical analyzer class implementation                    -
// ---------------------------------------------------------------------------
// - This program is free software;  you can redistribute it  and/or  modify -
// - it provided that this copyright notice is kept intact.                  -
// -                                                                         -
// - This program  is  distributed in  the hope  that it will be useful, but -
// - without  any  warranty;  without  even   the   implied    warranty   of -
// - merchantability or fitness for a particular purpose.  In no event shall -
// - the copyright holder be liable for any  direct, indirect, incidental or -
// - special damages arising in any way out of the use of this software.     -
// ---------------------------------------------------------------------------
// - copyright (c) 1999-2007 amaury darsch                                   -
// ---------------------------------------------------------------------------

#include "Lexer.hpp"
#include "Unicode.hpp"
#include "Lexical.hpp"

namespace afnix {

  // -------------------------------------------------------------------------
  // - private section                                                       -
  // -------------------------------------------------------------------------
  
  static const t_quad LEX_CHAR_DQ = 0x00000022; // "
  static const t_quad LEX_CHAR_DZ = 0x00000023; // #
  static const t_quad LEX_CHAR_SQ = 0x00000027; // '
  static const t_quad LEX_CHAR_LP = 0x00000028; // (
  static const t_quad LEX_CHAR_RP = 0x00000029; // )
  static const t_quad LEX_CHAR_PS = 0x0000002B; // +
  static const t_quad LEX_CHAR_MS = 0x0000002D; // -
  static const t_quad LEX_CHAR_SP = 0x0000002E; // .
  static const t_quad LEX_CHAR_D0 = 0x00000030; // 0
  static const t_quad LEX_CHAR_D1 = 0x00000031; // 0
  static const t_quad LEX_CHAR_DP = 0x0000003A; // :
  static const t_quad LEX_CHAR_LB = 0x0000005B; // [
  static const t_quad LEX_CHAR_AS = 0x0000005C; // ANTI-SLASH
  static const t_quad LEX_CHAR_RB = 0x0000005D; // ]
  static const t_quad LEX_CHAR_US = 0x0000005F; // _
  static const t_quad LEX_CHAR_LC = 0x0000007B; // {
  static const t_quad LEX_CHAR_RC = 0x0000007D; // }

  static const t_quad LEX_CHAR_BU = 0x00000042; // B
  static const t_quad LEX_CHAR_BL = 0x00000062; // b
  static const t_quad LEX_CHAR_EU = 0x00000045; // E
  static const t_quad LEX_CHAR_EL = 0x00000065; // e
  static const t_quad LEX_CHAR_NL = 0x0000006E; // n
  static const t_quad LEX_CHAR_RU = 0x00000052; // R
  static const t_quad LEX_CHAR_RL = 0x00000072; // r
  static const t_quad LEX_CHAR_XU = 0x00000058; // X
  static const t_quad LEX_CHAR_XL = 0x00000078; // x
  static const t_quad LEX_CHAR_TL = 0x00000074; // t

  // -------------------------------------------------------------------------
  // - class section                                                         -
  // -------------------------------------------------------------------------

  // create a new lexer

  Lexer::Lexer (Input* is) {
    d_lnum = 1;
    Object::iref (p_is = is);
  }

  // destroy this lexer

  Lexer::~Lexer (void) {
    Object::dref (p_is);
  }

  // return the next available token

  Token Lexer::get (void) {
    // reset controls
    bool   eflag  = false;
    bool   esign  = false;
    bool   eonly  = false;
    long   rcount = 0;
    t_quad clast  = nilq;

    // check for nil stream and reset buffer
    if (p_is == nilp) return Token (Token::ERROR,d_lnum);
    d_buffer.reset ();

  s_begin:
    t_quad c = p_is->rduc ();
    switch (c) {
    case blkq:
    case tabq:
    case crlq:
      goto s_begin;
    case eofq: 
      return Token (Token::EOF,d_lnum);
    case eolq: 
      return Token (Token::EOL,++d_lnum);
    case LEX_CHAR_LP:
      return Token (Token::RFB, d_lnum);
    case LEX_CHAR_RP:
      return Token (Token::RFE, d_lnum);
    case LEX_CHAR_LC:
      return Token (Token::BFB, d_lnum);
    case LEX_CHAR_RC:
      return Token (Token::BFE, d_lnum);
    case LEX_CHAR_DZ:
      goto s_comment;
    case LEX_CHAR_SQ:
      goto s_character;
    case LEX_CHAR_DQ:
      goto s_string;
    case LEX_CHAR_LB:
      rcount++;
      d_buffer.add (c);
      goto s_regex;
    case LEX_CHAR_PS:
    case LEX_CHAR_MS:
      d_buffer.add (c);
      goto s_number;
    case LEX_CHAR_D0:
      d_buffer.add (c);
      goto s_numfmt;
    default:
      d_buffer.add (c);
      if (Unicode::isdigit (c) == true) goto s_integer;
      if (Lexical::valid   (c) == true) goto s_lexical;
      break;
    }
    goto s_error;

  s_comment:
    c = p_is->rduc ();
    if (c == eolq) return Token (Token::EOL,++d_lnum);
    if (c == eofq) return Token (Token::EOF,++d_lnum);
    goto s_comment;

  s_number:
    c = p_is->rduc ();
    if (c == LEX_CHAR_D0) {
      d_buffer.add (c);
      goto s_numfmt;
    }
    if (Unicode::isdigit (c) == true) {
      d_buffer.add (c);
      goto s_integer;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::LEXICAL, d_buffer.tostring (), d_lnum);

  s_lexical:
    c = p_is->rduc ();
    if (c == LEX_CHAR_DP) {
      d_buffer.add (clast = c);
      goto s_qualified;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::LEXICAL, d_buffer.tostring (), d_lnum);

  s_qualified:
    c = p_is->rduc ();
    if ((Lexical::valid (c) == true) || (c == LEX_CHAR_DP)) {
      if ((clast == LEX_CHAR_DP) && (c == LEX_CHAR_DP)) {
	d_buffer.add (c);
	goto s_error;
      }
      d_buffer.add (clast = c);
      goto s_qualified;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    if (clast == LEX_CHAR_DP) goto s_error;
    p_is->pushback (c);
    return Token (Token::QUALIFIED, d_buffer.tostring (), d_lnum);

  s_numfmt:
    c = p_is->rduc ();
    if (Unicode::isdigit (c) == true) {
      d_buffer.add (c);
      goto s_integer;
    }
    if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
      goto s_relatif;
    }
    if (c == LEX_CHAR_SP) {
      d_buffer.add (c);
      goto s_real;
    }
    if ((c == LEX_CHAR_XL) || (c == LEX_CHAR_XU)) {
      d_buffer.add (c);
      goto s_hexa;
    }
    if ((c == LEX_CHAR_BL) || (c == LEX_CHAR_BU)) {
      d_buffer.add (c);
      goto s_binary;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);

  s_integer:
    c = p_is->rduc ();
    if (Unicode::isdigit (c) == true) {
      d_buffer.add (c);
      goto s_integer;
    }
    if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
      goto s_relatif;
    }
    if (c == LEX_CHAR_SP) {
      d_buffer.add (c);
      goto s_real;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);

  s_hexa:
    c = p_is->rduc ();
    if (c == LEX_CHAR_US) {
      goto s_hexa;
    }
    if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
      goto s_relatif;
    }
    if (Unicode::ishexa (c) == true) {
      d_buffer.add (c);
      goto s_hexa;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);

  s_binary:
    c = p_is->rduc ();
    if (c == LEX_CHAR_US) {
      goto s_binary;
    }
    if ((c == LEX_CHAR_RL) || (c == LEX_CHAR_RU)) {
      goto s_relatif;
    }
    if ((c == LEX_CHAR_D0) || (c == LEX_CHAR_D1)) {
      d_buffer.add (c);
      goto s_binary;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::INTEGER, d_buffer.tostring (), d_lnum);

  s_relatif:
    c = p_is->rduc ();
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::RELATIF, d_buffer.tostring (), d_lnum);

  s_real:
    c = p_is->rduc ();
    if (Unicode::isdigit (c) == true) {
      if ((eflag == true) && (eonly == true)) eonly = false;
      d_buffer.add (c);
      goto s_real;
    }
    if (((c == LEX_CHAR_EL) || (c == LEX_CHAR_EU)) && (eflag == false)) {
      d_buffer.add (LEX_CHAR_EL);
      eflag = true;
      eonly = true;
      goto s_real;
    }
    if (((c == LEX_CHAR_PS) || (c == LEX_CHAR_MS)) && 
	(eflag == true) && (esign == false) && (eonly == true)) {
      d_buffer.add (c);
      esign = true;
      eonly = false;
      goto s_real;
    }
    if (Lexical::valid (c) == true) {
      d_buffer.add (c);
      goto s_lexical;
    }
    if (Unicode::isncc (c) == false) {
      d_buffer.add (c);
      goto s_error;
    }
    p_is->pushback (c);
    return Token (Token::REAL, d_buffer.tostring (), d_lnum);

  s_character:
    c = p_is->rduc ();
    if (c == LEX_CHAR_SQ) 
      return Token (Token::CHARACTER, d_buffer.tostring (), d_lnum);
    if (c == LEX_CHAR_AS) goto s_charesc;
    d_buffer.add (c);
    goto s_character;
    
  s_charesc:
    c = p_is->rduc ();
    switch (c) {
    case LEX_CHAR_NL:
      d_buffer.add (eolq);
      break;
    case LEX_CHAR_RL:
      d_buffer.add (crlq);
      break;
    case LEX_CHAR_TL:
      d_buffer.add (tabq);
      break;
    case LEX_CHAR_AS:
      d_buffer.add (LEX_CHAR_AS);
      break;
    case LEX_CHAR_SQ:
      d_buffer.add (LEX_CHAR_SQ);
      break;
    default:
      d_buffer.add (LEX_CHAR_AS);
      p_is->pushback (c);
      break;
    }
    goto s_character;
    
  s_string:
    c = p_is->rduc ();
    if (c == eofq) goto s_error;
    if (c == LEX_CHAR_DQ) {
      return Token (Token::STRING,d_buffer.tostring (), d_lnum);
    }
    if (c == LEX_CHAR_AS) goto s_stresc;
    d_buffer.add (c);
    goto s_string;
    
  s_stresc:
    c = p_is->rduc ();
    switch (c) {
    case LEX_CHAR_NL:
      d_buffer.add (eolq);
      break;
    case LEX_CHAR_RL:
      d_buffer.add (crlq);
      break;
    case LEX_CHAR_TL:
      d_buffer.add (tabq);
      break;
    case LEX_CHAR_AS:
      d_buffer.add (LEX_CHAR_AS);
      break;
    case LEX_CHAR_DQ:
      d_buffer.add (LEX_CHAR_DQ);
      break;
    default:
      d_buffer.add (LEX_CHAR_AS);
      p_is->pushback (c);
      break;
    }
    goto s_string;

  s_regex:
    c = p_is->rduc ();
    switch (c) {
    case eofc:
      goto s_error;
    case LEX_CHAR_LB:
      d_buffer.add (c);
      rcount++;
      goto s_regex;
    case LEX_CHAR_RB:
      d_buffer.add (c);
      if (--rcount == 0) 
	return Token (Token::REGEX, d_buffer.tostring (), d_lnum);
      if (rcount < 0) goto s_error;
      goto s_regex;
    default:
      d_buffer.add (c);
      goto s_regex;
    }

  s_error:
    p_is->flush (eolq);
    return Token (Token::ERROR, d_buffer.tostring (), ++d_lnum);
  }

  // return the lexer line number
   
  long Lexer::getlnum (void) const {
    return d_lnum;
  }
}


syntax highlighted by Code2HTML, v. 0.9.1