///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: SimpleTokenizer.cc 272 2007-01-06 19:37:27Z brian $
//
// Copyright (C) 2007 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#include "AbstractTokenReceiver.h"
#include "AbstractCharReader.h"
#include "SimpleTokenizer.h"

SimpleTokenizer::SimpleTokenizer(char non_ascii_char)
  : m_nonAsciiCharReplacement(non_ascii_char)
{
    reset();
}

SimpleTokenizer::~SimpleTokenizer()
{
}

char SimpleTokenizer::currentChar(AbstractCharReader *reader)
{
  char ch = reader->currentChar();
  if (m_nonAsciiCharReplacement && (ch & 0x80)) {
    ch = m_nonAsciiCharReplacement;
  }
  return ch;
}

void SimpleTokenizer::tokenize(AbstractTokenReceiver *receiver,
                               AbstractCharReader *reader,
                               const string &prefix)
{
    reset();
    m_receiver = receiver;

    while (reader->forward()) {
        if (processCharForState(currentChar(reader))) {
            if (is_debug) {
                cerr << "SEND TOKEN: " << m_token << endl;
            }
            sendToken(prefix, m_token);
        }
    }

    if (m_token.length() > 0){
        sendToken(prefix, m_token);
    }
}

void SimpleTokenizer::reset()
{
    m_state = START;
    m_token.erase();
    m_pending.erase();
}

bool SimpleTokenizer::isLetterChar(char ch)
{
    return (ch & 0x80) || is_alnum(ch) || (ch == '%');
}

bool SimpleTokenizer::isSpecialChar(char ch)
{
    switch (ch) {
    case '.':
    case '-':
    case '+':
    case ',':
    case '_':
    case '$':
        return true;

    default:
        return false;
    }

}

void SimpleTokenizer::appendChar(string &s, char ch)
{
    s += to_lower(ch);
}

bool SimpleTokenizer::processStart(char ch)
{
    assert(m_state == START);

    m_token.erase();
    m_pending.erase();
    m_state = WAITING;
    return processWaiting(ch);
}

bool SimpleTokenizer::processWaiting(char ch)
{
    assert(m_state == WAITING);

    if (ch == '$' || isLetterChar(ch)) {
        appendChar(m_token, ch);
        m_state = IN_WORD;
    }

    return false;
}

bool SimpleTokenizer::processInWord(char ch)
{
    assert(m_state == IN_WORD);

    if (isLetterChar(ch)) {
        appendChar(m_token, ch);
        return false;
    }

    if (isSpecialChar(ch)) {
        m_state = PENDING;
        appendChar(m_pending, ch);
        return false;
    }

    m_state = START;
    return true;
}

bool SimpleTokenizer::processPending(char ch)
{
    if (isLetterChar(ch)) {
        m_state = IN_WORD;
        m_token += m_pending;
        appendChar(m_token, ch);
        m_pending.erase();
        return false;
    }

    if (isSpecialChar(ch)) {
        appendChar(m_pending, ch);
        return false;
    }

    m_state = START;
    m_pending.erase();
    return true;
}

bool SimpleTokenizer::processCharForState(char ch)
{
    //cerr << "PROCESS CHAR '" << ch << "' (" << (int)ch << ") STATE " << m_state
    //     << " TOKEN '" << m_token << "' PENDING '" << m_pending << "'" << endl;
    switch (m_state) {
    case START:
        return processStart(ch);

    case WAITING:
        return processWaiting(ch);

    case IN_WORD:
        return processInWord(ch);

    case PENDING:
        return processPending(ch);

    default:
        assert(false);
        cerr << "INVALID STATE " << m_state << endl;
        return false;
    }
}

void SimpleTokenizer::sendToken(const string &prefix,
                                string &token)
{
  m_receiver->receiveToken(prefix, m_token);
  m_token.erase();
}


syntax highlighted by Code2HTML, v. 0.9.1