ports//devel/codeblocks/work/codeblocks-1.0rc2/src/plugins/codecompletion/parser/tokenizer.cpp

/*
* This file is part of Code::Blocks Studio, an open-source cross-platform IDE
* Copyright (C) 2003  Yiannis An. Mandravellos
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact e-mail: Yiannis An. Mandravellos <mandrav@codeblocks.org>
* Program URL   : http://www.codeblocks.org
*
* $Id: tokenizer.cpp,v 1.4.2.1 2005/10/25 07:59:01 mandrav Exp $
* $Date: 2005/10/25 07:59:01 $
*/

#include <sdk.h>
#include "tokenizer.h"
#include <wx/utils.h>
#include <wx/file.h>
#include <wx/msgdlg.h>
#include "../../../sdk/manager.h"
#include <cctype>

Tokenizer::Tokenizer(const wxString& filename)
	: m_Filename(filename),
	m_BufferLen(0),
	m_NestLevel(0),
	m_UndoNestLevel(0),
	m_TokenIndex(0),
	m_UndoTokenIndex(0),
	m_LineNumber(1),
	m_UndoLineNumber(1),
	m_IsOK(false),
	m_IsOperator(false)
{
	//ctor
	m_Options.wantPreprocessor = false;
	if (!m_Filename.IsEmpty())
		Init(m_Filename);
}

Tokenizer::~Tokenizer()
{
	//dtor
}

bool Tokenizer::Init(const wxString& filename)
{
	BaseInit();
	if (filename.IsEmpty())
	{
		if (m_Filename.IsEmpty())
		{
			wxMessageBox(_T("Tokenizer::Init() called without filename..."));
			return false;
		}
	}
	else
		m_Filename = filename;

	if (!wxFileExists(m_Filename))
		return false;

	if (!ReadFile())
	{
		wxMessageBox(_T("File ") + filename + _T(" does not exist..."));
		return false;
	}

	if (!m_BufferLen)
	{
		//wxMessageBox("File is empty!");
		return false;
	}

	m_IsOK = true;
	return true;
}

bool Tokenizer::InitFromBuffer(const wxString& buffer)
{
	BaseInit();
	m_Buffer = buffer;
	m_BufferLen = buffer.Length();
	m_IsOK = true;
	m_Filename.Clear();
	return true;
}

void Tokenizer::BaseInit()
{
	m_TokenIndex = 0;
	m_UndoTokenIndex = 0;
	m_LineNumber = 1;
	m_UndoLineNumber = 1;
	m_NestLevel = 0;
	m_UndoNestLevel = 0;
	m_IsOperator = false;
	m_BufferLen = 0;
	m_LastWasPreprocessor = false;
	m_LastPreprocessor.Clear();
	m_Buffer.Clear();
	m_IsOK = false;
}

bool Tokenizer::ReadFile()
{
    if (!wxFileExists(m_Filename))
        return false;

    // open file
    wxFile file(m_Filename);

    if (!cbRead(file,m_Buffer))
        return false;
	m_BufferLen = m_Buffer.Length();

    return true;
}

wxChar Tokenizer::CurrentChar()
{
	return m_Buffer.GetChar(m_TokenIndex);
}

wxChar Tokenizer::NextChar()
{
	if ((m_TokenIndex + 1) < 0 || (m_TokenIndex + 1) >= m_BufferLen)
		return 0;
	return m_Buffer.GetChar(m_TokenIndex + 1);
}

wxChar Tokenizer::PreviousChar()
{
	if ((m_TokenIndex - 1) < 0 || (m_TokenIndex - 1) >= m_BufferLen)
		return 0;
	return m_Buffer.GetChar(m_TokenIndex - 1);
}

void Tokenizer::AdjustLineNumber()
{
	if (CurrentChar() == '\n')
		++m_LineNumber;
}

bool Tokenizer::MoveToNextChar()
{
	++m_TokenIndex;
	if (!IsEOF())
	{
		AdjustLineNumber();
		return true;
	}
	return false;
}

bool Tokenizer::SkipWhiteSpace()
{
	// skip spaces, tabs, etc.
	while (!IsEOF() && isspace(CurrentChar()))
		MoveToNextChar();
	if (IsEOF())
		return false;
	return true;
}

bool Tokenizer::SkipToChar(const wxChar& ch)
{
	// skip everything until we find ch
	while (1)
	{
		while (!IsEOF() && CurrentChar() != ch)
			MoveToNextChar();
		if (PreviousChar() != '\\')
			break;
		else
		{
			// check for "\\"
			if (m_TokenIndex - 2 >= 0 && m_Buffer.GetChar(m_TokenIndex - 2) == '\\')
				break;
		}
		MoveToNextChar();
	}
	if (IsEOF())
		return false;
	return true;
}

bool Tokenizer::CharInString(const char ch, const char* chars)
{
	int len = strlen(chars);
	for (int i = 0; i < len; ++i)
	{
		if (ch == chars[i])
			return true;
	}
	return false;
}

bool Tokenizer::SkipToOneOfChars(const char* chars, bool supportNesting)
{
	// skip everything until we find any one of chars
	while (1)
	{
		while (!IsEOF() && !CharInString(CurrentChar(), chars))
		{
			if (CurrentChar() == '"' || CurrentChar() == '\'')
			{
				// this is the case that match is inside a string!
				char ch = CurrentChar();
				MoveToNextChar();
				SkipToChar(ch);
			}
			MoveToNextChar();
			if (supportNesting)
			{
				if (CurrentChar() == '{')
					SkipBlock('{');
			}
		}
		if (PreviousChar() != '\\')
			break;
		else
		{
			// check for "\\"
			if (m_TokenIndex - 2 >= 0 && m_Buffer.GetChar(m_TokenIndex - 2) == '\\')
				break;
		}
		MoveToNextChar();
	}
	if (IsEOF())
		return false;
	return true;
}

bool Tokenizer::SkipToEOL()
{
	// skip everything until we find EOL
	while (1)
	{
		while (!IsEOF() && CurrentChar() != '\n')
			MoveToNextChar();
		wxChar last = PreviousChar();
		// if DOS line endings, we 've hit \r and we skip to \n...
		if (last == '\r')
			last = m_Buffer.GetChar(m_TokenIndex - 2);
		if (IsEOF() || last != '\\')
			break;
		else
			MoveToNextChar();
	}
	if (IsEOF())
		return false;
	return true;
}

bool Tokenizer::SkipBlock(const wxChar& ch)
{
	// skip blocks () [] {} <>
	wxChar match;
	switch (ch)
	{
		case '(': match = ')'; break;
		case '[': match = ']'; break;
		case '{': match = '}'; break;
		case '<': match = '>'; break;
		default : return false;
	}

	MoveToNextChar();
	int count = 1; // counter for nested blocks (xxx())
	while (!IsEOF())
	{
		if (CurrentChar() == '"' || CurrentChar() == '\'')
		{
			// this is the case that match is inside a string!
			char ch = CurrentChar();
			MoveToNextChar();
			SkipToChar(ch);
			MoveToNextChar();
		}
		if (CurrentChar() == ch)
			++count;
		else if (CurrentChar() == match)
			--count;
		MoveToNextChar();
		if (count == 0)
			break;
	}
	if (IsEOF())
		return false;
	return true;
}

bool Tokenizer::SkipUnwanted()
{
	while (CurrentChar() == '#' ||
			(!m_IsOperator && CurrentChar() == '=') ||
			(!m_IsOperator && CurrentChar() == '[') ||
			CurrentChar() == '?' ||
			m_Buffer.Mid(m_TokenIndex, 2) == _T("//") ||
			m_Buffer.Mid(m_TokenIndex, 2) == _T("/*"))
	{
		bool skipPreprocessor = false; // used for #include
		while (m_Buffer.Mid(m_TokenIndex, 2) == _T("//") ||
				m_Buffer.Mid(m_TokenIndex, 2) == _T("/*"))
		{
			// C/C++ style comments
			bool cstyle = NextChar() == '*';
			MoveToNextChar();
			MoveToNextChar();
			while (1)
			{
				if (!cstyle)
				{
					if (!SkipToEOL())
						return false;
					MoveToNextChar();
					break;
				}
				else
				{
					if (SkipToChar('/'))
					{
						if (PreviousChar() == '*')
						{
							MoveToNextChar();
							break;
						}
						MoveToNextChar();
					}
					else
						return false;
				}
			}
			if (IsEOF())
				return false;
			if (!SkipWhiteSpace())
				return false;
		}

		while (CurrentChar() == '#')
		{
			// preprocessor directives
			// we only care for #include and #define, for now
			unsigned int backupIdx = m_TokenIndex;
			MoveToNextChar();
			SkipWhiteSpace();
			if ((CurrentChar() == 'i' && NextChar() == 'n') || // in(clude)
				(m_Options.wantPreprocessor && CurrentChar() == 'd' && NextChar() == 'e')) // de(fine)
			{
				// ok, we have something like #in(clude)
				m_LastWasPreprocessor = true;
				m_LastPreprocessor.Clear();
				m_TokenIndex = backupIdx; // keep #
				skipPreprocessor = true;
				break;
			}
			else
			{
				// skip the rest for now...
				SkipToEOL();
				if (!SkipWhiteSpace())
					return false;
			}
			if (skipPreprocessor)
				break;
		}

		while (CurrentChar() == '[')
		{
			// array subscripts
			// skip them for now...
			SkipBlock('[');
			if (!SkipWhiteSpace())
				return false;
		}

		while (CurrentChar() == '=')
		{
			// skip assignments
			// TODO: what happens with operators?
			if (!SkipToOneOfChars(",;}", true))
				return false;
		}

		while (CurrentChar() == '?')
		{
			// skip "condition ? true : false"
			// TODO: what happens with operators?
			if (!SkipToOneOfChars(";}"))
				return false;
		}
		if (skipPreprocessor)
			break;
	}
	return true;
}

wxString Tokenizer::GetToken()
{
	m_UndoTokenIndex = m_TokenIndex;
	m_UndoLineNumber = m_LineNumber;
	m_UndoNestLevel = m_NestLevel;
	return DoGetToken();
}

wxString Tokenizer::PeekToken()
{
	unsigned int undoTokenIndex = m_TokenIndex;
	unsigned int undoLineNumber = m_LineNumber;
	unsigned int undoNestLevel = m_NestLevel;
	wxString peek = DoGetToken();
	m_TokenIndex = undoTokenIndex;
	m_LineNumber = undoLineNumber;
	m_NestLevel = undoNestLevel;
	return peek;
}

void Tokenizer::UngetToken()
{
	m_TokenIndex = m_UndoTokenIndex;
	m_LineNumber = m_UndoLineNumber;
	m_NestLevel = m_UndoNestLevel;
}

wxString Tokenizer::DoGetToken()
{
	if (IsEOF())
		return wxEmptyString;

	if (!SkipWhiteSpace())
		return wxEmptyString;

	if (!SkipUnwanted())
		return wxEmptyString;

	int start = m_TokenIndex;
	wxString m_Str;

	if (isalpha(CurrentChar()) || CurrentChar() == '_')
	{
		// keywords, identifiers, etc.
		while (!IsEOF() &&
				(isalnum(CurrentChar()) ||
				CurrentChar() == '_'))
			MoveToNextChar();
		if (IsEOF())
			return wxEmptyString;
		m_Str = m_Buffer.Mid(start, m_TokenIndex - start);
		m_IsOperator = m_Str.Matches(_T("operator"));
	}
	else if (isdigit(CurrentChar()))
	{
		// numbers
		while (!IsEOF() && CharInString(CurrentChar(), "0123456789.abcdefABCDEFfXxLl"))
			MoveToNextChar();
		if (IsEOF())
			return wxEmptyString;
		m_Str = m_Buffer.Mid(start, m_TokenIndex - start);
		m_IsOperator = false;
	}
	else if (CurrentChar() == '"' ||
			CurrentChar() == '\'')
	{
		// string, char, etc.
		wxChar match = CurrentChar();
		MoveToNextChar();  // skip starting ' or "
		if (!SkipToChar(match))
			return wxEmptyString;
		MoveToNextChar(); // skip ending ' or "
		m_Str = m_Buffer.Mid(start, m_TokenIndex - start);
	}
	else if (CurrentChar() == ':')
	{
		if (NextChar() == ':')
		{
			MoveToNextChar();
			MoveToNextChar();
			m_Str = _T("::");
		}
		else
		{
			MoveToNextChar();
			m_Str = _T(":");
		}
	}
	else if (CurrentChar() == '(')
	{
		m_IsOperator = false;
		// skip blocks () []
		if (!SkipBlock(CurrentChar()))
			return wxEmptyString;
		wxString tmp = m_Buffer.Mid(start, m_TokenIndex - start);
		tmp.Replace(_T("\t"), _T(" ")); // replace tabs with spaces
		tmp.Replace(_T("\n"), _T(" ")); // replace LF with spaces
		tmp.Replace(_T("\r"), _T(" ")); // replace CR with spaces
		// fix-up arguments (remove excessive spaces/tabs/newlines)
		for (unsigned int i = 0; i < tmp.Length() - 1; ++i)
		{
			if (tmp.GetChar(i) == '/' && tmp.GetChar(i + 1) == '*')
			{
				// skip C comments
				i += 2;
				while (i < tmp.Length() - 1)
				{
					if (tmp.GetChar(i) == '*' && tmp.GetChar(i + 1) == '/')
						break;
					++i;
				}
				if (i >= tmp.Length() - 1 || tmp.GetChar(i + 1) != '/')
					continue; // we failed...
				i += 2;
			}
			else if (tmp.GetChar(i) == '=')
			{
				// skip default assignments
				++i;
				int level = 0; // nesting parenthesis
				while (i < tmp.Length())
				{
					if (tmp.GetChar(i) == '(')
						++level;
					else if (tmp.GetChar(i) == ')')
						--level;
					if ((tmp.GetChar(i) == ',' && level == 0) ||
						(tmp.GetChar(i) == ')' && level < 0))
						break;
					++i;
				}
                if (i < tmp.Length() && tmp.GetChar(i) == ',')
                    --i;
				continue; // we are done here
			}

			if (i < tmp.Length() - 1 && tmp.GetChar(i) == ' ' && tmp.GetChar(i + 1) == ' ')
				continue; // skip excessive spaces
			m_Str << tmp.GetChar(i);
		}
		m_Str << _T(')'); // add closing parenthesis (see "i < tmp.Length() - 1" in previous "for")
		m_Str.Replace(_T("  "), _T(" ")); // replace two-spaces with single-space (introduced if it skipped comments or assignments)
		m_Str.Replace(_T("( "), _T("("));
		m_Str.Replace(_T(" )"), _T(")"));
	}
	else
	{
		if (CurrentChar() == '{')
			++m_NestLevel;
		else if (CurrentChar() == '}')
			--m_NestLevel;
		m_Str = CurrentChar();
		MoveToNextChar();
	}

	if (m_LastWasPreprocessor && !m_Str.Matches(_T("#")) && !m_LastPreprocessor.Matches(_T("#")))
	{
		if (!m_LastPreprocessor.Matches(_T("#include")))
		{
			// except for #include, all other preprocessor directives need only
			// one word exactly after the directive, e.g. #define THIS_WORD
			SkipToEOL();
		}
		m_LastWasPreprocessor = false;
		m_LastPreprocessor.Clear();
	}

	if (m_LastWasPreprocessor)
		m_LastPreprocessor << m_Str;

	return m_Str;
}
syntax highlighted by Code2HTML, v. 0.9.1