/**
 * @file tokenize.cpp
 * This file breaks up the text stream into tokens or chunks.
 *
 * Each routine needs to set pc->len and pc->type.
 *
 * @author  Ben Gardner
 * @license GPL v2+
 *
 * $Id: tokenize.cpp 971 2007-10-17 21:26:22Z bengardner $
 */

#include "uncrustify_types.h"
#include "char_table.h"
#include "prototypes.h"
#include "chunk_list.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cerrno>
#include <cctype>


static bool parse_string(chunk_t *pc, int quote_idx, bool allow_escape);

#include "d.tokenize.cpp"


/**
 * Figure of the length of the comment at text.
 * The next bit of text starts with a '/', so it might be a comment.
 * There are three types of comments:
 *  - C comments that start with  '/ *' and end with '* /'
 *  - C++ comments that start with //
 *  - D nestable comments '/+' '+/'
 *
 * @param pc   The structure to update, str is an input.
 * @return     Whether a comment was parsed
 */
bool parse_comment(chunk_t *pc)
{
   int  len     = 2;
   bool is_d    = (cpd.lang_flags & LANG_D) != 0;
   int  d_level = 0;

   if ((pc->str[0] != '/') ||
       ((pc->str[1] != '*') && (pc->str[1] != '/') &&
        ((pc->str[1] != '+') || !is_d)))
   {
      return(false);
   }

   /* account for opening two chars */
   cpd.column += 2;

   if (pc->str[1] == '/')
   {
      pc->type = CT_COMMENT_CPP;
      while (true)
      {
         while ((pc->str[len] != '\n') &&
                (pc->str[len] != '\r') &&
                (pc->str[len] != 0))
         {
            len++;
         }

         if (pc->str[len - 1] != '\\')
         {
            break;
         }
         len++;
      }
   }
   else if (pc->str[len] == 0)
   {
      /* unexpected end of file */
      return(false);
   }
   else if (pc->str[1] == '+')
   {
      pc->type = CT_COMMENT;
      d_level++;
      while ((d_level > 0) && (pc->str[len + 1] != 0))
      {
         if ((pc->str[len] == '+') && (pc->str[len + 1] == '/'))
         {
            len        += 2;
            cpd.column += 2;
            d_level--;
            continue;
         }

         if ((pc->str[len] == '/') && (pc->str[len + 1] == '+'))
         {
            len        += 2;
            cpd.column += 2;
            d_level++;
            continue;
         }

         if ((pc->str[len] == '\n') || (pc->str[len] == '\r'))
         {
            pc->type = CT_COMMENT_MULTI;
            pc->nl_count++;
            cpd.column = 0;
            cpd.line_number++;

            if (pc->str[len] == '\r')
            {
               if (pc->str[len + 1] == '\n')
               {
                  cpd.le_counts[LE_CRLF]++;
                  len++;
               }
               else
               {
                  cpd.le_counts[LE_CR]++;
               }
            }
            else
            {
               cpd.le_counts[LE_LF]++;
            }
         }
         len++;
         cpd.column++;
      }
   }
   else
   {
      pc->type = CT_COMMENT;
      while (pc->str[len + 1] != 0)
      {
         if ((pc->str[len] == '*') && (pc->str[len + 1] == '/'))
         {
            len        += 2;
            cpd.column += 2;
            break;
         }

         if ((pc->str[len] == '\n') || (pc->str[len] == '\r'))
         {
            pc->type = CT_COMMENT_MULTI;
            pc->nl_count++;
            cpd.column = 0;
            cpd.line_number++;

            if (pc->str[len] == '\r')
            {
               if (pc->str[len + 1] == '\n')
               {
                  cpd.le_counts[LE_CRLF]++;
                  len++;
               }
               else
               {
                  cpd.le_counts[LE_CR]++;
               }
            }
            else
            {
               cpd.le_counts[LE_LF]++;
            }
         }
         len++;
         cpd.column++;
      }
   }
   pc->len = len;
   return(true);
}

/**
 * Count the number of characters in the number.
 * The next bit of text starts with a number (0-9 or '.'), so it is a number.
 * Count the number of characters in the number.
 *
 * This should cover all number formats for all languages.
 * Note that this is not a strict parser. It will happily parse numbers in
 * an invalid format.
 *
 * For example, only D allows underscores in the numbers, but they are
 * allowed in all formats.
 *
 * @param pc   The structure to update, str is an input.
 * @return     Whether a number was parsed
 */
static bool parse_number(chunk_t *pc)
{
   int  len;
   int  tmp;
   bool is_float;
   bool did_hex = false;

   /* A number must start with a digit or a dot, followed by a digit */
   if (!isdigit(pc->str[0]) &&
       ((pc->str[0] != '.') || !isdigit(pc->str[1])))
   {
      return(false);
   }
   len = 1;

   is_float = (pc->str[0] == '.');
   if (is_float && (pc->str[1] == '.'))
   {
      return(false);
   }

   /* Check for Hex, Octal, or Binary
    * Note that only D and Pawn support binary, but who cares?
    */
   if (pc->str[0] == '0')
   {
      switch (toupper(pc->str[1]))
      {
      case 'X':               /* hex */
         did_hex = true;
         do
         {
            len++;
         } while (isxdigit(pc->str[len]) || (pc->str[len] == '_'));
         break;

      case 'B':               /* binary */
         do
         {
            len++;
         } while ((pc->str[len] == '0') || (pc->str[len] == '1') ||
                  (pc->str[len] == '_'));
         break;

      case '0':                /* octal */
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
         do
         {
            len++;
         } while (((pc->str[len] >= '0') && (pc->str[len] <= '7')) ||
                  (pc->str[len] == '_'));
         break;

      default:
         /* either just 0 or 0.1 or 0UL, etc */
         break;
      }
   }
   else
   {
      /* Regular int or float */
      while (isdigit(pc->str[len]) || (pc->str[len] == '_'))
      {
         len++;
      }
   }

   /* Check if we stopped on a decimal point & make sure it isn't '..' */
   if ((pc->str[len] == '.') && (pc->str[len + 1] != '.'))
   {
      len++;
      is_float = true;
      if (did_hex)
      {
         while (isxdigit(pc->str[len]) || (pc->str[len] == '_'))
         {
            len++;
         }
      }
      else
      {
         while (isdigit(pc->str[len]) || (pc->str[len] == '_'))
         {
            len++;
         }
      }
   }

   /* Check exponent
    * Valid exponents per language (not that it matters):
    * C/C++/D/Java: eEpP
    * C#/Pawn:      eE
    */
   tmp = toupper(pc->str[len]);
   if ((tmp == 'E') || (tmp == 'P'))
   {
      is_float = true;
      len++;
      if ((pc->str[len] == '+') || (pc->str[len] == '-'))
      {
         len++;
      }
      while (isdigit(pc->str[len]) || (pc->str[len] == '_'))
      {
         len++;
      }
   }

   /* Check the suffixes
    * Valid suffixes per language (not that it matters):
    *        Integer       Float
    * C/C++: uUlL          lLfF
    * C#:    uUlL          fFdDMm
    * D:     uUL           ifFL
    * Java:  lL            fFdD
    * Pawn:  (none)        (none)
    *
    * Note that i, f, d, and m only appear in floats.
    */
   while (1)
   {
      tmp = toupper(pc->str[len]);
      if ((tmp == 'I') || (tmp == 'F') || (tmp == 'D') || (tmp == 'M'))
      {
         is_float = true;
      }
      else if ((tmp != 'L') && (tmp != 'U'))
      {
         break;
      }
      len++;
   }

   pc->len     = len;
   pc->type    = is_float ? CT_NUMBER_FP : CT_NUMBER;
   cpd.column += len;
   return(true);
}

/**
 * Count the number of characters in a quoted string.
 * The next bit of text starts with a quote char " or ' or <.
 * Count the number of characters until the matching character.
 *
 * @param pc   The structure to update, str is an input.
 * @return     Whether a string was parsed
 */
static bool parse_string(chunk_t *pc, int quote_idx, bool allow_escape)
{
   bool escaped = 0;
   int  end_ch;
   int  len          = quote_idx;
   char escape_char  = cpd.settings[UO_string_escape_char].n;
   char escape_char2 = cpd.settings[UO_string_escape_char2].n;

   pc->type = CT_STRING;

   end_ch = CharTable::Get(pc->str[len]) & 0xff;
   len++;

   for ( /* nada */; pc->str[len] != 0; len++)
   {
      if (!escaped)
      {
         cpd.column++;
         if (pc->str[len] == escape_char)
         {
            escaped = (escape_char != 0);
         }
         else if ((pc->str[len] == escape_char2) &&
                  (pc->str[len + 1] == end_ch))
         {
            escaped = allow_escape;
         }
         else if (pc->str[len] == '\n')
         {
            cpd.line_number++;
            cpd.column = 1;
            pc->nl_count++;
            pc->type = CT_STRING_MULTI;
         }
         else if (pc->str[len] == end_ch)
         {
            len++;
            break;
         }
      }
      else
      {
         escaped = false;
      }
   }

   /* D can have suffixes */
   if (((cpd.lang_flags & LANG_D) != 0) &&
       ((pc->str[len] == 'c') ||
        (pc->str[len] == 'w') ||
        (pc->str[len] == 'd')))
   {
      len++;
   }
   pc->len = len;
   return(true);
}

/**
 * Literal string, ends with single "
 * Two "" don't end the string.
 *
 * @param pc   The structure to update, str is an input.
 * @return     Whether a string was parsed
 */
static bool parse_cs_string(chunk_t *pc)
{
   int len = 2;

   /* go until we hit a zero (end of file) or a single " */
   while (pc->str[len] != 0)
   {
      if ((pc->str[len] == '"') && (pc->str[len + 1] == '"'))
      {
         len += 2;
      }
      else
      {
         len++;
         if (pc->str[len - 1] == '"')
         {
            break;
         }
      }
   }

   pc->len     = len;
   pc->type    = CT_STRING;
   cpd.column += len;
   return(true);
}

/**
 * Count the number of characters in a word.
 * The first character is already valid for a keyword
 *
 * @param pc   The structure to update, str is an input.
 * @return     Whether a word was parsed (always true)
 */
bool parse_word(chunk_t *pc, bool skipcheck)
{
   int len = 1;
   const chunk_tag_t *tag;

   while ((pc->str[len] < 127) && CharTable::IsKw2(pc->str[len]))
   {
      len++;
   }
   cpd.column += len;
   pc->len     = len;
   pc->type    = CT_WORD;

   if (skipcheck)
   {
      return(true);
   }

   /* Detect pre-processor functions now */
   if ((cpd.in_preproc == CT_PP_DEFINE) &&
       (cpd.preproc_ncnl_count == 1))
   {
      if (pc->str[len] == '(')
      {
         pc->type = CT_MACRO_FUNC;
      }
      else
      {
         pc->type = CT_MACRO;
      }
   }

   /* Turn it into a keyword now */
   tag = find_keyword(pc->str, len);
   if (tag != NULL)
   {
      pc->type = tag->type;
   }
   return(true);
}

/**
 * Count the number of whitespace characters.
 *
 * @param pc   The structure to update, str is an input.
 * @return     Whether whitespace was parsed
 */
bool parse_whitespace(chunk_t *pc)
{
   int  len          = 0;
   int  nl_count     = 0;
   bool last_was_tab = false;

   while ((pc->str[len] != 0) &&
          ((pc->str[len] <= ' ') || (pc->str[len] >= 127)))
   {
      last_was_tab = false;
      switch (pc->str[len])
      {
      case '\r':
         if (pc->str[len + 1] == '\n')
         {
            /* CRLF ending */
            len++;
            cpd.le_counts[LE_CRLF]++;
         }
         else
         {
            /* CR ending */
            cpd.le_counts[LE_CR]++;
         }
         nl_count++;
         cpd.column = 1;
         cpd.line_number++;
         break;

      case '\n':
         /* LF ending */
         cpd.le_counts[LE_LF]++;
         nl_count++;
         cpd.column = 1;
         cpd.line_number++;
         break;

      case '\t':
         cpd.column = calc_next_tab_column(cpd.column,
                                           cpd.settings[UO_input_tab_size].n);
         last_was_tab = true;
         break;

      case ' ':
         cpd.column++;
         break;

      default:
         break;
      }
      len++;
   }

   if (len > 0)
   {
      pc->nl_count  = nl_count;
      pc->type      = nl_count ? CT_NEWLINE : CT_WHITESPACE;
      pc->len       = len;
      pc->after_tab = last_was_tab;
   }
   return(len != 0);
}

/**
 * Called when we hit a backslash.
 * If there is nothing but whitespace until the newline, then this is a
 * backslash newline
 */
static bool parse_bs_newline(chunk_t *pc)
{
   pc->len = 1;

   while (isspace(pc->str[pc->len]))
   {
      if ((pc->str[pc->len] == '\r') || (pc->str[pc->len] == '\n'))
      {
         if (pc->str[pc->len] == '\n')
         {
            pc->len++;
         }
         else /* it is '\r' */
         {
            pc->len++;
            if (pc->str[pc->len] == '\n')
            {
               pc->len++;
            }
         }
         pc->type     = CT_NL_CONT;
         pc->nl_count = 1;
         return(true);
      }
      pc->len++;
   }
   return(false);
}

/**
 * Skips the next bit of whatever and returns the type of block.
 *
 * pc->str is the input text.
 * pc->len in the output length.
 * pc->type is the output type
 * pc->column is output column
 *
 * @param pc      The structure to update, str is an input.
 * @return        true/false - whether anything was parsed
 */
static bool parse_next(chunk_t *pc)
{
   const chunk_tag_t *punc;

   if ((pc == NULL) || (pc->str == NULL) || (*pc->str == 0))
   {
      //fprintf(stderr, "All done!\n");
      return(false);
   }

   /* Save off the current column */
   pc->orig_line = cpd.line_number;
   pc->column    = cpd.column;
   pc->orig_col  = cpd.column;
   pc->len       = 0;
   pc->type      = CT_NONE;
   pc->nl_count  = 0;
   pc->flags     = 0;

   /**
    *  Parse whitespace
    */
   if (parse_whitespace(pc))
   {
      return(true);
   }

   /**
    *  Handle unknown/unhandled preprocessors
    */
   if ((cpd.in_preproc > CT_PP_BODYCHUNK) &&
       (cpd.in_preproc <= CT_PP_OTHER))
   {
      /* Chunk to a newline or comment */
      pc->type = CT_PREPROC_BODY;
      char last = 0;
      while (pc->str[pc->len] != 0)
      {
         char ch = pc->str[pc->len];

         if ((ch == '\n') || (ch == '\r'))
         {
            /* Back off if this is an escaped newline */
            if (last == '\\')
            {
               pc->len--;
            }
            break;
         }

         /* Quit on a C++ comment start */
         if ((ch == '/') && (pc->str[pc->len + 1] == '/'))
         {
            break;
         }
         last = ch;
         pc->len++;
      }
      if (pc->len > 0)
      {
         return(true);
      }
   }

   /**
    *   Detect backslash-newline
    *
    * REVISIT: does this need to handle other line endings?
    */
   if ((pc->str[0] == '\\') && parse_bs_newline(pc))
   {
      cpd.column = 1;
      cpd.line_number++;
      return(true);
   }

   /**
    *  Parse comments
    */
   if (parse_comment(pc))
   {
      return(true);
   }

   /* Check for C# literal strings, ie @"hello" */
   if (((cpd.lang_flags & LANG_CS) != 0) && (*pc->str == '@'))
   {
      if (pc->str[1] == '"')
      {
         parse_cs_string(pc);
         return(true);
      }
      if (CharTable::IsKw1(pc->str[1]) && parse_word(pc, true))
      {
         return(true);
      }
   }

   /* PAWN specific stuff */
   if ((cpd.lang_flags & LANG_PAWN) != 0)
   {
      /* Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" */
      if ((pc->str[0] == '\\') || (pc->str[0] == '!'))
      {
         if (pc->str[1] == '"')
         {
            parse_string(pc, 1, (*pc->str == '!'));
            return(true);
         }
         else if (((pc->str[1] == '\\') || (pc->str[1] == '!')) &&
                  (pc->str[2] == '"'))
         {
            parse_string(pc, 2, false);
            return(true);
         }
      }
   }

   /**
    *  Parse strings and character constants
    */

   if (parse_number(pc))
   {
      return(true);
   }

   if ((cpd.lang_flags & LANG_D) != 0)
   {
      /* D specific stuff */
      if (d_parse_string(pc))
      {
         return(true);
      }
   }
   else
   {
      /* Not D stuff */

      /* Check for L'a', L"abc", 'a', "abc", <abc> strings */
      if (((*pc->str == 'L') && ((pc->str[1] == '"') || (pc->str[1] == '\''))) ||
          (*pc->str == '"') ||
          (*pc->str == '\'') ||
          ((*pc->str == '<') && (cpd.in_preproc == CT_PP_INCLUDE)))
      {
         parse_string(pc, (*pc->str == 'L') ? 1 : 0, true);
         return(true);
      }
   }

   /* Check for pawn/ObjectiveC identifiers */
   if ((*pc->str == '@') &&
       CharTable::IsKw2(pc->str[1]) &&
       parse_word(pc, false))
   {
      return(true);
   }

   if (CharTable::IsKw1(*pc->str) && parse_word(pc, false))
   {
      return(true);
   }

   if ((punc = find_punctuator(pc->str, cpd.lang_flags)) != NULL)
   {
      pc->type    = punc->type;
      pc->len     = strlen(punc->tag);
      cpd.column += pc->len;
      pc->flags  |= PCF_PUNCTUATOR;
      return(true);
   }


   /* throw away this character */
   pc->type = CT_UNKNOWN;
   pc->len  = 1;

   LOG_FMT(LWARN, "%s:%d Garbage in col %d: %x\n",
           cpd.filename, pc->orig_line, cpd.column, *pc->str);
   cpd.error_count++;
   return(true);
}

/**
 * This function parses or tokenizes the whole buffer into a list.
 * It has to do some tricks to parse preprocessors.
 *
 * If output_text() were called immediately after, two things would happen:
 *  - trailing whitespace are removed.
 *  - leading space & tabs are converted to the appropriate format.
 *
 * All the tokens are inserted before ref. If ref is NULL, they are inserted
 * at the end of the list.  Line numbers are relative to the start of the data.
 */
void tokenize(const char *data, int data_len, chunk_t *ref)
{
   int                idx = 0;
   chunk_t            chunk;
   chunk_t            *pc    = NULL;
   chunk_t            *rprev = NULL;
   chunk_t            *prev  = NULL;
   struct parse_frame frm;
   bool               last_was_tab = false;

   memset(&frm, 0, sizeof(frm));
   memset(&chunk, 0, sizeof(chunk));

   cpd.line_number = 1;
   cpd.column      = 1;

   while (idx < data_len)
   {
      chunk.str = &data[idx];
      if (!parse_next(&chunk))
      {
         LOG_FMT(LERR, "%s:%d Bailed before the end?\n",
                 cpd.filename, cpd.line_number);
         cpd.error_count++;
         break;
      }

      /* Bump up the index */
      idx += chunk.len;

      /* Don't create an entry for whitespace */
      if (chunk.type == CT_WHITESPACE)
      {
         last_was_tab = chunk.after_tab;
         continue;
      }

      if (chunk.type == CT_NEWLINE)
      {
         last_was_tab    = chunk.after_tab;
         chunk.after_tab = false;
         chunk.len       = 0;
      }
      else if (chunk.type == CT_NL_CONT)
      {
         last_was_tab    = chunk.after_tab;
         chunk.after_tab = false;
         chunk.len       = 2;
         chunk.str       = "\\\n";
      }
      else
      {
         chunk.after_tab = last_was_tab;
         last_was_tab    = false;
      }

      /* Strip trailing whitespace (for CPP comments and PP blocks) */
      while ((chunk.len > 0) &&
             ((chunk.str[chunk.len - 1] == ' ') ||
              (chunk.str[chunk.len - 1] == '\t')))
      {
         chunk.len--;
      }

      /* Store off the end column */
      chunk.orig_col_end = cpd.column;

      /* Add the chunk to the list */
      rprev = pc;
      if (!chunk_is_newline(pc) && !chunk_is_comment(pc))
      {
         prev = pc;
      }
      if (rprev != NULL)
      {
         pc->flags |= rprev->flags & PCF_COPY_FLAGS;

         /* a newline can't be in a preprocessor */
         if (pc->type == CT_NEWLINE)
         {
            pc->flags &= ~PCF_IN_PREPROC;
         }
      }
      if (ref != NULL)
      {
         chunk.flags |= PCF_INSERTED;
      }
      else
      {
         chunk.flags &= ~PCF_INSERTED;
      }
      pc = chunk_add_before(&chunk, ref);

      /* A newline marks the end of a preprocessor */
      if ((pc->type == CT_NEWLINE) || (pc->type == CT_COMMENT_MULTI))
      {
         cpd.in_preproc         = CT_NONE;
         cpd.preproc_ncnl_count = 0;
      }

      /* Special handling for preprocessor stuff */
      if (cpd.in_preproc != CT_NONE)
      {
         pc->flags |= PCF_IN_PREPROC;

         /* Count words after the preprocessor */
         if (!chunk_is_comment(pc) && !chunk_is_newline(pc))
         {
            cpd.preproc_ncnl_count++;
         }

         /* Figure out the type of preprocessor for #include parsing */
         if (cpd.in_preproc == CT_PREPROC)
         {
            if ((pc->type < CT_PP_DEFINE) || (pc->type > CT_PP_OTHER))
            {
               pc->type = CT_PP_OTHER;
            }
            cpd.in_preproc = pc->type;
         }
      }
      else
      {
         /* Check for a preprocessor start */
         if ((pc->type == CT_POUND) &&
             ((rprev == NULL) || (rprev->type == CT_NEWLINE)))
         {
            pc->type       = CT_PREPROC;
            pc->flags     |= PCF_IN_PREPROC;
            cpd.in_preproc = CT_PREPROC;
         }
      }
   }

   /* Set the cpd.newline string for this file */
   if ((cpd.settings[UO_newlines].le == LE_LF) ||
       ((cpd.settings[UO_newlines].le == LE_AUTO) &&
        (cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CRLF]) &&
        (cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CR])))
   {
      /* LF line ends */
      strcpy(cpd.newline, "\n");
      LOG_FMT(LLINEENDS, "Using LF line endings\n");
   }
   else if ((cpd.settings[UO_newlines].le == LE_CRLF) ||
            ((cpd.settings[UO_newlines].le == LE_AUTO) &&
             (cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_LF]) &&
             (cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_CR])))
   {
      /* CRLF line ends */
      strcpy(cpd.newline, "\r\n");
      LOG_FMT(LLINEENDS, "Using CRLF line endings\n");
   }
   else
   {
      /* CR line ends */
      strcpy(cpd.newline, "\r");
      LOG_FMT(LLINEENDS, "Using CR line endings\n");
   }
}