/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: NPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Netscape Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is 
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the NPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the NPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */


#include "nsJISx4501LineBreaker.h"


#include "pratom.h"
#include "nsLWBRKDll.h"
#include "jisx4501class.h"
#define TH_UNICODE
#include "th_char.h"
#include "rulebrk.h"
#include "nsUnicharUtils.h"


/* 

   Simplification of Pair Table in JIS X 4051

   1. The Origion Table - in 4.1.3

   In JIS x 4051. The pair table is defined as below

   Class of
   Leading    Class of Trailing Char Class
   Char        

              1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
                                                 *  #  *  #
        1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
        2        X  X  X  X  X                                               X
        3        X  X  X  X  X                                               X
        4        X  X  X  X  X                                               X
        5        X  X  X  X  X                                               X
        6        X  X  X  X  X                                               X
        7        X  X  X  X  X  X                                            X 
        8        X  X  X  X  X                                X              E 
        9        X  X  X  X  X                                               X
       10        X  X  X  X  X                                               X
       11        X  X  X  X  X                                               X
       12        X  X  X  X  X                                               X  
       13        X  X  X  X  X                    X                          X
       14        X  X  X  X  X                          X                    X
       15        X  X  X  X  X        X                       X        X     X 
       16        X  X  X  X  X                                   X     X     X
       17        X  X  X  X  X                                               E 
       18        X  X  X  X  X                                X  X     X     X 
       19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
       20        X  X  X  X  X                                               E

   * Same Char
   # Other Char
    
   X Cannot Break

   2. Simplified by remove the class which we do not care

   However, since we do not care about class 13(Subscript), 14(Ruby), 
   19(split line note begin quote), and 20(split line note end quote) 
   we can simplify this par table into the following 

   Class of
   Leading    Class of Trailing Char Class
   Char        

              1  2  3  4  5  6  7  8  9 10 11 12 15 16 17 18 
                                                 
        1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
        2        X  X  X  X  X                             
        3        X  X  X  X  X                            
        4        X  X  X  X  X                           
        5        X  X  X  X  X                          
        6        X  X  X  X  X                         
        7        X  X  X  X  X  X                      
        8        X  X  X  X  X                    X    
        9        X  X  X  X  X                                   
       10        X  X  X  X  X                                  
       11        X  X  X  X  X                                 
       12        X  X  X  X  X                                
       15        X  X  X  X  X        X           X        X    
       16        X  X  X  X  X                       X     X    
       17        X  X  X  X  X                                  
       18        X  X  X  X  X                    X  X     X    

   3. Simplified by merged classes

   After the 2 simplification, the pair table have some duplication 
   a. class 2, 3, 4, 5, 6,  are the same- we can merged them
   b. class 10, 11, 12, 17  are the same- we can merged them


   Class of
   Leading    Class of Trailing Char Class
   Char        

              1 [a] 7  8  9 [b]15 16 18 
                                     
        1     X  X  X  X  X  X  X  X  X
      [a]        X                             
        7        X  X                      
        8        X              X    
        9        X                                   
      [b]        X                                  
       15        X        X     X     X    
       16        X                 X  X    
       18        X              X  X  X    


   4. We add THAI characters and make it breakable w/ all ther class

   Class of
   Leading    Class of Trailing Char Class
   Char        

              1 [a] 7  8  9 [b]15 16 18 THAI
                                     
        1     X  X  X  X  X  X  X  X  X
      [a]        X                             
        7        X  X                      
        8        X              X    
        9        X                                   
      [b]        X                                  
       15        X        X     X     X    
       16        X                 X  X    
       18        X              X  X  X    
     THAI                                T
      
     T : need special handling

   5. Now we use one bit to encode weather it is breakable, and use 2 bytes
      for one row, then the bit table will look like:

                 18    <-   1
            
       1  0000 0001 1111 1111  = 0x01FF
      [a] 0000 0000 0000 0010  = 0x0002
       7  0000 0000 0000 0110  = 0x0006
       8  0000 0000 0100 0010  = 0x0042
       9  0000 0000 0000 0010  = 0x0002
      [b] 0000 0000 0000 0010  = 0x0002
      15  0000 0001 0101 0010  = 0x0152
      16  0000 0001 1000 0010  = 0x0182
      18  0000 0001 1100 0010  = 0x01C2
    THAI  0000 0000 0000 0000  = 0x0000

   5. Now we map the class to number
      
      0: 1 
      1: [a]- 2, 3, 4, 5, 6
      2: 7
      3: 8
      4: 9
      5: [b]- 10, 11, 12, 17
      6: 15
      7: 16
      8: 18
      9: THAI

*/

#define MAX_CLASSES 10

static const PRUint16 gPair[MAX_CLASSES] = {
  0x01FF, 
  0x0002, 
  0x0006, 
  0x0042, 
  0x0002, 
  0x0002, 
  0x0152, 
  0x0182, 
  0x01C2,
  0x0000
};


static inline int
GETCLASSFROMTABLE(const PRUint32* t, PRUint16 l)
{
  return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
}

#define CLASS_THAI 9


static inline int
IS_HALFWIDTH_IN_JISx4051_CLASS3(PRUnichar u)
{
  return ((0xff66 <= (u)) && ((u) <= 0xff70));
}

static inline int
IS_CJK_CHAR(PRUnichar u)
{
  return ((0x1100 <= (u) && (u) <= 0x11ff) ||
          (0x2e80 <= (u) && (u) <= 0xd7ff) ||
          (0xf900 <= (u) && (u) <= 0xfaff) ||
          (0xff00 <= (u) && (u) <= 0xffef) );
}

static inline int
IS_SPACE(PRUnichar u)
{
  return ((u) == 0x0020 || (u) == 0x0009 || (u) == 0x000a || (u) == 0x000d || (u)==0x200b);
}

PRInt8 nsJISx4051LineBreaker::GetClass(PRUnichar u)
{
   PRUint16 h = u & 0xFF00;
   PRUint16 l = u & 0x00ff;
   PRInt8 c;
   
   // Handle 3 range table first
   if( 0x0000 == h)
   {
     c = GETCLASSFROMTABLE(gLBClass00, l);
   } 
   else if(th_isthai(u))
   {
     c = CLASS_THAI;
   }
   else if( 0x2000 == h)
   {
     c = GETCLASSFROMTABLE(gLBClass20, l);
   } 
   else if( 0x2100 == h)
   {
     c = GETCLASSFROMTABLE(gLBClass21, l);
   } 
   else if( 0x3000 == h)
   {
     c = GETCLASSFROMTABLE(gLBClass30, l);
   } 
   else if (  ( ( 0x3200 <= u) && ( u <= 0xA4CF) ) || // CJK and Yi 
              ( ( 0xAC00 <= h) && ( h <= 0xD7FF) ) || // Hangul
              ( ( 0xf900 <= h) && ( h <= 0xfaff) )
             )
   { 
     c = 5; // CJK charcter, Han, and Han Compatability
   } 
   else if( 0xff00 == h)
   {
     if( l < 0x0060) // Fullwidth ASCII variant 
     {
       c = GETCLASSFROMTABLE(gLBClass00, (l+0x20));
     } else if (l < 0x00a0) {
       switch (l)
       {
         case 0x61: c = GetClass(0x3002); break;
         case 0x62: c = GetClass(0x300c); break;
         case 0x63: c = GetClass(0x300d); break;
         case 0x64: c = GetClass(0x3001); break;
         case 0x65: c = GetClass(0x30fb); break;
         case 0x9e: c = GetClass(0x309b); break;
         case 0x9f: c = GetClass(0x309c); break;
         default:
           if(IS_HALFWIDTH_IN_JISx4051_CLASS3(u))
              c = 1; // jis x4051 class 3
           else
              c = 5; // jis x4051 class 11
           break;
       }
       // Halfwidth Katakana variants
     } else if( l < 0x00e0) {
       c = 8; // Halfwidth Hangul variants 
     } else if( l < 0x00f0) {
       static PRUnichar NarrowFFEx[16] = 
       { 
         0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
         0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
       };
       c = GetClass(NarrowFFEx[l - 0x00e0]);
     } else {
       c = 8;
     }
   }
   else if( 0x3100 == h) { 
     if ( l <= 0xbf) {  // Hangul Compatibility Jamo, Bopomofo, Kanbun
                        // XXX: This is per UAX #14, but UAX #14 may change
                        // the line breaking rules about Kanbun and Bopomofo.
       c = 5;
     }
     else if ( l >= 0xf0)
     {            // Katakana small letters for Ainu 
       c = 1;
     }
     else   // unassigned
     {
       c = 8;
     }
   } 
   else {
     c = 8; // others 
   }
   return c;
}

PRBool nsJISx4051LineBreaker::GetPair(PRInt8 c1, PRInt8 c2)
{
  NS_ASSERTION( c1 < MAX_CLASSES ,"illegal classes 1");
  NS_ASSERTION( c2 < MAX_CLASSES ,"illegal classes 2");

  return (0 == ((gPair[c1] >> c2 ) & 0x0001));
}


nsJISx4051LineBreaker::nsJISx4051LineBreaker(
   const PRUnichar* aNoBegin, PRInt32 aNoBeginLen,
   const PRUnichar* aNoEnd, PRInt32 aNoEndLen
)
{
}

nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
{
}

NS_IMPL_ISUPPORTS1(nsJISx4051LineBreaker, nsILineBreaker)

#define U_PERIOD ((PRUnichar) '.')
#define U_COMMA ((PRUnichar) ',')
#define U_SPACE ((PRUnichar) ' ')
#define U_RIGHT_SINGLE_QUOTATION_MARK ((PRUnichar) 0x2019)
#define NEED_CONTEXTUAL_ANALYSIS(c) ((c) == U_PERIOD || \
                                     (c) == U_COMMA || \
                                     (c) == U_RIGHT_SINGLE_QUOTATION_MARK)
#define NUMERIC_CLASS  6 // JIS x4051 class 15 is now map to simplified class 6
#define CHARACTER_CLASS  8 // JIS x4051 class 18 is now map to simplified class 8
#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)

PRInt8  nsJISx4051LineBreaker::ContextualAnalysis(
  PRUnichar prev, PRUnichar cur, PRUnichar next
)
{
   if(U_COMMA == cur)
   {
     if(IS_ASCII_DIGIT (prev) && IS_ASCII_DIGIT (next))
       return NUMERIC_CLASS;
   }
   else if(U_PERIOD == cur)
   {
     if((IS_ASCII_DIGIT (prev) || (0x0020 == prev)) && 
         IS_ASCII_DIGIT (next))
       return NUMERIC_CLASS;
 
     // By assigning a full stop  character class only when it's followed by
     // class 6 (numeric), 7, and 8 (character). Note that class 9 (Thai) 
     // doesn't matter, either way, we prevent lines from breaking around 
     // full stop in those cases while  still allowing it to end a line when 
     // followed by CJK  characters. With an additional condition of it being 
     // preceded by  class 0 or class > 5, we make sure that it does not 
     // start a line  (see bug 164759). 
     PRUint8 pc = GetClass(prev);
     if((pc > 5 || pc == 0)  && GetClass(next) > 5)
       return CHARACTER_CLASS;
   }
   else if(U_RIGHT_SINGLE_QUOTATION_MARK == cur)
   {
     // somehow people use this as ' in "it's" sometimes...
     if(U_SPACE != next)
       return CHARACTER_CLASS;
   }
   return this->GetClass(cur);
}


NS_IMETHODIMP nsJISx4051LineBreaker::BreakInBetween(
  const PRUnichar* aText1 , PRUint32 aTextLen1,
  const PRUnichar* aText2 , PRUint32 aTextLen2,
  PRBool *oCanBreak)
{
  NS_ENSURE_TRUE(aText1, NS_ERROR_NULL_POINTER);
  NS_ENSURE_TRUE(aText2, NS_ERROR_NULL_POINTER);

  if((0 == aTextLen1) || (0==aTextLen2) ||
     IS_HIGH_SURROGATE(aText1[aTextLen1-1]) && 
     IS_LOW_SURROGATE(aText2[0]) )  //Do not separate a surrogate pair
  {
     *oCanBreak = PR_FALSE;
     return NS_OK;
  }

  //search for CJK characters until a space is found. 
  //if CJK char is found before space, use 4051, otherwise western
  PRInt32 cur;

  for (cur= aTextLen1-1; cur>=0; cur--)
  {
    if (IS_SPACE(aText1[cur]))
      break;
    if (IS_CJK_CHAR(aText1[cur]))
      goto ROUTE_CJK_BETWEEN;
  }

  for (cur= 0; cur < (PRInt32)aTextLen2; cur++)
  {
    if (IS_SPACE(aText2[cur]))
      break;
    if (IS_CJK_CHAR(aText2[cur]))
      goto ROUTE_CJK_BETWEEN;
  }

  //now apply western rule.
  *oCanBreak = IS_SPACE(aText1[aTextLen1-1]) || IS_SPACE(aText2[0]);
  return NS_OK;

ROUTE_CJK_BETWEEN:

  PRInt8 c1, c2;
  if(NEED_CONTEXTUAL_ANALYSIS(aText1[aTextLen1-1]))
    c1 = this->ContextualAnalysis((aTextLen1>1)?aText1[aTextLen1-2]:0,
                                  aText1[aTextLen1-1],
                                  aText2[0]);
  else 
    c1 = this->GetClass(aText1[aTextLen1-1]);

  if(NEED_CONTEXTUAL_ANALYSIS(aText2[0]))
    c2 = this->ContextualAnalysis(aText1[aTextLen1-1],
                                  aText2[0],
                                  (aTextLen2>1)?aText2[1]:0);
  else 
    c2 = this->GetClass(aText2[0]);

  /* Handle cases for THAI */
  if((CLASS_THAI == c1) && (CLASS_THAI == c2))
  {
     *oCanBreak = (0 == TrbWordBreakPos(aText1, aTextLen1, aText2, aTextLen2));
  }
  else 
  {
     *oCanBreak = GetPair(c1,c2);
  }
  return NS_OK;
}


NS_IMETHODIMP nsJISx4051LineBreaker::Next( 
  const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
  PRUint32* oNext, PRBool *oNeedMoreText) 
{
  NS_ENSURE_TRUE(aText, NS_ERROR_NULL_POINTER);
  NS_ENSURE_TRUE(oNext, NS_ERROR_NULL_POINTER);
  NS_ENSURE_TRUE(oNeedMoreText, NS_ERROR_NULL_POINTER);
  NS_ENSURE_TRUE(aPos <= aLen, NS_ERROR_ILLEGAL_VALUE);

  //forward check for CJK characters until a space is found. 
  //if CJK char is found before space, use 4051, otherwise western
  PRUint32 cur;
  for (cur = aPos; cur < aLen; ++cur)
  {
    if (IS_SPACE(aText[cur]))
    {
      *oNext = cur;
      *oNeedMoreText = PR_FALSE;
      return NS_OK;
    }
    if (IS_CJK_CHAR(aText[cur]))
      goto ROUTE_CJK_NEXT;
  }
  *oNext = aLen;
  *oNeedMoreText = PR_TRUE;
  return NS_OK;

ROUTE_CJK_NEXT:
  PRInt8 c1, c2;
  cur = aPos;
  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
  {
    c1 = this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
                                  aText[cur],
                                  (cur<(aLen-1)) ?aText[cur+1]:0);
  } else  {
    c1 = this->GetClass(aText[cur]);
  }
  
  if(CLASS_THAI == c1) 
  {
     *oNext = PRUint32(TrbFollowing(aText, aLen, aPos));
     *oNeedMoreText = PR_FALSE;
     return NS_OK;
  }

  for(cur++; cur <aLen; cur++)
  {
     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur]))
     {
       c2= this->ContextualAnalysis((cur>0)?aText[cur-1]:0,
                                  aText[cur],
                                  (cur<(aLen-1)) ?aText[cur+1]:0);
     } else {
       c2 = this->GetClass(aText[cur]);
     }

     if(GetPair(c1, c2)) {
       *oNext = cur ;
       *oNeedMoreText = PR_FALSE;
       return NS_OK;
     }
     c1 = c2;
  }
  *oNext = aLen;
  *oNeedMoreText = PR_TRUE;
  return NS_OK;
}

NS_IMETHODIMP nsJISx4051LineBreaker::Prev( 
  const PRUnichar* aText, PRUint32 aLen, PRUint32 aPos,
  PRUint32* oPrev, PRBool *oNeedMoreText) 
{
  NS_ENSURE_TRUE(aText, NS_ERROR_NULL_POINTER);
  NS_ENSURE_TRUE(oPrev, NS_ERROR_NULL_POINTER);
  NS_ENSURE_TRUE(oNeedMoreText, NS_ERROR_NULL_POINTER);

  //backward check for CJK characters until a space is found. 
  //if CJK char is found before space, use 4051, otherwise western
  PRUint32 cur;
  for (cur = aPos - 1; cur > 0; --cur)
  {
    if (IS_SPACE(aText[cur]))
    {
      if (cur != aPos - 1) // XXXldb Why?
        ++cur;
      *oPrev = cur;
      *oNeedMoreText = PR_FALSE;
      return NS_OK;
    }
    if (IS_CJK_CHAR(aText[cur]))
      goto ROUTE_CJK_PREV;
  }

  *oPrev = 0;
  *oNeedMoreText = PR_TRUE;
  return NS_OK;

ROUTE_CJK_PREV:
  cur = aPos;
  PRInt8 c1, c2;
  if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
  {
    c2 = this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
                                  aText[cur-1],
                                  (cur<aLen) ?aText[cur]:0);
  } else  {
    c2 = this->GetClass(aText[cur-1]);
  }
  // To Do: 
  //
  // Should handle CLASS_THAI here
  //
  for(cur--; cur > 0; cur--)
  {
     if(NEED_CONTEXTUAL_ANALYSIS(aText[cur-1]))
     {
       c1= this->ContextualAnalysis(((cur-1)>0)?aText[cur-2]:0,
                                  aText[cur-1],
                                  (cur<aLen) ?aText[cur]:0);
     } else {
       c1 = this->GetClass(aText[cur-1]);
     }

     if(GetPair(c1, c2)) {
       *oPrev = cur;
       *oNeedMoreText = PR_FALSE;
       return NS_OK;
     }
     c2 = c1;
  }
  *oPrev = 0;
  *oNeedMoreText = PR_TRUE;
  return NS_OK;
}