/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: NPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Netscape Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/NPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Communicator client code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 1998 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Pierre Phaneuf * * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the NPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the NPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nsIFactory.h" #include "nsISupports.h" #include "nsClassicCharDetDll.h" #include "pratom.h" #include "nsICharsetDetector.h" #include "nsICharsetDetectionObserver.h" #include "nsIStringCharsetDetector.h" #include "nsClassicDetectors.h" //========================================================== /* values for EUC shift chars */ #define SS2 0x8E /* Single Shift 2 */ #define SS3 0x8F /* Single Shift 3 */ #define IsRoman(c) ((c) < 0x80) #define IsSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xFD)) #define IsLoSJIS2ndByte(c) (((c) > 0x3F) && ((c) < 0xA1)) #define IsHiSJIS2ndByte(c) (((c) > 0xA0) && ((c) < 0xFD)) #define IsEUCJPKana(b1) (((b1) > 0xA0) && ((b1) < 0xE0)) #define IsEUCJPKanji(b1or2) (((b1or2) > 0xA0) && ((b1or2) < 0xFF)) #define YES 1 #define NO 0 #define MAYBE -1 static int isSJIS(const unsigned char *cp, PRInt32 len) { while (len) { if (IsRoman(*cp)) { cp++, len--; } else if (*cp == 0x80) { /* illegal SJIS 1st byte */ return NO; } else if ((*cp < 0xA0)) { /* byte 1 of 2byte SJIS 1st range */ if (len > 1) { if (IsSJIS2ndByte(cp[1])) { if ((*cp != 0x8E && *cp != 0x8F) || (*(cp+1) <= 0xA0)) return YES; cp += 2, len -= 2; /* valid 2 byte SJIS */ } else { return NO; /* invalid SJIS 2nd byte */ } } else break; /* buffer ended w/1of2 byte SJIS */ } else if (*cp == 0xA0) { /* illegal EUCJP byte */ #if ALLOW_NBSP cp++, len--; /* allow nbsp */ #endif } else if (*cp < 0xE0) { /* SJIS half-width kana */ cp++, len--; } else if (*cp < 0xF0) { /* byte 1 of 2byte SJIS 2nd range */ if (len > 1) { if (IsSJIS2ndByte(cp[1])) { cp += 2, len -= 2; /* valid 2 byte SJIS */ } else { return NO; /* invalid SJIS */ } } else break; /* buffer ended w/1of2 byte SJIS */ } else { return NO; /* invalid SJIS 1st byte */ } } return MAYBE; /* No illegal SJIS values found */ } static int isEUCJP(const unsigned char *cp, PRInt32 len) { while (len) { if (IsRoman(*cp)) { /* Roman */ cp++, len--; } else if (*cp == SS2) { /* EUCJP JIS201 half-width kana */ if (len > 1) { if (IsEUCJPKana(cp[1])) cp += 2, len -= 2; /* valid half-width kana */ else return NO; /* invalid 2of3 byte EUC */ } else break; /* buffer ended w/1of2 byte EUC */ } else if (*cp == SS3) { /* EUCJP JIS212 */ if (len > 1) { if (IsEUCJPKanji(cp[1])) { if (len > 2) { if (IsEUCJPKanji(cp[2])) cp += 2, len -= 2; /* valid 3 byte EUCJP */ else return NO; /* invalid 3of3 byte EUCJP */ } else break; /* buffer ended w/2of3 byte EUCJP */ } else return NO; /* invalid 2of3 byte EUCJP */ } else break; /* buffer ended w/1of3 byte EUCJP */ } else if (*cp == 0xA0) { /* illegal EUCJP byte */ #if ALLOW_NBSP cp++, len--; /* allow nbsp */ #else return NO; #endif } else if (*cp < 0xF0) { /* EUCJP JIS208 (overlaps SJIS) */ if (len > 1) { if (IsEUCJPKanji(cp[1])) cp += 2, len -= 2; /* valid 2 byte EUCJP */ else return NO; /* invalid 2of2 byte EUCJP */ } else break; /* buffer ended w/1of2 byte EUCJP */ } else if (*cp < 0xFF) { /* EUCJP JIS208 only: */ if (len > 1) { if (IsEUCJPKanji(cp[1])) return YES; /* valid 2 byte EUCJP, invalid SJIS */ else return NO; /* invalid 2of2 byte EUCJP */ } else break; /* buffer ended w/1of2 byte EUCJP */ } else { return NO; /* invalid EUCJP 1st byte: 0xFF */ } } return MAYBE; } //////////////////////////////////////////////////////////////////////////////////////////////////// static nsresult JA_AutoCharsetDetectBuffer(const char* aBuffer, const PRInt32 aLen, char* aCharset) { PRBool hasEsc = PR_FALSE; PRBool asciiOnly = PR_TRUE; PL_strcpy(aCharset, "ISO-8859-1"); // check 8 bit or ESC for (int i = 0; i < aLen; i++) { if ((unsigned char) aBuffer[i] > 127 || aBuffer[i] == 0x1B) { if (aBuffer[i] == 0x1B) { hasEsc = PR_TRUE; break; } else { asciiOnly = PR_FALSE; } } } if (hasEsc) { PL_strcpy(aCharset, "ISO-2022-JP"); } else if (!asciiOnly) { // use old japanese auto detect code int euc, sjis; euc = isEUCJP((unsigned char *) aBuffer, aLen); sjis = isSJIS((unsigned char *) aBuffer, aLen); if (YES == euc) { PL_strcpy(aCharset, "EUC-JP"); } else if (YES == sjis) { PL_strcpy(aCharset, "Shift_JIS"); } else if (MAYBE == euc && NO == sjis) { PL_strcpy(aCharset, "EUC-JP"); } else if (MAYBE == sjis && NO == euc) { PL_strcpy(aCharset, "Shift_JIS"); } else if (MAYBE == euc && MAYBE == sjis) { PL_strcpy(aCharset, "EUC-JP"); } } return NS_OK; } //========================================================== NS_IMPL_ISUPPORTS1(nsClassicDetector, nsICharsetDetector) //---------------------------------------------------------- nsClassicDetector::nsClassicDetector(const char* language) { mObserver = nsnull; PL_strcpy(mLanguage, language); } //---------------------------------------------------------- nsClassicDetector::~nsClassicDetector() { } //---------------------------------------------------------- NS_IMETHODIMP nsClassicDetector::Init( nsICharsetDetectionObserver* aObserver) { NS_ASSERTION(mObserver == nsnull , "Init twice"); if(nsnull == aObserver) return NS_ERROR_ILLEGAL_VALUE; mObserver = aObserver; return NS_OK; } //---------------------------------------------------------- NS_IMETHODIMP nsClassicDetector::DoIt( const char* aBuf, PRUint32 aLen, PRBool* oDontFeedMe) { NS_ASSERTION(mObserver != nsnull , "have not init yet"); if((nsnull == aBuf) || (nsnull == oDontFeedMe)) return NS_ERROR_ILLEGAL_VALUE; if (!PL_strcasecmp("ja", mLanguage) && NS_SUCCEEDED(JA_AutoCharsetDetectBuffer(aBuf, (PRInt32) aLen, mCharset))) { mObserver->Notify(mCharset, eBestAnswer); } else { mObserver->Notify("", eNoAnswerMatch); } *oDontFeedMe = PR_TRUE; return NS_OK; } //---------------------------------------------------------- NS_IMETHODIMP nsClassicDetector::Done() { NS_ASSERTION(mObserver != nsnull , "have not init yet"); return NS_OK; } //========================================================== NS_IMPL_ISUPPORTS1(nsClassicStringDetector, nsIStringCharsetDetector) //---------------------------------------------------------- nsClassicStringDetector::nsClassicStringDetector(const char* language) { PL_strcpy(mLanguage, language); } //---------------------------------------------------------- nsClassicStringDetector::~nsClassicStringDetector() { } //---------------------------------------------------------- NS_IMETHODIMP nsClassicStringDetector::DoIt(const char* aBuf, PRUint32 aLen, const char** oCharset, nsDetectionConfident &oConfident) { oConfident = eNoAnswerMatch; *oCharset = ""; if (!PL_strcasecmp("ja", mLanguage) && NS_SUCCEEDED(JA_AutoCharsetDetectBuffer(aBuf, (PRInt32) aLen, mCharset))) { *oCharset = mCharset; oConfident = eBestAnswer; } return NS_OK; }