/* This file is part of the KDE libraries Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004 Apple Computer, Inc. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ //---------------------------------------------------------------------------- // // KDE HTML Widget - Tokenizers //#define TOKEN_DEBUG 1 //#define TOKEN_DEBUG 2 #ifdef HAVE_CONFIG_H #include "config.h" #endif //#include #include "html/htmltokenizer.h" #include "html/html_documentimpl.h" #include "html/htmlparser.h" #include "html/dtd.h" #include "misc/loader.h" #include "misc/htmlhashes.h" #include "khtmlview.h" #include "khtml_part.h" #include "xml/dom_docimpl.h" #include "css/csshelper.h" #include "ecma/kjs_proxy.h" #include #include #include #include #include #include #include // turn off inlining to void warning with newer gcc #undef __inline #define __inline #include "kentities.c" #undef __inline using namespace khtml; static const char commentStart [] = " as a close comment, even though it's // not technically valid. ( scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '!' ) ) ) { ++src; if ( !( script || xmp || textarea || style) ) { #ifdef COMMENTS_IN_DOM checkScriptBuffer(); scriptCode[ scriptCodeSize ] = 0; scriptCode[ scriptCodeSize + 1 ] = 0; currToken.id = ID_COMMENT; processListing(DOMStringIt(scriptCode, scriptCodeSize - 2)); processToken(); currToken.id = ID_COMMENT + ID_CLOSE_TAG; processToken(); #endif scriptCodeSize = 0; } comment = false; return; // Finished parsing comment } ++src; } } void HTMLTokenizer::parseServer(DOMStringIt &src) { checkScriptBuffer(src.length()); while ( src.length() ) { scriptCode[ scriptCodeSize++ ] = *src; if (src->unicode() == '>' && scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') { ++src; server = false; scriptCodeSize = 0; return; // Finished parsing server include } ++src; } } void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src) { char oldchar = 0; while ( src.length() ) { unsigned char chbegin = src->latin1(); if(chbegin == '\'') { tquote = tquote == SingleQuote ? NoQuote : SingleQuote; } else if(chbegin == '\"') { tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; } // Look for '?>' // some crappy sites omit the "?" before it, so // we look for an unquoted '>' instead. (IE compatible) else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) ) { // We got a '?>' sequence processingInstruction = false; ++src; discard=LFDiscard; return; // Finished parsing comment! } ++src; oldchar = chbegin; } } void HTMLTokenizer::parseText(DOMStringIt &src) { while ( src.length() ) { // do we need to enlarge the buffer? checkBuffer(); // ascii is okay because we only do ascii comparisons unsigned char chbegin = src->latin1(); if (skipLF && ( chbegin != '\n' )) { skipLF = false; } if (skipLF) { skipLF = false; ++src; } else if (( chbegin == '\n' ) || ( chbegin == '\r' )) { if (chbegin == '\r') skipLF = true; *dest++ = '\n'; ++src; } else { *dest = *src; fixUpChar(*dest); ++dest; ++src; } } } void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start) { if( start ) { cBufferPos = 0; Entity = SearchEntity; EntityUnicodeValue = 0; } while( src.length() ) { ushort cc = src->unicode(); switch(Entity) { case NoEntity: assert(Entity != NoEntity); return; case SearchEntity: if(cc == '#') { cBuffer[cBufferPos++] = cc; ++src; Entity = NumericSearch; } else Entity = EntityName; break; case NumericSearch: if(cc == 'x' || cc == 'X') { cBuffer[cBufferPos++] = cc; ++src; Entity = Hexadecimal; } else if(cc >= '0' && cc <= '9') Entity = Decimal; else Entity = SearchSemicolon; break; case Hexadecimal: { int ll = kMin(src.length(), 8); while(ll--) { QChar csrc(src->lower()); cc = csrc.cell(); if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) { break; } EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10)); cBuffer[cBufferPos++] = cc; ++src; } Entity = SearchSemicolon; break; } case Decimal: { int ll = kMin(src.length(), 9-cBufferPos); while(ll--) { cc = src->cell(); if(src->row() || !(cc >= '0' && cc <= '9')) { Entity = SearchSemicolon; break; } EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); cBuffer[cBufferPos++] = cc; ++src; } if(cBufferPos == 9) Entity = SearchSemicolon; break; } case EntityName: { int ll = kMin(src.length(), 9-cBufferPos); while(ll--) { QChar csrc = *src; cc = csrc.cell(); if(csrc.row() || !((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { Entity = SearchSemicolon; break; } cBuffer[cBufferPos++] = cc; ++src; } if(cBufferPos == 9) Entity = SearchSemicolon; if(Entity == SearchSemicolon) { if(cBufferPos > 1) { const entity *e = findEntity(cBuffer, cBufferPos); if(e) EntityUnicodeValue = e->code; // be IE compatible if(tag && EntityUnicodeValue > 255 && *src != ';') EntityUnicodeValue = 0; } } else break; } case SearchSemicolon: //kdDebug( 6036 ) << "ENTITY " << EntityUnicodeValue << ", " << res << endl; // Don't allow surrogate code points, or values that are more than 21 bits. if ((EntityUnicodeValue > 0 && EntityUnicodeValue < 0xD800) || (EntityUnicodeValue >= 0xE000 && EntityUnicodeValue <= 0x1FFFFF)) { if (*src == ';') ++src; if (EntityUnicodeValue <= 0xFFFF) { QChar c(EntityUnicodeValue); fixUpChar(c); checkBuffer(); src.push(c); } else { // Convert to UTF-16, using surrogate code points. QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F)); QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF)); checkBuffer(2); src.push(c1); src.push(c2); } } else { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "unknown entity!" << endl; #endif checkBuffer(10); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; for(unsigned int i = 0; i < cBufferPos; i++) dest[i] = cBuffer[i]; dest += cBufferPos; if (pre) prePos += cBufferPos+1; } Entity = NoEntity; return; } } } void HTMLTokenizer::parseTag(DOMStringIt &src) { assert(!Entity ); while ( src.length() ) { checkBuffer(); #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 uint l = 0; while(l < src.length() && (*(src.current()+l)).latin1() != '>') l++; qDebug("src is now: *%s*, tquote: %d", QConstString((QChar*)src.current(), l).string().latin1(), tquote); #endif switch(tag) { case NoTag: { return; } case TagName: { #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("TagName"); #endif if (searchCount > 0) { if (*src == commentStart[searchCount]) { searchCount++; if (searchCount == 4) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Found comment" << endl; #endif // Found ' as a valid comment, since both mozilla and IE on windows // can handle this case. Only do this in quirks mode. -dwh if (*src == '>' && parser->doc()->inCompatMode()) { comment = false; ++src; cBuffer[cBufferPos++] = src->cell(); } else parseComment(src); return; // Finished parsing tag! } // cuts of high part, is okay cBuffer[cBufferPos++] = src->cell(); ++src; break; } else searchCount = 0; // Stop looking for ' searchCount = 1; // Look for ' " << name << " id = " << currToken.id << endl; if (currToken.flat) kdDebug( 6036 ) << "Token is FLAT!" << endl; if(!text.isNull()) kdDebug( 6036 ) << "text: \"" << text << "\"" << endl; unsigned long l = currToken.attrs ? currToken.attrs->length() : 0; if(l) { kdDebug( 6036 ) << "Attributes: " << l << endl; for (unsigned long i = 0; i < l; ++i) { AttributeImpl* c = currToken.attrs->attributeItem(i); kdDebug( 6036 ) << " " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string() << "=\"" << c->value().string() << "\"" << endl; } } kdDebug( 6036 ) << endl; #endif // pass the token over to the parser, the parser DOES NOT delete the token parser->parseToken(&currToken); currToken.reset(); if (jsProxy) jsProxy->setEventHandlerLineno(0); } HTMLTokenizer::~HTMLTokenizer() { assert(!inWrite); reset(); delete parser; } void HTMLTokenizer::enlargeBuffer(int len) { int newsize = kMax(size*2, size+len); int oldoffs = (dest - buffer); buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar)); dest = buffer + oldoffs; size = newsize; } void HTMLTokenizer::enlargeScriptBuffer(int len) { int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len); scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar)); scriptCodeMaxSize = newsize; } void HTMLTokenizer::notifyFinished(CachedObject */*finishedObj*/) { assert(!cachedScript.isEmpty()); bool finished = false; while (!finished && cachedScript.head()->isLoaded()) { #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Finished loading an external script" << endl; #endif CachedScript* cs = cachedScript.dequeue(); DOMString scriptSource = cs->script(); #ifdef TOKEN_DEBUG kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl; #endif setSrc(QString::null); // make sure we forget about the script before we execute the new one // infinite recursion might happen otherwise QString cachedScriptUrl( cs->url().string() ); cs->deref(this); scriptExecution( scriptSource.string(), cachedScriptUrl ); // cachedScript.isEmpty() can change inside the scriptExecution() call above, // so don't test it until afterwards. finished = cachedScript.isEmpty(); if (finished) loadingExtScript = false; // 'script' is true when we are called synchronously from // parseScript(). In that case parseScript() will take care // of 'scriptOutput'. if ( !script ) { QString rest = pendingSrc; pendingSrc = ""; write(rest, false); // we might be deleted at this point, do not // access any members. } } } bool HTMLTokenizer::isWaitingForScripts() { return loadingExtScript; } void HTMLTokenizer::setSrc(const QString &source) { lineno += src.lineCount(); _src = source; src = DOMStringIt(_src); } void HTMLTokenizer::setOnHold(bool _onHold) { if (onHold == _onHold) return; onHold = _onHold; if (onHold) setSrc(QString(src.current(), src.length())); // ### deep copy }