khtml Library API Documentation

htmltokenizer.cpp

00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1997 Martin Jones (mjones@kde.org)
00005               (C) 1997 Torben Weis (weis@kde.org)
00006               (C) 1998 Waldo Bastian (bastian@kde.org)
00007               (C) 1999 Lars Knoll (knoll@kde.org)
00008               (C) 1999 Antti Koivisto (koivisto@kde.org)
00009               (C) 2001-2003 Dirk Mueller (mueller@kde.org)
00010               (C) 2002 Apple Computer, Inc.
00011 
00012     This library is free software; you can redistribute it and/or
00013     modify it under the terms of the GNU Library General Public
00014     License as published by the Free Software Foundation; either
00015     version 2 of the License, or (at your option) any later version.
00016 
00017     This library is distributed in the hope that it will be useful,
00018     but WITHOUT ANY WARRANTY; without even the implied warranty of
00019     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00020     Library General Public License for more details.
00021 
00022     You should have received a copy of the GNU Library General Public License
00023     along with this library; see the file COPYING.LIB.  If not, write to
00024     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00025     Boston, MA 02111-1307, USA.
00026 */
00027 //----------------------------------------------------------------------------
00028 //
00029 // KDE HTML Widget - Tokenizers
00030 // $Id: htmltokenizer.cpp,v 1.248.2.3 2003/01/13 22:04:48 mueller Exp $
00031 
00032 //#define TOKEN_DEBUG 1
00033 //#define TOKEN_DEBUG 2
00034 
00035 #ifdef HAVE_CONFIG_H
00036 #include "config.h"
00037 #endif
00038 
00039 //#include <string.h>
00040 #include "html/htmltokenizer.h"
00041 #include "html/html_documentimpl.h"
00042 #include "html/htmlparser.h"
00043 #include "html/dtd.h"
00044 
00045 #include "misc/loader.h"
00046 #include "misc/htmlhashes.h"
00047 
00048 #include "khtmlview.h"
00049 #include "khtml_part.h"
00050 #include "xml/dom_docimpl.h"
00051 #include "css/csshelper.h"
00052 #include "ecma/kjs_proxy.h"
00053 #include <kcharsets.h>
00054 #include <kglobal.h>
00055 #include <ctype.h>
00056 #include <assert.h>
00057 #include <qvariant.h>
00058 #include <kdebug.h>
00059 #include <stdlib.h>
00060 
00061 #include "kentities.c"
00062 
00063 using namespace khtml;
00064 
00065 static const QChar commentStart [] = { '<','!','-','-', QChar::null };
00066 
00067 static const char scriptEnd [] = "</script";
00068 static const char xmpEnd [] = "</xmp";
00069 static const char styleEnd [] =  "</style";
00070 static const char textareaEnd [] = "</textarea";
00071 static const char titleEnd [] = "</title";
00072 
00073 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
00074 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
00075 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
00076 
00077 // Full support for MS Windows extensions to Latin-1.
00078 // Technically these extensions should only be activated for pages
00079 // marked "windows-1252" or "cp1252", but
00080 // in the standard Microsoft way, these extensions infect hundreds of thousands
00081 // of web pages.  Note that people with non-latin-1 Microsoft extensions
00082 // are SOL.
00083 //
00084 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
00085 //      http://www.bbsinc.com/iso8859.html
00086 //      http://www.obviously.com/
00087 //
00088 // There may be better equivalents
00089 #if 0
00090 #define fixUpChar(x)
00091 #else
00092 #define fixUpChar(x) \
00093             if (!(x).row() ) { \
00094                 switch ((x).cell()) \
00095                 { \
00096                 /* ALL of these should be changed to Unicode SOON */ \
00097                 case 0x80: (x) = 0x20ac; break; \
00098                 case 0x82: (x) = ',';    break; \
00099                 case 0x83: (x) = 0x0192; break; \
00100                 case 0x84: (x) = '"';    break; \
00101                 case 0x85: (x) = 0x2026; break; \
00102                 case 0x86: (x) = 0x2020; break; \
00103                 case 0x87: (x) = 0x2021; break; \
00104                 case 0x88: (x) = 0x02C6; break; \
00105                 case 0x89: (x) = 0x2030; break; \
00106                 case 0x8A: (x) = 0x0160; break; \
00107                 case 0x8b: (x) = '<';    break; \
00108                 case 0x8C: (x) = 0x0152; break; \
00109 \
00110                 case 0x8E: (x) = 0x017D; break; \
00111 \
00112 \
00113                 case 0x91: (x) = '\'';   break; \
00114                 case 0x92: (x) = '\'';   break; \
00115                 case 0x93: (x) = '"';    break; \
00116                 case 0x94: (x) = '"';    break; \
00117                 case 0x95: (x) = '*';    break; \
00118                 case 0x96: (x) = '-';    break; \
00119                 case 0x97: (x) = '-';    break; \
00120                 case 0x98: (x) = '~';    break; \
00121                 case 0x99: (x) = 0x2122; break; \
00122                 case 0x9A: (x) = 0x0161; break; \
00123                 case 0x9b: (x) = '>';    break; \
00124                 case 0x9C: (x) = 0x0153; break; \
00125 \
00126                 case 0x9E: (x) = 0x017E; break; \
00127                 case 0x9F: (x) = 0x0178; break; \
00128                 /* This one should die */ \
00129                 case 0xb7: (x) = '*';    break; \
00130                 default: break; \
00131                 } \
00132             } \
00133             else { \
00134                 /* These should all die sooner rather than later */ \
00135                 switch( (x).unicode() ) { \
00136                 case 0x2013: (x) = '-'; break; \
00137                 case 0x2014: (x) = '-'; break; \
00138                 case 0x2018: (x) = '\''; break; \
00139                 case 0x2019: (x) = '\''; break; \
00140                 case 0x201c: (x) = '"'; break; \
00141                 case 0x201d: (x) = '"'; break; \
00142                 case 0x2022: (x) = '*'; break; \
00143                 case 0x2122: (x) = 0x2122; break; \
00144                 default: break; \
00145                 } \
00146             }
00147 #endif
00148 
00149 // ----------------------------------------------------------------------------
00150 
00151 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view)
00152 {
00153     view = _view;
00154     buffer = 0;
00155     scriptCode = 0;
00156     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00157     charsets = KGlobal::charsets();
00158     parser = new KHTMLParser(_view, _doc);
00159     m_executingScript = 0;
00160     onHold = false;
00161 
00162     reset();
00163 }
00164 
00165 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i)
00166 {
00167     view = 0;
00168     buffer = 0;
00169     scriptCode = 0;
00170     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00171     charsets = KGlobal::charsets();
00172     parser = new KHTMLParser( i, _doc );
00173     m_executingScript = 0;
00174     onHold = false;
00175 
00176     reset();
00177 }
00178 
00179 void HTMLTokenizer::reset()
00180 {
00181     assert(m_executingScript == 0);
00182     assert(onHold == false);
00183 
00184     while (!cachedScript.isEmpty())
00185         cachedScript.dequeue()->deref(this);
00186 
00187     if ( buffer )
00188         KHTML_DELETE_QCHAR_VEC(buffer);
00189     buffer = dest = 0;
00190     size = 0;
00191 
00192     if ( scriptCode )
00193         KHTML_DELETE_QCHAR_VEC(scriptCode);
00194     scriptCode = 0;
00195     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00196 
00197     currToken.reset();
00198 }
00199 
00200 void HTMLTokenizer::begin()
00201 {
00202     m_executingScript = 0;
00203     onHold = false;
00204     reset();
00205     size = 254;
00206     buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
00207     dest = buffer;
00208     tag = NoTag;
00209     pending = NonePending;
00210     discard = NoneDiscard;
00211     pre = false;
00212     prePos = 0;
00213     plaintext = false;
00214     xmp = false;
00215     processingInstruction = false;
00216     script = false;
00217     escaped = false;
00218     style = false;
00219     skipLF = false;
00220     select = false;
00221     comment = false;
00222     server = false;
00223     textarea = false;
00224     title = false;
00225     startTag = false;
00226     tquote = NoQuote;
00227     searchCount = 0;
00228     Entity = NoEntity;
00229     noMoreData = false;
00230     brokenComments = false;
00231     brokenServer = false;
00232     lineno = 0;
00233     scriptStartLineno = 0;
00234     tagStartLineno = 0;
00235 }
00236 
00237 void HTMLTokenizer::processListing(DOMStringIt list)
00238 {
00239     bool old_pre = pre;
00240 
00241     // This function adds the listing 'list' as
00242     // preformatted text-tokens to the token-collection
00243     // thereby converting TABs.
00244     if(!style) pre = true;
00245     prePos = 0;
00246 
00247     while ( list.length() )
00248     {
00249         checkBuffer(3*TAB_SIZE);
00250 
00251         if (skipLF && ( *list != '\n' ))
00252         {
00253             skipLF = false;
00254         }
00255 
00256         if (skipLF)
00257         {
00258             skipLF = false;
00259             ++list;
00260         }
00261         else if (( *list == '\n' ) || ( *list == '\r' ))
00262         {
00263             if (discard == LFDiscard)
00264             {
00265                 // Ignore this LF
00266                 discard = NoneDiscard; // We have discarded 1 LF
00267             }
00268             else
00269             {
00270                 // Process this LF
00271                 if (pending)
00272                     addPending();
00273                 pending = LFPending;
00274             }
00275             /* Check for MS-DOS CRLF sequence */
00276             if (*list == '\r')
00277             {
00278                 skipLF = true;
00279             }
00280             ++list;
00281         }
00282         else if (( *list == ' ' ) || ( *list == '\t'))
00283         {
00284             if (pending)
00285                 addPending();
00286             if (*list == ' ')
00287                 pending = SpacePending;
00288             else
00289                 pending = TabPending;
00290 
00291             ++list;
00292         }
00293         else
00294         {
00295             discard = NoneDiscard;
00296             if (pending)
00297                 addPending();
00298 
00299             prePos++;
00300             *dest++ = *list;
00301             ++list;
00302         }
00303 
00304     }
00305 
00306     if ((pending == SpacePending) || (pending == TabPending))
00307         addPending();
00308     else
00309         pending = NonePending;
00310 
00311     prePos = 0;
00312     pre = old_pre;
00313 }
00314 
00315 void HTMLTokenizer::parseSpecial(DOMStringIt &src)
00316 {
00317     assert( textarea || title || !Entity );
00318     assert( !tag );
00319     assert( xmp+textarea+title+style+script == 1 );
00320     if (script)
00321         scriptStartLineno = lineno+src.lineCount();
00322 
00323     if ( comment ) parseComment( src );
00324 
00325     while ( src.length() ) {
00326         checkScriptBuffer();
00327         unsigned char ch = src->latin1();
00328         if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && !title && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) {
00329             comment = true;
00330             parseComment( src );
00331             continue;
00332         }
00333         if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
00334             ++src;
00335             scriptCodeSize = scriptCodeResync-1;
00336             scriptCodeResync = 0;
00337             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
00338             if ( script )
00339                 scriptHandler();
00340             else {
00341                 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00342                 processToken();
00343                 if ( style )         { currToken.id = ID_STYLE + ID_CLOSE_TAG; }
00344                 else if ( textarea ) { currToken.id = ID_TEXTAREA + ID_CLOSE_TAG; }
00345                 else if ( title ) { currToken.id = ID_TITLE + ID_CLOSE_TAG; }
00346                 else if ( xmp )  { currToken.id = ID_XMP + ID_CLOSE_TAG; }
00347                 processToken();
00348                 style = script = style = textarea = title = xmp = false;
00349                 tquote = NoQuote;
00350                 scriptCodeSize = scriptCodeResync = 0;
00351             }
00352             return;
00353         }
00354         // possible end of tagname, lets check.
00355         if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
00356              scriptCodeSize >= searchStopperLen &&
00357              !QConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) {
00358             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
00359             tquote = NoQuote;
00360             continue;
00361         }
00362         if ( scriptCodeResync && !escaped ) {
00363             if(ch == '\"')
00364                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
00365             else if(ch == '\'')
00366                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
00367             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
00368                 tquote = NoQuote;
00369         }
00370         escaped = ( !escaped && ch == '\\' );
00371         if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
00372             QChar *scriptCodeDest = scriptCode+scriptCodeSize;
00373             ++src;
00374             parseEntity(src,scriptCodeDest,true);
00375             scriptCodeSize = scriptCodeDest-scriptCode;
00376         }
00377         else {
00378             scriptCode[ scriptCodeSize++ ] = *src;
00379             ++src;
00380         }
00381     }
00382 }
00383 
00384 void HTMLTokenizer::scriptHandler()
00385 {
00386     QString currentScriptSrc = scriptSrc;
00387     scriptSrc = QString::null;
00388 
00389     processListing(DOMStringIt(scriptCode, scriptCodeSize));
00390     QString exScript( buffer, dest-buffer );
00391 
00392     processToken();
00393     currToken.id = ID_SCRIPT + ID_CLOSE_TAG;
00394     processToken();
00395 
00396     QString prependingSrc;
00397 
00398     if ( !parser->skipMode() ) {
00399         CachedScript* cs = 0;
00400 
00401         // forget what we just got, load from src url instead
00402         if ( !currentScriptSrc.isEmpty() &&
00403              (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) ))
00404             cachedScript.enqueue(cs);
00405 
00406         if (cs) {
00407             pendingSrc.prepend( QString(src.current(), src.length() ) );
00408             setSrc(QString::null);
00409             scriptCodeSize = scriptCodeResync = 0;
00410             cs->ref(this);
00411 
00412         }
00413         else if (currentScriptSrc.isEmpty() && view && javascript ) {
00414             if ( !m_executingScript )
00415                 pendingSrc.prepend( QString( src.current(), src.length() ) ); // deep copy - again
00416             else
00417                 prependingSrc = QString( src.current(), src.length() ); // deep copy
00418 
00419             setSrc(QString::null);
00420             scriptCodeSize = scriptCodeResync = 0;
00421             scriptExecution( exScript, QString::null, tagStartLineno /*scriptStartLineno*/ );
00422         }
00423     }
00424 
00425     script = false;
00426     scriptCodeSize = scriptCodeResync = 0;
00427 
00428     if ( !m_executingScript && cachedScript.isEmpty() ) {
00429         // kdDebug( 6036 ) << "adding pending Output to parsed string" << endl;
00430         QString newStr = QString(src.current(), src.length());
00431         newStr += pendingSrc;
00432         setSrc(newStr);
00433         pendingSrc = QString::null;
00434     }
00435     else if ( !prependingSrc.isEmpty() )
00436         write( prependingSrc, false );
00437 }
00438 
00439 void HTMLTokenizer::scriptExecution( const QString& str, QString scriptURL,
00440                                      int baseLine)
00441 {
00442     bool oldscript = script;
00443     m_executingScript++;
00444     script = false;
00445     QString url;
00446     if (scriptURL.isNull())
00447       url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL();
00448     else
00449       url = scriptURL;
00450 
00451     view->part()->executeScript(url,baseLine,Node(),str);
00452     m_executingScript--;
00453     script = oldscript;
00454 }
00455 
00456 void HTMLTokenizer::parseComment(DOMStringIt &src)
00457 {
00458     checkScriptBuffer(src.length());
00459     while ( src.length() ) {
00460         scriptCode[ scriptCodeSize++ ] = *src;
00461 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00462         qDebug("comment is now: *%s*",
00463                QConstString((QChar*)src.current(), QMIN(16, src.length())).string().latin1());
00464 #endif
00465         if (src->unicode() == '>' &&
00466             ( ( brokenComments && !( script || style ) ) ||
00467               ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
00468                 scriptCode[scriptCodeSize-2] == '-' ) ) ) {
00469             ++src;
00470             if ( !( script || xmp || textarea || style) ) {
00471 #ifdef COMMENTS_IN_DOM
00472                 checkScriptBuffer();
00473                 scriptCode[ scriptCodeSize ] = 0;
00474                 scriptCode[ scriptCodeSize + 1 ] = 0;
00475                 currToken.id = ID_COMMENT;
00476                 processListing(DOMStringIt(scriptCode, scriptCodeSize - 2));
00477                 processToken();
00478                 currToken.id = ID_COMMENT + ID_CLOSE_TAG;
00479                 processToken();
00480 #endif
00481                 scriptCodeSize = 0;
00482             }
00483             comment = false;
00484             return; // Finished parsing comment
00485         }
00486         ++src;
00487     }
00488 }
00489 
00490 void HTMLTokenizer::parseServer(DOMStringIt &src)
00491 {
00492     checkScriptBuffer(src.length());
00493     while ( src.length() ) {
00494         scriptCode[ scriptCodeSize++ ] = *src;
00495         if (src->unicode() == '>' &&
00496             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
00497             ++src;
00498             server = false;
00499             scriptCodeSize = 0;
00500             return; // Finished parsing server include
00501         }
00502         ++src;
00503     }
00504 }
00505 
00506 void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src)
00507 {
00508     char oldchar = 0;
00509     while ( src.length() )
00510     {
00511         unsigned char chbegin = src->latin1();
00512         if(chbegin == '\'') {
00513             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
00514         }
00515         else if(chbegin == '\"') {
00516             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
00517         }
00518         // Look for '?>'
00519         // some crappy sites omit the "?" before it, so
00520         // we look for an unquoted '>' instead. (IE compatible)
00521         else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
00522         {
00523             // We got a '?>' sequence
00524             processingInstruction = false;
00525             ++src;
00526             discard=LFDiscard;
00527             return; // Finished parsing comment!
00528         }
00529         ++src;
00530         oldchar = chbegin;
00531     }
00532 }
00533 
00534 void HTMLTokenizer::parseText(DOMStringIt &src)
00535 {
00536     while ( src.length() )
00537     {
00538         // do we need to enlarge the buffer?
00539         checkBuffer();
00540 
00541         // ascii is okay because we only do ascii comparisons
00542         unsigned char chbegin = src->latin1();
00543 
00544         if (skipLF && ( chbegin != '\n' ))
00545         {
00546             skipLF = false;
00547         }
00548 
00549         if (skipLF)
00550         {
00551             skipLF = false;
00552             ++src;
00553         }
00554         else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
00555         {
00556             if (chbegin == '\r')
00557                 skipLF = true;
00558 
00559             *dest++ = '\n';
00560             ++src;
00561         }
00562         else {
00563             *dest++ = *src;
00564             ++src;
00565         }
00566     }
00567 }
00568 
00569 
00570 void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start)
00571 {
00572     if( start )
00573     {
00574         cBufferPos = 0;
00575         Entity = SearchEntity;
00576     }
00577 
00578     while( src.length() )
00579     {
00580         ushort cc = src->unicode();
00581         switch(Entity) {
00582         case NoEntity:
00583             return;
00584 
00585             break;
00586         case SearchEntity:
00587             if(cc == '#') {
00588                 cBuffer[cBufferPos++] = cc;
00589                 ++src;
00590                 Entity = NumericSearch;
00591             }
00592             else
00593                 Entity = EntityName;
00594 
00595             break;
00596 
00597         case NumericSearch:
00598             if(cc == 'x' || cc == 'X') {
00599                 cBuffer[cBufferPos++] = cc;
00600                 ++src;
00601                 Entity = Hexadecimal;
00602             }
00603             else if(cc >= '0' && cc <= '9')
00604                 Entity = Decimal;
00605             else
00606                 Entity = SearchSemicolon;
00607 
00608             break;
00609 
00610         case Hexadecimal:
00611         {
00612             int uc = EntityChar.unicode();
00613             int ll = kMin(src.length(), 9-cBufferPos);
00614             while(ll--) {
00615                 QChar csrc(src->lower());
00616                 cc = csrc.cell();
00617 
00618                 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
00619                     Entity = SearchSemicolon;
00620                     break;
00621                 }
00622                 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
00623                 cBuffer[cBufferPos++] = cc;
00624                 ++src;
00625             }
00626             EntityChar = QChar(uc);
00627             if(cBufferPos == 9) Entity = SearchSemicolon;
00628             break;
00629         }
00630         case Decimal:
00631         {
00632             int uc = EntityChar.unicode();
00633             int ll = kMin(src.length(), 9-cBufferPos);
00634             while(ll--) {
00635                 cc = src->cell();
00636 
00637                 if(src->row() || !(cc >= '0' && cc <= '9')) {
00638                     Entity = SearchSemicolon;
00639                     break;
00640                 }
00641 
00642                 uc = uc * 10 + (cc - '0');
00643                 cBuffer[cBufferPos++] = cc;
00644                 ++src;
00645             }
00646             EntityChar = QChar(uc);
00647             if(cBufferPos == 9)  Entity = SearchSemicolon;
00648             break;
00649         }
00650         case EntityName:
00651         {
00652             int ll = kMin(src.length(), 9-cBufferPos);
00653             while(ll--) {
00654                 QChar csrc = *src;
00655                 cc = csrc.cell();
00656 
00657                 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
00658                                    (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
00659                     Entity = SearchSemicolon;
00660                     break;
00661                 }
00662 
00663                 cBuffer[cBufferPos++] = cc;
00664                 ++src;
00665             }
00666             if(cBufferPos == 9) Entity = SearchSemicolon;
00667             if(Entity == SearchSemicolon) {
00668                 if(cBufferPos > 1) {
00669                     const entity *e = findEntity(cBuffer, cBufferPos);
00670                     if(e)
00671                         EntityChar = e->code;
00672 
00673                     // be IE compatible
00674                     if(tag && EntityChar.unicode() > 255 && *src != ';')
00675                         EntityChar = QChar::null;
00676                 }
00677             }
00678             else
00679                 break;
00680         }
00681         case SearchSemicolon:
00682 
00683             //kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << ", " << res << endl;
00684 
00685             fixUpChar(EntityChar);
00686 
00687             if ( EntityChar != QChar::null ) {
00688                 checkBuffer();
00689                 // Just insert it
00690                 if (*src == ';')
00691                     ++src;
00692 
00693                 src.push( EntityChar );
00694             } else {
00695 #ifdef TOKEN_DEBUG
00696                 kdDebug( 6036 ) << "unknown entity!" << endl;
00697 #endif
00698                 checkBuffer(10);
00699                 // ignore the sequence, add it to the buffer as plaintext
00700                 *dest++ = '&';
00701                 for(unsigned int i = 0; i < cBufferPos; i++)
00702                     dest[i] = cBuffer[i];
00703                 dest += cBufferPos;
00704                 Entity = NoEntity;
00705                 if (pre)
00706                     prePos += cBufferPos+1;
00707             }
00708 
00709             Entity = NoEntity;
00710             EntityChar = QChar::null;
00711             return;
00712         };
00713     }
00714 }
00715 
00716 void HTMLTokenizer::parseTag(DOMStringIt &src)
00717 {
00718     assert(!Entity );
00719 
00720     while ( src.length() )
00721     {
00722         checkBuffer();
00723 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00724         uint l = 0;
00725         while(l < src.length() && (*(src.current()+l)).latin1() != '>')
00726             l++;
00727         qDebug("src is now: *%s*, tquote: %d",
00728                QConstString((QChar*)src.current(), l).string().latin1(), tquote);
00729 #endif
00730         switch(tag) {
00731         case NoTag:
00732         {
00733             return;
00734         }
00735         case TagName:
00736         {
00737 #if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
00738             qDebug("TagName");
00739 #endif
00740             if (searchCount > 0)
00741             {
00742                 if (*src == commentStart[searchCount])
00743                 {
00744                     searchCount++;
00745                     if (searchCount == 4)
00746                     {
00747 #ifdef TOKEN_DEBUG
00748                         kdDebug( 6036 ) << "Found comment" << endl;
00749 #endif
00750                         // Found '<!--' sequence
00751                         ++src;
00752                         dest = buffer; // ignore the previous part of this tag
00753                         tag = NoTag;
00754 
00755                         comment = true;
00756                         // push what we parsed so far upon the stack. helps for <!-->
00757                         checkScriptBuffer();
00758                         scriptCode[0] = scriptCode[1] = '-';
00759                         scriptCodeSize = 2;
00760                         parseComment(src);
00761                         return; // Finished parsing tag!
00762                     }
00763                     // cuts of high part, is okay
00764                     cBuffer[cBufferPos++] = src->cell();
00765                     ++src;
00766                     break;
00767                 }
00768                 else
00769                     searchCount = 0; // Stop looking for '<!--' sequence
00770             }
00771 
00772             bool finish = false;
00773             unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00774             while(ll--) {
00775                 ushort curchar = *src;
00776                 if(curchar <= ' ' || curchar == '>' ) {
00777                     finish = true;
00778                     break;
00779                 }
00780                 // this is a nasty performance trick. will work for the A-Z
00781                 // characters, but not for others. if it contains one,
00782                 // we fail anyway
00783                 char cc = curchar;
00784                 cBuffer[cBufferPos++] = cc | 0x20;
00785                 ++src;
00786             }
00787 
00788             // Disadvantage: we add the possible rest of the tag
00789             // as attribute names. ### judge if this causes problems
00790             if(finish || CBUFLEN == cBufferPos) {
00791                 bool beginTag;
00792                 char* ptr = cBuffer;
00793                 unsigned int len = cBufferPos;
00794                 cBuffer[cBufferPos] = '\0';
00795                 if ((cBufferPos > 0) && (*ptr == '/'))
00796                 {
00797                     // End Tag
00798                     beginTag = false;
00799                     ptr++;
00800                     len--;
00801                 }
00802                 else
00803                     // Start Tag
00804                     beginTag = true;
00805                 // Accept empty xml tags like <br/>
00806                 if(len > 1 && ptr[len-1] == '/' ) {
00807                     ptr[--len] = '\0';
00808                     // if its like <br/> and not like <input/ value=foo>, take it as flat
00809                     if (*src == '>')
00810                         currToken.flat = true;
00811                 }
00812 
00813                 uint tagID = khtml::getTagID(ptr, len);
00814                 if (!tagID) {
00815 #ifdef TOKEN_DEBUG
00816                     QCString tmp(ptr, len+1);
00817                     kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;
00818 #endif
00819                     dest = buffer;
00820                 }
00821                 else
00822                 {
00823 #ifdef TOKEN_DEBUG
00824                     QCString tmp(ptr, len+1);
00825                     kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
00826 #endif
00827                     currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
00828                     dest = buffer;
00829                 }
00830                 tag = SearchAttribute;
00831                 cBufferPos = 0;
00832             }
00833             break;
00834         }
00835         case SearchAttribute:
00836         {
00837 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00838                 qDebug("SearchAttribute");
00839 #endif
00840             bool atespace = false;
00841             ushort curchar;
00842             while(src.length()) {
00843                 curchar = *src;
00844                 if(curchar > ' ') {
00845                     if(curchar == '>')
00846                         tag = SearchEnd;
00847                     else if(atespace && (curchar == '\'' || curchar == '"'))
00848                     {
00849                         tag = SearchValue;
00850                         *dest++ = 0;
00851                         attrName = QString::null;
00852                     }
00853                     else
00854                         tag = AttributeName;
00855 
00856                     cBufferPos = 0;
00857                     break;
00858                 }
00859                 atespace = true;
00860                 ++src;
00861             }
00862             break;
00863         }
00864         case AttributeName:
00865         {
00866 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00867                 qDebug("AttributeName");
00868 #endif
00869             ushort curchar;
00870             int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00871 
00872             while(ll--) {
00873                 curchar = *src;
00874                 if(curchar <= '>') {
00875                     if(curchar <= ' ' || curchar == '=' || curchar == '>') {
00876                         unsigned int a;
00877                         cBuffer[cBufferPos] = '\0';
00878                         a = khtml::getAttrID(cBuffer, cBufferPos);
00879                         if ( !a )
00880                             attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00881 
00882                         dest = buffer;
00883                         *dest++ = a;
00884 #ifdef TOKEN_DEBUG
00885                         if (!a || (cBufferPos && *cBuffer == '!'))
00886                             kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
00887                         else
00888                             kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
00889 #endif
00890                         // did we just get />
00891                         if (!a && cBufferPos == 1 && *cBuffer == '/' && curchar == '>')
00892                             currToken.flat = true;
00893 
00894                         tag = SearchEqual;
00895                         break;
00896                     }
00897                 }
00898                 cBuffer[cBufferPos++] = (char) curchar | 0x20;
00899                 ++src;
00900             }
00901             if ( cBufferPos == CBUFLEN ) {
00902                 cBuffer[cBufferPos] = '\0';
00903                 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00904                 dest = buffer;
00905                 *dest++ = 0;
00906                 tag = SearchEqual;
00907             }
00908             break;
00909         }
00910         case SearchEqual:
00911         {
00912 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00913                 qDebug("SearchEqual");
00914 #endif
00915             ushort curchar;
00916             bool atespace = false;
00917             while(src.length()) {
00918                 curchar = src->unicode();
00919                 if(curchar > ' ') {
00920                     if(curchar == '=') {
00921 #ifdef TOKEN_DEBUG
00922                         kdDebug(6036) << "found equal" << endl;
00923 #endif
00924                         tag = SearchValue;
00925                         ++src;
00926                     }
00927                     else if(atespace && (curchar == '\'' || curchar == '"'))
00928                     {
00929                         tag = SearchValue;
00930                         *dest++ = 0;
00931                         attrName = QString::null;
00932                     }
00933                     else {
00934                         DOMString v("");
00935                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00936                         dest = buffer;
00937                         tag = SearchAttribute;
00938                     }
00939                     break;
00940                 }
00941                 atespace = true;
00942                 ++src;
00943             }
00944             break;
00945         }
00946         case SearchValue:
00947         {
00948             ushort curchar;
00949             while(src.length()) {
00950                 curchar = src->unicode();
00951                 if(curchar > ' ') {
00952                     if(( curchar == '\'' || curchar == '\"' )) {
00953                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
00954                         tag = QuotedValue;
00955                         ++src;
00956                     } else
00957                         tag = Value;
00958 
00959                     break;
00960                 }
00961                 ++src;
00962             }
00963             break;
00964         }
00965         case QuotedValue:
00966         {
00967 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00968                 qDebug("QuotedValue");
00969 #endif
00970             ushort curchar;
00971             while(src.length()) {
00972                 checkBuffer();
00973 
00974                 curchar = src->unicode();
00975                 if(curchar <= '\'' && !src.escaped()) {
00976                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
00977                     if ( curchar == '&' )
00978                     {
00979                         ++src;
00980                         parseEntity(src, dest, true);
00981                         break;
00982                     }
00983                     else if ( (tquote == SingleQuote && curchar == '\'') ||
00984                               (tquote == DoubleQuote && curchar == '\"') )
00985                     {
00986                         // some <input type=hidden> rely on trailing spaces. argh
00987                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
00988                             dest--; // remove trailing newlines
00989                         DOMString v(buffer+1, dest-buffer-1);
00990                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00991 
00992                         dest = buffer;
00993                         tag = SearchAttribute;
00994                         tquote = NoQuote;
00995                         ++src;
00996                         break;
00997                     }
00998                 }
00999                 *dest++ = *src;
01000                 ++src;
01001             }
01002             break;
01003         }
01004         case Value:
01005         {
01006 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
01007             qDebug("Value");
01008 #endif
01009             ushort curchar;
01010             while(src.length()) {
01011                 checkBuffer();
01012                 curchar = src->unicode();
01013                 if(curchar <= '>' && !src.escaped()) {
01014                     // parse Entities
01015                     if ( curchar == '&' )
01016                     {
01017                         ++src;
01018                         parseEntity(src, dest, true);
01019                         break;
01020                     }
01021                     // no quotes. Every space means end of value
01022                     // '/' does not delimit in IE!
01023                     if ( curchar <= ' ' || curchar == '>' )
01024                     {
01025                         DOMString v(buffer+1, dest-buffer-1);
01026                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
01027                         dest = buffer;
01028                         tag = SearchAttribute;
01029                         break;
01030                     }
01031                 }
01032 
01033                 *dest++ = *src;
01034                 ++src;
01035             }
01036             break;
01037         }
01038         case SearchEnd:
01039         {
01040 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
01041                 qDebug("SearchEnd");
01042 #endif
01043             while(src.length()) {
01044                 if(*src == '>')
01045                     break;
01046 
01047                 if (*src == '/')
01048                     currToken.flat = true;
01049 
01050                 ++src;
01051             }
01052             if(!src.length() && *src != '>') break;
01053 
01054             searchCount = 0; // Stop looking for '<!--' sequence
01055             tag = NoTag;
01056             tquote = NoQuote;
01057             ++src;
01058 
01059             if ( !currToken.id ) //stop if tag is unknown
01060                 return;
01061 
01062             uint tagID = currToken.id;
01063 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
01064             kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
01065 #endif
01066             bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
01067 
01068             if(tagID >= ID_CLOSE_TAG)
01069                 tagID -= ID_CLOSE_TAG;
01070             else if ( beginTag && tagID == ID_SCRIPT ) {
01071                 AttributeImpl* a = 0;
01072                 scriptSrc = scriptSrcCharset = QString::null;
01073                 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
01074                      parser->doc()->view()->part()->jScriptEnabled() && /* jscript allowed at all? */
01075                      view /* are we a regular tokenizer or just for innerHTML ? */
01076                     ) {
01077                     if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) )
01078                         scriptSrc = parser->doc()->completeURL(khtml::parseURL( a->value() ).string() );
01079                     if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) )
01080                         scriptSrcCharset = a->value().string().stripWhiteSpace();
01081                     if ( scriptSrcCharset.isEmpty() )
01082                         scriptSrcCharset = parser->doc()->view()->part()->encoding();
01083                     if (!(a = currToken.attrs->getAttributeItem( ATTR_LANGUAGE )))
01084                         a = currToken.attrs->getAttributeItem(ATTR_TYPE);
01085                 }
01086                 javascript = true;
01087                 if( a ) {
01088                     QString lang = a->value().string();
01089                     lang = lang.lower();
01090                     if( !lang.contains("javascript") &&
01091                         !lang.contains("ecmascript") &&
01092                         !lang.contains("livescript") &&
01093                         !lang.contains("jscript") )
01094                         javascript = false;
01095                 }
01096             }
01097 
01098             processToken();
01099 
01100             // lets see if we're still in parsing mood for spaces
01101             pre = parser->preMode();
01102 
01103             switch( tagID ) {
01104             case ID_PRE:
01105                 prePos = 0;
01106                 break;
01107             case ID_SCRIPT:
01108                 if (beginTag) {
01109                     searchStopper = scriptEnd;
01110                     searchStopperLen = 8;
01111                     script = true;
01112                     parseSpecial(src);
01113                 }
01114                 break;
01115             case ID_STYLE:
01116                 if (beginTag) {
01117                     searchStopper = styleEnd;
01118                     searchStopperLen = 7;
01119                     style = true;
01120                     parseSpecial(src);
01121                 }
01122                 break;
01123             case ID_TEXTAREA:
01124                 if(beginTag) {
01125                     searchStopper = textareaEnd;
01126                     searchStopperLen = 10;
01127                     textarea = true;
01128                     discard = AllDiscard;
01129                     parseSpecial(src);
01130                 }
01131                 break;
01132             case ID_TITLE:
01133                 if (beginTag) {
01134                     searchStopper = titleEnd;
01135                     searchStopperLen = 7;
01136                     title = true;
01137                     parseSpecial(src);
01138                 }
01139                 break;
01140             case ID_XMP:
01141                 if (beginTag) {
01142                     searchStopper = xmpEnd;
01143                     searchStopperLen = 5;
01144                     xmp = true;
01145                     parseSpecial(src);
01146                 }
01147                 break;
01148             case ID_SELECT:
01149                 select = beginTag;
01150                 break;
01151             case ID_PLAINTEXT:
01152                 plaintext = beginTag;
01153                 break;
01154             }
01155             return; // Finished parsing tag!
01156         }
01157         } // end switch
01158     }
01159     return;
01160 }
01161 
01162 void HTMLTokenizer::addPending()
01163 {
01164     if ( select && !(comment || script))
01165     {
01166         *dest++ = ' ';
01167     }
01168     else if ( textarea )
01169     {
01170         switch(pending) {
01171         case LFPending:  *dest++ = '\n'; prePos = 0; break;
01172         case SpacePending: *dest++ = ' '; ++prePos; break;
01173         case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
01174         case NonePending:
01175             assert(0);
01176         }
01177     }
01178     else if ( pre )
01179     {
01180         int p;
01181 
01182         switch (pending)
01183         {
01184         case SpacePending:
01185             // Insert a breaking space
01186             *dest++ = QChar(' ');
01187             prePos++;
01188             break;
01189 
01190         case LFPending:
01191             *dest = '\n';
01192             dest++;
01193             prePos = 0;
01194             break;
01195 
01196         case TabPending:
01197             p = TAB_SIZE - ( prePos % TAB_SIZE );
01198             for ( int x = 0; x < p; x++ )
01199                 *dest++ = QChar(' ');
01200             prePos += p;
01201             break;
01202 
01203         case NonePending:
01204             assert(0);
01205             break;
01206         }
01207     }
01208     else
01209     {
01210         *dest++ = ' ';
01211     }
01212 
01213     pending = NonePending;
01214 }
01215 
01216 void HTMLTokenizer::write( const QString &str, bool appendData )
01217 {
01218 #ifdef TOKEN_DEBUG
01219     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str << "\"," << appendData << ")" << endl;
01220 #endif
01221 
01222     if ( !buffer )
01223         return;
01224 
01225     if ( ( m_executingScript && appendData ) ||
01226          ( !m_executingScript && cachedScript.count() ) ) {
01227         // don't parse; we will do this later
01228         pendingSrc += str;
01229         return;
01230     }
01231 
01232     if ( onHold ) {
01233         QString rest = QString( src.current(), src.length() );
01234         rest += str;
01235         setSrc(rest);
01236         return;
01237     }
01238     else
01239         setSrc(str);
01240 
01241 //     if (Entity)
01242 //         parseEntity(src, dest);
01243 
01244     while ( src.length() )
01245     {
01246         // do we need to enlarge the buffer?
01247         checkBuffer();
01248 
01249         ushort cc = src->unicode();
01250 
01251         if (skipLF && (cc != '\n'))
01252             skipLF = false;
01253 
01254         if (skipLF) {
01255             skipLF = false;
01256             ++src;
01257         }
01258         else if ( Entity )
01259             parseEntity( src, dest );
01260         else if ( plaintext )
01261             parseText( src );
01262         else if (script)
01263             parseSpecial(src);
01264         else if (style)
01265             parseSpecial(src);
01266         else if (xmp)
01267             parseSpecial(src);
01268         else if (textarea)
01269             parseSpecial(src);
01270         else if (title)
01271             parseSpecial(src);
01272         else if (comment)
01273             parseComment(src);
01274         else if (server)
01275             parseServer(src);
01276         else if (processingInstruction)
01277             parseProcessingInstruction(src);
01278         else if (tag)
01279             parseTag(src);
01280         else if ( startTag )
01281         {
01282             startTag = false;
01283 
01284             switch(cc) {
01285             case '/':
01286                 break;
01287             case '!':
01288             {
01289                 // <!-- comment -->
01290                 searchCount = 1; // Look for '<!--' sequence to start comment
01291 
01292                 break;
01293             }
01294             case '?':
01295             {
01296                 // xml processing instruction
01297                 processingInstruction = true;
01298                 tquote = NoQuote;
01299                 parseProcessingInstruction(src);
01300                 continue;
01301 
01302                 break;
01303             }
01304             case '%':
01305                 if (!brokenServer) {
01306                     // <% server stuff, handle as comment %>
01307                     server = true;
01308                     tquote = NoQuote;
01309                     parseServer(src);
01310                     continue;
01311                 }
01312                 // else fall through
01313             default:
01314             {
01315                 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
01316                 {
01317                     // Start of a Start-Tag
01318                 }
01319                 else
01320                 {
01321                     // Invalid tag
01322                     // Add as is
01323                     if (pending)
01324                         addPending();
01325                     *dest = '<';
01326                     dest++;
01327                     continue;
01328                 }
01329             }
01330             }; // end case
01331 
01332             if ( pending ) {
01333                 // pre context always gets its spaces/linefeeds
01334                 if ( pre )
01335                     addPending();
01336                 // only add in existing inline context or if
01337                 // we just started one, i.e. we're about to insert real text
01338                 else if ( !parser->selectMode() &&
01339                           ( !parser->noSpaces() || dest > buffer )) {
01340                     addPending();
01341             discard = AllDiscard;
01342                 }
01343                 // just forget it
01344                 else
01345                     pending = NonePending;
01346             }
01347 
01348             processToken();
01349 
01350             cBufferPos = 0;
01351             tag = TagName;
01352             parseTag(src);
01353         }
01354         else if ( cc == '&' && !src.escaped())
01355         {
01356             ++src;
01357             if ( pending )
01358                 addPending();
01359             parseEntity(src, dest, true);
01360         }
01361         else if ( cc == '<' && !src.escaped())
01362         {
01363             tagStartLineno = lineno+src.lineCount();
01364             ++src;
01365             startTag = true;
01366         }
01367         else if (( cc == '\n' ) || ( cc == '\r' ))
01368         {
01369             if ( pre || textarea)
01370             {
01371                 if (discard == LFDiscard || discard == AllDiscard)
01372                 {
01373                     // Ignore this LF
01374                     discard = NoneDiscard; // We have discarded 1 LF
01375                 }
01376                 else
01377                 {
01378                     // Process this LF
01379                     if (pending)
01380                         addPending();
01381                     pending = LFPending;
01382                 }
01383             }
01384             else
01385             {
01386                 if (discard == LFDiscard)
01387                 {
01388                     // Ignore this LF
01389                     discard = NoneDiscard; // We have discarded 1 LF
01390                 }
01391                 else if(discard == AllDiscard)
01392                 {
01393                 }
01394                 else
01395                 {
01396                     // Process this LF
01397                     if (pending == NonePending)
01398                         pending = LFPending;
01399                 }
01400             }
01401             /* Check for MS-DOS CRLF sequence */
01402             if (cc == '\r')
01403             {
01404                 skipLF = true;
01405             }
01406             ++src;
01407         }
01408         else if (( cc == ' ' ) || ( cc == '\t' ))
01409         {
01410             if ( pre || textarea)
01411             {
01412                 if (discard == SpaceDiscard || discard == AllDiscard)
01413                 {
01414                     // Ignore this LF
01415                     discard = NoneDiscard; // We have discarded 1 LF
01416                 }
01417                 else {
01418                     if (pending)
01419                         addPending();
01420                     if (cc == ' ')
01421                         pending = SpacePending;
01422                     else
01423                         pending = TabPending;
01424                 }
01425             }
01426             else
01427             {
01428                 if(discard == SpaceDiscard)
01429                     discard = NoneDiscard;
01430                 else if(discard == AllDiscard)
01431                 { }
01432                 else
01433                     pending = SpacePending;
01434             }
01435             ++src;
01436         }
01437         else
01438         {
01439             if (pending)
01440                 addPending();
01441 
01442             discard = NoneDiscard;
01443             if ( pre )
01444             {
01445                 prePos++;
01446             }
01447             *dest = *src;
01448             fixUpChar( *dest );
01449             ++dest;
01450             ++src;
01451         }
01452     }
01453     _src = QString::null;
01454 
01455     if (noMoreData && cachedScript.isEmpty() && !m_executingScript )
01456         end(); // this actually causes us to be deleted
01457 }
01458 
01459 void HTMLTokenizer::end()
01460 {
01461     if ( buffer == 0 ) {
01462         emit finishedParsing();
01463         return;
01464     }
01465 
01466     // parseTag is using the buffer for different matters
01467     if ( !tag )
01468         processToken();
01469 
01470     if(buffer)
01471         KHTML_DELETE_QCHAR_VEC(buffer);
01472 
01473     if(scriptCode)
01474         KHTML_DELETE_QCHAR_VEC(scriptCode);
01475 
01476     scriptCode = 0;
01477     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01478     buffer = 0;
01479     emit finishedParsing();
01480 }
01481 
01482 void HTMLTokenizer::finish()
01483 {
01484     // do this as long as we don't find matching comment ends
01485     while((comment || server) && scriptCode && scriptCodeSize)
01486     {
01487         // we've found an unmatched comment start
01488         if (comment)
01489             brokenComments = true;
01490         else
01491             brokenServer = true;
01492         checkScriptBuffer();
01493         scriptCode[ scriptCodeSize ] = 0;
01494         scriptCode[ scriptCodeSize + 1 ] = 0;
01495         int pos;
01496         QString food;
01497         if (script || style) {
01498             food.setUnicode(scriptCode, scriptCodeSize);
01499         }
01500         else if (server) {
01501             food = "<";
01502             food += QString(scriptCode, scriptCodeSize);
01503         }
01504         else {
01505             pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
01506             food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
01507         }
01508         KHTML_DELETE_QCHAR_VEC(scriptCode);
01509         scriptCode = 0;
01510         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01511         comment = server = false;
01512         if ( !food.isEmpty() )
01513             write(food, true);
01514     }
01515     // this indicates we will not recieve any more data... but if we are waiting on
01516     // an external script to load, we can't finish parsing until that is done
01517     noMoreData = true;
01518     if (cachedScript.isEmpty() && !m_executingScript && !onHold)
01519         end(); // this actually causes us to be deleted
01520 }
01521 
01522 void HTMLTokenizer::processToken()
01523 {
01524     KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
01525     if (jsProxy)
01526         jsProxy->setEventHandlerLineno(tagStartLineno);
01527     if ( dest > buffer )
01528     {
01529 #ifdef TOKEN_DEBUG
01530         if(currToken.id) {
01531             qDebug( "unexpected token id: %d, str: *%s*", currToken.id,QConstString( buffer,dest-buffer ).string().latin1() );
01532             assert(0);
01533         }
01534 
01535 #endif
01536         currToken.text = new DOMStringImpl( buffer, dest - buffer );
01537         currToken.text->ref();
01538         currToken.id = ID_TEXT;
01539     }
01540     else if(!currToken.id) {
01541         currToken.reset();
01542         if (jsProxy)
01543             jsProxy->setEventHandlerLineno(lineno+src.lineCount());
01544         return;
01545     }
01546 
01547     dest = buffer;
01548 
01549 #ifdef TOKEN_DEBUG
01550     QString name = getTagName(currToken.id).string();
01551     QString text;
01552     if(currToken.text)
01553         text = QConstString(currToken.text->s, currToken.text->l).string();
01554 
01555     kdDebug( 6036 ) << "Token --> " << name << "   id = " << currToken.id << endl;
01556     if (currToken.flat)
01557         kdDebug( 6036 ) << "Token is FLAT!" << endl;
01558     if(!text.isNull())
01559         kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
01560     unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
01561     if(l) {
01562         kdDebug( 6036 ) << "Attributes: " << l << endl;
01563         for (unsigned long i = 0; i < l; ++i) {
01564             AttributeImpl* c = currToken.attrs->attributeItem(i);
01565             kdDebug( 6036 ) << "    " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string()
01566                             << "=\"" << c->value().string() << "\"" << endl;
01567         }
01568     }
01569     kdDebug( 6036 ) << endl;
01570 #endif
01571     // pass the token over to the parser, the parser DOES NOT delete the token
01572     parser->parseToken(&currToken);
01573 
01574     if ( currToken.flat && currToken.id != ID_TEXT && !parser->noSpaces() )
01575     discard = NoneDiscard;
01576     else if ( parser->selectMode() )
01577         discard = AllDiscard;
01578 
01579     currToken.reset();
01580     if (jsProxy)
01581         jsProxy->setEventHandlerLineno(0);
01582 }
01583 
01584 
01585 HTMLTokenizer::~HTMLTokenizer()
01586 {
01587     reset();
01588     delete parser;
01589 }
01590 
01591 
01592 void HTMLTokenizer::enlargeBuffer(int len)
01593 {
01594     int newsize = kMax(size*2, size+len);
01595     int oldoffs = (dest - buffer);
01596 
01597     buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar));
01598     dest = buffer + oldoffs;
01599     size = newsize;
01600 }
01601 
01602 void HTMLTokenizer::enlargeScriptBuffer(int len)
01603 {
01604     int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
01605     scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar));
01606     scriptCodeMaxSize = newsize;
01607 }
01608 
01609 void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
01610 {
01611     assert(!cachedScript.isEmpty());
01612     bool done = false;
01613     while (!done && cachedScript.head()->isLoaded()) {
01614 #ifdef TOKEN_DEBUG
01615         kdDebug( 6036 ) << "Finished loading an external script" << endl;
01616 #endif
01617         CachedScript* cs = cachedScript.dequeue();
01618         done = cachedScript.isEmpty();
01619         DOMString scriptSource = cs->script();
01620 #ifdef TOKEN_DEBUG
01621         kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
01622 #endif
01623         setSrc(QString::null);
01624 
01625         // make sure we forget about the script before we execute the new one
01626         // infinite recursion might happen otherwise
01627         QString cachedScriptUrl( cs->url().string() );
01628         cs->deref(this);
01629 
01630     scriptExecution( scriptSource.string(), cachedScriptUrl );
01631 
01632         // 'script' is true when we are called synchronously from
01633         // parseScript(). In that case parseScript() will take care
01634         // of 'scriptOutput'.
01635         if ( !script ) {
01636             QString rest = pendingSrc;
01637             pendingSrc = QString::null;
01638             write(rest, false);
01639             // we might be deleted at this point, do not
01640             // access any members.
01641         }
01642     }
01643 }
01644 
01645 void HTMLTokenizer::setSrc(const QString& source)
01646 {
01647     lineno += src.lineCount();
01648     _src = source;
01649     src = DOMStringIt(_src);
01650 }
01651 
01652 void HTMLTokenizer::setOnHold(bool _onHold)
01653 {
01654     if (onHold == _onHold) return;
01655     onHold = _onHold;
01656     if (onHold)
01657         setSrc(QString(src.current(), src.length())); // ### deep copy
01658 }
01659 
KDE Logo
This file is part of the documentation for kdelibs Version 3.1.0.
Documentation copyright © 1996-2002 the KDE developers.
Generated on Wed Oct 8 12:22:38 2003 by doxygen 1.2.18 written by Dimitri van Heesch, © 1997-2001