khtml Library API Documentation

htmltokenizer.h

00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1997 Martin Jones (mjones@kde.org)
00005               (C) 1997 Torben Weis (weis@kde.org)
00006               (C) 1998 Waldo Bastian (bastian@kde.org)
00007               (C) 2001 Dirk Mueller (mueller@kde.org)
00008 
00009     This library is free software; you can redistribute it and/or
00010     modify it under the terms of the GNU Library General Public
00011     License as published by the Free Software Foundation; either
00012     version 2 of the License, or (at your option) any later version.
00013 
00014     This library is distributed in the hope that it will be useful,
00015     but WITHOUT ANY WARRANTY; without even the implied warranty of
00016     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017     Library General Public License for more details.
00018 
00019     You should have received a copy of the GNU Library General Public License
00020     along with this library; see the file COPYING.LIB.  If not, write to
00021     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00022     Boston, MA 02111-1307, USA.
00023 */
00024 //----------------------------------------------------------------------------
00025 //
00026 // KDE HTML Widget -- Tokenizers
00027 // $Id: htmltokenizer.h,v 1.62.2.2 2003/01/13 22:04:48 mueller Exp $
00028 
00029 #ifndef HTMLTOKENIZER_H
00030 #define HTMLTOKENIZER_H
00031 
00032 #include <qstring.h>
00033 #include <qobject.h>
00034 #include <qptrqueue.h>
00035 
00036 #include "misc/loader_client.h"
00037 #include "misc/htmltags.h"
00038 #include "misc/stringit.h"
00039 #include "xml/dom_stringimpl.h"
00040 #include "xml/xml_tokenizer.h"
00041 #include "xml/dom_elementimpl.h"
00042 #include "xml/dom_docimpl.h"
00043 
00044 class KCharsets;
00045 class KHTMLView;
00046 
00047 namespace DOM {
00048     class DocumentPtr;
00049     class DocumentFragmentImpl;
00050 }
00051 
00052 namespace khtml {
00053     class CachedScript;
00054     class KHTMLParser;
00055 
00062     class Token
00063     {
00064     public:
00065         Token() {
00066             id = 0;
00067             attrs = 0;
00068             text = 0;
00069             flat = false;
00070             //qDebug("new token, creating %08lx", attrs);
00071         }
00072         ~Token() {
00073             if(attrs) attrs->deref();
00074             if(text) text->deref();
00075         }
00076         void addAttribute(DocumentImpl* doc, QChar* buffer, const QString& attrName, const DOMString& v)
00077         {
00078             AttributeImpl* a = 0;
00079             if(buffer->unicode())
00080                 a = new AttributeImpl(buffer->unicode(), v.implementation());
00081             else if ( !attrName.isEmpty() && attrName != "/" )
00082                 a = new AttributeImpl(doc->attrId(0, DOMString(attrName).implementation(), false),
00083                                       v.implementation());
00084 
00085             if (a) {
00086                 if(!attrs) {
00087                     attrs = new DOM::NamedAttrMapImpl(0);
00088                     attrs->ref();
00089                 }
00090                 attrs->insertAttribute(a);
00091             }
00092         }
00093         void reset()
00094         {
00095             if(attrs) {
00096                 attrs->deref();
00097                 attrs = 0;
00098             }
00099             id = 0;
00100             if(text) {
00101                 text->deref();
00102                 text = 0;
00103             }
00104             flat = false;
00105         }
00106         DOM::NamedAttrMapImpl* attrs;
00107         DOMStringImpl* text;
00108         ushort id;
00109         bool flat;
00110     };
00111 
00112 // The count of spaces used for each tab.
00113 #define TAB_SIZE 8
00114 
00115 //-----------------------------------------------------------------------------
00116 
00117 class HTMLTokenizer : public Tokenizer, public CachedObjectClient
00118 {
00119 public:
00120     HTMLTokenizer(DOM::DocumentPtr *, KHTMLView * = 0);
00121     HTMLTokenizer(DOM::DocumentPtr *, DOM::DocumentFragmentImpl *frag);
00122     virtual ~HTMLTokenizer();
00123 
00124     void begin();
00125     void write( const QString &str, bool appendData );
00126     void end();
00127     void finish();
00128     virtual void setOnHold(bool _onHold);
00129 
00130 protected:
00131     void reset();
00132     void addPending();
00133     void processToken();
00134     void processListing(khtml::DOMStringIt list);
00135 
00136     void parseComment(khtml::DOMStringIt &str);
00137     void parseServer(khtml::DOMStringIt &str);
00138     void parseText(khtml::DOMStringIt &str);
00139     void parseListing(khtml::DOMStringIt &str);
00140     void parseSpecial(khtml::DOMStringIt &str);
00141     void parseTag(khtml::DOMStringIt &str);
00142     void parseEntity(khtml::DOMStringIt &str, QChar *&dest, bool start = false);
00143     void parseProcessingInstruction(khtml::DOMStringIt &str);
00144     void scriptHandler();
00145     void scriptExecution(const QString& script, QString scriptURL = QString(),
00146                          int baseLine = 0);
00147     void setSrc(const QString& source);
00148 
00149     // check if we have enough space in the buffer.
00150     // if not enlarge it
00151     inline void checkBuffer(int len = 10)
00152     {
00153         if ( (dest - buffer) > size-len )
00154             enlargeBuffer(len);
00155     }
00156     inline void checkScriptBuffer(int len = 10)
00157     {
00158         if ( scriptCodeSize + len >= scriptCodeMaxSize )
00159             enlargeScriptBuffer(len);
00160     }
00161 
00162     void enlargeBuffer(int len);
00163     void enlargeScriptBuffer(int len);
00164 
00165     // from CachedObjectClient
00166     void notifyFinished(khtml::CachedObject *finishedObj);
00167 protected:
00168     // Internal buffers
00170     QChar *buffer;
00171     QChar *dest;
00172 
00173     khtml::Token currToken;
00174 
00175     // the size of buffer
00176     int size;
00177 
00178     // Tokenizer flags
00180     // are we in quotes within a html tag
00181     enum
00182     {
00183         NoQuote = 0,
00184         SingleQuote,
00185         DoubleQuote
00186     } tquote;
00187 
00188     enum
00189     {
00190         NonePending = 0,
00191         SpacePending,
00192         LFPending,
00193         TabPending
00194     } pending;
00195 
00196     // Discard line breaks immediately after start-tags
00197     // Discard spaces after '=' within tags
00198     enum
00199     {
00200         NoneDiscard = 0,
00201         SpaceDiscard,
00202         LFDiscard,
00203         AllDiscard  // discard all spaces, LF's etc until next non white char
00204     } discard;
00205 
00206     // Discard the LF part of CRLF sequence
00207     bool skipLF;
00208 
00209     // Flag to say that we have the '<' but not the character following it.
00210     bool startTag;
00211 
00212     // Flag to say, we are just parsing a tag, meaning, we are in the middle
00213     // of <tag...
00214     enum {
00215         NoTag = 0,
00216         TagName,
00217         SearchAttribute,
00218         AttributeName,
00219         SearchEqual,
00220         SearchValue,
00221         QuotedValue,
00222         Value,
00223         SearchEnd
00224     } tag;
00225 
00226     // Are we in a &... character entity description?
00227     enum {
00228         NoEntity = 0,
00229         SearchEntity,
00230         NumericSearch,
00231         Hexadecimal,
00232         Decimal,
00233         EntityName,
00234         SearchSemicolon
00235     } Entity;
00236 
00237     // are we in a <script> ... </script block
00238     bool script;
00239 
00240     QChar EntityChar;
00241 
00242     // Are we in a <pre> ... </pre> block
00243     bool pre;
00244 
00245     // if 'pre == true' we track in which column we are
00246     int prePos;
00247 
00248     // Are we in a <style> ... </style> block
00249     bool style;
00250 
00251     // Are we in a <select> ... </select> block
00252     bool select;
00253 
00254     // Are we in a <xmp> ... </xmp> block
00255     bool xmp;
00256 
00257     // Are we in a <title> ... </title> block
00258     bool title;
00259 
00260     // Are we in plain textmode ?
00261     bool plaintext;
00262 
00263     // XML processing instructions. Ignored at the moment
00264     bool processingInstruction;
00265 
00266     // Area we in a <!-- comment --> block
00267     bool comment;
00268 
00269     // Are we in a <textarea> ... </textarea> block
00270     bool textarea;
00271 
00272     // was the previous character escaped ?
00273     bool escaped;
00274 
00275     // are we in a server includes statement?
00276     bool server;
00277 
00278     bool brokenServer;
00279 
00280     // name of an unknown attribute
00281     QString attrName;
00282 
00283     // Used to store the code of a srcipting sequence
00284     QChar *scriptCode;
00285     // Size of the script sequenze stored in @ref #scriptCode
00286     int scriptCodeSize;
00287     // Maximal size that can be stored in @ref #scriptCode
00288     int scriptCodeMaxSize;
00289     // resync point of script code size
00290     int scriptCodeResync;
00291 
00292     // Stores characters if we are scanning for a string like "</script>"
00293     QChar searchBuffer[ 10 ];
00294     // Counts where we are in the string we are scanning for
00295     int searchCount;
00296     // The string we are searching for
00297     const QChar *searchFor;
00298     // the stopper string
00299     const char* searchStopper;
00300     // the stopper len
00301     int searchStopperLen;
00302     // if no more data is coming, just parse what we have (including ext scripts that
00303     // may be still downloading) and finish
00304     bool noMoreData;
00305     // URL to get source code of script from
00306     QString scriptSrc;
00307     QString scriptSrcCharset;
00308     bool javascript;
00309     // the HTML code we will parse after the external script we are waiting for has loaded
00310     QString pendingSrc;
00311     // true if we are executing a script while parsing a document. This causes the parsing of
00312     // the output of the script to be postponed until after the script has finished executing
00313     int m_executingScript;
00314     QPtrQueue<khtml::CachedScript> cachedScript;
00315     // you can pause the tokenizer if you need to display a dialog or something
00316     bool onHold;
00317 
00318     // if we found one broken comment, there are most likely others as well
00319     // store a flag to get rid of the O(n^2) behaviour in such a case.
00320     bool brokenComments;
00321     // current line number
00322     int lineno;
00323     // line number at which the current <script> started
00324     int scriptStartLineno;
00325     int tagStartLineno;
00326 
00327 #define CBUFLEN 14
00328     char cBuffer[CBUFLEN+2];
00329     unsigned int cBufferPos;
00330 
00331     QString _src;
00332     khtml::DOMStringIt src;
00333 
00334     KCharsets *charsets;
00335     KHTMLParser *parser;
00336 
00337     KHTMLView *view;
00338 };
00339 
00340 }; // namespace
00341 
00342 #endif // HTMLTOKENIZER
00343 
KDE Logo
This file is part of the documentation for kdelibs Version 3.1.0.
Documentation copyright © 1996-2002 the KDE developers.
Generated on Wed Oct 8 12:22:38 2003 by doxygen 1.2.18 written by Dimitri van Heesch, © 1997-2001