khtml Library API Documentation

decoder.cpp

00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00019     Boston, MA 02111-1307, USA.
00020 */
00021 //----------------------------------------------------------------------------
00022 //
00023 // KDE HTML Widget -- decoder for input stream
00024 // $Id: decoder.cpp,v 1.59 2002/11/22 03:05:38 mueller Exp $
00025 
00026 #undef DECODE_DEBUG
00027 //#define DECODE_DEBUG
00028 
00029 #include <assert.h>
00030 
00031 #include "decoder.h"
00032 using namespace khtml;
00033 
00034 #include "htmlhashes.h"
00035 
00036 #include <qregexp.h>
00037 #include <qtextcodec.h>
00038 
00039 #include <kglobal.h>
00040 #include <kcharsets.h>
00041 
00042 #include <ctype.h>
00043 #include <kdebug.h>
00044 #include <klocale.h>
00045 
00046 class KanjiCode
00047 {
00048 public:
00049     enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
00050     static enum Type judge(const char *str);
00051     static const int ESC;
00052     static const int _SS2_;
00053     static const unsigned char kanji_map_sjis[];
00054     static int ISkanji(int code)
00055     {
00056     if (code >= 0x100)
00057             return 0;
00058     return (kanji_map_sjis[code & 0xff] & 1);
00059     }
00060 
00061     static int ISkana(int code)
00062     {
00063     if (code >= 0x100)
00064             return 0;
00065     return (kanji_map_sjis[code & 0xff] & 2);
00066     }
00067 
00068 };
00069 
00070 const int KanjiCode::ESC = 0x1b;
00071 const int KanjiCode::_SS2_ = 0x8e;
00072 
00073 const unsigned char KanjiCode::kanji_map_sjis[] =
00074 {
00075     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00076     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00077     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00078     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00079     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00080     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00081     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00082     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00083     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00084     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00085     0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00086     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00087     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00088     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00089     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00090     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00091 };
00092 
00093 /*
00094  * EUC-JP is
00095  *     [0xa1 - 0xfe][0xa1 - 0xfe]
00096  *     0x8e[0xa1 - 0xfe](SS2)
00097  *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
00098  *
00099  * Shift_Jis is
00100  *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
00101  *
00102  * Shift_Jis Hankaku Kana is
00103  *     [0xa1 - 0xdf]
00104  */
00105 
00106 /*
00107  * KanjiCode::judge() is based on judge_jcode() from jvim
00108  *     http://hp.vector.co.jp/authors/VA003457/vim/
00109  *
00110  * Special Thanks to Kenichi Tsuchida
00111  */
00112 
00113 /*
00114  * Maybe we should use QTextCodec::heuristicContentMatch()
00115  * But it fails detection. It's not useful.
00116  */
00117 
00118 enum KanjiCode::Type KanjiCode::judge(const char *str)
00119 {
00120     enum Type code;
00121     int i;
00122     int bfr = FALSE;        /* Kana Moji */
00123     int bfk = 0;        /* EUC Kana */
00124     int sjis = 0;
00125     int euc = 0;
00126 
00127     const unsigned char *ptr = (const unsigned char *) str;
00128     int size = strlen(str);
00129 
00130     code = ASCII;
00131 
00132     i = 0;
00133     while (i < size) {
00134     if (ptr[i] == ESC && (size - i >= 3)) {
00135         if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
00136         || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
00137         code = JIS;
00138         goto breakBreak;
00139         } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
00140             || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
00141         code = JIS;
00142         goto breakBreak;
00143         } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
00144         code = JIS;
00145         i += 3;
00146         } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
00147         code = JIS;
00148         i += 3;
00149         } else {
00150         i++;
00151         }
00152         bfr = FALSE;
00153         bfk = 0;
00154     } else {
00155         if (ptr[i] < 0x20) {
00156         bfr = FALSE;
00157         bfk = 0;
00158         /* ?? check kudokuten ?? && ?? hiragana ?? */
00159         if ((i >= 2) && (ptr[i - 2] == 0x81)
00160             && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
00161             code = SJIS;
00162             sjis += 100;    /* kudokuten */
00163         } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
00164             && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
00165             code = EUC;
00166             euc += 100;     /* kudokuten */
00167         } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
00168             sjis += 40;     /* hiragana */
00169         } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
00170             euc += 40;  /* hiragana */
00171         }
00172         } else {
00173         /* ?? check hiragana or katana ?? */
00174         if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
00175             sjis++; /* hiragana */
00176         } else if ((size - i > 1) && (ptr[i] == 0x83)
00177              && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
00178             sjis++; /* katakana */
00179         } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
00180             euc++;  /* hiragana */
00181         } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
00182             euc++;  /* katakana */
00183         }
00184         if (bfr) {
00185             if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
00186             code = SJIS;
00187             goto breakBreak;
00188             } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
00189             code = SJIS;
00190             goto breakBreak;
00191             } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
00192             code = EUC;
00193             goto breakBreak;
00194             } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
00195             code = EUC;
00196             goto breakBreak;
00197             } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
00198             code = SJIS;
00199             goto breakBreak;
00200             } else if (ptr[i] <= 0x7f) {
00201             code = SJIS;
00202             goto breakBreak;
00203             } else {
00204             if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
00205                 euc++;  /* sjis hankaku kana kigo */
00206             } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
00207                 ;   /* sjis hankaku kana */
00208             } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
00209                 euc++;
00210             } else if (0x8e == ptr[i]) {
00211                 euc++;
00212             } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
00213                 sjis++;
00214             }
00215             bfr = FALSE;
00216             bfk = 0;
00217             }
00218         } else if (0x8e == ptr[i]) {
00219             if (size - i <= 1) {
00220             ;
00221             } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
00222             /* EUC KANA or SJIS KANJI */
00223             if (bfk == 1) {
00224                 euc += 100;
00225             }
00226             bfk++;
00227             i++;
00228             } else {
00229             /* SJIS only */
00230             code = SJIS;
00231             goto breakBreak;
00232             }
00233         } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
00234             /* SJIS only */
00235             code = SJIS;
00236             if ((size - i >= 1)
00237                 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
00238                 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
00239             goto breakBreak;
00240             }
00241         } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
00242             /* EUC only */
00243             code = EUC;
00244             if ((size - i >= 1)
00245                 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
00246             goto breakBreak;
00247             }
00248         } else if (ptr[i] <= 0x7f) {
00249             ;
00250         } else {
00251             bfr = TRUE;
00252             bfk = 0;
00253         }
00254         }
00255         i++;
00256     }
00257     }
00258     if (code == ASCII) {
00259     if (sjis > euc) {
00260         code = SJIS;
00261     } else if (sjis < euc) {
00262         code = EUC;
00263     }
00264     }
00265 breakBreak:
00266     return (code);
00267 }
00268 
00269 Decoder::Decoder()
00270 {
00271     // latin1
00272     m_codec = QTextCodec::codecForMib(4);
00273     m_decoder = m_codec->makeDecoder();
00274     enc = 0;
00275     body = false;
00276     beginning = true;
00277     visualRTL = false;
00278     haveEncoding = false;
00279 }
00280 Decoder::~Decoder()
00281 {
00282     delete m_decoder;
00283 }
00284 
00285 void Decoder::setEncoding(const char *_encoding, bool force)
00286 {
00287 #ifdef DECODE_DEBUG
00288     kdDebug(6005) << "setEncoding " << _encoding << " " << force << endl;
00289 #endif
00290     enc = _encoding;
00291 
00292     QTextCodec *old = m_codec;
00293 #ifdef DECODE_DEBUG
00294     kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;
00295 #endif
00296     enc = enc.lower();
00297 #ifdef DECODE_DEBUG
00298     kdDebug(6005) << "requesting:" << enc << endl;
00299 #endif
00300     if(enc.isNull() || enc.isEmpty())
00301         return;
00302     if(enc == "visual") // hebrew visually ordered
00303         enc = "iso8859-8";
00304     bool b;
00305     m_codec = KGlobal::charsets()->codecForName(enc, b);
00306     if(m_codec->mibEnum() == 11)  {
00307         // iso8859-8 (visually ordered)
00308         m_codec = QTextCodec::codecForName("iso8859-8-i");
00309         visualRTL = true;
00310     }
00311     if( !b ) // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs)
00312     m_codec = old;
00313     else
00314     haveEncoding = force;
00315     delete m_decoder;
00316     m_decoder = m_codec->makeDecoder();
00317     if (m_codec->mibEnum() == 1000) // utf 16
00318         haveEncoding = false; // force auto detection
00319 #ifdef DECODE_DEBUG
00320     kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;
00321 #endif
00322 }
00323 
00324 const char *Decoder::encoding() const
00325 {
00326     return enc;
00327 }
00328 
00329 QString Decoder::decode(const char *data, int len)
00330 {
00331     // this is not completely efficient, since the function might go
00332     // through the html head several times...
00333 
00334     if(!haveEncoding && !body) {
00335 #ifdef DECODE_DEBUG
00336         kdDebug(6005) << "looking for charset definition" << endl;
00337 #endif
00338         // check for UTF-16
00339         uchar * uchars = (uchar *) data;
00340         if( uchars[0] == 0xfe && uchars[1] == 0xff ||
00341             uchars[0] == 0xff && uchars[1] == 0xfe ) {
00342             enc = "ISO-10646-UCS-2";
00343             haveEncoding = true;
00344             m_codec = QTextCodec::codecForMib(1000);
00345             delete m_decoder;
00346             m_decoder = m_codec->makeDecoder();
00347         } else {
00348 
00349             if(m_codec->mibEnum() != 1000) {  // utf16
00350                 // replace '\0' by spaces, for buggy pages
00351                 char *d = const_cast<char *>(data);
00352                 int i = len - 1;
00353                 while(i >= 0) {
00354                     if(d[i] == 0) d[i] = ' ';
00355                     i--;
00356                 }
00357             }
00358             buffer += QCString(data, len+1);
00359 
00360             // we still don't have an encoding, and are in the head
00361             // the following tags are allowed in <head>:
00362             // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
00363 
00364             const char *ptr = buffer.data();
00365             while(*ptr != '\0')
00366             {
00367                 if(*ptr == '<') {
00368                     bool end = false;
00369                     ptr++;
00370                     if(*ptr == '/') ptr++, end=true;
00371                     char tmp[20];
00372                     int len = 0;
00373                     while (
00374                         ((*ptr >= 'a') && (*ptr <= 'z') ||
00375                          (*ptr >= 'A') && (*ptr <= 'Z') ||
00376                          (*ptr >= '0') && (*ptr <= '9'))
00377                         && len < 19 )
00378                     {
00379                         tmp[len] = tolower( *ptr );
00380                         ptr++;
00381                         len++;
00382                     }
00383             tmp[len] = 0;
00384                     int id = khtml::getTagID(tmp, len);
00385                     if(end) id += ID_CLOSE_TAG;
00386 
00387                     switch( id ) {
00388                     case ID_META:
00389                     {
00390                         // found a meta tag...
00391                         //ptr += 5;
00392                         const char * end = ptr;
00393                         while(*end != '>' && *end != '\0') end++;
00394                         if ( *end == '\0' ) break;
00395                         QCString str( ptr, (end-ptr)+1);
00396                         str = str.lower();
00397                         int pos = 0;
00398                         //if( (pos = str.find("http-equiv", pos)) == -1) break;
00399                         //if( (pos = str.find("content-type", pos)) == -1) break;
00400             while( pos < ( int ) str.length() ) {
00401                 if( (pos = str.find("charset", pos)) == -1) break;
00402                 pos += 7;
00403                             // skip whitespace..
00404                 while(  pos < (int)str.length() && str[pos] <= ' ' ) pos++;
00405                             if ( pos == ( int )str.length()) break;
00406                             if ( str[pos++] != '=' ) continue;
00407                             while ( pos < ( int )str.length() &&
00408                                     ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
00409                 pos++;
00410 
00411                             // end ?
00412                             if ( pos == ( int )str.length() ) break;
00413                 uint endpos = pos;
00414                 while( endpos < str.length() &&
00415                                    (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
00416                                     && str[endpos] != ';' && str[endpos] != '>') )
00417                 endpos++;
00418                 enc = str.mid(pos, endpos-pos);
00419 #ifdef DECODE_DEBUG
00420                 kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl;
00421 #endif
00422                 setEncoding(enc, true);
00423                 if( haveEncoding ) goto found;
00424 
00425                             if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
00426 
00427                 pos = endpos + 1;
00428             }
00429             }
00430                     case ID_SCRIPT:
00431                     case (ID_SCRIPT+ID_CLOSE_TAG):
00432                     case ID_NOSCRIPT:
00433                     case (ID_NOSCRIPT+ID_CLOSE_TAG):
00434                     case ID_STYLE:
00435                     case (ID_STYLE+ID_CLOSE_TAG):
00436                     case ID_LINK:
00437                     case (ID_LINK+ID_CLOSE_TAG):
00438                     case ID_OBJECT:
00439                     case (ID_OBJECT+ID_CLOSE_TAG):
00440                     case ID_TITLE:
00441                     case (ID_TITLE+ID_CLOSE_TAG):
00442                     case ID_BASE:
00443                     case (ID_BASE+ID_CLOSE_TAG):
00444                     case ID_HTML:
00445                     case ID_HEAD:
00446                     case 0:
00447                     case (0 + ID_CLOSE_TAG ):
00448                         break;
00449                     default:
00450                         body = true;
00451 #ifdef DECODE_DEBUG
00452             kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
00453 #endif
00454                         goto found;
00455                     }
00456                 }
00457                 else
00458                     ptr++;
00459             }
00460             return QString::null;
00461         }
00462     }
00463 
00464  found:
00465     if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") {
00466 #ifdef DECODE_DEBUG
00467     kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
00468 #endif
00469     switch ( KanjiCode::judge( data ) ) {
00470     case KanjiCode::JIS:
00471         enc = "jis7";
00472         break;
00473     case KanjiCode::EUC:
00474         enc = "eucjp";
00475         break;
00476     case KanjiCode::SJIS:
00477         enc = "sjis";
00478         break;
00479     default:
00480         enc = NULL;
00481         break;
00482     }
00483 #ifdef DECODE_DEBUG
00484     kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl;
00485 #endif
00486     if (!enc.isEmpty()) {
00487         setEncoding(enc, true);
00488     }
00489     }
00490 
00491     // if we still haven't found an encoding latin1 will be used...
00492     // this is according to HTML4.0 specs
00493     if (!m_codec)
00494     {
00495         if(enc.isEmpty()) enc = "iso8859-1";
00496         m_codec = QTextCodec::codecForName(enc);
00497         // be sure not to crash
00498         if(!m_codec) {
00499             m_codec = QTextCodec::codecForMib(4);
00500             enc = "iso8859-1";
00501         }
00502         delete m_decoder;
00503         m_decoder = m_codec->makeDecoder();
00504     }
00505     QString out;
00506 
00507     if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {
00508         out = m_decoder->toUnicode(buffer, buffer.length());
00509         buffer = "";
00510     } else {
00511         if(m_codec->mibEnum() != 1000) // utf16
00512         {
00513             // ### hack for a bug in QTextCodec. It cut's the input stream
00514             // in case there are \0 in it. ZDNET has them inside... :-(
00515             char *d = const_cast<char *>(data);
00516             int i = len - 1;
00517             while(i >= 0) {
00518                 if(*(d+i) == 0) *(d+i) = ' ';
00519                 i--;
00520             }
00521         }
00522         out = m_decoder->toUnicode(data, len);
00523     }
00524 
00525     // the hell knows, why the output does sometimes have a QChar::null at
00526     // the end...
00527     if(out[out.length()-1] == QChar::null)
00528         assert(0);
00529     return out;
00530 }
00531 
00532 QString Decoder::flush() const
00533 {
00534     return m_decoder->toUnicode(buffer, buffer.length());
00535 }
00536 
00537 // -----------------------------------------------------------------------------
00538 #undef DECODE_DEBUG
KDE Logo
This file is part of the documentation for kdelibs Version 3.1.0.
Documentation copyright © 1996-2002 the KDE developers.
Generated on Wed Oct 8 12:22:35 2003 by doxygen 1.2.18 written by Dimitri van Heesch, © 1997-2001