src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/vector.h"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece() {}
  51     wxHtmlTextPiece(const wxString::const_iterator& start,
  52                     const wxString::const_iterator& end)
  53         : m_start(start), m_end(end) {}
  54     wxString::const_iterator m_start, m_end;
  55 };
  56
  57 // NB: this is an empty class and not typedef because of forward declaration
  58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  59 {
  60 };
  61
  62 class wxHtmlParserState
  63 {
  64 public:
  65     wxHtmlTag         *m_curTag;
  66     wxHtmlTag         *m_tags;
  67     wxHtmlTextPieces  *m_textPieces;
  68     int                m_curTextPiece;
  69     const wxString    *m_source;
  70     wxHtmlParserState *m_nextState;
  71 };
  72
  73 //-----------------------------------------------------------------------------
  74 // wxHtmlParser
  75 //-----------------------------------------------------------------------------
  76
  77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  78
  79 wxHtmlParser::wxHtmlParser()
  80     : wxObject(), m_HandlersHash(wxKEY_STRING),
  81       m_FS(NULL), m_HandlersStack(NULL)
  82 {
  83     m_Source = NULL;
  84     m_entitiesParser = new wxHtmlEntitiesParser;
  85     m_Tags = NULL;
  86     m_CurTag = NULL;
  87     m_TextPieces = NULL;
  88     m_CurTextPiece = 0;
  89     m_SavedStates = NULL;
  90 }
  91
  92 wxHtmlParser::~wxHtmlParser()
  93 {
  94     while (RestoreState()) {}
  95     DestroyDOMTree();
  96
  97     if (m_HandlersStack)
  98     {
  99         wxList& tmp = *m_HandlersStack;
 100         wxList::iterator it, en;
 101         for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
 102             delete (wxHashTable*)*it;
 103         tmp.clear();
 104     }
 105     delete m_HandlersStack;
 106     m_HandlersHash.Clear();
 107     WX_CLEAR_LIST(wxList, m_HandlersList);
 108     delete m_entitiesParser;
 109     delete m_Source;
 110 }
 111
 112 wxObject* wxHtmlParser::Parse(const wxString& source)
 113 {
 114     InitParser(source);
 115     DoParsing();
 116     wxObject *result = GetProduct();
 117     DoneParser();
 118     return result;
 119 }
 120
 121 void wxHtmlParser::InitParser(const wxString& source)
 122 {
 123     SetSource(source);
 124     m_stopParsing = false;
 125 }
 126
 127 void wxHtmlParser::DoneParser()
 128 {
 129     DestroyDOMTree();
 130 }
 131
 132 void wxHtmlParser::SetSource(const wxString& src)
 133 {
 134     DestroyDOMTree();
 135     // NB: This is allocated on heap because wxHtmlTag uses iterators and
 136     //     making a copy of m_Source string in SetSourceAndSaveState() and
 137     //     RestoreState() would invalidate them (because wxString::m_impl's
 138     //     memory would change completely twice and iterators use pointers
 139     //     into it). So instead, we keep the string object intact and only
 140     //     store/restore pointer to it, for which we need it to be allocated
 141     //     on the heap.
 142     delete m_Source;
 143     m_Source = new wxString(src);
 144     CreateDOMTree();
 145     m_CurTag = NULL;
 146     m_CurTextPiece = 0;
 147 }
 148
 149 void wxHtmlParser::CreateDOMTree()
 150 {
 151     wxHtmlTagsCache cache(*m_Source);
 152     m_TextPieces = new wxHtmlTextPieces;
 153     CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
 154     m_CurTextPiece = 0;
 155 }
 156
 157 extern bool wxIsCDATAElement(const wxString& tag);
 158
 159 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 160                                     const wxString::const_iterator& begin_pos,
 161                                     const wxString::const_iterator& end_pos,
 162                                     wxHtmlTagsCache *cache)
 163 {
 164     if (end_pos <= begin_pos)
 165         return;
 166
 167     wxChar c;
 168     wxString::const_iterator i = begin_pos;
 169     wxString::const_iterator textBeginning = begin_pos;
 170
 171     // If the tag contains CDATA text, we include the text between beginning
 172     // and ending tag verbosely. Setting i=end_pos will skip to the very
 173     // end of this function where text piece is added, bypassing any child
 174     // tags parsing (CDATA element can't have child elements by definition):
 175     if (cur != NULL && wxIsCDATAElement(cur->GetName()))
 176     {
 177         i = end_pos;
 178     }
 179
 180     while (i < end_pos)
 181     {
 182         c = *i;
 183
 184         if (c == wxT('<'))
 185         {
 186             // add text to m_TextPieces:
 187             if (i > textBeginning)
 188                 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
 189
 190             // if it is a comment, skip it:
 191             if ( SkipCommentTag(i, m_Source->end()) )
 192             {
 193                 textBeginning = i = i + 1; // skip closing '>' too
 194             }
 195
 196             // add another tag to the tree:
 197             else if (i < end_pos-1 && *(i+1) != wxT('/'))
 198             {
 199                 wxHtmlTag *chd;
 200                 if (cur)
 201                     chd = new wxHtmlTag(cur, m_Source,
 202                                         i, end_pos, cache, m_entitiesParser);
 203                 else
 204                 {
 205                     chd = new wxHtmlTag(NULL, m_Source,
 206                                         i, end_pos, cache, m_entitiesParser);
 207                     if (!m_Tags)
 208                     {
 209                         // if this is the first tag to be created make the root
 210                         // m_Tags point to it:
 211                         m_Tags = chd;
 212                     }
 213                     else
 214                     {
 215                         // if there is already a root tag add this tag as
 216                         // the last sibling:
 217                         chd->m_Prev = m_Tags->GetLastSibling();
 218                         chd->m_Prev->m_Next = chd;
 219                     }
 220                 }
 221
 222                 if (chd->HasEnding())
 223                 {
 224                     CreateDOMSubTree(chd,
 225                                      chd->GetBeginIter(), chd->GetEndIter1(),
 226                                      cache);
 227                     i = chd->GetEndIter2();
 228                 }
 229                 else
 230                     i = chd->GetBeginIter();
 231
 232                 textBeginning = i;
 233             }
 234
 235             // ... or skip ending tag:
 236             else
 237             {
 238                 while (i < end_pos && *i != wxT('>')) ++i;
 239                 textBeginning = i+1;
 240             }
 241         }
 242         else ++i;
 243     }
 244
 245     // add remaining text to m_TextPieces:
 246     if (end_pos > textBeginning)
 247         m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
 248 }
 249
 250 void wxHtmlParser::DestroyDOMTree()
 251 {
 252     wxHtmlTag *t1, *t2;
 253     t1 = m_Tags;
 254     while (t1)
 255     {
 256         t2 = t1->GetNextSibling();
 257         delete t1;
 258         t1 = t2;
 259     }
 260     m_Tags = m_CurTag = NULL;
 261
 262     delete m_TextPieces;
 263     m_TextPieces = NULL;
 264 }
 265
 266 void wxHtmlParser::DoParsing()
 267 {
 268     m_CurTag = m_Tags;
 269     m_CurTextPiece = 0;
 270     DoParsing(m_Source->begin(), m_Source->end());
 271 }
 272
 273 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
 274                              const wxString::const_iterator& end_pos)
 275 {
 276     wxString::const_iterator begin_pos(begin_pos_);
 277
 278     if (end_pos <= begin_pos)
 279         return;
 280
 281     wxHtmlTextPieces& pieces = *m_TextPieces;
 282     size_t piecesCnt = pieces.size();
 283
 284     while (begin_pos < end_pos)
 285     {
 286         while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
 287             m_CurTag = m_CurTag->GetNextTag();
 288         while (m_CurTextPiece < piecesCnt &&
 289                pieces[m_CurTextPiece].m_start < begin_pos)
 290             m_CurTextPiece++;
 291
 292         if (m_CurTextPiece < piecesCnt &&
 293             (!m_CurTag ||
 294              pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
 295         {
 296             // Add text:
 297             AddText(GetEntitiesParser()->Parse(
 298                        wxString(pieces[m_CurTextPiece].m_start,
 299                                 pieces[m_CurTextPiece].m_end)));
 300             begin_pos = pieces[m_CurTextPiece].m_end;
 301             m_CurTextPiece++;
 302         }
 303         else if (m_CurTag)
 304         {
 305             if (m_CurTag->HasEnding())
 306                 begin_pos = m_CurTag->GetEndIter2();
 307             else
 308                 begin_pos = m_CurTag->GetBeginIter();
 309             wxHtmlTag *t = m_CurTag;
 310             m_CurTag = m_CurTag->GetNextTag();
 311             AddTag(*t);
 312             if (m_stopParsing)
 313                 return;
 314         }
 315         else break;
 316     }
 317 }
 318
 319 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 320 {
 321     wxHtmlTagHandler *h;
 322     bool inner = false;
 323
 324     h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
 325     if (h)
 326     {
 327         inner = h->HandleTag(tag);
 328         if (m_stopParsing)
 329             return;
 330     }
 331     if (!inner)
 332     {
 333         if (tag.HasEnding())
 334             DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
 335     }
 336 }
 337
 338 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 339 {
 340     wxString s(handler->GetSupportedTags());
 341     wxStringTokenizer tokenizer(s, wxT(", "));
 342
 343     while (tokenizer.HasMoreTokens())
 344         m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
 345
 346     if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
 347         m_HandlersList.Append(handler);
 348
 349     handler->SetParser(this);
 350 }
 351
 352 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 353 {
 354     wxStringTokenizer tokenizer(tags, wxT(", "));
 355     wxString key;
 356
 357     if (m_HandlersStack == NULL)
 358     {
 359         m_HandlersStack = new wxList;
 360     }
 361
 362     m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
 363
 364     while (tokenizer.HasMoreTokens())
 365     {
 366         key = tokenizer.GetNextToken();
 367         m_HandlersHash.Delete(key);
 368         m_HandlersHash.Put(key, handler);
 369     }
 370 }
 371
 372 void wxHtmlParser::PopTagHandler()
 373 {
 374     wxList::compatibility_iterator first;
 375
 376     if ( !m_HandlersStack ||
 377 #if wxUSE_STL
 378          !(first = m_HandlersStack->GetFirst())
 379 #else // !wxUSE_STL
 380          ((first = m_HandlersStack->GetFirst()) == NULL)
 381 #endif // wxUSE_STL/!wxUSE_STL
 382         )
 383     {
 384         wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
 385         return;
 386     }
 387     m_HandlersHash = *((wxHashTable*) first->GetData());
 388     delete (wxHashTable*) first->GetData();
 389     m_HandlersStack->Erase(first);
 390 }
 391
 392 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 393 {
 394     wxHtmlParserState *s = new wxHtmlParserState;
 395
 396     s->m_curTag = m_CurTag;
 397     s->m_tags = m_Tags;
 398     s->m_textPieces = m_TextPieces;
 399     s->m_curTextPiece = m_CurTextPiece;
 400     s->m_source = m_Source;
 401
 402     s->m_nextState = m_SavedStates;
 403     m_SavedStates = s;
 404
 405     m_CurTag = NULL;
 406     m_Tags = NULL;
 407     m_TextPieces = NULL;
 408     m_CurTextPiece = 0;
 409     m_Source = NULL;
 410
 411     SetSource(src);
 412 }
 413
 414 bool wxHtmlParser::RestoreState()
 415 {
 416     if (!m_SavedStates) return false;
 417
 418     DestroyDOMTree();
 419     delete m_Source;
 420
 421     wxHtmlParserState *s = m_SavedStates;
 422     m_SavedStates = s->m_nextState;
 423
 424     m_CurTag = s->m_curTag;
 425     m_Tags = s->m_tags;
 426     m_TextPieces = s->m_textPieces;
 427     m_CurTextPiece = s->m_curTextPiece;
 428     m_Source = s->m_source;
 429
 430     delete s;
 431     return true;
 432 }
 433
 434 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 435 {
 436     return wxString(tag.GetBeginIter(), tag.GetEndIter1());
 437 }
 438
 439 //-----------------------------------------------------------------------------
 440 // wxHtmlTagHandler
 441 //-----------------------------------------------------------------------------
 442
 443 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 444
 445 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 446 {
 447     // It is safe to temporarily change the source being parsed,
 448     // provided we restore the state back after parsing
 449     m_Parser->SetSourceAndSaveState(source);
 450     m_Parser->DoParsing();
 451     m_Parser->RestoreState();
 452 }
 453
 454
 455 //-----------------------------------------------------------------------------
 456 // wxHtmlEntitiesParser
 457 //-----------------------------------------------------------------------------
 458
 459 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 460
 461 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 462 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 463     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 464 #endif
 465 {
 466 }
 467
 468 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 469 {
 470 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 471     delete m_conv;
 472 #endif
 473 }
 474
 475 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 476 {
 477 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 478     if (encoding == m_encoding)
 479         return;
 480
 481     delete m_conv;
 482
 483     m_encoding = encoding;
 484     if (m_encoding == wxFONTENCODING_SYSTEM)
 485         m_conv = NULL;
 486     else
 487         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 488 #else
 489     (void) encoding;
 490 #endif
 491 }
 492
 493 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 494 {
 495     wxString output;
 496
 497     const wxString::const_iterator end(input.end());
 498     wxString::const_iterator c(input.begin());
 499     wxString::const_iterator last(c);
 500
 501     for ( ; c < end; ++c )
 502     {
 503         if (*c == wxT('&'))
 504         {
 505             if ( output.empty() )
 506                 output.reserve(input.length());
 507
 508             if (c - last > 0)
 509                 output.append(last, c);
 510             if ( ++c == end )
 511                 break;
 512
 513             wxString entity;
 514             const wxString::const_iterator ent_s = c;
 515             wxChar entity_char;
 516
 517             for ( ; c != end; ++c )
 518             {
 519                 wxChar ch = *c;
 520                 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
 521                        (ch >= wxT('A') && ch <= wxT('Z')) ||
 522                        (ch >= wxT('0') && ch <= wxT('9')) ||
 523                         ch == wxT('_') || ch == wxT('#')) )
 524                     break;
 525             }
 526
 527             entity.append(ent_s, c);
 528             if (c == end || *c != wxT(';')) --c;
 529             last = c+1;
 530             entity_char = GetEntityChar(entity);
 531             if (entity_char)
 532                 output << entity_char;
 533             else
 534             {
 535                 output.append(ent_s-1, c+1);
 536                 wxLogTrace(wxTRACE_HTML_DEBUG,
 537                            "Unrecognized HTML entity: '%s'",
 538                            entity);
 539             }
 540         }
 541     }
 542     if ( last == input.begin() ) // common case: no entity
 543         return input;
 544     if ( last != end )
 545         output.append(last, end);
 546     return output;
 547 }
 548
 549 #if !wxUSE_UNICODE
 550 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 551 {
 552 #if wxUSE_WCHAR_T
 553     char buf[2];
 554     wchar_t wbuf[2];
 555     wbuf[0] = (wchar_t)code;
 556     wbuf[1] = 0;
 557     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 558     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 559         return '?';
 560     return buf[0];
 561 #else
 562     return (code < 256) ? (wxChar)code : '?';
 563 #endif
 564 }
 565 #endif
 566
 567 struct wxHtmlEntityInfo
 568 {
 569     const wxStringCharType *name;
 570     unsigned code;
 571 };
 572
 573 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 574 {
 575 #if wxUSE_UNICODE_UTF8
 576     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 577 #else
 578     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 579 #endif
 580 }
 581
 582 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 583 {
 584     unsigned code = 0;
 585
 586     if (entity[0] == wxT('#'))
 587     {
 588         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 589         //     representation is the same for numbers
 590         const wxStringCharType *ent_s = entity.wx_str();
 591         const wxStringCharType *format;
 592
 593         if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
 594         {
 595             format = wxSTRING_TEXT("%x");
 596             ent_s++;
 597         }
 598         else
 599             format = wxSTRING_TEXT("%u");
 600         ent_s++;
 601
 602         if (wxSscanf(ent_s, format, &code) != 1)
 603             code = 0;
 604     }
 605     else
 606     {
 607         // store the literals in wx's internal representation (either char*
 608         // in UTF-8 or wchar_t*) for best performance:
 609         #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
 610
 611         static wxHtmlEntityInfo substitutions[] = {
 612             ENTITY("AElig", 198),
 613             ENTITY("Aacute", 193),
 614             ENTITY("Acirc", 194),
 615             ENTITY("Agrave", 192),
 616             ENTITY("Alpha", 913),
 617             ENTITY("Aring", 197),
 618             ENTITY("Atilde", 195),
 619             ENTITY("Auml", 196),
 620             ENTITY("Beta", 914),
 621             ENTITY("Ccedil", 199),
 622             ENTITY("Chi", 935),
 623             ENTITY("Dagger", 8225),
 624             ENTITY("Delta", 916),
 625             ENTITY("ETH", 208),
 626             ENTITY("Eacute", 201),
 627             ENTITY("Ecirc", 202),
 628             ENTITY("Egrave", 200),
 629             ENTITY("Epsilon", 917),
 630             ENTITY("Eta", 919),
 631             ENTITY("Euml", 203),
 632             ENTITY("Gamma", 915),
 633             ENTITY("Iacute", 205),
 634             ENTITY("Icirc", 206),
 635             ENTITY("Igrave", 204),
 636             ENTITY("Iota", 921),
 637             ENTITY("Iuml", 207),
 638             ENTITY("Kappa", 922),
 639             ENTITY("Lambda", 923),
 640             ENTITY("Mu", 924),
 641             ENTITY("Ntilde", 209),
 642             ENTITY("Nu", 925),
 643             ENTITY("OElig", 338),
 644             ENTITY("Oacute", 211),
 645             ENTITY("Ocirc", 212),
 646             ENTITY("Ograve", 210),
 647             ENTITY("Omega", 937),
 648             ENTITY("Omicron", 927),
 649             ENTITY("Oslash", 216),
 650             ENTITY("Otilde", 213),
 651             ENTITY("Ouml", 214),
 652             ENTITY("Phi", 934),
 653             ENTITY("Pi", 928),
 654             ENTITY("Prime", 8243),
 655             ENTITY("Psi", 936),
 656             ENTITY("Rho", 929),
 657             ENTITY("Scaron", 352),
 658             ENTITY("Sigma", 931),
 659             ENTITY("THORN", 222),
 660             ENTITY("Tau", 932),
 661             ENTITY("Theta", 920),
 662             ENTITY("Uacute", 218),
 663             ENTITY("Ucirc", 219),
 664             ENTITY("Ugrave", 217),
 665             ENTITY("Upsilon", 933),
 666             ENTITY("Uuml", 220),
 667             ENTITY("Xi", 926),
 668             ENTITY("Yacute", 221),
 669             ENTITY("Yuml", 376),
 670             ENTITY("Zeta", 918),
 671             ENTITY("aacute", 225),
 672             ENTITY("acirc", 226),
 673             ENTITY("acute", 180),
 674             ENTITY("aelig", 230),
 675             ENTITY("agrave", 224),
 676             ENTITY("alefsym", 8501),
 677             ENTITY("alpha", 945),
 678             ENTITY("amp", 38),
 679             ENTITY("and", 8743),
 680             ENTITY("ang", 8736),
 681             ENTITY("aring", 229),
 682             ENTITY("asymp", 8776),
 683             ENTITY("atilde", 227),
 684             ENTITY("auml", 228),
 685             ENTITY("bdquo", 8222),
 686             ENTITY("beta", 946),
 687             ENTITY("brvbar", 166),
 688             ENTITY("bull", 8226),
 689             ENTITY("cap", 8745),
 690             ENTITY("ccedil", 231),
 691             ENTITY("cedil", 184),
 692             ENTITY("cent", 162),
 693             ENTITY("chi", 967),
 694             ENTITY("circ", 710),
 695             ENTITY("clubs", 9827),
 696             ENTITY("cong", 8773),
 697             ENTITY("copy", 169),
 698             ENTITY("crarr", 8629),
 699             ENTITY("cup", 8746),
 700             ENTITY("curren", 164),
 701             ENTITY("dArr", 8659),
 702             ENTITY("dagger", 8224),
 703             ENTITY("darr", 8595),
 704             ENTITY("deg", 176),
 705             ENTITY("delta", 948),
 706             ENTITY("diams", 9830),
 707             ENTITY("divide", 247),
 708             ENTITY("eacute", 233),
 709             ENTITY("ecirc", 234),
 710             ENTITY("egrave", 232),
 711             ENTITY("empty", 8709),
 712             ENTITY("emsp", 8195),
 713             ENTITY("ensp", 8194),
 714             ENTITY("epsilon", 949),
 715             ENTITY("equiv", 8801),
 716             ENTITY("eta", 951),
 717             ENTITY("eth", 240),
 718             ENTITY("euml", 235),
 719             ENTITY("euro", 8364),
 720             ENTITY("exist", 8707),
 721             ENTITY("fnof", 402),
 722             ENTITY("forall", 8704),
 723             ENTITY("frac12", 189),
 724             ENTITY("frac14", 188),
 725             ENTITY("frac34", 190),
 726             ENTITY("frasl", 8260),
 727             ENTITY("gamma", 947),
 728             ENTITY("ge", 8805),
 729             ENTITY("gt", 62),
 730             ENTITY("hArr", 8660),
 731             ENTITY("harr", 8596),
 732             ENTITY("hearts", 9829),
 733             ENTITY("hellip", 8230),
 734             ENTITY("iacute", 237),
 735             ENTITY("icirc", 238),
 736             ENTITY("iexcl", 161),
 737             ENTITY("igrave", 236),
 738             ENTITY("image", 8465),
 739             ENTITY("infin", 8734),
 740             ENTITY("int", 8747),
 741             ENTITY("iota", 953),
 742             ENTITY("iquest", 191),
 743             ENTITY("isin", 8712),
 744             ENTITY("iuml", 239),
 745             ENTITY("kappa", 954),
 746             ENTITY("lArr", 8656),
 747             ENTITY("lambda", 955),
 748             ENTITY("lang", 9001),
 749             ENTITY("laquo", 171),
 750             ENTITY("larr", 8592),
 751             ENTITY("lceil", 8968),
 752             ENTITY("ldquo", 8220),
 753             ENTITY("le", 8804),
 754             ENTITY("lfloor", 8970),
 755             ENTITY("lowast", 8727),
 756             ENTITY("loz", 9674),
 757             ENTITY("lrm", 8206),
 758             ENTITY("lsaquo", 8249),
 759             ENTITY("lsquo", 8216),
 760             ENTITY("lt", 60),
 761             ENTITY("macr", 175),
 762             ENTITY("mdash", 8212),
 763             ENTITY("micro", 181),
 764             ENTITY("middot", 183),
 765             ENTITY("minus", 8722),
 766             ENTITY("mu", 956),
 767             ENTITY("nabla", 8711),
 768             ENTITY("nbsp", 160),
 769             ENTITY("ndash", 8211),
 770             ENTITY("ne", 8800),
 771             ENTITY("ni", 8715),
 772             ENTITY("not", 172),
 773             ENTITY("notin", 8713),
 774             ENTITY("nsub", 8836),
 775             ENTITY("ntilde", 241),
 776             ENTITY("nu", 957),
 777             ENTITY("oacute", 243),
 778             ENTITY("ocirc", 244),
 779             ENTITY("oelig", 339),
 780             ENTITY("ograve", 242),
 781             ENTITY("oline", 8254),
 782             ENTITY("omega", 969),
 783             ENTITY("omicron", 959),
 784             ENTITY("oplus", 8853),
 785             ENTITY("or", 8744),
 786             ENTITY("ordf", 170),
 787             ENTITY("ordm", 186),
 788             ENTITY("oslash", 248),
 789             ENTITY("otilde", 245),
 790             ENTITY("otimes", 8855),
 791             ENTITY("ouml", 246),
 792             ENTITY("para", 182),
 793             ENTITY("part", 8706),
 794             ENTITY("permil", 8240),
 795             ENTITY("perp", 8869),
 796             ENTITY("phi", 966),
 797             ENTITY("pi", 960),
 798             ENTITY("piv", 982),
 799             ENTITY("plusmn", 177),
 800             ENTITY("pound", 163),
 801             ENTITY("prime", 8242),
 802             ENTITY("prod", 8719),
 803             ENTITY("prop", 8733),
 804             ENTITY("psi", 968),
 805             ENTITY("quot", 34),
 806             ENTITY("rArr", 8658),
 807             ENTITY("radic", 8730),
 808             ENTITY("rang", 9002),
 809             ENTITY("raquo", 187),
 810             ENTITY("rarr", 8594),
 811             ENTITY("rceil", 8969),
 812             ENTITY("rdquo", 8221),
 813             ENTITY("real", 8476),
 814             ENTITY("reg", 174),
 815             ENTITY("rfloor", 8971),
 816             ENTITY("rho", 961),
 817             ENTITY("rlm", 8207),
 818             ENTITY("rsaquo", 8250),
 819             ENTITY("rsquo", 8217),
 820             ENTITY("sbquo", 8218),
 821             ENTITY("scaron", 353),
 822             ENTITY("sdot", 8901),
 823             ENTITY("sect", 167),
 824             ENTITY("shy", 173),
 825             ENTITY("sigma", 963),
 826             ENTITY("sigmaf", 962),
 827             ENTITY("sim", 8764),
 828             ENTITY("spades", 9824),
 829             ENTITY("sub", 8834),
 830             ENTITY("sube", 8838),
 831             ENTITY("sum", 8721),
 832             ENTITY("sup", 8835),
 833             ENTITY("sup1", 185),
 834             ENTITY("sup2", 178),
 835             ENTITY("sup3", 179),
 836             ENTITY("supe", 8839),
 837             ENTITY("szlig", 223),
 838             ENTITY("tau", 964),
 839             ENTITY("there4", 8756),
 840             ENTITY("theta", 952),
 841             ENTITY("thetasym", 977),
 842             ENTITY("thinsp", 8201),
 843             ENTITY("thorn", 254),
 844             ENTITY("tilde", 732),
 845             ENTITY("times", 215),
 846             ENTITY("trade", 8482),
 847             ENTITY("uArr", 8657),
 848             ENTITY("uacute", 250),
 849             ENTITY("uarr", 8593),
 850             ENTITY("ucirc", 251),
 851             ENTITY("ugrave", 249),
 852             ENTITY("uml", 168),
 853             ENTITY("upsih", 978),
 854             ENTITY("upsilon", 965),
 855             ENTITY("uuml", 252),
 856             ENTITY("weierp", 8472),
 857             ENTITY("xi", 958),
 858             ENTITY("yacute", 253),
 859             ENTITY("yen", 165),
 860             ENTITY("yuml", 255),
 861             ENTITY("zeta", 950),
 862             ENTITY("zwj", 8205),
 863             ENTITY("zwnj", 8204),
 864             {NULL, 0}};
 865         #undef ENTITY
 866         static size_t substitutions_cnt = 0;
 867
 868         if (substitutions_cnt == 0)
 869             while (substitutions[substitutions_cnt].code != 0)
 870                 substitutions_cnt++;
 871
 872         wxHtmlEntityInfo *info = NULL;
 873 #ifdef __WXWINCE__
 874         // bsearch crashes under WinCE for some reason
 875         size_t i;
 876         for (i = 0; i < substitutions_cnt; i++)
 877         {
 878             if (entity == substitutions[i].name)
 879             {
 880                 info = & substitutions[i];
 881                 break;
 882             }
 883         }
 884 #else
 885         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 886                                            substitutions_cnt,
 887                                            sizeof(wxHtmlEntityInfo),
 888                                            wxHtmlEntityCompare);
 889 #endif
 890         if (info)
 891             code = info->code;
 892     }
 893
 894     if (code == 0)
 895         return 0;
 896     else
 897         return GetCharForCode(code);
 898 }
 899
 900 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 901                                 const wxString& url) const
 902 {
 903     return m_FS ? m_FS->OpenFile(url) : NULL;
 904
 905 }
 906
 907
 908 //-----------------------------------------------------------------------------
 909 // wxHtmlParser::ExtractCharsetInformation
 910 //-----------------------------------------------------------------------------
 911
 912 class wxMetaTagParser : public wxHtmlParser
 913 {
 914 public:
 915     wxMetaTagParser() { }
 916
 917     wxObject* GetProduct() { return NULL; }
 918
 919 protected:
 920     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 921
 922     DECLARE_NO_COPY_CLASS(wxMetaTagParser)
 923 };
 924
 925 class wxMetaTagHandler : public wxHtmlTagHandler
 926 {
 927 public:
 928     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 929     wxString GetSupportedTags() { return wxT("META,BODY"); }
 930     bool HandleTag(const wxHtmlTag& tag);
 931
 932 private:
 933     wxString *m_retval;
 934
 935     DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
 936 };
 937
 938 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 939 {
 940     if (tag.GetName() == _T("BODY"))
 941     {
 942         m_Parser->StopParsing();
 943         return false;
 944     }
 945
 946     if (tag.HasParam(_T("HTTP-EQUIV")) &&
 947         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
 948         tag.HasParam(_T("CONTENT")))
 949     {
 950         wxString content = tag.GetParam(_T("CONTENT")).Lower();
 951         if (content.Left(19) == _T("text/html; charset="))
 952         {
 953             *m_retval = content.Mid(19);
 954             m_Parser->StopParsing();
 955         }
 956     }
 957     return false;
 958 }
 959
 960
 961 /*static*/
 962 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 963 {
 964     wxString charset;
 965     wxMetaTagParser *parser = new wxMetaTagParser();
 966     if(parser)
 967     {
 968         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 969         parser->Parse(markup);
 970         delete parser;
 971     }
 972     return charset;
 973 }
 974
 975 /* static */
 976 bool
 977 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 978                              wxString::const_iterator end)
 979 {
 980     wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
 981
 982     wxString::const_iterator p = start;
 983
 984     // comments begin with "<!--" in HTML 4.0
 985     if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 986     {
 987         // not a comment at all
 988         return false;
 989     }
 990
 991     // skip the start of the comment tag in any case, if we don't find the
 992     // closing tag we should ignore broken markup
 993     start = p;
 994
 995     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 996     // comment delimiter and the closing tag character (section 3.2.4 of
 997     // http://www.w3.org/TR/html401/)
 998     int dashes = 0;
 999     while ( ++p < end )
1000     {
1001         const wxChar c = *p;
1002
1003         if ( (c == wxT(' ') || c == wxT('\n') ||
1004               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
1005         {
1006             // ignore white space before potential tag end
1007             continue;
1008         }
1009
1010         if ( c == wxT('>') && dashes >= 2 )
1011         {
1012             // found end of comment
1013             start = p;
1014             break;
1015         }
1016
1017         if ( c == wxT('-') )
1018             dashes++;
1019         else
1020             dashes = 0;
1021     }
1022
1023     return true;
1024 }
1025
1026 #endif // wxUSE_HTML