src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/vector.h"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece() {}
  51     wxHtmlTextPiece(const wxString::const_iterator& start,
  52                     const wxString::const_iterator& end)
  53         : m_start(start), m_end(end) {}
  54     wxString::const_iterator m_start, m_end;
  55 };
  56
  57 // NB: this is an empty class and not typedef because of forward declaration
  58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  59 {
  60 };
  61
  62 class wxHtmlParserState
  63 {
  64 public:
  65     wxHtmlTag         *m_curTag;
  66     wxHtmlTag         *m_tags;
  67     wxHtmlTextPieces  *m_textPieces;
  68     int                m_curTextPiece;
  69     const wxString    *m_source;
  70     wxHtmlParserState *m_nextState;
  71 };
  72
  73 //-----------------------------------------------------------------------------
  74 // wxHtmlParser
  75 //-----------------------------------------------------------------------------
  76
  77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  78
  79 wxHtmlParser::wxHtmlParser()
  80     : wxObject(),
  81       m_FS(NULL)
  82 {
  83     m_Source = NULL;
  84     m_entitiesParser = new wxHtmlEntitiesParser;
  85     m_Tags = NULL;
  86     m_CurTag = NULL;
  87     m_TextPieces = NULL;
  88     m_CurTextPiece = 0;
  89     m_SavedStates = NULL;
  90 }
  91
  92 wxHtmlParser::~wxHtmlParser()
  93 {
  94     while (RestoreState()) {}
  95     DestroyDOMTree();
  96
  97     WX_CLEAR_ARRAY(m_HandlersStack);
  98     WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
  99     delete m_entitiesParser;
 100     delete m_Source;
 101 }
 102
 103 wxObject* wxHtmlParser::Parse(const wxString& source)
 104 {
 105     InitParser(source);
 106     DoParsing();
 107     wxObject *result = GetProduct();
 108     DoneParser();
 109     return result;
 110 }
 111
 112 void wxHtmlParser::InitParser(const wxString& source)
 113 {
 114     SetSource(source);
 115     m_stopParsing = false;
 116 }
 117
 118 void wxHtmlParser::DoneParser()
 119 {
 120     DestroyDOMTree();
 121 }
 122
 123 void wxHtmlParser::SetSource(const wxString& src)
 124 {
 125     DestroyDOMTree();
 126     // NB: This is allocated on heap because wxHtmlTag uses iterators and
 127     //     making a copy of m_Source string in SetSourceAndSaveState() and
 128     //     RestoreState() would invalidate them (because wxString::m_impl's
 129     //     memory would change completely twice and iterators use pointers
 130     //     into it). So instead, we keep the string object intact and only
 131     //     store/restore pointer to it, for which we need it to be allocated
 132     //     on the heap.
 133     delete m_Source;
 134     m_Source = new wxString(src);
 135     CreateDOMTree();
 136     m_CurTag = NULL;
 137     m_CurTextPiece = 0;
 138 }
 139
 140 void wxHtmlParser::CreateDOMTree()
 141 {
 142     wxHtmlTagsCache cache(*m_Source);
 143     m_TextPieces = new wxHtmlTextPieces;
 144     CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
 145     m_CurTextPiece = 0;
 146 }
 147
 148 extern bool wxIsCDATAElement(const wxString& tag);
 149
 150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 151                                     const wxString::const_iterator& begin_pos,
 152                                     const wxString::const_iterator& end_pos,
 153                                     wxHtmlTagsCache *cache)
 154 {
 155     if (end_pos <= begin_pos)
 156         return;
 157
 158     wxChar c;
 159     wxString::const_iterator i = begin_pos;
 160     wxString::const_iterator textBeginning = begin_pos;
 161
 162     // If the tag contains CDATA text, we include the text between beginning
 163     // and ending tag verbosely. Setting i=end_pos will skip to the very
 164     // end of this function where text piece is added, bypassing any child
 165     // tags parsing (CDATA element can't have child elements by definition):
 166     if (cur != NULL && wxIsCDATAElement(cur->GetName()))
 167     {
 168         i = end_pos;
 169     }
 170
 171     while (i < end_pos)
 172     {
 173         c = *i;
 174
 175         if (c == wxT('<'))
 176         {
 177             // add text to m_TextPieces:
 178             if (i > textBeginning)
 179                 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
 180
 181             // if it is a comment, skip it:
 182             if ( SkipCommentTag(i, m_Source->end()) )
 183             {
 184                 textBeginning = i = i + 1; // skip closing '>' too
 185             }
 186
 187             // add another tag to the tree:
 188             else if (i < end_pos-1 && *(i+1) != wxT('/'))
 189             {
 190                 wxHtmlTag *chd;
 191                 if (cur)
 192                     chd = new wxHtmlTag(cur, m_Source,
 193                                         i, end_pos, cache, m_entitiesParser);
 194                 else
 195                 {
 196                     chd = new wxHtmlTag(NULL, m_Source,
 197                                         i, end_pos, cache, m_entitiesParser);
 198                     if (!m_Tags)
 199                     {
 200                         // if this is the first tag to be created make the root
 201                         // m_Tags point to it:
 202                         m_Tags = chd;
 203                     }
 204                     else
 205                     {
 206                         // if there is already a root tag add this tag as
 207                         // the last sibling:
 208                         chd->m_Prev = m_Tags->GetLastSibling();
 209                         chd->m_Prev->m_Next = chd;
 210                     }
 211                 }
 212
 213                 if (chd->HasEnding())
 214                 {
 215                     CreateDOMSubTree(chd,
 216                                      chd->GetBeginIter(), chd->GetEndIter1(),
 217                                      cache);
 218                     i = chd->GetEndIter2();
 219                 }
 220                 else
 221                     i = chd->GetBeginIter();
 222
 223                 textBeginning = i;
 224             }
 225
 226             // ... or skip ending tag:
 227             else
 228             {
 229                 while (i < end_pos && *i != wxT('>')) ++i;
 230                 textBeginning = i+1;
 231             }
 232         }
 233         else ++i;
 234     }
 235
 236     // add remaining text to m_TextPieces:
 237     if (end_pos > textBeginning)
 238         m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
 239 }
 240
 241 void wxHtmlParser::DestroyDOMTree()
 242 {
 243     wxHtmlTag *t1, *t2;
 244     t1 = m_Tags;
 245     while (t1)
 246     {
 247         t2 = t1->GetNextSibling();
 248         delete t1;
 249         t1 = t2;
 250     }
 251     m_Tags = m_CurTag = NULL;
 252
 253     delete m_TextPieces;
 254     m_TextPieces = NULL;
 255 }
 256
 257 void wxHtmlParser::DoParsing()
 258 {
 259     m_CurTag = m_Tags;
 260     m_CurTextPiece = 0;
 261     DoParsing(m_Source->begin(), m_Source->end());
 262 }
 263
 264 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
 265                              const wxString::const_iterator& end_pos)
 266 {
 267     wxString::const_iterator begin_pos(begin_pos_);
 268
 269     if (end_pos <= begin_pos)
 270         return;
 271
 272     wxHtmlTextPieces& pieces = *m_TextPieces;
 273     size_t piecesCnt = pieces.size();
 274
 275     while (begin_pos < end_pos)
 276     {
 277         while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
 278             m_CurTag = m_CurTag->GetNextTag();
 279         while (m_CurTextPiece < piecesCnt &&
 280                pieces[m_CurTextPiece].m_start < begin_pos)
 281             m_CurTextPiece++;
 282
 283         if (m_CurTextPiece < piecesCnt &&
 284             (!m_CurTag ||
 285              pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
 286         {
 287             // Add text:
 288             AddText(GetEntitiesParser()->Parse(
 289                        wxString(pieces[m_CurTextPiece].m_start,
 290                                 pieces[m_CurTextPiece].m_end)));
 291             begin_pos = pieces[m_CurTextPiece].m_end;
 292             m_CurTextPiece++;
 293         }
 294         else if (m_CurTag)
 295         {
 296             if (m_CurTag->HasEnding())
 297                 begin_pos = m_CurTag->GetEndIter2();
 298             else
 299                 begin_pos = m_CurTag->GetBeginIter();
 300             wxHtmlTag *t = m_CurTag;
 301             m_CurTag = m_CurTag->GetNextTag();
 302             AddTag(*t);
 303             if (m_stopParsing)
 304                 return;
 305         }
 306         else break;
 307     }
 308 }
 309
 310 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 311 {
 312     bool inner = false;
 313
 314     wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
 315     if (h != m_HandlersHash.end())
 316     {
 317         inner = h->second->HandleTag(tag);
 318         if (m_stopParsing)
 319             return;
 320     }
 321     if (!inner)
 322     {
 323         if (tag.HasEnding())
 324             DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
 325     }
 326 }
 327
 328 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 329 {
 330     wxString s(handler->GetSupportedTags());
 331     wxStringTokenizer tokenizer(s, wxT(", "));
 332
 333     while (tokenizer.HasMoreTokens())
 334         m_HandlersHash[tokenizer.GetNextToken()] = handler;
 335
 336     m_HandlersSet.insert(handler);
 337
 338     handler->SetParser(this);
 339 }
 340
 341 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 342 {
 343     wxStringTokenizer tokenizer(tags, wxT(", "));
 344     wxString key;
 345
 346     m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
 347
 348     while (tokenizer.HasMoreTokens())
 349     {
 350         key = tokenizer.GetNextToken();
 351         m_HandlersHash[key] = handler;
 352     }
 353 }
 354
 355 void wxHtmlParser::PopTagHandler()
 356 {
 357     wxCHECK_RET( !m_HandlersStack.empty(),
 358                  "attempt to remove HTML tag handler from empty stack" );
 359
 360     wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
 361     m_HandlersStack.pop_back();
 362     m_HandlersHash = *prev;
 363     delete prev;
 364 }
 365
 366 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 367 {
 368     wxHtmlParserState *s = new wxHtmlParserState;
 369
 370     s->m_curTag = m_CurTag;
 371     s->m_tags = m_Tags;
 372     s->m_textPieces = m_TextPieces;
 373     s->m_curTextPiece = m_CurTextPiece;
 374     s->m_source = m_Source;
 375
 376     s->m_nextState = m_SavedStates;
 377     m_SavedStates = s;
 378
 379     m_CurTag = NULL;
 380     m_Tags = NULL;
 381     m_TextPieces = NULL;
 382     m_CurTextPiece = 0;
 383     m_Source = NULL;
 384
 385     SetSource(src);
 386 }
 387
 388 bool wxHtmlParser::RestoreState()
 389 {
 390     if (!m_SavedStates) return false;
 391
 392     DestroyDOMTree();
 393     delete m_Source;
 394
 395     wxHtmlParserState *s = m_SavedStates;
 396     m_SavedStates = s->m_nextState;
 397
 398     m_CurTag = s->m_curTag;
 399     m_Tags = s->m_tags;
 400     m_TextPieces = s->m_textPieces;
 401     m_CurTextPiece = s->m_curTextPiece;
 402     m_Source = s->m_source;
 403
 404     delete s;
 405     return true;
 406 }
 407
 408 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 409 {
 410     return wxString(tag.GetBeginIter(), tag.GetEndIter1());
 411 }
 412
 413 //-----------------------------------------------------------------------------
 414 // wxHtmlTagHandler
 415 //-----------------------------------------------------------------------------
 416
 417 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 418
 419 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 420 {
 421     // It is safe to temporarily change the source being parsed,
 422     // provided we restore the state back after parsing
 423     m_Parser->SetSourceAndSaveState(source);
 424     m_Parser->DoParsing();
 425     m_Parser->RestoreState();
 426 }
 427
 428
 429 //-----------------------------------------------------------------------------
 430 // wxHtmlEntitiesParser
 431 //-----------------------------------------------------------------------------
 432
 433 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 434
 435 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 436 #if !wxUSE_UNICODE
 437     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 438 #endif
 439 {
 440 }
 441
 442 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 443 {
 444 #if !wxUSE_UNICODE
 445     delete m_conv;
 446 #endif
 447 }
 448
 449 #if !wxUSE_UNICODE
 450 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 451 {
 452     if (encoding == m_encoding)
 453         return;
 454
 455     delete m_conv;
 456
 457     m_encoding = encoding;
 458     if (m_encoding == wxFONTENCODING_SYSTEM)
 459         m_conv = NULL;
 460     else
 461         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 462 }
 463 #endif // !wxUSE_UNICODE
 464
 465 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 466 {
 467     wxString output;
 468
 469     const wxString::const_iterator end(input.end());
 470     wxString::const_iterator c(input.begin());
 471     wxString::const_iterator last(c);
 472
 473     for ( ; c < end; ++c )
 474     {
 475         if (*c == wxT('&'))
 476         {
 477             if ( output.empty() )
 478                 output.reserve(input.length());
 479
 480             if (c - last > 0)
 481                 output.append(last, c);
 482             if ( ++c == end )
 483                 break;
 484
 485             wxString entity;
 486             const wxString::const_iterator ent_s = c;
 487             wxChar entity_char;
 488
 489             for ( ; c != end; ++c )
 490             {
 491                 wxChar ch = *c;
 492                 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
 493                        (ch >= wxT('A') && ch <= wxT('Z')) ||
 494                        (ch >= wxT('0') && ch <= wxT('9')) ||
 495                         ch == wxT('_') || ch == wxT('#')) )
 496                     break;
 497             }
 498
 499             entity.append(ent_s, c);
 500             if (c == end || *c != wxT(';')) --c;
 501             last = c+1;
 502             entity_char = GetEntityChar(entity);
 503             if (entity_char)
 504                 output << entity_char;
 505             else
 506             {
 507                 output.append(ent_s-1, c+1);
 508                 wxLogTrace(wxTRACE_HTML_DEBUG,
 509                            "Unrecognized HTML entity: '%s'",
 510                            entity);
 511             }
 512         }
 513     }
 514     if ( last == input.begin() ) // common case: no entity
 515         return input;
 516     if ( last != end )
 517         output.append(last, end);
 518     return output;
 519 }
 520
 521 #if !wxUSE_UNICODE
 522 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 523 {
 524     char buf[2];
 525     wchar_t wbuf[2];
 526     wbuf[0] = (wchar_t)code;
 527     wbuf[1] = 0;
 528     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 529     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 530         return '?';
 531     return buf[0];
 532 }
 533 #endif
 534
 535 struct wxHtmlEntityInfo
 536 {
 537     const wxStringCharType *name;
 538     unsigned code;
 539 };
 540
 541 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 542 {
 543 #if wxUSE_UNICODE_UTF8
 544     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 545 #else
 546     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 547 #endif
 548 }
 549
 550 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 551 {
 552     unsigned code = 0;
 553
 554     if (entity.empty())
 555       return 0; // invalid entity reference
 556
 557     if (entity[0] == wxT('#'))
 558     {
 559         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 560         //     representation is the same for numbers
 561         const wxStringCharType *ent_s = entity.wx_str();
 562         const wxStringCharType *format;
 563
 564         if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
 565         {
 566             format = wxS("%x");
 567             ent_s++;
 568         }
 569         else
 570             format = wxS("%u");
 571         ent_s++;
 572
 573         if (wxSscanf(ent_s, format, &code) != 1)
 574             code = 0;
 575     }
 576     else
 577     {
 578         // store the literals in wx's internal representation (either char*
 579         // in UTF-8 or wchar_t*) for best performance:
 580         #define ENTITY(name, code) { wxS(name), code }
 581
 582         static wxHtmlEntityInfo substitutions[] = {
 583             ENTITY("AElig", 198),
 584             ENTITY("Aacute", 193),
 585             ENTITY("Acirc", 194),
 586             ENTITY("Agrave", 192),
 587             ENTITY("Alpha", 913),
 588             ENTITY("Aring", 197),
 589             ENTITY("Atilde", 195),
 590             ENTITY("Auml", 196),
 591             ENTITY("Beta", 914),
 592             ENTITY("Ccedil", 199),
 593             ENTITY("Chi", 935),
 594             ENTITY("Dagger", 8225),
 595             ENTITY("Delta", 916),
 596             ENTITY("ETH", 208),
 597             ENTITY("Eacute", 201),
 598             ENTITY("Ecirc", 202),
 599             ENTITY("Egrave", 200),
 600             ENTITY("Epsilon", 917),
 601             ENTITY("Eta", 919),
 602             ENTITY("Euml", 203),
 603             ENTITY("Gamma", 915),
 604             ENTITY("Iacute", 205),
 605             ENTITY("Icirc", 206),
 606             ENTITY("Igrave", 204),
 607             ENTITY("Iota", 921),
 608             ENTITY("Iuml", 207),
 609             ENTITY("Kappa", 922),
 610             ENTITY("Lambda", 923),
 611             ENTITY("Mu", 924),
 612             ENTITY("Ntilde", 209),
 613             ENTITY("Nu", 925),
 614             ENTITY("OElig", 338),
 615             ENTITY("Oacute", 211),
 616             ENTITY("Ocirc", 212),
 617             ENTITY("Ograve", 210),
 618             ENTITY("Omega", 937),
 619             ENTITY("Omicron", 927),
 620             ENTITY("Oslash", 216),
 621             ENTITY("Otilde", 213),
 622             ENTITY("Ouml", 214),
 623             ENTITY("Phi", 934),
 624             ENTITY("Pi", 928),
 625             ENTITY("Prime", 8243),
 626             ENTITY("Psi", 936),
 627             ENTITY("Rho", 929),
 628             ENTITY("Scaron", 352),
 629             ENTITY("Sigma", 931),
 630             ENTITY("THORN", 222),
 631             ENTITY("Tau", 932),
 632             ENTITY("Theta", 920),
 633             ENTITY("Uacute", 218),
 634             ENTITY("Ucirc", 219),
 635             ENTITY("Ugrave", 217),
 636             ENTITY("Upsilon", 933),
 637             ENTITY("Uuml", 220),
 638             ENTITY("Xi", 926),
 639             ENTITY("Yacute", 221),
 640             ENTITY("Yuml", 376),
 641             ENTITY("Zeta", 918),
 642             ENTITY("aacute", 225),
 643             ENTITY("acirc", 226),
 644             ENTITY("acute", 180),
 645             ENTITY("aelig", 230),
 646             ENTITY("agrave", 224),
 647             ENTITY("alefsym", 8501),
 648             ENTITY("alpha", 945),
 649             ENTITY("amp", 38),
 650             ENTITY("and", 8743),
 651             ENTITY("ang", 8736),
 652             ENTITY("apos", 39),
 653             ENTITY("aring", 229),
 654             ENTITY("asymp", 8776),
 655             ENTITY("atilde", 227),
 656             ENTITY("auml", 228),
 657             ENTITY("bdquo", 8222),
 658             ENTITY("beta", 946),
 659             ENTITY("brvbar", 166),
 660             ENTITY("bull", 8226),
 661             ENTITY("cap", 8745),
 662             ENTITY("ccedil", 231),
 663             ENTITY("cedil", 184),
 664             ENTITY("cent", 162),
 665             ENTITY("chi", 967),
 666             ENTITY("circ", 710),
 667             ENTITY("clubs", 9827),
 668             ENTITY("cong", 8773),
 669             ENTITY("copy", 169),
 670             ENTITY("crarr", 8629),
 671             ENTITY("cup", 8746),
 672             ENTITY("curren", 164),
 673             ENTITY("dArr", 8659),
 674             ENTITY("dagger", 8224),
 675             ENTITY("darr", 8595),
 676             ENTITY("deg", 176),
 677             ENTITY("delta", 948),
 678             ENTITY("diams", 9830),
 679             ENTITY("divide", 247),
 680             ENTITY("eacute", 233),
 681             ENTITY("ecirc", 234),
 682             ENTITY("egrave", 232),
 683             ENTITY("empty", 8709),
 684             ENTITY("emsp", 8195),
 685             ENTITY("ensp", 8194),
 686             ENTITY("epsilon", 949),
 687             ENTITY("equiv", 8801),
 688             ENTITY("eta", 951),
 689             ENTITY("eth", 240),
 690             ENTITY("euml", 235),
 691             ENTITY("euro", 8364),
 692             ENTITY("exist", 8707),
 693             ENTITY("fnof", 402),
 694             ENTITY("forall", 8704),
 695             ENTITY("frac12", 189),
 696             ENTITY("frac14", 188),
 697             ENTITY("frac34", 190),
 698             ENTITY("frasl", 8260),
 699             ENTITY("gamma", 947),
 700             ENTITY("ge", 8805),
 701             ENTITY("gt", 62),
 702             ENTITY("hArr", 8660),
 703             ENTITY("harr", 8596),
 704             ENTITY("hearts", 9829),
 705             ENTITY("hellip", 8230),
 706             ENTITY("iacute", 237),
 707             ENTITY("icirc", 238),
 708             ENTITY("iexcl", 161),
 709             ENTITY("igrave", 236),
 710             ENTITY("image", 8465),
 711             ENTITY("infin", 8734),
 712             ENTITY("int", 8747),
 713             ENTITY("iota", 953),
 714             ENTITY("iquest", 191),
 715             ENTITY("isin", 8712),
 716             ENTITY("iuml", 239),
 717             ENTITY("kappa", 954),
 718             ENTITY("lArr", 8656),
 719             ENTITY("lambda", 955),
 720             ENTITY("lang", 9001),
 721             ENTITY("laquo", 171),
 722             ENTITY("larr", 8592),
 723             ENTITY("lceil", 8968),
 724             ENTITY("ldquo", 8220),
 725             ENTITY("le", 8804),
 726             ENTITY("lfloor", 8970),
 727             ENTITY("lowast", 8727),
 728             ENTITY("loz", 9674),
 729             ENTITY("lrm", 8206),
 730             ENTITY("lsaquo", 8249),
 731             ENTITY("lsquo", 8216),
 732             ENTITY("lt", 60),
 733             ENTITY("macr", 175),
 734             ENTITY("mdash", 8212),
 735             ENTITY("micro", 181),
 736             ENTITY("middot", 183),
 737             ENTITY("minus", 8722),
 738             ENTITY("mu", 956),
 739             ENTITY("nabla", 8711),
 740             ENTITY("nbsp", 160),
 741             ENTITY("ndash", 8211),
 742             ENTITY("ne", 8800),
 743             ENTITY("ni", 8715),
 744             ENTITY("not", 172),
 745             ENTITY("notin", 8713),
 746             ENTITY("nsub", 8836),
 747             ENTITY("ntilde", 241),
 748             ENTITY("nu", 957),
 749             ENTITY("oacute", 243),
 750             ENTITY("ocirc", 244),
 751             ENTITY("oelig", 339),
 752             ENTITY("ograve", 242),
 753             ENTITY("oline", 8254),
 754             ENTITY("omega", 969),
 755             ENTITY("omicron", 959),
 756             ENTITY("oplus", 8853),
 757             ENTITY("or", 8744),
 758             ENTITY("ordf", 170),
 759             ENTITY("ordm", 186),
 760             ENTITY("oslash", 248),
 761             ENTITY("otilde", 245),
 762             ENTITY("otimes", 8855),
 763             ENTITY("ouml", 246),
 764             ENTITY("para", 182),
 765             ENTITY("part", 8706),
 766             ENTITY("permil", 8240),
 767             ENTITY("perp", 8869),
 768             ENTITY("phi", 966),
 769             ENTITY("pi", 960),
 770             ENTITY("piv", 982),
 771             ENTITY("plusmn", 177),
 772             ENTITY("pound", 163),
 773             ENTITY("prime", 8242),
 774             ENTITY("prod", 8719),
 775             ENTITY("prop", 8733),
 776             ENTITY("psi", 968),
 777             ENTITY("quot", 34),
 778             ENTITY("rArr", 8658),
 779             ENTITY("radic", 8730),
 780             ENTITY("rang", 9002),
 781             ENTITY("raquo", 187),
 782             ENTITY("rarr", 8594),
 783             ENTITY("rceil", 8969),
 784             ENTITY("rdquo", 8221),
 785             ENTITY("real", 8476),
 786             ENTITY("reg", 174),
 787             ENTITY("rfloor", 8971),
 788             ENTITY("rho", 961),
 789             ENTITY("rlm", 8207),
 790             ENTITY("rsaquo", 8250),
 791             ENTITY("rsquo", 8217),
 792             ENTITY("sbquo", 8218),
 793             ENTITY("scaron", 353),
 794             ENTITY("sdot", 8901),
 795             ENTITY("sect", 167),
 796             ENTITY("shy", 173),
 797             ENTITY("sigma", 963),
 798             ENTITY("sigmaf", 962),
 799             ENTITY("sim", 8764),
 800             ENTITY("spades", 9824),
 801             ENTITY("sub", 8834),
 802             ENTITY("sube", 8838),
 803             ENTITY("sum", 8721),
 804             ENTITY("sup", 8835),
 805             ENTITY("sup1", 185),
 806             ENTITY("sup2", 178),
 807             ENTITY("sup3", 179),
 808             ENTITY("supe", 8839),
 809             ENTITY("szlig", 223),
 810             ENTITY("tau", 964),
 811             ENTITY("there4", 8756),
 812             ENTITY("theta", 952),
 813             ENTITY("thetasym", 977),
 814             ENTITY("thinsp", 8201),
 815             ENTITY("thorn", 254),
 816             ENTITY("tilde", 732),
 817             ENTITY("times", 215),
 818             ENTITY("trade", 8482),
 819             ENTITY("uArr", 8657),
 820             ENTITY("uacute", 250),
 821             ENTITY("uarr", 8593),
 822             ENTITY("ucirc", 251),
 823             ENTITY("ugrave", 249),
 824             ENTITY("uml", 168),
 825             ENTITY("upsih", 978),
 826             ENTITY("upsilon", 965),
 827             ENTITY("uuml", 252),
 828             ENTITY("weierp", 8472),
 829             ENTITY("xi", 958),
 830             ENTITY("yacute", 253),
 831             ENTITY("yen", 165),
 832             ENTITY("yuml", 255),
 833             ENTITY("zeta", 950),
 834             ENTITY("zwj", 8205),
 835             ENTITY("zwnj", 8204),
 836             {NULL, 0}};
 837         #undef ENTITY
 838         static size_t substitutions_cnt = 0;
 839
 840         if (substitutions_cnt == 0)
 841             while (substitutions[substitutions_cnt].code != 0)
 842                 substitutions_cnt++;
 843
 844         wxHtmlEntityInfo *info;
 845 #ifdef __WXWINCE__
 846         // bsearch crashes under WinCE for some reason
 847         info = NULL;
 848         size_t i;
 849         for (i = 0; i < substitutions_cnt; i++)
 850         {
 851             if (entity == substitutions[i].name)
 852             {
 853                 info = & substitutions[i];
 854                 break;
 855             }
 856         }
 857 #else
 858         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 859                                            substitutions_cnt,
 860                                            sizeof(wxHtmlEntityInfo),
 861                                            wxHtmlEntityCompare);
 862 #endif
 863         if (info)
 864             code = info->code;
 865     }
 866
 867     if (code == 0)
 868         return 0;
 869     else
 870         return GetCharForCode(code);
 871 }
 872
 873 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 874                                 const wxString& url) const
 875 {
 876     return m_FS ? m_FS->OpenFile(url) : NULL;
 877
 878 }
 879
 880
 881 //-----------------------------------------------------------------------------
 882 // wxHtmlParser::ExtractCharsetInformation
 883 //-----------------------------------------------------------------------------
 884
 885 class wxMetaTagParser : public wxHtmlParser
 886 {
 887 public:
 888     wxMetaTagParser() { }
 889
 890     wxObject* GetProduct() { return NULL; }
 891
 892 protected:
 893     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 894
 895     wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
 896 };
 897
 898 class wxMetaTagHandler : public wxHtmlTagHandler
 899 {
 900 public:
 901     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 902     wxString GetSupportedTags() { return wxT("META,BODY"); }
 903     bool HandleTag(const wxHtmlTag& tag);
 904
 905 private:
 906     wxString *m_retval;
 907
 908     wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
 909 };
 910
 911 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 912 {
 913     if (tag.GetName() == wxT("BODY"))
 914     {
 915         m_Parser->StopParsing();
 916         return false;
 917     }
 918
 919     if (tag.HasParam(wxT("HTTP-EQUIV")) &&
 920         tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
 921         tag.HasParam(wxT("CONTENT")))
 922     {
 923         wxString content = tag.GetParam(wxT("CONTENT")).Lower();
 924         if (content.Left(19) == wxT("text/html; charset="))
 925         {
 926             *m_retval = content.Mid(19);
 927             m_Parser->StopParsing();
 928         }
 929     }
 930     return false;
 931 }
 932
 933
 934 /*static*/
 935 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 936 {
 937     wxString charset;
 938     wxMetaTagParser *parser = new wxMetaTagParser();
 939     if(parser)
 940     {
 941         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 942         parser->Parse(markup);
 943         delete parser;
 944     }
 945     return charset;
 946 }
 947
 948 /* static */
 949 bool
 950 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 951                              wxString::const_iterator end)
 952 {
 953     wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
 954
 955     wxString::const_iterator p = start;
 956
 957     // comments begin with "<!--" in HTML 4.0
 958     if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 959     {
 960         // not a comment at all
 961         return false;
 962     }
 963
 964     // skip the start of the comment tag in any case, if we don't find the
 965     // closing tag we should ignore broken markup
 966     start = p;
 967
 968     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 969     // comment delimiter and the closing tag character (section 3.2.4 of
 970     // http://www.w3.org/TR/html401/)
 971     int dashes = 0;
 972     while ( ++p < end )
 973     {
 974         const wxChar c = *p;
 975
 976         if ( (c == wxT(' ') || c == wxT('\n') ||
 977               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 978         {
 979             // ignore white space before potential tag end
 980             continue;
 981         }
 982
 983         if ( c == wxT('>') && dashes >= 2 )
 984         {
 985             // found end of comment
 986             start = p;
 987             break;
 988         }
 989
 990         if ( c == wxT('-') )
 991             dashes++;
 992         else
 993             dashes = 0;
 994     }
 995
 996     return true;
 997 }
 998
 999 #endif // wxUSE_HTML