src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/vector.h"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece() {}
  51     wxHtmlTextPiece(const wxString::const_iterator& start,
  52                     const wxString::const_iterator& end)
  53         : m_start(start), m_end(end) {}
  54     wxString::const_iterator m_start, m_end;
  55 };
  56
  57 // NB: this is an empty class and not typedef because of forward declaration
  58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  59 {
  60 };
  61
  62 class wxHtmlParserState
  63 {
  64 public:
  65     wxHtmlTag         *m_curTag;
  66     wxHtmlTag         *m_tags;
  67     wxHtmlTextPieces  *m_textPieces;
  68     int                m_curTextPiece;
  69     const wxString    *m_source;
  70     wxHtmlParserState *m_nextState;
  71 };
  72
  73 //-----------------------------------------------------------------------------
  74 // wxHtmlParser
  75 //-----------------------------------------------------------------------------
  76
  77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  78
  79 wxHtmlParser::wxHtmlParser()
  80     : wxObject(),
  81       m_FS(NULL)
  82 {
  83     m_Source = NULL;
  84     m_entitiesParser = new wxHtmlEntitiesParser;
  85     m_Tags = NULL;
  86     m_CurTag = NULL;
  87     m_TextPieces = NULL;
  88     m_CurTextPiece = 0;
  89     m_SavedStates = NULL;
  90 }
  91
  92 wxHtmlParser::~wxHtmlParser()
  93 {
  94     while (RestoreState()) {}
  95     DestroyDOMTree();
  96
  97     WX_CLEAR_ARRAY(m_HandlersStack);
  98     WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
  99     delete m_entitiesParser;
 100     delete m_Source;
 101 }
 102
 103 wxObject* wxHtmlParser::Parse(const wxString& source)
 104 {
 105     InitParser(source);
 106     DoParsing();
 107     wxObject *result = GetProduct();
 108     DoneParser();
 109     return result;
 110 }
 111
 112 void wxHtmlParser::InitParser(const wxString& source)
 113 {
 114     SetSource(source);
 115     m_stopParsing = false;
 116 }
 117
 118 void wxHtmlParser::DoneParser()
 119 {
 120     DestroyDOMTree();
 121 }
 122
 123 void wxHtmlParser::SetSource(const wxString& src)
 124 {
 125     DestroyDOMTree();
 126     // NB: This is allocated on heap because wxHtmlTag uses iterators and
 127     //     making a copy of m_Source string in SetSourceAndSaveState() and
 128     //     RestoreState() would invalidate them (because wxString::m_impl's
 129     //     memory would change completely twice and iterators use pointers
 130     //     into it). So instead, we keep the string object intact and only
 131     //     store/restore pointer to it, for which we need it to be allocated
 132     //     on the heap.
 133     delete m_Source;
 134     m_Source = new wxString(src);
 135     CreateDOMTree();
 136     m_CurTag = NULL;
 137     m_CurTextPiece = 0;
 138 }
 139
 140 void wxHtmlParser::CreateDOMTree()
 141 {
 142     wxHtmlTagsCache cache(*m_Source);
 143     m_TextPieces = new wxHtmlTextPieces;
 144     CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
 145     m_CurTextPiece = 0;
 146 }
 147
 148 extern bool wxIsCDATAElement(const wxString& tag);
 149
 150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 151                                     const wxString::const_iterator& begin_pos,
 152                                     const wxString::const_iterator& end_pos,
 153                                     wxHtmlTagsCache *cache)
 154 {
 155     if (end_pos <= begin_pos)
 156         return;
 157
 158     wxChar c;
 159     wxString::const_iterator i = begin_pos;
 160     wxString::const_iterator textBeginning = begin_pos;
 161
 162     // If the tag contains CDATA text, we include the text between beginning
 163     // and ending tag verbosely. Setting i=end_pos will skip to the very
 164     // end of this function where text piece is added, bypassing any child
 165     // tags parsing (CDATA element can't have child elements by definition):
 166     if (cur != NULL && wxIsCDATAElement(cur->GetName()))
 167     {
 168         i = end_pos;
 169     }
 170
 171     while (i < end_pos)
 172     {
 173         c = *i;
 174
 175         if (c == wxT('<'))
 176         {
 177             // add text to m_TextPieces:
 178             if (i > textBeginning)
 179                 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
 180
 181             // if it is a comment, skip it:
 182             if ( SkipCommentTag(i, m_Source->end()) )
 183             {
 184                 textBeginning = i = i + 1; // skip closing '>' too
 185             }
 186
 187             // add another tag to the tree:
 188             else if (i < end_pos-1 && *(i+1) != wxT('/'))
 189             {
 190                 wxHtmlTag *chd;
 191                 if (cur)
 192                     chd = new wxHtmlTag(cur, m_Source,
 193                                         i, end_pos, cache, m_entitiesParser);
 194                 else
 195                 {
 196                     chd = new wxHtmlTag(NULL, m_Source,
 197                                         i, end_pos, cache, m_entitiesParser);
 198                     if (!m_Tags)
 199                     {
 200                         // if this is the first tag to be created make the root
 201                         // m_Tags point to it:
 202                         m_Tags = chd;
 203                     }
 204                     else
 205                     {
 206                         // if there is already a root tag add this tag as
 207                         // the last sibling:
 208                         chd->m_Prev = m_Tags->GetLastSibling();
 209                         chd->m_Prev->m_Next = chd;
 210                     }
 211                 }
 212
 213                 if (chd->HasEnding())
 214                 {
 215                     CreateDOMSubTree(chd,
 216                                      chd->GetBeginIter(), chd->GetEndIter1(),
 217                                      cache);
 218                     i = chd->GetEndIter2();
 219                 }
 220                 else
 221                     i = chd->GetBeginIter();
 222
 223                 textBeginning = i;
 224             }
 225
 226             // ... or skip ending tag:
 227             else
 228             {
 229                 while (i < end_pos && *i != wxT('>')) ++i;
 230                 textBeginning = i+1;
 231             }
 232         }
 233         else ++i;
 234     }
 235
 236     // add remaining text to m_TextPieces:
 237     if (end_pos > textBeginning)
 238         m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
 239 }
 240
 241 void wxHtmlParser::DestroyDOMTree()
 242 {
 243     wxHtmlTag *t1, *t2;
 244     t1 = m_Tags;
 245     while (t1)
 246     {
 247         t2 = t1->GetNextSibling();
 248         delete t1;
 249         t1 = t2;
 250     }
 251     m_Tags = m_CurTag = NULL;
 252
 253     delete m_TextPieces;
 254     m_TextPieces = NULL;
 255 }
 256
 257 void wxHtmlParser::DoParsing()
 258 {
 259     m_CurTag = m_Tags;
 260     m_CurTextPiece = 0;
 261     DoParsing(m_Source->begin(), m_Source->end());
 262 }
 263
 264 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
 265                              const wxString::const_iterator& end_pos)
 266 {
 267     wxString::const_iterator begin_pos(begin_pos_);
 268
 269     if (end_pos <= begin_pos)
 270         return;
 271
 272     wxHtmlTextPieces& pieces = *m_TextPieces;
 273     size_t piecesCnt = pieces.size();
 274
 275     while (begin_pos < end_pos)
 276     {
 277         while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
 278             m_CurTag = m_CurTag->GetNextTag();
 279         while (m_CurTextPiece < piecesCnt &&
 280                pieces[m_CurTextPiece].m_start < begin_pos)
 281             m_CurTextPiece++;
 282
 283         if (m_CurTextPiece < piecesCnt &&
 284             (!m_CurTag ||
 285              pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
 286         {
 287             // Add text:
 288             AddText(GetEntitiesParser()->Parse(
 289                        wxString(pieces[m_CurTextPiece].m_start,
 290                                 pieces[m_CurTextPiece].m_end)));
 291             begin_pos = pieces[m_CurTextPiece].m_end;
 292             m_CurTextPiece++;
 293         }
 294         else if (m_CurTag)
 295         {
 296             if (m_CurTag->HasEnding())
 297                 begin_pos = m_CurTag->GetEndIter2();
 298             else
 299                 begin_pos = m_CurTag->GetBeginIter();
 300             wxHtmlTag *t = m_CurTag;
 301             m_CurTag = m_CurTag->GetNextTag();
 302             AddTag(*t);
 303             if (m_stopParsing)
 304                 return;
 305         }
 306         else break;
 307     }
 308 }
 309
 310 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 311 {
 312     bool inner = false;
 313
 314     wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
 315     if (h != m_HandlersHash.end())
 316     {
 317         inner = h->second->HandleTag(tag);
 318         if (m_stopParsing)
 319             return;
 320     }
 321     if (!inner)
 322     {
 323         if (tag.HasEnding())
 324             DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
 325     }
 326 }
 327
 328 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 329 {
 330     wxString s(handler->GetSupportedTags());
 331     wxStringTokenizer tokenizer(s, wxT(", "));
 332
 333     while (tokenizer.HasMoreTokens())
 334         m_HandlersHash[tokenizer.GetNextToken()] = handler;
 335
 336     m_HandlersSet.insert(handler);
 337
 338     handler->SetParser(this);
 339 }
 340
 341 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 342 {
 343     wxStringTokenizer tokenizer(tags, wxT(", "));
 344     wxString key;
 345
 346     m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
 347
 348     while (tokenizer.HasMoreTokens())
 349     {
 350         key = tokenizer.GetNextToken();
 351         m_HandlersHash[key] = handler;
 352     }
 353 }
 354
 355 void wxHtmlParser::PopTagHandler()
 356 {
 357     wxCHECK_RET( !m_HandlersStack.empty(),
 358                  "attempt to remove HTML tag handler from empty stack" );
 359
 360     wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
 361     m_HandlersStack.pop_back();
 362     m_HandlersHash = *prev;
 363     delete prev;
 364 }
 365
 366 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 367 {
 368     wxHtmlParserState *s = new wxHtmlParserState;
 369
 370     s->m_curTag = m_CurTag;
 371     s->m_tags = m_Tags;
 372     s->m_textPieces = m_TextPieces;
 373     s->m_curTextPiece = m_CurTextPiece;
 374     s->m_source = m_Source;
 375
 376     s->m_nextState = m_SavedStates;
 377     m_SavedStates = s;
 378
 379     m_CurTag = NULL;
 380     m_Tags = NULL;
 381     m_TextPieces = NULL;
 382     m_CurTextPiece = 0;
 383     m_Source = NULL;
 384
 385     SetSource(src);
 386 }
 387
 388 bool wxHtmlParser::RestoreState()
 389 {
 390     if (!m_SavedStates) return false;
 391
 392     DestroyDOMTree();
 393     delete m_Source;
 394
 395     wxHtmlParserState *s = m_SavedStates;
 396     m_SavedStates = s->m_nextState;
 397
 398     m_CurTag = s->m_curTag;
 399     m_Tags = s->m_tags;
 400     m_TextPieces = s->m_textPieces;
 401     m_CurTextPiece = s->m_curTextPiece;
 402     m_Source = s->m_source;
 403
 404     delete s;
 405     return true;
 406 }
 407
 408 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 409 {
 410     return wxString(tag.GetBeginIter(), tag.GetEndIter1());
 411 }
 412
 413 //-----------------------------------------------------------------------------
 414 // wxHtmlTagHandler
 415 //-----------------------------------------------------------------------------
 416
 417 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 418
 419 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 420 {
 421     // It is safe to temporarily change the source being parsed,
 422     // provided we restore the state back after parsing
 423     m_Parser->SetSourceAndSaveState(source);
 424     m_Parser->DoParsing();
 425     m_Parser->RestoreState();
 426 }
 427
 428
 429 //-----------------------------------------------------------------------------
 430 // wxHtmlEntitiesParser
 431 //-----------------------------------------------------------------------------
 432
 433 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 434
 435 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 436 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 437     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 438 #endif
 439 {
 440 }
 441
 442 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 443 {
 444 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 445     delete m_conv;
 446 #endif
 447 }
 448
 449 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 450 {
 451 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 452     if (encoding == m_encoding)
 453         return;
 454
 455     delete m_conv;
 456
 457     m_encoding = encoding;
 458     if (m_encoding == wxFONTENCODING_SYSTEM)
 459         m_conv = NULL;
 460     else
 461         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 462 #else
 463     (void) encoding;
 464 #endif
 465 }
 466
 467 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 468 {
 469     wxString output;
 470
 471     const wxString::const_iterator end(input.end());
 472     wxString::const_iterator c(input.begin());
 473     wxString::const_iterator last(c);
 474
 475     for ( ; c < end; ++c )
 476     {
 477         if (*c == wxT('&'))
 478         {
 479             if ( output.empty() )
 480                 output.reserve(input.length());
 481
 482             if (c - last > 0)
 483                 output.append(last, c);
 484             if ( ++c == end )
 485                 break;
 486
 487             wxString entity;
 488             const wxString::const_iterator ent_s = c;
 489             wxChar entity_char;
 490
 491             for ( ; c != end; ++c )
 492             {
 493                 wxChar ch = *c;
 494                 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
 495                        (ch >= wxT('A') && ch <= wxT('Z')) ||
 496                        (ch >= wxT('0') && ch <= wxT('9')) ||
 497                         ch == wxT('_') || ch == wxT('#')) )
 498                     break;
 499             }
 500
 501             entity.append(ent_s, c);
 502             if (c == end || *c != wxT(';')) --c;
 503             last = c+1;
 504             entity_char = GetEntityChar(entity);
 505             if (entity_char)
 506                 output << entity_char;
 507             else
 508             {
 509                 output.append(ent_s-1, c+1);
 510                 wxLogTrace(wxTRACE_HTML_DEBUG,
 511                            "Unrecognized HTML entity: '%s'",
 512                            entity);
 513             }
 514         }
 515     }
 516     if ( last == input.begin() ) // common case: no entity
 517         return input;
 518     if ( last != end )
 519         output.append(last, end);
 520     return output;
 521 }
 522
 523 #if !wxUSE_UNICODE
 524 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 525 {
 526 #if wxUSE_WCHAR_T
 527     char buf[2];
 528     wchar_t wbuf[2];
 529     wbuf[0] = (wchar_t)code;
 530     wbuf[1] = 0;
 531     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 532     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 533         return '?';
 534     return buf[0];
 535 #else
 536     return (code < 256) ? (wxChar)code : '?';
 537 #endif
 538 }
 539 #endif
 540
 541 struct wxHtmlEntityInfo
 542 {
 543     const wxStringCharType *name;
 544     unsigned code;
 545 };
 546
 547 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 548 {
 549 #if wxUSE_UNICODE_UTF8
 550     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 551 #else
 552     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 553 #endif
 554 }
 555
 556 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 557 {
 558     unsigned code = 0;
 559
 560     if (entity.empty())
 561       return 0; // invalid entity reference
 562
 563     if (entity[0] == wxT('#'))
 564     {
 565         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 566         //     representation is the same for numbers
 567         const wxStringCharType *ent_s = entity.wx_str();
 568         const wxStringCharType *format;
 569
 570         if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
 571         {
 572             format = wxS("%x");
 573             ent_s++;
 574         }
 575         else
 576             format = wxS("%u");
 577         ent_s++;
 578
 579         if (wxSscanf(ent_s, format, &code) != 1)
 580             code = 0;
 581     }
 582     else
 583     {
 584         // store the literals in wx's internal representation (either char*
 585         // in UTF-8 or wchar_t*) for best performance:
 586         #define ENTITY(name, code) { wxS(name), code }
 587
 588         static wxHtmlEntityInfo substitutions[] = {
 589             ENTITY("AElig", 198),
 590             ENTITY("Aacute", 193),
 591             ENTITY("Acirc", 194),
 592             ENTITY("Agrave", 192),
 593             ENTITY("Alpha", 913),
 594             ENTITY("Aring", 197),
 595             ENTITY("Atilde", 195),
 596             ENTITY("Auml", 196),
 597             ENTITY("Beta", 914),
 598             ENTITY("Ccedil", 199),
 599             ENTITY("Chi", 935),
 600             ENTITY("Dagger", 8225),
 601             ENTITY("Delta", 916),
 602             ENTITY("ETH", 208),
 603             ENTITY("Eacute", 201),
 604             ENTITY("Ecirc", 202),
 605             ENTITY("Egrave", 200),
 606             ENTITY("Epsilon", 917),
 607             ENTITY("Eta", 919),
 608             ENTITY("Euml", 203),
 609             ENTITY("Gamma", 915),
 610             ENTITY("Iacute", 205),
 611             ENTITY("Icirc", 206),
 612             ENTITY("Igrave", 204),
 613             ENTITY("Iota", 921),
 614             ENTITY("Iuml", 207),
 615             ENTITY("Kappa", 922),
 616             ENTITY("Lambda", 923),
 617             ENTITY("Mu", 924),
 618             ENTITY("Ntilde", 209),
 619             ENTITY("Nu", 925),
 620             ENTITY("OElig", 338),
 621             ENTITY("Oacute", 211),
 622             ENTITY("Ocirc", 212),
 623             ENTITY("Ograve", 210),
 624             ENTITY("Omega", 937),
 625             ENTITY("Omicron", 927),
 626             ENTITY("Oslash", 216),
 627             ENTITY("Otilde", 213),
 628             ENTITY("Ouml", 214),
 629             ENTITY("Phi", 934),
 630             ENTITY("Pi", 928),
 631             ENTITY("Prime", 8243),
 632             ENTITY("Psi", 936),
 633             ENTITY("Rho", 929),
 634             ENTITY("Scaron", 352),
 635             ENTITY("Sigma", 931),
 636             ENTITY("THORN", 222),
 637             ENTITY("Tau", 932),
 638             ENTITY("Theta", 920),
 639             ENTITY("Uacute", 218),
 640             ENTITY("Ucirc", 219),
 641             ENTITY("Ugrave", 217),
 642             ENTITY("Upsilon", 933),
 643             ENTITY("Uuml", 220),
 644             ENTITY("Xi", 926),
 645             ENTITY("Yacute", 221),
 646             ENTITY("Yuml", 376),
 647             ENTITY("Zeta", 918),
 648             ENTITY("aacute", 225),
 649             ENTITY("acirc", 226),
 650             ENTITY("acute", 180),
 651             ENTITY("aelig", 230),
 652             ENTITY("agrave", 224),
 653             ENTITY("alefsym", 8501),
 654             ENTITY("alpha", 945),
 655             ENTITY("amp", 38),
 656             ENTITY("and", 8743),
 657             ENTITY("ang", 8736),
 658             ENTITY("aring", 229),
 659             ENTITY("asymp", 8776),
 660             ENTITY("atilde", 227),
 661             ENTITY("auml", 228),
 662             ENTITY("bdquo", 8222),
 663             ENTITY("beta", 946),
 664             ENTITY("brvbar", 166),
 665             ENTITY("bull", 8226),
 666             ENTITY("cap", 8745),
 667             ENTITY("ccedil", 231),
 668             ENTITY("cedil", 184),
 669             ENTITY("cent", 162),
 670             ENTITY("chi", 967),
 671             ENTITY("circ", 710),
 672             ENTITY("clubs", 9827),
 673             ENTITY("cong", 8773),
 674             ENTITY("copy", 169),
 675             ENTITY("crarr", 8629),
 676             ENTITY("cup", 8746),
 677             ENTITY("curren", 164),
 678             ENTITY("dArr", 8659),
 679             ENTITY("dagger", 8224),
 680             ENTITY("darr", 8595),
 681             ENTITY("deg", 176),
 682             ENTITY("delta", 948),
 683             ENTITY("diams", 9830),
 684             ENTITY("divide", 247),
 685             ENTITY("eacute", 233),
 686             ENTITY("ecirc", 234),
 687             ENTITY("egrave", 232),
 688             ENTITY("empty", 8709),
 689             ENTITY("emsp", 8195),
 690             ENTITY("ensp", 8194),
 691             ENTITY("epsilon", 949),
 692             ENTITY("equiv", 8801),
 693             ENTITY("eta", 951),
 694             ENTITY("eth", 240),
 695             ENTITY("euml", 235),
 696             ENTITY("euro", 8364),
 697             ENTITY("exist", 8707),
 698             ENTITY("fnof", 402),
 699             ENTITY("forall", 8704),
 700             ENTITY("frac12", 189),
 701             ENTITY("frac14", 188),
 702             ENTITY("frac34", 190),
 703             ENTITY("frasl", 8260),
 704             ENTITY("gamma", 947),
 705             ENTITY("ge", 8805),
 706             ENTITY("gt", 62),
 707             ENTITY("hArr", 8660),
 708             ENTITY("harr", 8596),
 709             ENTITY("hearts", 9829),
 710             ENTITY("hellip", 8230),
 711             ENTITY("iacute", 237),
 712             ENTITY("icirc", 238),
 713             ENTITY("iexcl", 161),
 714             ENTITY("igrave", 236),
 715             ENTITY("image", 8465),
 716             ENTITY("infin", 8734),
 717             ENTITY("int", 8747),
 718             ENTITY("iota", 953),
 719             ENTITY("iquest", 191),
 720             ENTITY("isin", 8712),
 721             ENTITY("iuml", 239),
 722             ENTITY("kappa", 954),
 723             ENTITY("lArr", 8656),
 724             ENTITY("lambda", 955),
 725             ENTITY("lang", 9001),
 726             ENTITY("laquo", 171),
 727             ENTITY("larr", 8592),
 728             ENTITY("lceil", 8968),
 729             ENTITY("ldquo", 8220),
 730             ENTITY("le", 8804),
 731             ENTITY("lfloor", 8970),
 732             ENTITY("lowast", 8727),
 733             ENTITY("loz", 9674),
 734             ENTITY("lrm", 8206),
 735             ENTITY("lsaquo", 8249),
 736             ENTITY("lsquo", 8216),
 737             ENTITY("lt", 60),
 738             ENTITY("macr", 175),
 739             ENTITY("mdash", 8212),
 740             ENTITY("micro", 181),
 741             ENTITY("middot", 183),
 742             ENTITY("minus", 8722),
 743             ENTITY("mu", 956),
 744             ENTITY("nabla", 8711),
 745             ENTITY("nbsp", 160),
 746             ENTITY("ndash", 8211),
 747             ENTITY("ne", 8800),
 748             ENTITY("ni", 8715),
 749             ENTITY("not", 172),
 750             ENTITY("notin", 8713),
 751             ENTITY("nsub", 8836),
 752             ENTITY("ntilde", 241),
 753             ENTITY("nu", 957),
 754             ENTITY("oacute", 243),
 755             ENTITY("ocirc", 244),
 756             ENTITY("oelig", 339),
 757             ENTITY("ograve", 242),
 758             ENTITY("oline", 8254),
 759             ENTITY("omega", 969),
 760             ENTITY("omicron", 959),
 761             ENTITY("oplus", 8853),
 762             ENTITY("or", 8744),
 763             ENTITY("ordf", 170),
 764             ENTITY("ordm", 186),
 765             ENTITY("oslash", 248),
 766             ENTITY("otilde", 245),
 767             ENTITY("otimes", 8855),
 768             ENTITY("ouml", 246),
 769             ENTITY("para", 182),
 770             ENTITY("part", 8706),
 771             ENTITY("permil", 8240),
 772             ENTITY("perp", 8869),
 773             ENTITY("phi", 966),
 774             ENTITY("pi", 960),
 775             ENTITY("piv", 982),
 776             ENTITY("plusmn", 177),
 777             ENTITY("pound", 163),
 778             ENTITY("prime", 8242),
 779             ENTITY("prod", 8719),
 780             ENTITY("prop", 8733),
 781             ENTITY("psi", 968),
 782             ENTITY("quot", 34),
 783             ENTITY("rArr", 8658),
 784             ENTITY("radic", 8730),
 785             ENTITY("rang", 9002),
 786             ENTITY("raquo", 187),
 787             ENTITY("rarr", 8594),
 788             ENTITY("rceil", 8969),
 789             ENTITY("rdquo", 8221),
 790             ENTITY("real", 8476),
 791             ENTITY("reg", 174),
 792             ENTITY("rfloor", 8971),
 793             ENTITY("rho", 961),
 794             ENTITY("rlm", 8207),
 795             ENTITY("rsaquo", 8250),
 796             ENTITY("rsquo", 8217),
 797             ENTITY("sbquo", 8218),
 798             ENTITY("scaron", 353),
 799             ENTITY("sdot", 8901),
 800             ENTITY("sect", 167),
 801             ENTITY("shy", 173),
 802             ENTITY("sigma", 963),
 803             ENTITY("sigmaf", 962),
 804             ENTITY("sim", 8764),
 805             ENTITY("spades", 9824),
 806             ENTITY("sub", 8834),
 807             ENTITY("sube", 8838),
 808             ENTITY("sum", 8721),
 809             ENTITY("sup", 8835),
 810             ENTITY("sup1", 185),
 811             ENTITY("sup2", 178),
 812             ENTITY("sup3", 179),
 813             ENTITY("supe", 8839),
 814             ENTITY("szlig", 223),
 815             ENTITY("tau", 964),
 816             ENTITY("there4", 8756),
 817             ENTITY("theta", 952),
 818             ENTITY("thetasym", 977),
 819             ENTITY("thinsp", 8201),
 820             ENTITY("thorn", 254),
 821             ENTITY("tilde", 732),
 822             ENTITY("times", 215),
 823             ENTITY("trade", 8482),
 824             ENTITY("uArr", 8657),
 825             ENTITY("uacute", 250),
 826             ENTITY("uarr", 8593),
 827             ENTITY("ucirc", 251),
 828             ENTITY("ugrave", 249),
 829             ENTITY("uml", 168),
 830             ENTITY("upsih", 978),
 831             ENTITY("upsilon", 965),
 832             ENTITY("uuml", 252),
 833             ENTITY("weierp", 8472),
 834             ENTITY("xi", 958),
 835             ENTITY("yacute", 253),
 836             ENTITY("yen", 165),
 837             ENTITY("yuml", 255),
 838             ENTITY("zeta", 950),
 839             ENTITY("zwj", 8205),
 840             ENTITY("zwnj", 8204),
 841             {NULL, 0}};
 842         #undef ENTITY
 843         static size_t substitutions_cnt = 0;
 844
 845         if (substitutions_cnt == 0)
 846             while (substitutions[substitutions_cnt].code != 0)
 847                 substitutions_cnt++;
 848
 849         wxHtmlEntityInfo *info;
 850 #ifdef __WXWINCE__
 851         // bsearch crashes under WinCE for some reason
 852         info = NULL;
 853         size_t i;
 854         for (i = 0; i < substitutions_cnt; i++)
 855         {
 856             if (entity == substitutions[i].name)
 857             {
 858                 info = & substitutions[i];
 859                 break;
 860             }
 861         }
 862 #else
 863         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 864                                            substitutions_cnt,
 865                                            sizeof(wxHtmlEntityInfo),
 866                                            wxHtmlEntityCompare);
 867 #endif
 868         if (info)
 869             code = info->code;
 870     }
 871
 872     if (code == 0)
 873         return 0;
 874     else
 875         return GetCharForCode(code);
 876 }
 877
 878 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 879                                 const wxString& url) const
 880 {
 881     return m_FS ? m_FS->OpenFile(url) : NULL;
 882
 883 }
 884
 885
 886 //-----------------------------------------------------------------------------
 887 // wxHtmlParser::ExtractCharsetInformation
 888 //-----------------------------------------------------------------------------
 889
 890 class wxMetaTagParser : public wxHtmlParser
 891 {
 892 public:
 893     wxMetaTagParser() { }
 894
 895     wxObject* GetProduct() { return NULL; }
 896
 897 protected:
 898     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 899
 900     wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
 901 };
 902
 903 class wxMetaTagHandler : public wxHtmlTagHandler
 904 {
 905 public:
 906     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 907     wxString GetSupportedTags() { return wxT("META,BODY"); }
 908     bool HandleTag(const wxHtmlTag& tag);
 909
 910 private:
 911     wxString *m_retval;
 912
 913     wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
 914 };
 915
 916 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 917 {
 918     if (tag.GetName() == _T("BODY"))
 919     {
 920         m_Parser->StopParsing();
 921         return false;
 922     }
 923
 924     if (tag.HasParam(_T("HTTP-EQUIV")) &&
 925         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
 926         tag.HasParam(_T("CONTENT")))
 927     {
 928         wxString content = tag.GetParam(_T("CONTENT")).Lower();
 929         if (content.Left(19) == _T("text/html; charset="))
 930         {
 931             *m_retval = content.Mid(19);
 932             m_Parser->StopParsing();
 933         }
 934     }
 935     return false;
 936 }
 937
 938
 939 /*static*/
 940 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 941 {
 942     wxString charset;
 943     wxMetaTagParser *parser = new wxMetaTagParser();
 944     if(parser)
 945     {
 946         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 947         parser->Parse(markup);
 948         delete parser;
 949     }
 950     return charset;
 951 }
 952
 953 /* static */
 954 bool
 955 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 956                              wxString::const_iterator end)
 957 {
 958     wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
 959
 960     wxString::const_iterator p = start;
 961
 962     // comments begin with "<!--" in HTML 4.0
 963     if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 964     {
 965         // not a comment at all
 966         return false;
 967     }
 968
 969     // skip the start of the comment tag in any case, if we don't find the
 970     // closing tag we should ignore broken markup
 971     start = p;
 972
 973     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 974     // comment delimiter and the closing tag character (section 3.2.4 of
 975     // http://www.w3.org/TR/html401/)
 976     int dashes = 0;
 977     while ( ++p < end )
 978     {
 979         const wxChar c = *p;
 980
 981         if ( (c == wxT(' ') || c == wxT('\n') ||
 982               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 983         {
 984             // ignore white space before potential tag end
 985             continue;
 986         }
 987
 988         if ( c == wxT('>') && dashes >= 2 )
 989         {
 990             // found end of comment
 991             start = p;
 992             break;
 993         }
 994
 995         if ( c == wxT('-') )
 996             dashes++;
 997         else
 998             dashes = 0;
 999     }
1000
1001     return true;
1002 }
1003
1004 #endif // wxUSE_HTML