src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/vector.h"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece() {}
  51     wxHtmlTextPiece(const wxString::const_iterator& start,
  52                     const wxString::const_iterator& end)
  53         : m_start(start), m_end(end) {}
  54     wxString::const_iterator m_start, m_end;
  55 };
  56
  57 // NB: this is an empty class and not typedef because of forward declaration
  58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  59 {
  60 };
  61
  62 class wxHtmlParserState
  63 {
  64 public:
  65     wxHtmlTag         *m_curTag;
  66     wxHtmlTag         *m_tags;
  67     wxHtmlTextPieces  *m_textPieces;
  68     int                m_curTextPiece;
  69     const wxString    *m_source;
  70     wxHtmlParserState *m_nextState;
  71 };
  72
  73 //-----------------------------------------------------------------------------
  74 // wxHtmlParser
  75 //-----------------------------------------------------------------------------
  76
  77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  78
  79 wxHtmlParser::wxHtmlParser()
  80     : wxObject(),
  81       m_FS(NULL)
  82 {
  83     m_Source = NULL;
  84     m_entitiesParser = new wxHtmlEntitiesParser;
  85     m_Tags = NULL;
  86     m_CurTag = NULL;
  87     m_TextPieces = NULL;
  88     m_CurTextPiece = 0;
  89     m_SavedStates = NULL;
  90 }
  91
  92 wxHtmlParser::~wxHtmlParser()
  93 {
  94     while (RestoreState()) {}
  95     DestroyDOMTree();
  96
  97     WX_CLEAR_ARRAY(m_HandlersStack);
  98     WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
  99     delete m_entitiesParser;
 100     delete m_Source;
 101 }
 102
 103 wxObject* wxHtmlParser::Parse(const wxString& source)
 104 {
 105     InitParser(source);
 106     DoParsing();
 107     wxObject *result = GetProduct();
 108     DoneParser();
 109     return result;
 110 }
 111
 112 void wxHtmlParser::InitParser(const wxString& source)
 113 {
 114     SetSource(source);
 115     m_stopParsing = false;
 116 }
 117
 118 void wxHtmlParser::DoneParser()
 119 {
 120     DestroyDOMTree();
 121 }
 122
 123 void wxHtmlParser::SetSource(const wxString& src)
 124 {
 125     DestroyDOMTree();
 126     // NB: This is allocated on heap because wxHtmlTag uses iterators and
 127     //     making a copy of m_Source string in SetSourceAndSaveState() and
 128     //     RestoreState() would invalidate them (because wxString::m_impl's
 129     //     memory would change completely twice and iterators use pointers
 130     //     into it). So instead, we keep the string object intact and only
 131     //     store/restore pointer to it, for which we need it to be allocated
 132     //     on the heap.
 133     delete m_Source;
 134     m_Source = new wxString(src);
 135     CreateDOMTree();
 136     m_CurTag = NULL;
 137     m_CurTextPiece = 0;
 138 }
 139
 140 void wxHtmlParser::CreateDOMTree()
 141 {
 142     wxHtmlTagsCache cache(*m_Source);
 143     m_TextPieces = new wxHtmlTextPieces;
 144     CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
 145     m_CurTextPiece = 0;
 146 }
 147
 148 extern bool wxIsCDATAElement(const wxString& tag);
 149
 150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 151                                     const wxString::const_iterator& begin_pos,
 152                                     const wxString::const_iterator& end_pos,
 153                                     wxHtmlTagsCache *cache)
 154 {
 155     if (end_pos <= begin_pos)
 156         return;
 157
 158     wxChar c;
 159     wxString::const_iterator i = begin_pos;
 160     wxString::const_iterator textBeginning = begin_pos;
 161
 162     // If the tag contains CDATA text, we include the text between beginning
 163     // and ending tag verbosely. Setting i=end_pos will skip to the very
 164     // end of this function where text piece is added, bypassing any child
 165     // tags parsing (CDATA element can't have child elements by definition):
 166     if (cur != NULL && wxIsCDATAElement(cur->GetName()))
 167     {
 168         i = end_pos;
 169     }
 170
 171     while (i < end_pos)
 172     {
 173         c = *i;
 174
 175         if (c == wxT('<'))
 176         {
 177             // add text to m_TextPieces:
 178             if (i > textBeginning)
 179                 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
 180
 181             // if it is a comment, skip it:
 182             if ( SkipCommentTag(i, m_Source->end()) )
 183             {
 184                 textBeginning = i = i + 1; // skip closing '>' too
 185             }
 186
 187             // add another tag to the tree:
 188             else if (i < end_pos-1 && *(i+1) != wxT('/'))
 189             {
 190                 wxHtmlTag *chd;
 191                 if (cur)
 192                     chd = new wxHtmlTag(cur, m_Source,
 193                                         i, end_pos, cache, m_entitiesParser);
 194                 else
 195                 {
 196                     chd = new wxHtmlTag(NULL, m_Source,
 197                                         i, end_pos, cache, m_entitiesParser);
 198                     if (!m_Tags)
 199                     {
 200                         // if this is the first tag to be created make the root
 201                         // m_Tags point to it:
 202                         m_Tags = chd;
 203                     }
 204                     else
 205                     {
 206                         // if there is already a root tag add this tag as
 207                         // the last sibling:
 208                         chd->m_Prev = m_Tags->GetLastSibling();
 209                         chd->m_Prev->m_Next = chd;
 210                     }
 211                 }
 212
 213                 if (chd->HasEnding())
 214                 {
 215                     CreateDOMSubTree(chd,
 216                                      chd->GetBeginIter(), chd->GetEndIter1(),
 217                                      cache);
 218                     i = chd->GetEndIter2();
 219                 }
 220                 else
 221                     i = chd->GetBeginIter();
 222
 223                 textBeginning = i;
 224             }
 225
 226             // ... or skip ending tag:
 227             else
 228             {
 229                 while (i < end_pos && *i != wxT('>')) ++i;
 230                 textBeginning = i < end_pos ? i+1 : i;
 231             }
 232         }
 233         else ++i;
 234     }
 235
 236     // add remaining text to m_TextPieces:
 237     if (end_pos > textBeginning)
 238         m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
 239 }
 240
 241 void wxHtmlParser::DestroyDOMTree()
 242 {
 243     wxHtmlTag *t1, *t2;
 244     t1 = m_Tags;
 245     while (t1)
 246     {
 247         t2 = t1->GetNextSibling();
 248         delete t1;
 249         t1 = t2;
 250     }
 251     m_Tags = m_CurTag = NULL;
 252
 253     wxDELETE(m_TextPieces);
 254 }
 255
 256 void wxHtmlParser::DoParsing()
 257 {
 258     m_CurTag = m_Tags;
 259     m_CurTextPiece = 0;
 260     DoParsing(m_Source->begin(), m_Source->end());
 261 }
 262
 263 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
 264                              const wxString::const_iterator& end_pos)
 265 {
 266     wxString::const_iterator begin_pos(begin_pos_);
 267
 268     if (end_pos <= begin_pos)
 269         return;
 270
 271     wxHtmlTextPieces& pieces = *m_TextPieces;
 272     size_t piecesCnt = pieces.size();
 273
 274     while (begin_pos < end_pos)
 275     {
 276         while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
 277             m_CurTag = m_CurTag->GetNextTag();
 278         while (m_CurTextPiece < piecesCnt &&
 279                pieces[m_CurTextPiece].m_start < begin_pos)
 280             m_CurTextPiece++;
 281
 282         if (m_CurTextPiece < piecesCnt &&
 283             (!m_CurTag ||
 284              pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
 285         {
 286             // Add text:
 287             AddText(GetEntitiesParser()->Parse(
 288                        wxString(pieces[m_CurTextPiece].m_start,
 289                                 pieces[m_CurTextPiece].m_end)));
 290             begin_pos = pieces[m_CurTextPiece].m_end;
 291             m_CurTextPiece++;
 292         }
 293         else if (m_CurTag)
 294         {
 295             if (m_CurTag->HasEnding())
 296                 begin_pos = m_CurTag->GetEndIter2();
 297             else
 298                 begin_pos = m_CurTag->GetBeginIter();
 299             wxHtmlTag *t = m_CurTag;
 300             m_CurTag = m_CurTag->GetNextTag();
 301             AddTag(*t);
 302             if (m_stopParsing)
 303                 return;
 304         }
 305         else break;
 306     }
 307 }
 308
 309 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 310 {
 311     bool inner = false;
 312
 313     wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
 314     if (h != m_HandlersHash.end())
 315     {
 316         inner = h->second->HandleTag(tag);
 317         if (m_stopParsing)
 318             return;
 319     }
 320     if (!inner)
 321     {
 322         if (tag.HasEnding())
 323             DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
 324     }
 325 }
 326
 327 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 328 {
 329     wxString s(handler->GetSupportedTags());
 330     wxStringTokenizer tokenizer(s, wxT(", "));
 331
 332     while (tokenizer.HasMoreTokens())
 333         m_HandlersHash[tokenizer.GetNextToken()] = handler;
 334
 335     m_HandlersSet.insert(handler);
 336
 337     handler->SetParser(this);
 338 }
 339
 340 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 341 {
 342     wxStringTokenizer tokenizer(tags, wxT(", "));
 343     wxString key;
 344
 345     m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
 346
 347     while (tokenizer.HasMoreTokens())
 348     {
 349         key = tokenizer.GetNextToken();
 350         m_HandlersHash[key] = handler;
 351     }
 352 }
 353
 354 void wxHtmlParser::PopTagHandler()
 355 {
 356     wxCHECK_RET( !m_HandlersStack.empty(),
 357                  "attempt to remove HTML tag handler from empty stack" );
 358
 359     wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
 360     m_HandlersStack.pop_back();
 361     m_HandlersHash = *prev;
 362     delete prev;
 363 }
 364
 365 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 366 {
 367     wxHtmlParserState *s = new wxHtmlParserState;
 368
 369     s->m_curTag = m_CurTag;
 370     s->m_tags = m_Tags;
 371     s->m_textPieces = m_TextPieces;
 372     s->m_curTextPiece = m_CurTextPiece;
 373     s->m_source = m_Source;
 374
 375     s->m_nextState = m_SavedStates;
 376     m_SavedStates = s;
 377
 378     m_CurTag = NULL;
 379     m_Tags = NULL;
 380     m_TextPieces = NULL;
 381     m_CurTextPiece = 0;
 382     m_Source = NULL;
 383
 384     SetSource(src);
 385 }
 386
 387 bool wxHtmlParser::RestoreState()
 388 {
 389     if (!m_SavedStates) return false;
 390
 391     DestroyDOMTree();
 392     delete m_Source;
 393
 394     wxHtmlParserState *s = m_SavedStates;
 395     m_SavedStates = s->m_nextState;
 396
 397     m_CurTag = s->m_curTag;
 398     m_Tags = s->m_tags;
 399     m_TextPieces = s->m_textPieces;
 400     m_CurTextPiece = s->m_curTextPiece;
 401     m_Source = s->m_source;
 402
 403     delete s;
 404     return true;
 405 }
 406
 407 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 408 {
 409     return wxString(tag.GetBeginIter(), tag.GetEndIter1());
 410 }
 411
 412 //-----------------------------------------------------------------------------
 413 // wxHtmlTagHandler
 414 //-----------------------------------------------------------------------------
 415
 416 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 417
 418 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 419 {
 420     // It is safe to temporarily change the source being parsed,
 421     // provided we restore the state back after parsing
 422     m_Parser->SetSourceAndSaveState(source);
 423     m_Parser->DoParsing();
 424     m_Parser->RestoreState();
 425 }
 426
 427
 428 //-----------------------------------------------------------------------------
 429 // wxHtmlEntitiesParser
 430 //-----------------------------------------------------------------------------
 431
 432 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 433
 434 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 435 #if !wxUSE_UNICODE
 436     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 437 #endif
 438 {
 439 }
 440
 441 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 442 {
 443 #if !wxUSE_UNICODE
 444     delete m_conv;
 445 #endif
 446 }
 447
 448 #if !wxUSE_UNICODE
 449 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 450 {
 451     if (encoding == m_encoding)
 452         return;
 453
 454     delete m_conv;
 455
 456     m_encoding = encoding;
 457     if (m_encoding == wxFONTENCODING_SYSTEM)
 458         m_conv = NULL;
 459     else
 460         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 461 }
 462 #endif // !wxUSE_UNICODE
 463
 464 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 465 {
 466     wxString output;
 467
 468     const wxString::const_iterator end(input.end());
 469     wxString::const_iterator c(input.begin());
 470     wxString::const_iterator last(c);
 471
 472     for ( ; c < end; ++c )
 473     {
 474         if (*c == wxT('&'))
 475         {
 476             if ( output.empty() )
 477                 output.reserve(input.length());
 478
 479             if (c - last > 0)
 480                 output.append(last, c);
 481             if ( ++c == end )
 482                 break;
 483
 484             wxString entity;
 485             const wxString::const_iterator ent_s = c;
 486             wxChar entity_char;
 487
 488             for ( ; c != end; ++c )
 489             {
 490                 wxChar ch = *c;
 491                 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
 492                        (ch >= wxT('A') && ch <= wxT('Z')) ||
 493                        (ch >= wxT('0') && ch <= wxT('9')) ||
 494                         ch == wxT('_') || ch == wxT('#')) )
 495                     break;
 496             }
 497
 498             entity.append(ent_s, c);
 499             if (c == end || *c != wxT(';')) --c;
 500             last = c+1;
 501             entity_char = GetEntityChar(entity);
 502             if (entity_char)
 503                 output << entity_char;
 504             else
 505             {
 506                 output.append(ent_s-1, c+1);
 507                 wxLogTrace(wxTRACE_HTML_DEBUG,
 508                            "Unrecognized HTML entity: '%s'",
 509                            entity);
 510             }
 511         }
 512     }
 513     if ( last == input.begin() ) // common case: no entity
 514         return input;
 515     if ( last != end )
 516         output.append(last, end);
 517     return output;
 518 }
 519
 520 #if !wxUSE_UNICODE
 521 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 522 {
 523     char buf[2];
 524     wchar_t wbuf[2];
 525     wbuf[0] = (wchar_t)code;
 526     wbuf[1] = 0;
 527     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 528     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 529         return '?';
 530     return buf[0];
 531 }
 532 #endif
 533
 534 struct wxHtmlEntityInfo
 535 {
 536     const wxStringCharType *name;
 537     unsigned code;
 538 };
 539
 540 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 541 {
 542 #if wxUSE_UNICODE_UTF8
 543     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 544 #else
 545     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 546 #endif
 547 }
 548
 549 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 550 {
 551     unsigned code = 0;
 552
 553     if (entity.empty())
 554       return 0; // invalid entity reference
 555
 556     if (entity[0] == wxT('#'))
 557     {
 558         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 559         //     representation is the same for numbers
 560         const wxStringCharType *ent_s = entity.wx_str();
 561         const wxStringCharType *format;
 562
 563         if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
 564         {
 565             format = wxS("%x");
 566             ent_s++;
 567         }
 568         else
 569             format = wxS("%u");
 570         ent_s++;
 571
 572         if (wxSscanf(ent_s, format, &code) != 1)
 573             code = 0;
 574     }
 575     else
 576     {
 577         // store the literals in wx's internal representation (either char*
 578         // in UTF-8 or wchar_t*) for best performance:
 579         #define ENTITY(name, code) { wxS(name), code }
 580
 581         static wxHtmlEntityInfo substitutions[] = {
 582             ENTITY("AElig", 198),
 583             ENTITY("Aacute", 193),
 584             ENTITY("Acirc", 194),
 585             ENTITY("Agrave", 192),
 586             ENTITY("Alpha", 913),
 587             ENTITY("Aring", 197),
 588             ENTITY("Atilde", 195),
 589             ENTITY("Auml", 196),
 590             ENTITY("Beta", 914),
 591             ENTITY("Ccedil", 199),
 592             ENTITY("Chi", 935),
 593             ENTITY("Dagger", 8225),
 594             ENTITY("Delta", 916),
 595             ENTITY("ETH", 208),
 596             ENTITY("Eacute", 201),
 597             ENTITY("Ecirc", 202),
 598             ENTITY("Egrave", 200),
 599             ENTITY("Epsilon", 917),
 600             ENTITY("Eta", 919),
 601             ENTITY("Euml", 203),
 602             ENTITY("Gamma", 915),
 603             ENTITY("Iacute", 205),
 604             ENTITY("Icirc", 206),
 605             ENTITY("Igrave", 204),
 606             ENTITY("Iota", 921),
 607             ENTITY("Iuml", 207),
 608             ENTITY("Kappa", 922),
 609             ENTITY("Lambda", 923),
 610             ENTITY("Mu", 924),
 611             ENTITY("Ntilde", 209),
 612             ENTITY("Nu", 925),
 613             ENTITY("OElig", 338),
 614             ENTITY("Oacute", 211),
 615             ENTITY("Ocirc", 212),
 616             ENTITY("Ograve", 210),
 617             ENTITY("Omega", 937),
 618             ENTITY("Omicron", 927),
 619             ENTITY("Oslash", 216),
 620             ENTITY("Otilde", 213),
 621             ENTITY("Ouml", 214),
 622             ENTITY("Phi", 934),
 623             ENTITY("Pi", 928),
 624             ENTITY("Prime", 8243),
 625             ENTITY("Psi", 936),
 626             ENTITY("Rho", 929),
 627             ENTITY("Scaron", 352),
 628             ENTITY("Sigma", 931),
 629             ENTITY("THORN", 222),
 630             ENTITY("Tau", 932),
 631             ENTITY("Theta", 920),
 632             ENTITY("Uacute", 218),
 633             ENTITY("Ucirc", 219),
 634             ENTITY("Ugrave", 217),
 635             ENTITY("Upsilon", 933),
 636             ENTITY("Uuml", 220),
 637             ENTITY("Xi", 926),
 638             ENTITY("Yacute", 221),
 639             ENTITY("Yuml", 376),
 640             ENTITY("Zeta", 918),
 641             ENTITY("aacute", 225),
 642             ENTITY("acirc", 226),
 643             ENTITY("acute", 180),
 644             ENTITY("aelig", 230),
 645             ENTITY("agrave", 224),
 646             ENTITY("alefsym", 8501),
 647             ENTITY("alpha", 945),
 648             ENTITY("amp", 38),
 649             ENTITY("and", 8743),
 650             ENTITY("ang", 8736),
 651             ENTITY("apos", 39),
 652             ENTITY("aring", 229),
 653             ENTITY("asymp", 8776),
 654             ENTITY("atilde", 227),
 655             ENTITY("auml", 228),
 656             ENTITY("bdquo", 8222),
 657             ENTITY("beta", 946),
 658             ENTITY("brvbar", 166),
 659             ENTITY("bull", 8226),
 660             ENTITY("cap", 8745),
 661             ENTITY("ccedil", 231),
 662             ENTITY("cedil", 184),
 663             ENTITY("cent", 162),
 664             ENTITY("chi", 967),
 665             ENTITY("circ", 710),
 666             ENTITY("clubs", 9827),
 667             ENTITY("cong", 8773),
 668             ENTITY("copy", 169),
 669             ENTITY("crarr", 8629),
 670             ENTITY("cup", 8746),
 671             ENTITY("curren", 164),
 672             ENTITY("dArr", 8659),
 673             ENTITY("dagger", 8224),
 674             ENTITY("darr", 8595),
 675             ENTITY("deg", 176),
 676             ENTITY("delta", 948),
 677             ENTITY("diams", 9830),
 678             ENTITY("divide", 247),
 679             ENTITY("eacute", 233),
 680             ENTITY("ecirc", 234),
 681             ENTITY("egrave", 232),
 682             ENTITY("empty", 8709),
 683             ENTITY("emsp", 8195),
 684             ENTITY("ensp", 8194),
 685             ENTITY("epsilon", 949),
 686             ENTITY("equiv", 8801),
 687             ENTITY("eta", 951),
 688             ENTITY("eth", 240),
 689             ENTITY("euml", 235),
 690             ENTITY("euro", 8364),
 691             ENTITY("exist", 8707),
 692             ENTITY("fnof", 402),
 693             ENTITY("forall", 8704),
 694             ENTITY("frac12", 189),
 695             ENTITY("frac14", 188),
 696             ENTITY("frac34", 190),
 697             ENTITY("frasl", 8260),
 698             ENTITY("gamma", 947),
 699             ENTITY("ge", 8805),
 700             ENTITY("gt", 62),
 701             ENTITY("hArr", 8660),
 702             ENTITY("harr", 8596),
 703             ENTITY("hearts", 9829),
 704             ENTITY("hellip", 8230),
 705             ENTITY("iacute", 237),
 706             ENTITY("icirc", 238),
 707             ENTITY("iexcl", 161),
 708             ENTITY("igrave", 236),
 709             ENTITY("image", 8465),
 710             ENTITY("infin", 8734),
 711             ENTITY("int", 8747),
 712             ENTITY("iota", 953),
 713             ENTITY("iquest", 191),
 714             ENTITY("isin", 8712),
 715             ENTITY("iuml", 239),
 716             ENTITY("kappa", 954),
 717             ENTITY("lArr", 8656),
 718             ENTITY("lambda", 955),
 719             ENTITY("lang", 9001),
 720             ENTITY("laquo", 171),
 721             ENTITY("larr", 8592),
 722             ENTITY("lceil", 8968),
 723             ENTITY("ldquo", 8220),
 724             ENTITY("le", 8804),
 725             ENTITY("lfloor", 8970),
 726             ENTITY("lowast", 8727),
 727             ENTITY("loz", 9674),
 728             ENTITY("lrm", 8206),
 729             ENTITY("lsaquo", 8249),
 730             ENTITY("lsquo", 8216),
 731             ENTITY("lt", 60),
 732             ENTITY("macr", 175),
 733             ENTITY("mdash", 8212),
 734             ENTITY("micro", 181),
 735             ENTITY("middot", 183),
 736             ENTITY("minus", 8722),
 737             ENTITY("mu", 956),
 738             ENTITY("nabla", 8711),
 739             ENTITY("nbsp", 160),
 740             ENTITY("ndash", 8211),
 741             ENTITY("ne", 8800),
 742             ENTITY("ni", 8715),
 743             ENTITY("not", 172),
 744             ENTITY("notin", 8713),
 745             ENTITY("nsub", 8836),
 746             ENTITY("ntilde", 241),
 747             ENTITY("nu", 957),
 748             ENTITY("oacute", 243),
 749             ENTITY("ocirc", 244),
 750             ENTITY("oelig", 339),
 751             ENTITY("ograve", 242),
 752             ENTITY("oline", 8254),
 753             ENTITY("omega", 969),
 754             ENTITY("omicron", 959),
 755             ENTITY("oplus", 8853),
 756             ENTITY("or", 8744),
 757             ENTITY("ordf", 170),
 758             ENTITY("ordm", 186),
 759             ENTITY("oslash", 248),
 760             ENTITY("otilde", 245),
 761             ENTITY("otimes", 8855),
 762             ENTITY("ouml", 246),
 763             ENTITY("para", 182),
 764             ENTITY("part", 8706),
 765             ENTITY("permil", 8240),
 766             ENTITY("perp", 8869),
 767             ENTITY("phi", 966),
 768             ENTITY("pi", 960),
 769             ENTITY("piv", 982),
 770             ENTITY("plusmn", 177),
 771             ENTITY("pound", 163),
 772             ENTITY("prime", 8242),
 773             ENTITY("prod", 8719),
 774             ENTITY("prop", 8733),
 775             ENTITY("psi", 968),
 776             ENTITY("quot", 34),
 777             ENTITY("rArr", 8658),
 778             ENTITY("radic", 8730),
 779             ENTITY("rang", 9002),
 780             ENTITY("raquo", 187),
 781             ENTITY("rarr", 8594),
 782             ENTITY("rceil", 8969),
 783             ENTITY("rdquo", 8221),
 784             ENTITY("real", 8476),
 785             ENTITY("reg", 174),
 786             ENTITY("rfloor", 8971),
 787             ENTITY("rho", 961),
 788             ENTITY("rlm", 8207),
 789             ENTITY("rsaquo", 8250),
 790             ENTITY("rsquo", 8217),
 791             ENTITY("sbquo", 8218),
 792             ENTITY("scaron", 353),
 793             ENTITY("sdot", 8901),
 794             ENTITY("sect", 167),
 795             ENTITY("shy", 173),
 796             ENTITY("sigma", 963),
 797             ENTITY("sigmaf", 962),
 798             ENTITY("sim", 8764),
 799             ENTITY("spades", 9824),
 800             ENTITY("sub", 8834),
 801             ENTITY("sube", 8838),
 802             ENTITY("sum", 8721),
 803             ENTITY("sup", 8835),
 804             ENTITY("sup1", 185),
 805             ENTITY("sup2", 178),
 806             ENTITY("sup3", 179),
 807             ENTITY("supe", 8839),
 808             ENTITY("szlig", 223),
 809             ENTITY("tau", 964),
 810             ENTITY("there4", 8756),
 811             ENTITY("theta", 952),
 812             ENTITY("thetasym", 977),
 813             ENTITY("thinsp", 8201),
 814             ENTITY("thorn", 254),
 815             ENTITY("tilde", 732),
 816             ENTITY("times", 215),
 817             ENTITY("trade", 8482),
 818             ENTITY("uArr", 8657),
 819             ENTITY("uacute", 250),
 820             ENTITY("uarr", 8593),
 821             ENTITY("ucirc", 251),
 822             ENTITY("ugrave", 249),
 823             ENTITY("uml", 168),
 824             ENTITY("upsih", 978),
 825             ENTITY("upsilon", 965),
 826             ENTITY("uuml", 252),
 827             ENTITY("weierp", 8472),
 828             ENTITY("xi", 958),
 829             ENTITY("yacute", 253),
 830             ENTITY("yen", 165),
 831             ENTITY("yuml", 255),
 832             ENTITY("zeta", 950),
 833             ENTITY("zwj", 8205),
 834             ENTITY("zwnj", 8204),
 835             {NULL, 0}};
 836         #undef ENTITY
 837         static size_t substitutions_cnt = 0;
 838
 839         if (substitutions_cnt == 0)
 840             while (substitutions[substitutions_cnt].code != 0)
 841                 substitutions_cnt++;
 842
 843         wxHtmlEntityInfo *info;
 844 #ifdef __WXWINCE__
 845         // bsearch crashes under WinCE for some reason
 846         info = NULL;
 847         size_t i;
 848         for (i = 0; i < substitutions_cnt; i++)
 849         {
 850             if (entity == substitutions[i].name)
 851             {
 852                 info = & substitutions[i];
 853                 break;
 854             }
 855         }
 856 #else
 857         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 858                                            substitutions_cnt,
 859                                            sizeof(wxHtmlEntityInfo),
 860                                            wxHtmlEntityCompare);
 861 #endif
 862         if (info)
 863             code = info->code;
 864     }
 865
 866     if (code == 0)
 867         return 0;
 868     else
 869         return GetCharForCode(code);
 870 }
 871
 872 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType type,
 873                                 const wxString& url) const
 874 {
 875     int flags = wxFS_READ;
 876     if (type == wxHTML_URL_IMAGE)
 877         flags |= wxFS_SEEKABLE;
 878
 879     return m_FS ? m_FS->OpenFile(url, flags) : NULL;
 880
 881 }
 882
 883
 884 //-----------------------------------------------------------------------------
 885 // wxHtmlParser::ExtractCharsetInformation
 886 //-----------------------------------------------------------------------------
 887
 888 class wxMetaTagParser : public wxHtmlParser
 889 {
 890 public:
 891     wxMetaTagParser() { }
 892
 893     wxObject* GetProduct() { return NULL; }
 894
 895 protected:
 896     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 897
 898     wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
 899 };
 900
 901 class wxMetaTagHandler : public wxHtmlTagHandler
 902 {
 903 public:
 904     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 905     wxString GetSupportedTags() { return wxT("META,BODY"); }
 906     bool HandleTag(const wxHtmlTag& tag);
 907
 908 private:
 909     wxString *m_retval;
 910
 911     wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
 912 };
 913
 914 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 915 {
 916     if (tag.GetName() == wxT("BODY"))
 917     {
 918         m_Parser->StopParsing();
 919         return false;
 920     }
 921
 922     if (tag.HasParam(wxT("HTTP-EQUIV")) &&
 923         tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
 924         tag.HasParam(wxT("CONTENT")))
 925     {
 926         wxString content = tag.GetParam(wxT("CONTENT")).Lower();
 927         if (content.Left(19) == wxT("text/html; charset="))
 928         {
 929             *m_retval = content.Mid(19);
 930             m_Parser->StopParsing();
 931         }
 932     }
 933     return false;
 934 }
 935
 936
 937 /*static*/
 938 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 939 {
 940     wxString charset;
 941     wxMetaTagParser *parser = new wxMetaTagParser();
 942     if(parser)
 943     {
 944         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 945         parser->Parse(markup);
 946         delete parser;
 947     }
 948     return charset;
 949 }
 950
 951 /* static */
 952 bool
 953 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 954                              wxString::const_iterator end)
 955 {
 956     wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
 957
 958     wxString::const_iterator p = start;
 959
 960     // Comments begin with "<!--" in HTML 4.0; anything shorter or not containing
 961     // these characters is not a comment and we're not going to skip it.
 962     if ( ++p == end || *p != '!' )
 963       return false;
 964     if ( ++p == end || *p != '-' )
 965       return false;
 966     if ( ++p == end || *p != '-' )
 967       return false;
 968
 969     // skip the start of the comment tag in any case, if we don't find the
 970     // closing tag we should ignore broken markup
 971     start = p;
 972
 973     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 974     // comment delimiter and the closing tag character (section 3.2.4 of
 975     // http://www.w3.org/TR/html401/)
 976     int dashes = 0;
 977     while ( ++p < end )
 978     {
 979         const wxChar c = *p;
 980
 981         if ( (c == wxT(' ') || c == wxT('\n') ||
 982               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 983         {
 984             // ignore white space before potential tag end
 985             continue;
 986         }
 987
 988         if ( c == wxT('>') && dashes >= 2 )
 989         {
 990             // found end of comment
 991             start = p;
 992             break;
 993         }
 994
 995         if ( c == wxT('-') )
 996             dashes++;
 997         else
 998             dashes = 0;
 999     }
1000
1001     return true;
1002 }
1003
1004 #endif // wxUSE_HTML