src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // Copyright:   (c) 1999 Vaclav Slavik
   6 // Licence:     wxWindows licence
   7 /////////////////////////////////////////////////////////////////////////////
   8
   9 #include "wx/wxprec.h"
  10
  11 #ifdef __BORLANDC__
  12     #pragma hdrstop
  13 #endif
  14
  15 #if wxUSE_HTML && wxUSE_STREAMS
  16
  17 #ifndef WX_PRECOMP
  18     #include "wx/dynarray.h"
  19     #include "wx/log.h"
  20     #include "wx/intl.h"
  21     #include "wx/app.h"
  22     #include "wx/wxcrtvararg.h"
  23 #endif
  24
  25 #include "wx/tokenzr.h"
  26 #include "wx/wfstream.h"
  27 #include "wx/url.h"
  28 #include "wx/fontmap.h"
  29 #include "wx/html/htmldefs.h"
  30 #include "wx/html/htmlpars.h"
  31 #include "wx/vector.h"
  32
  33 #ifdef __WXWINCE__
  34     #include "wx/msw/wince/missing.h"       // for bsearch()
  35 #endif
  36
  37 // DLL options compatibility check:
  38 WX_CHECK_BUILD_OPTIONS("wxHTML")
  39
  40 const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
  41
  42 //-----------------------------------------------------------------------------
  43 // wxHtmlParser helpers
  44 //-----------------------------------------------------------------------------
  45
  46 class wxHtmlTextPiece
  47 {
  48 public:
  49     wxHtmlTextPiece() {}
  50     wxHtmlTextPiece(const wxString::const_iterator& start,
  51                     const wxString::const_iterator& end)
  52         : m_start(start), m_end(end) {}
  53     wxString::const_iterator m_start, m_end;
  54 };
  55
  56 // NB: this is an empty class and not typedef because of forward declaration
  57 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  58 {
  59 };
  60
  61 class wxHtmlParserState
  62 {
  63 public:
  64     wxHtmlTag         *m_curTag;
  65     wxHtmlTag         *m_tags;
  66     wxHtmlTextPieces  *m_textPieces;
  67     int                m_curTextPiece;
  68     const wxString    *m_source;
  69     wxHtmlParserState *m_nextState;
  70 };
  71
  72 //-----------------------------------------------------------------------------
  73 // wxHtmlParser
  74 //-----------------------------------------------------------------------------
  75
  76 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  77
  78 wxHtmlParser::wxHtmlParser()
  79     : wxObject(),
  80       m_FS(NULL)
  81 {
  82     m_Source = NULL;
  83     m_entitiesParser = new wxHtmlEntitiesParser;
  84     m_Tags = NULL;
  85     m_CurTag = NULL;
  86     m_TextPieces = NULL;
  87     m_CurTextPiece = 0;
  88     m_SavedStates = NULL;
  89 }
  90
  91 wxHtmlParser::~wxHtmlParser()
  92 {
  93     while (RestoreState()) {}
  94     DestroyDOMTree();
  95
  96     WX_CLEAR_ARRAY(m_HandlersStack);
  97     WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
  98     delete m_entitiesParser;
  99     delete m_Source;
 100 }
 101
 102 wxObject* wxHtmlParser::Parse(const wxString& source)
 103 {
 104     InitParser(source);
 105     DoParsing();
 106     wxObject *result = GetProduct();
 107     DoneParser();
 108     return result;
 109 }
 110
 111 void wxHtmlParser::InitParser(const wxString& source)
 112 {
 113     SetSource(source);
 114     m_stopParsing = false;
 115 }
 116
 117 void wxHtmlParser::DoneParser()
 118 {
 119     DestroyDOMTree();
 120 }
 121
 122 void wxHtmlParser::SetSource(const wxString& src)
 123 {
 124     DestroyDOMTree();
 125     // NB: This is allocated on heap because wxHtmlTag uses iterators and
 126     //     making a copy of m_Source string in SetSourceAndSaveState() and
 127     //     RestoreState() would invalidate them (because wxString::m_impl's
 128     //     memory would change completely twice and iterators use pointers
 129     //     into it). So instead, we keep the string object intact and only
 130     //     store/restore pointer to it, for which we need it to be allocated
 131     //     on the heap.
 132     delete m_Source;
 133     m_Source = new wxString(src);
 134     CreateDOMTree();
 135     m_CurTag = NULL;
 136     m_CurTextPiece = 0;
 137 }
 138
 139 void wxHtmlParser::CreateDOMTree()
 140 {
 141     wxHtmlTagsCache cache(*m_Source);
 142     m_TextPieces = new wxHtmlTextPieces;
 143     CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
 144     m_CurTextPiece = 0;
 145 }
 146
 147 extern bool wxIsCDATAElement(const wxString& tag);
 148
 149 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 150                                     const wxString::const_iterator& begin_pos,
 151                                     const wxString::const_iterator& end_pos,
 152                                     wxHtmlTagsCache *cache)
 153 {
 154     if (end_pos <= begin_pos)
 155         return;
 156
 157     wxChar c;
 158     wxString::const_iterator i = begin_pos;
 159     wxString::const_iterator textBeginning = begin_pos;
 160
 161     // If the tag contains CDATA text, we include the text between beginning
 162     // and ending tag verbosely. Setting i=end_pos will skip to the very
 163     // end of this function where text piece is added, bypassing any child
 164     // tags parsing (CDATA element can't have child elements by definition):
 165     if (cur != NULL && wxIsCDATAElement(cur->GetName()))
 166     {
 167         i = end_pos;
 168     }
 169
 170     while (i < end_pos)
 171     {
 172         c = *i;
 173
 174         if (c == wxT('<'))
 175         {
 176             // add text to m_TextPieces:
 177             if (i > textBeginning)
 178                 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
 179
 180             // if it is a comment, skip it:
 181             if ( SkipCommentTag(i, m_Source->end()) )
 182             {
 183                 textBeginning = i = i + 1; // skip closing '>' too
 184             }
 185
 186             // add another tag to the tree:
 187             else if (i < end_pos-1 && *(i+1) != wxT('/'))
 188             {
 189                 wxHtmlTag *chd;
 190                 if (cur)
 191                     chd = new wxHtmlTag(cur, m_Source,
 192                                         i, end_pos, cache, m_entitiesParser);
 193                 else
 194                 {
 195                     chd = new wxHtmlTag(NULL, m_Source,
 196                                         i, end_pos, cache, m_entitiesParser);
 197                     if (!m_Tags)
 198                     {
 199                         // if this is the first tag to be created make the root
 200                         // m_Tags point to it:
 201                         m_Tags = chd;
 202                     }
 203                     else
 204                     {
 205                         // if there is already a root tag add this tag as
 206                         // the last sibling:
 207                         chd->m_Prev = m_Tags->GetLastSibling();
 208                         chd->m_Prev->m_Next = chd;
 209                     }
 210                 }
 211
 212                 if (chd->HasEnding())
 213                 {
 214                     CreateDOMSubTree(chd,
 215                                      chd->GetBeginIter(), chd->GetEndIter1(),
 216                                      cache);
 217                     i = chd->GetEndIter2();
 218                 }
 219                 else
 220                     i = chd->GetBeginIter();
 221
 222                 textBeginning = i;
 223             }
 224
 225             // ... or skip ending tag:
 226             else
 227             {
 228                 while (i < end_pos && *i != wxT('>')) ++i;
 229                 textBeginning = i < end_pos ? i+1 : i;
 230             }
 231         }
 232         else ++i;
 233     }
 234
 235     // add remaining text to m_TextPieces:
 236     if (end_pos > textBeginning)
 237         m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
 238 }
 239
 240 void wxHtmlParser::DestroyDOMTree()
 241 {
 242     wxHtmlTag *t1, *t2;
 243     t1 = m_Tags;
 244     while (t1)
 245     {
 246         t2 = t1->GetNextSibling();
 247         delete t1;
 248         t1 = t2;
 249     }
 250     m_Tags = m_CurTag = NULL;
 251
 252     wxDELETE(m_TextPieces);
 253 }
 254
 255 void wxHtmlParser::DoParsing()
 256 {
 257     m_CurTag = m_Tags;
 258     m_CurTextPiece = 0;
 259     DoParsing(m_Source->begin(), m_Source->end());
 260 }
 261
 262 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
 263                              const wxString::const_iterator& end_pos)
 264 {
 265     wxString::const_iterator begin_pos(begin_pos_);
 266
 267     if (end_pos <= begin_pos)
 268         return;
 269
 270     wxHtmlTextPieces& pieces = *m_TextPieces;
 271     size_t piecesCnt = pieces.size();
 272
 273     while (begin_pos < end_pos)
 274     {
 275         while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
 276             m_CurTag = m_CurTag->GetNextTag();
 277         while (m_CurTextPiece < piecesCnt &&
 278                pieces[m_CurTextPiece].m_start < begin_pos)
 279             m_CurTextPiece++;
 280
 281         if (m_CurTextPiece < piecesCnt &&
 282             (!m_CurTag ||
 283              pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
 284         {
 285             // Add text:
 286             AddText(GetEntitiesParser()->Parse(
 287                        wxString(pieces[m_CurTextPiece].m_start,
 288                                 pieces[m_CurTextPiece].m_end)));
 289             begin_pos = pieces[m_CurTextPiece].m_end;
 290             m_CurTextPiece++;
 291         }
 292         else if (m_CurTag)
 293         {
 294             if (m_CurTag->HasEnding())
 295                 begin_pos = m_CurTag->GetEndIter2();
 296             else
 297                 begin_pos = m_CurTag->GetBeginIter();
 298             wxHtmlTag *t = m_CurTag;
 299             m_CurTag = m_CurTag->GetNextTag();
 300             AddTag(*t);
 301             if (m_stopParsing)
 302                 return;
 303         }
 304         else break;
 305     }
 306 }
 307
 308 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 309 {
 310     bool inner = false;
 311
 312     wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
 313     if (h != m_HandlersHash.end())
 314     {
 315         inner = h->second->HandleTag(tag);
 316         if (m_stopParsing)
 317             return;
 318     }
 319     if (!inner)
 320     {
 321         if (tag.HasEnding())
 322             DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
 323     }
 324 }
 325
 326 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 327 {
 328     wxString s(handler->GetSupportedTags());
 329     wxStringTokenizer tokenizer(s, wxT(", "));
 330
 331     while (tokenizer.HasMoreTokens())
 332         m_HandlersHash[tokenizer.GetNextToken()] = handler;
 333
 334     m_HandlersSet.insert(handler);
 335
 336     handler->SetParser(this);
 337 }
 338
 339 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 340 {
 341     wxStringTokenizer tokenizer(tags, wxT(", "));
 342     wxString key;
 343
 344     m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
 345
 346     while (tokenizer.HasMoreTokens())
 347     {
 348         key = tokenizer.GetNextToken();
 349         m_HandlersHash[key] = handler;
 350     }
 351 }
 352
 353 void wxHtmlParser::PopTagHandler()
 354 {
 355     wxCHECK_RET( !m_HandlersStack.empty(),
 356                  "attempt to remove HTML tag handler from empty stack" );
 357
 358     wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
 359     m_HandlersStack.pop_back();
 360     m_HandlersHash = *prev;
 361     delete prev;
 362 }
 363
 364 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 365 {
 366     wxHtmlParserState *s = new wxHtmlParserState;
 367
 368     s->m_curTag = m_CurTag;
 369     s->m_tags = m_Tags;
 370     s->m_textPieces = m_TextPieces;
 371     s->m_curTextPiece = m_CurTextPiece;
 372     s->m_source = m_Source;
 373
 374     s->m_nextState = m_SavedStates;
 375     m_SavedStates = s;
 376
 377     m_CurTag = NULL;
 378     m_Tags = NULL;
 379     m_TextPieces = NULL;
 380     m_CurTextPiece = 0;
 381     m_Source = NULL;
 382
 383     SetSource(src);
 384 }
 385
 386 bool wxHtmlParser::RestoreState()
 387 {
 388     if (!m_SavedStates) return false;
 389
 390     DestroyDOMTree();
 391     delete m_Source;
 392
 393     wxHtmlParserState *s = m_SavedStates;
 394     m_SavedStates = s->m_nextState;
 395
 396     m_CurTag = s->m_curTag;
 397     m_Tags = s->m_tags;
 398     m_TextPieces = s->m_textPieces;
 399     m_CurTextPiece = s->m_curTextPiece;
 400     m_Source = s->m_source;
 401
 402     delete s;
 403     return true;
 404 }
 405
 406 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 407 {
 408     return wxString(tag.GetBeginIter(), tag.GetEndIter1());
 409 }
 410
 411 //-----------------------------------------------------------------------------
 412 // wxHtmlTagHandler
 413 //-----------------------------------------------------------------------------
 414
 415 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 416
 417 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 418 {
 419     // It is safe to temporarily change the source being parsed,
 420     // provided we restore the state back after parsing
 421     m_Parser->SetSourceAndSaveState(source);
 422     m_Parser->DoParsing();
 423     m_Parser->RestoreState();
 424 }
 425
 426
 427 //-----------------------------------------------------------------------------
 428 // wxHtmlEntitiesParser
 429 //-----------------------------------------------------------------------------
 430
 431 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 432
 433 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 434 #if !wxUSE_UNICODE
 435     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 436 #endif
 437 {
 438 }
 439
 440 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 441 {
 442 #if !wxUSE_UNICODE
 443     delete m_conv;
 444 #endif
 445 }
 446
 447 #if !wxUSE_UNICODE
 448 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 449 {
 450     if (encoding == m_encoding)
 451         return;
 452
 453     delete m_conv;
 454
 455     m_encoding = encoding;
 456     if (m_encoding == wxFONTENCODING_SYSTEM)
 457         m_conv = NULL;
 458     else
 459         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 460 }
 461 #endif // !wxUSE_UNICODE
 462
 463 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 464 {
 465     wxString output;
 466
 467     const wxString::const_iterator end(input.end());
 468     wxString::const_iterator c(input.begin());
 469     wxString::const_iterator last(c);
 470
 471     for ( ; c < end; ++c )
 472     {
 473         if (*c == wxT('&'))
 474         {
 475             if ( output.empty() )
 476                 output.reserve(input.length());
 477
 478             if (c - last > 0)
 479                 output.append(last, c);
 480             if ( ++c == end )
 481                 break;
 482
 483             wxString entity;
 484             const wxString::const_iterator ent_s = c;
 485             wxChar entity_char;
 486
 487             for ( ; c != end; ++c )
 488             {
 489                 wxChar ch = *c;
 490                 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
 491                        (ch >= wxT('A') && ch <= wxT('Z')) ||
 492                        (ch >= wxT('0') && ch <= wxT('9')) ||
 493                         ch == wxT('_') || ch == wxT('#')) )
 494                     break;
 495             }
 496
 497             entity.append(ent_s, c);
 498             if (c == end || *c != wxT(';')) --c;
 499             last = c+1;
 500             entity_char = GetEntityChar(entity);
 501             if (entity_char)
 502                 output << entity_char;
 503             else
 504             {
 505                 output.append(ent_s-1, c+1);
 506                 wxLogTrace(wxTRACE_HTML_DEBUG,
 507                            "Unrecognized HTML entity: '%s'",
 508                            entity);
 509             }
 510         }
 511     }
 512     if ( last == input.begin() ) // common case: no entity
 513         return input;
 514     if ( last != end )
 515         output.append(last, end);
 516     return output;
 517 }
 518
 519 #if !wxUSE_UNICODE
 520 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 521 {
 522     char buf[2];
 523     wchar_t wbuf[2];
 524     wbuf[0] = (wchar_t)code;
 525     wbuf[1] = 0;
 526     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 527     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 528         return '?';
 529     return buf[0];
 530 }
 531 #endif
 532
 533 struct wxHtmlEntityInfo
 534 {
 535     const wxStringCharType *name;
 536     unsigned code;
 537 };
 538
 539 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 540 {
 541 #if wxUSE_UNICODE_UTF8
 542     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 543 #else
 544     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 545 #endif
 546 }
 547
 548 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 549 {
 550     unsigned code = 0;
 551
 552     if (entity.empty())
 553       return 0; // invalid entity reference
 554
 555     if (entity[0] == wxT('#'))
 556     {
 557         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 558         //     representation is the same for numbers
 559         const wxStringCharType *ent_s = entity.wx_str();
 560         const wxStringCharType *format;
 561
 562         if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
 563         {
 564             format = wxS("%x");
 565             ent_s++;
 566         }
 567         else
 568             format = wxS("%u");
 569         ent_s++;
 570
 571         if (wxSscanf(ent_s, format, &code) != 1)
 572             code = 0;
 573     }
 574     else
 575     {
 576         // store the literals in wx's internal representation (either char*
 577         // in UTF-8 or wchar_t*) for best performance:
 578         #define ENTITY(name, code) { wxS(name), code }
 579
 580         static wxHtmlEntityInfo substitutions[] = {
 581             ENTITY("AElig", 198),
 582             ENTITY("Aacute", 193),
 583             ENTITY("Acirc", 194),
 584             ENTITY("Agrave", 192),
 585             ENTITY("Alpha", 913),
 586             ENTITY("Aring", 197),
 587             ENTITY("Atilde", 195),
 588             ENTITY("Auml", 196),
 589             ENTITY("Beta", 914),
 590             ENTITY("Ccedil", 199),
 591             ENTITY("Chi", 935),
 592             ENTITY("Dagger", 8225),
 593             ENTITY("Delta", 916),
 594             ENTITY("ETH", 208),
 595             ENTITY("Eacute", 201),
 596             ENTITY("Ecirc", 202),
 597             ENTITY("Egrave", 200),
 598             ENTITY("Epsilon", 917),
 599             ENTITY("Eta", 919),
 600             ENTITY("Euml", 203),
 601             ENTITY("Gamma", 915),
 602             ENTITY("Iacute", 205),
 603             ENTITY("Icirc", 206),
 604             ENTITY("Igrave", 204),
 605             ENTITY("Iota", 921),
 606             ENTITY("Iuml", 207),
 607             ENTITY("Kappa", 922),
 608             ENTITY("Lambda", 923),
 609             ENTITY("Mu", 924),
 610             ENTITY("Ntilde", 209),
 611             ENTITY("Nu", 925),
 612             ENTITY("OElig", 338),
 613             ENTITY("Oacute", 211),
 614             ENTITY("Ocirc", 212),
 615             ENTITY("Ograve", 210),
 616             ENTITY("Omega", 937),
 617             ENTITY("Omicron", 927),
 618             ENTITY("Oslash", 216),
 619             ENTITY("Otilde", 213),
 620             ENTITY("Ouml", 214),
 621             ENTITY("Phi", 934),
 622             ENTITY("Pi", 928),
 623             ENTITY("Prime", 8243),
 624             ENTITY("Psi", 936),
 625             ENTITY("Rho", 929),
 626             ENTITY("Scaron", 352),
 627             ENTITY("Sigma", 931),
 628             ENTITY("THORN", 222),
 629             ENTITY("Tau", 932),
 630             ENTITY("Theta", 920),
 631             ENTITY("Uacute", 218),
 632             ENTITY("Ucirc", 219),
 633             ENTITY("Ugrave", 217),
 634             ENTITY("Upsilon", 933),
 635             ENTITY("Uuml", 220),
 636             ENTITY("Xi", 926),
 637             ENTITY("Yacute", 221),
 638             ENTITY("Yuml", 376),
 639             ENTITY("Zeta", 918),
 640             ENTITY("aacute", 225),
 641             ENTITY("acirc", 226),
 642             ENTITY("acute", 180),
 643             ENTITY("aelig", 230),
 644             ENTITY("agrave", 224),
 645             ENTITY("alefsym", 8501),
 646             ENTITY("alpha", 945),
 647             ENTITY("amp", 38),
 648             ENTITY("and", 8743),
 649             ENTITY("ang", 8736),
 650             ENTITY("apos", 39),
 651             ENTITY("aring", 229),
 652             ENTITY("asymp", 8776),
 653             ENTITY("atilde", 227),
 654             ENTITY("auml", 228),
 655             ENTITY("bdquo", 8222),
 656             ENTITY("beta", 946),
 657             ENTITY("brvbar", 166),
 658             ENTITY("bull", 8226),
 659             ENTITY("cap", 8745),
 660             ENTITY("ccedil", 231),
 661             ENTITY("cedil", 184),
 662             ENTITY("cent", 162),
 663             ENTITY("chi", 967),
 664             ENTITY("circ", 710),
 665             ENTITY("clubs", 9827),
 666             ENTITY("cong", 8773),
 667             ENTITY("copy", 169),
 668             ENTITY("crarr", 8629),
 669             ENTITY("cup", 8746),
 670             ENTITY("curren", 164),
 671             ENTITY("dArr", 8659),
 672             ENTITY("dagger", 8224),
 673             ENTITY("darr", 8595),
 674             ENTITY("deg", 176),
 675             ENTITY("delta", 948),
 676             ENTITY("diams", 9830),
 677             ENTITY("divide", 247),
 678             ENTITY("eacute", 233),
 679             ENTITY("ecirc", 234),
 680             ENTITY("egrave", 232),
 681             ENTITY("empty", 8709),
 682             ENTITY("emsp", 8195),
 683             ENTITY("ensp", 8194),
 684             ENTITY("epsilon", 949),
 685             ENTITY("equiv", 8801),
 686             ENTITY("eta", 951),
 687             ENTITY("eth", 240),
 688             ENTITY("euml", 235),
 689             ENTITY("euro", 8364),
 690             ENTITY("exist", 8707),
 691             ENTITY("fnof", 402),
 692             ENTITY("forall", 8704),
 693             ENTITY("frac12", 189),
 694             ENTITY("frac14", 188),
 695             ENTITY("frac34", 190),
 696             ENTITY("frasl", 8260),
 697             ENTITY("gamma", 947),
 698             ENTITY("ge", 8805),
 699             ENTITY("gt", 62),
 700             ENTITY("hArr", 8660),
 701             ENTITY("harr", 8596),
 702             ENTITY("hearts", 9829),
 703             ENTITY("hellip", 8230),
 704             ENTITY("iacute", 237),
 705             ENTITY("icirc", 238),
 706             ENTITY("iexcl", 161),
 707             ENTITY("igrave", 236),
 708             ENTITY("image", 8465),
 709             ENTITY("infin", 8734),
 710             ENTITY("int", 8747),
 711             ENTITY("iota", 953),
 712             ENTITY("iquest", 191),
 713             ENTITY("isin", 8712),
 714             ENTITY("iuml", 239),
 715             ENTITY("kappa", 954),
 716             ENTITY("lArr", 8656),
 717             ENTITY("lambda", 955),
 718             ENTITY("lang", 9001),
 719             ENTITY("laquo", 171),
 720             ENTITY("larr", 8592),
 721             ENTITY("lceil", 8968),
 722             ENTITY("ldquo", 8220),
 723             ENTITY("le", 8804),
 724             ENTITY("lfloor", 8970),
 725             ENTITY("lowast", 8727),
 726             ENTITY("loz", 9674),
 727             ENTITY("lrm", 8206),
 728             ENTITY("lsaquo", 8249),
 729             ENTITY("lsquo", 8216),
 730             ENTITY("lt", 60),
 731             ENTITY("macr", 175),
 732             ENTITY("mdash", 8212),
 733             ENTITY("micro", 181),
 734             ENTITY("middot", 183),
 735             ENTITY("minus", 8722),
 736             ENTITY("mu", 956),
 737             ENTITY("nabla", 8711),
 738             ENTITY("nbsp", 160),
 739             ENTITY("ndash", 8211),
 740             ENTITY("ne", 8800),
 741             ENTITY("ni", 8715),
 742             ENTITY("not", 172),
 743             ENTITY("notin", 8713),
 744             ENTITY("nsub", 8836),
 745             ENTITY("ntilde", 241),
 746             ENTITY("nu", 957),
 747             ENTITY("oacute", 243),
 748             ENTITY("ocirc", 244),
 749             ENTITY("oelig", 339),
 750             ENTITY("ograve", 242),
 751             ENTITY("oline", 8254),
 752             ENTITY("omega", 969),
 753             ENTITY("omicron", 959),
 754             ENTITY("oplus", 8853),
 755             ENTITY("or", 8744),
 756             ENTITY("ordf", 170),
 757             ENTITY("ordm", 186),
 758             ENTITY("oslash", 248),
 759             ENTITY("otilde", 245),
 760             ENTITY("otimes", 8855),
 761             ENTITY("ouml", 246),
 762             ENTITY("para", 182),
 763             ENTITY("part", 8706),
 764             ENTITY("permil", 8240),
 765             ENTITY("perp", 8869),
 766             ENTITY("phi", 966),
 767             ENTITY("pi", 960),
 768             ENTITY("piv", 982),
 769             ENTITY("plusmn", 177),
 770             ENTITY("pound", 163),
 771             ENTITY("prime", 8242),
 772             ENTITY("prod", 8719),
 773             ENTITY("prop", 8733),
 774             ENTITY("psi", 968),
 775             ENTITY("quot", 34),
 776             ENTITY("rArr", 8658),
 777             ENTITY("radic", 8730),
 778             ENTITY("rang", 9002),
 779             ENTITY("raquo", 187),
 780             ENTITY("rarr", 8594),
 781             ENTITY("rceil", 8969),
 782             ENTITY("rdquo", 8221),
 783             ENTITY("real", 8476),
 784             ENTITY("reg", 174),
 785             ENTITY("rfloor", 8971),
 786             ENTITY("rho", 961),
 787             ENTITY("rlm", 8207),
 788             ENTITY("rsaquo", 8250),
 789             ENTITY("rsquo", 8217),
 790             ENTITY("sbquo", 8218),
 791             ENTITY("scaron", 353),
 792             ENTITY("sdot", 8901),
 793             ENTITY("sect", 167),
 794             ENTITY("shy", 173),
 795             ENTITY("sigma", 963),
 796             ENTITY("sigmaf", 962),
 797             ENTITY("sim", 8764),
 798             ENTITY("spades", 9824),
 799             ENTITY("sub", 8834),
 800             ENTITY("sube", 8838),
 801             ENTITY("sum", 8721),
 802             ENTITY("sup", 8835),
 803             ENTITY("sup1", 185),
 804             ENTITY("sup2", 178),
 805             ENTITY("sup3", 179),
 806             ENTITY("supe", 8839),
 807             ENTITY("szlig", 223),
 808             ENTITY("tau", 964),
 809             ENTITY("there4", 8756),
 810             ENTITY("theta", 952),
 811             ENTITY("thetasym", 977),
 812             ENTITY("thinsp", 8201),
 813             ENTITY("thorn", 254),
 814             ENTITY("tilde", 732),
 815             ENTITY("times", 215),
 816             ENTITY("trade", 8482),
 817             ENTITY("uArr", 8657),
 818             ENTITY("uacute", 250),
 819             ENTITY("uarr", 8593),
 820             ENTITY("ucirc", 251),
 821             ENTITY("ugrave", 249),
 822             ENTITY("uml", 168),
 823             ENTITY("upsih", 978),
 824             ENTITY("upsilon", 965),
 825             ENTITY("uuml", 252),
 826             ENTITY("weierp", 8472),
 827             ENTITY("xi", 958),
 828             ENTITY("yacute", 253),
 829             ENTITY("yen", 165),
 830             ENTITY("yuml", 255),
 831             ENTITY("zeta", 950),
 832             ENTITY("zwj", 8205),
 833             ENTITY("zwnj", 8204),
 834             {NULL, 0}};
 835         #undef ENTITY
 836         static size_t substitutions_cnt = 0;
 837
 838         if (substitutions_cnt == 0)
 839             while (substitutions[substitutions_cnt].code != 0)
 840                 substitutions_cnt++;
 841
 842         wxHtmlEntityInfo *info;
 843 #ifdef __WXWINCE__
 844         // bsearch crashes under WinCE for some reason
 845         info = NULL;
 846         size_t i;
 847         for (i = 0; i < substitutions_cnt; i++)
 848         {
 849             if (entity == substitutions[i].name)
 850             {
 851                 info = & substitutions[i];
 852                 break;
 853             }
 854         }
 855 #else
 856         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 857                                            substitutions_cnt,
 858                                            sizeof(wxHtmlEntityInfo),
 859                                            wxHtmlEntityCompare);
 860 #endif
 861         if (info)
 862             code = info->code;
 863     }
 864
 865     if (code == 0)
 866         return 0;
 867     else
 868         return GetCharForCode(code);
 869 }
 870
 871 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType type,
 872                                 const wxString& url) const
 873 {
 874     int flags = wxFS_READ;
 875     if (type == wxHTML_URL_IMAGE)
 876         flags |= wxFS_SEEKABLE;
 877
 878     return m_FS ? m_FS->OpenFile(url, flags) : NULL;
 879
 880 }
 881
 882
 883 //-----------------------------------------------------------------------------
 884 // wxHtmlParser::ExtractCharsetInformation
 885 //-----------------------------------------------------------------------------
 886
 887 class wxMetaTagParser : public wxHtmlParser
 888 {
 889 public:
 890     wxMetaTagParser() { }
 891
 892     wxObject* GetProduct() { return NULL; }
 893
 894 protected:
 895     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 896
 897     wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
 898 };
 899
 900 class wxMetaTagHandler : public wxHtmlTagHandler
 901 {
 902 public:
 903     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 904     wxString GetSupportedTags() { return wxT("META,BODY"); }
 905     bool HandleTag(const wxHtmlTag& tag);
 906
 907 private:
 908     wxString *m_retval;
 909
 910     wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
 911 };
 912
 913 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 914 {
 915     if (tag.GetName() == wxT("BODY"))
 916     {
 917         m_Parser->StopParsing();
 918         return false;
 919     }
 920
 921     wxString httpEquiv,
 922              content;
 923     if (tag.GetParamAsString(wxT("HTTP-EQUIV"), &httpEquiv) &&
 924         httpEquiv.IsSameAs(wxT("Content-Type"), false) &&
 925         tag.GetParamAsString(wxT("CONTENT"), &content))
 926     {
 927         content.MakeLower();
 928         if (content.Left(19) == wxT("text/html; charset="))
 929         {
 930             *m_retval = content.Mid(19);
 931             m_Parser->StopParsing();
 932         }
 933     }
 934     return false;
 935 }
 936
 937
 938 /*static*/
 939 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 940 {
 941     wxString charset;
 942     wxMetaTagParser *parser = new wxMetaTagParser();
 943     if(parser)
 944     {
 945         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 946         parser->Parse(markup);
 947         delete parser;
 948     }
 949     return charset;
 950 }
 951
 952 /* static */
 953 bool
 954 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 955                              wxString::const_iterator end)
 956 {
 957     wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
 958
 959     wxString::const_iterator p = start;
 960
 961     // Comments begin with "<!--" in HTML 4.0; anything shorter or not containing
 962     // these characters is not a comment and we're not going to skip it.
 963     if ( ++p == end || *p != '!' )
 964       return false;
 965     if ( ++p == end || *p != '-' )
 966       return false;
 967     if ( ++p == end || *p != '-' )
 968       return false;
 969
 970     // skip the start of the comment tag in any case, if we don't find the
 971     // closing tag we should ignore broken markup
 972     start = p;
 973
 974     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 975     // comment delimiter and the closing tag character (section 3.2.4 of
 976     // http://www.w3.org/TR/html401/)
 977     int dashes = 0;
 978     while ( ++p < end )
 979     {
 980         const wxChar c = *p;
 981
 982         if ( (c == wxT(' ') || c == wxT('\n') ||
 983               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 984         {
 985             // ignore white space before potential tag end
 986             continue;
 987         }
 988
 989         if ( c == wxT('>') && dashes >= 2 )
 990         {
 991             // found end of comment
 992             start = p;
 993             break;
 994         }
 995
 996         if ( c == wxT('-') )
 997             dashes++;
 998         else
 999             dashes = 0;
1000     }
1001
1002     return true;
1003 }
1004
1005 #endif // wxUSE_HTML