src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/vector.h"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece() {}
  51     wxHtmlTextPiece(const wxString::const_iterator& start,
  52                     const wxString::const_iterator& end)
  53         : m_start(start), m_end(end) {}
  54     wxString::const_iterator m_start, m_end;
  55 };
  56
  57 // NB: this is an empty class and not typedef because of forward declaration
  58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  59 {
  60 };
  61
  62 class wxHtmlParserState
  63 {
  64 public:
  65     wxHtmlTag         *m_curTag;
  66     wxHtmlTag         *m_tags;
  67     wxHtmlTextPieces  *m_textPieces;
  68     int                m_curTextPiece;
  69     const wxString    *m_source;
  70     wxHtmlParserState *m_nextState;
  71 };
  72
  73 //-----------------------------------------------------------------------------
  74 // wxHtmlParser
  75 //-----------------------------------------------------------------------------
  76
  77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  78
  79 wxHtmlParser::wxHtmlParser()
  80     : wxObject(), m_HandlersHash(wxKEY_STRING),
  81       m_FS(NULL), m_HandlersStack(NULL)
  82 {
  83     m_Source = NULL;
  84     m_entitiesParser = new wxHtmlEntitiesParser;
  85     m_Tags = NULL;
  86     m_CurTag = NULL;
  87     m_TextPieces = NULL;
  88     m_CurTextPiece = 0;
  89     m_SavedStates = NULL;
  90 }
  91
  92 wxHtmlParser::~wxHtmlParser()
  93 {
  94     while (RestoreState()) {}
  95     DestroyDOMTree();
  96
  97     if (m_HandlersStack)
  98     {
  99         wxList& tmp = *m_HandlersStack;
 100         wxList::iterator it, en;
 101         for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
 102             delete (wxHashTable*)*it;
 103         tmp.clear();
 104     }
 105     delete m_HandlersStack;
 106     m_HandlersHash.Clear();
 107     WX_CLEAR_LIST(wxList, m_HandlersList);
 108     delete m_entitiesParser;
 109     delete m_Source;
 110 }
 111
 112 wxObject* wxHtmlParser::Parse(const wxString& source)
 113 {
 114     InitParser(source);
 115     DoParsing();
 116     wxObject *result = GetProduct();
 117     DoneParser();
 118     return result;
 119 }
 120
 121 void wxHtmlParser::InitParser(const wxString& source)
 122 {
 123     SetSource(source);
 124     m_stopParsing = false;
 125 }
 126
 127 void wxHtmlParser::DoneParser()
 128 {
 129     DestroyDOMTree();
 130 }
 131
 132 void wxHtmlParser::SetSource(const wxString& src)
 133 {
 134     DestroyDOMTree();
 135     // NB: this is allocated on heap because wxHtmlTag keeps a pointer to
 136     //     this string if WXWIN_COMPATIBILITY_2_8
 137     delete m_Source;
 138     m_Source = new wxString(src);
 139     CreateDOMTree();
 140     m_CurTag = NULL;
 141     m_CurTextPiece = 0;
 142 }
 143
 144 void wxHtmlParser::CreateDOMTree()
 145 {
 146     wxHtmlTagsCache cache(*m_Source);
 147     m_TextPieces = new wxHtmlTextPieces;
 148     CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
 149     m_CurTextPiece = 0;
 150 }
 151
 152 extern bool wxIsCDATAElement(const wxString& tag);
 153
 154 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 155                                     const wxString::const_iterator& begin_pos,
 156                                     const wxString::const_iterator& end_pos,
 157                                     wxHtmlTagsCache *cache)
 158 {
 159     if (end_pos <= begin_pos)
 160         return;
 161
 162     wxChar c;
 163     wxString::const_iterator i = begin_pos;
 164     wxString::const_iterator textBeginning = begin_pos;
 165
 166     // If the tag contains CDATA text, we include the text between beginning
 167     // and ending tag verbosely. Setting i=end_pos will skip to the very
 168     // end of this function where text piece is added, bypassing any child
 169     // tags parsing (CDATA element can't have child elements by definition):
 170     if (cur != NULL && wxIsCDATAElement(cur->GetName()))
 171     {
 172         i = end_pos;
 173     }
 174
 175     while (i < end_pos)
 176     {
 177         c = *i;
 178
 179         if (c == wxT('<'))
 180         {
 181             // add text to m_TextPieces:
 182             if (i > textBeginning)
 183                 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
 184
 185             // if it is a comment, skip it:
 186             if ( SkipCommentTag(i, m_Source->end()) )
 187             {
 188                 textBeginning = i = i + 1; // skip closing '>' too
 189             }
 190
 191             // add another tag to the tree:
 192             else if (i < end_pos-1 && *(i+1) != wxT('/'))
 193             {
 194                 wxHtmlTag *chd;
 195                 if (cur)
 196                     chd = new wxHtmlTag(cur, m_Source,
 197                                         i, end_pos, cache, m_entitiesParser);
 198                 else
 199                 {
 200                     chd = new wxHtmlTag(NULL, m_Source,
 201                                         i, end_pos, cache, m_entitiesParser);
 202                     if (!m_Tags)
 203                     {
 204                         // if this is the first tag to be created make the root
 205                         // m_Tags point to it:
 206                         m_Tags = chd;
 207                     }
 208                     else
 209                     {
 210                         // if there is already a root tag add this tag as
 211                         // the last sibling:
 212                         chd->m_Prev = m_Tags->GetLastSibling();
 213                         chd->m_Prev->m_Next = chd;
 214                     }
 215                 }
 216
 217                 if (chd->HasEnding())
 218                 {
 219                     CreateDOMSubTree(chd,
 220                                      chd->GetBeginIter(), chd->GetEndIter1(),
 221                                      cache);
 222                     i = chd->GetEndIter2();
 223                 }
 224                 else
 225                     i = chd->GetBeginIter();
 226
 227                 textBeginning = i;
 228             }
 229
 230             // ... or skip ending tag:
 231             else
 232             {
 233                 while (i < end_pos && *i != wxT('>')) ++i;
 234                 textBeginning = i+1;
 235             }
 236         }
 237         else ++i;
 238     }
 239
 240     // add remaining text to m_TextPieces:
 241     if (end_pos > textBeginning)
 242         m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
 243 }
 244
 245 void wxHtmlParser::DestroyDOMTree()
 246 {
 247     wxHtmlTag *t1, *t2;
 248     t1 = m_Tags;
 249     while (t1)
 250     {
 251         t2 = t1->GetNextSibling();
 252         delete t1;
 253         t1 = t2;
 254     }
 255     m_Tags = m_CurTag = NULL;
 256
 257     delete m_TextPieces;
 258     m_TextPieces = NULL;
 259 }
 260
 261 void wxHtmlParser::DoParsing()
 262 {
 263     m_CurTag = m_Tags;
 264     m_CurTextPiece = 0;
 265     DoParsing(m_Source->begin(), m_Source->end());
 266 }
 267
 268 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
 269                              const wxString::const_iterator& end_pos)
 270 {
 271     wxString::const_iterator begin_pos(begin_pos_);
 272
 273     if (end_pos <= begin_pos)
 274         return;
 275
 276     wxHtmlTextPieces& pieces = *m_TextPieces;
 277     size_t piecesCnt = pieces.size();
 278
 279     while (begin_pos < end_pos)
 280     {
 281         while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
 282             m_CurTag = m_CurTag->GetNextTag();
 283         while (m_CurTextPiece < piecesCnt &&
 284                pieces[m_CurTextPiece].m_start < begin_pos)
 285             m_CurTextPiece++;
 286
 287         if (m_CurTextPiece < piecesCnt &&
 288             (!m_CurTag ||
 289              pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
 290         {
 291             // Add text:
 292             AddText(GetEntitiesParser()->Parse(
 293                        wxString(pieces[m_CurTextPiece].m_start,
 294                                 pieces[m_CurTextPiece].m_end)));
 295             begin_pos = pieces[m_CurTextPiece].m_end;
 296             m_CurTextPiece++;
 297         }
 298         else if (m_CurTag)
 299         {
 300             if (m_CurTag->HasEnding())
 301                 begin_pos = m_CurTag->GetEndIter2();
 302             else
 303                 begin_pos = m_CurTag->GetBeginIter();
 304             wxHtmlTag *t = m_CurTag;
 305             m_CurTag = m_CurTag->GetNextTag();
 306             AddTag(*t);
 307             if (m_stopParsing)
 308                 return;
 309         }
 310         else break;
 311     }
 312 }
 313
 314 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 315 {
 316     wxHtmlTagHandler *h;
 317     bool inner = false;
 318
 319     h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
 320     if (h)
 321     {
 322         inner = h->HandleTag(tag);
 323         if (m_stopParsing)
 324             return;
 325     }
 326     if (!inner)
 327     {
 328         if (tag.HasEnding())
 329             DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
 330     }
 331 }
 332
 333 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 334 {
 335     wxString s(handler->GetSupportedTags());
 336     wxStringTokenizer tokenizer(s, wxT(", "));
 337
 338     while (tokenizer.HasMoreTokens())
 339         m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
 340
 341     if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
 342         m_HandlersList.Append(handler);
 343
 344     handler->SetParser(this);
 345 }
 346
 347 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 348 {
 349     wxStringTokenizer tokenizer(tags, wxT(", "));
 350     wxString key;
 351
 352     if (m_HandlersStack == NULL)
 353     {
 354         m_HandlersStack = new wxList;
 355     }
 356
 357     m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
 358
 359     while (tokenizer.HasMoreTokens())
 360     {
 361         key = tokenizer.GetNextToken();
 362         m_HandlersHash.Delete(key);
 363         m_HandlersHash.Put(key, handler);
 364     }
 365 }
 366
 367 void wxHtmlParser::PopTagHandler()
 368 {
 369     wxList::compatibility_iterator first;
 370
 371     if ( !m_HandlersStack ||
 372 #if wxUSE_STL
 373          !(first = m_HandlersStack->GetFirst())
 374 #else // !wxUSE_STL
 375          ((first = m_HandlersStack->GetFirst()) == NULL)
 376 #endif // wxUSE_STL/!wxUSE_STL
 377         )
 378     {
 379         wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
 380         return;
 381     }
 382     m_HandlersHash = *((wxHashTable*) first->GetData());
 383     delete (wxHashTable*) first->GetData();
 384     m_HandlersStack->Erase(first);
 385 }
 386
 387 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 388 {
 389     wxHtmlParserState *s = new wxHtmlParserState;
 390
 391     s->m_curTag = m_CurTag;
 392     s->m_tags = m_Tags;
 393     s->m_textPieces = m_TextPieces;
 394     s->m_curTextPiece = m_CurTextPiece;
 395     s->m_source = m_Source;
 396
 397     s->m_nextState = m_SavedStates;
 398     m_SavedStates = s;
 399
 400     m_CurTag = NULL;
 401     m_Tags = NULL;
 402     m_TextPieces = NULL;
 403     m_CurTextPiece = 0;
 404     m_Source = NULL;
 405
 406     SetSource(src);
 407 }
 408
 409 bool wxHtmlParser::RestoreState()
 410 {
 411     if (!m_SavedStates) return false;
 412
 413     DestroyDOMTree();
 414
 415     wxHtmlParserState *s = m_SavedStates;
 416     m_SavedStates = s->m_nextState;
 417
 418     m_CurTag = s->m_curTag;
 419     m_Tags = s->m_tags;
 420     m_TextPieces = s->m_textPieces;
 421     m_CurTextPiece = s->m_curTextPiece;
 422     m_Source = s->m_source;
 423
 424     delete s;
 425     return true;
 426 }
 427
 428 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 429 {
 430     return wxString(tag.GetBeginIter(), tag.GetEndIter1());
 431 }
 432
 433 //-----------------------------------------------------------------------------
 434 // wxHtmlTagHandler
 435 //-----------------------------------------------------------------------------
 436
 437 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 438
 439 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 440 {
 441     // It is safe to temporarily change the source being parsed,
 442     // provided we restore the state back after parsing
 443     m_Parser->SetSourceAndSaveState(source);
 444     m_Parser->DoParsing();
 445     m_Parser->RestoreState();
 446 }
 447
 448
 449 //-----------------------------------------------------------------------------
 450 // wxHtmlEntitiesParser
 451 //-----------------------------------------------------------------------------
 452
 453 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 454
 455 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 456 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 457     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 458 #endif
 459 {
 460 }
 461
 462 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 463 {
 464 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 465     delete m_conv;
 466 #endif
 467 }
 468
 469 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 470 {
 471 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 472     if (encoding == m_encoding)
 473         return;
 474
 475     delete m_conv;
 476
 477     m_encoding = encoding;
 478     if (m_encoding == wxFONTENCODING_SYSTEM)
 479         m_conv = NULL;
 480     else
 481         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 482 #else
 483     (void) encoding;
 484 #endif
 485 }
 486
 487 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 488 {
 489     wxString output;
 490
 491     const wxString::const_iterator end(input.end());
 492     wxString::const_iterator c(input.begin());
 493     wxString::const_iterator last(c);
 494
 495     for ( ; c < end; ++c )
 496     {
 497         if (*c == wxT('&'))
 498         {
 499             if ( output.empty() )
 500                 output.reserve(input.length());
 501
 502             if (c - last > 0)
 503                 output.append(last, c);
 504             if ( ++c == end )
 505                 break;
 506
 507             wxString entity;
 508             const wxString::const_iterator ent_s = c;
 509             wxChar entity_char;
 510
 511             for ( ; c != end; ++c )
 512             {
 513                 wxChar ch = *c;
 514                 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
 515                        (ch >= wxT('A') && ch <= wxT('Z')) ||
 516                        (ch >= wxT('0') && ch <= wxT('9')) ||
 517                         ch == wxT('_') || ch == wxT('#')) )
 518                     break;
 519             }
 520
 521             entity.append(ent_s, c);
 522             if (c == end || *c != wxT(';')) --c;
 523             last = c+1;
 524             entity_char = GetEntityChar(entity);
 525             if (entity_char)
 526                 output << entity_char;
 527             else
 528             {
 529                 output.append(ent_s-1, c+1);
 530                 wxLogTrace(wxTRACE_HTML_DEBUG,
 531                            "Unrecognized HTML entity: '%s'",
 532                            entity);
 533             }
 534         }
 535     }
 536     if ( last == input.begin() ) // common case: no entity
 537         return input;
 538     if ( last != end )
 539         output.append(last, end);
 540     return output;
 541 }
 542
 543 #if !wxUSE_UNICODE
 544 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 545 {
 546 #if wxUSE_WCHAR_T
 547     char buf[2];
 548     wchar_t wbuf[2];
 549     wbuf[0] = (wchar_t)code;
 550     wbuf[1] = 0;
 551     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 552     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 553         return '?';
 554     return buf[0];
 555 #else
 556     return (code < 256) ? (wxChar)code : '?';
 557 #endif
 558 }
 559 #endif
 560
 561 struct wxHtmlEntityInfo
 562 {
 563     const wxStringCharType *name;
 564     unsigned code;
 565 };
 566
 567 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 568 {
 569 #if wxUSE_UNICODE_UTF8
 570     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 571 #else
 572     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 573 #endif
 574 }
 575
 576 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 577 {
 578     unsigned code = 0;
 579
 580     if (entity[0] == wxT('#'))
 581     {
 582         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 583         //     representation is the same for numbers
 584         const wxStringCharType *ent_s = entity.wx_str();
 585         const wxStringCharType *format;
 586
 587         if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
 588         {
 589             format = wxSTRING_TEXT("%x");
 590             ent_s++;
 591         }
 592         else
 593             format = wxSTRING_TEXT("%u");
 594         ent_s++;
 595
 596         if (wxSscanf(ent_s, format, &code) != 1)
 597             code = 0;
 598     }
 599     else
 600     {
 601         // store the literals in wx's internal representation (either char*
 602         // in UTF-8 or wchar_t*) for best performance:
 603         #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
 604
 605         static wxHtmlEntityInfo substitutions[] = {
 606             ENTITY("AElig", 198),
 607             ENTITY("Aacute", 193),
 608             ENTITY("Acirc", 194),
 609             ENTITY("Agrave", 192),
 610             ENTITY("Alpha", 913),
 611             ENTITY("Aring", 197),
 612             ENTITY("Atilde", 195),
 613             ENTITY("Auml", 196),
 614             ENTITY("Beta", 914),
 615             ENTITY("Ccedil", 199),
 616             ENTITY("Chi", 935),
 617             ENTITY("Dagger", 8225),
 618             ENTITY("Delta", 916),
 619             ENTITY("ETH", 208),
 620             ENTITY("Eacute", 201),
 621             ENTITY("Ecirc", 202),
 622             ENTITY("Egrave", 200),
 623             ENTITY("Epsilon", 917),
 624             ENTITY("Eta", 919),
 625             ENTITY("Euml", 203),
 626             ENTITY("Gamma", 915),
 627             ENTITY("Iacute", 205),
 628             ENTITY("Icirc", 206),
 629             ENTITY("Igrave", 204),
 630             ENTITY("Iota", 921),
 631             ENTITY("Iuml", 207),
 632             ENTITY("Kappa", 922),
 633             ENTITY("Lambda", 923),
 634             ENTITY("Mu", 924),
 635             ENTITY("Ntilde", 209),
 636             ENTITY("Nu", 925),
 637             ENTITY("OElig", 338),
 638             ENTITY("Oacute", 211),
 639             ENTITY("Ocirc", 212),
 640             ENTITY("Ograve", 210),
 641             ENTITY("Omega", 937),
 642             ENTITY("Omicron", 927),
 643             ENTITY("Oslash", 216),
 644             ENTITY("Otilde", 213),
 645             ENTITY("Ouml", 214),
 646             ENTITY("Phi", 934),
 647             ENTITY("Pi", 928),
 648             ENTITY("Prime", 8243),
 649             ENTITY("Psi", 936),
 650             ENTITY("Rho", 929),
 651             ENTITY("Scaron", 352),
 652             ENTITY("Sigma", 931),
 653             ENTITY("THORN", 222),
 654             ENTITY("Tau", 932),
 655             ENTITY("Theta", 920),
 656             ENTITY("Uacute", 218),
 657             ENTITY("Ucirc", 219),
 658             ENTITY("Ugrave", 217),
 659             ENTITY("Upsilon", 933),
 660             ENTITY("Uuml", 220),
 661             ENTITY("Xi", 926),
 662             ENTITY("Yacute", 221),
 663             ENTITY("Yuml", 376),
 664             ENTITY("Zeta", 918),
 665             ENTITY("aacute", 225),
 666             ENTITY("acirc", 226),
 667             ENTITY("acute", 180),
 668             ENTITY("aelig", 230),
 669             ENTITY("agrave", 224),
 670             ENTITY("alefsym", 8501),
 671             ENTITY("alpha", 945),
 672             ENTITY("amp", 38),
 673             ENTITY("and", 8743),
 674             ENTITY("ang", 8736),
 675             ENTITY("aring", 229),
 676             ENTITY("asymp", 8776),
 677             ENTITY("atilde", 227),
 678             ENTITY("auml", 228),
 679             ENTITY("bdquo", 8222),
 680             ENTITY("beta", 946),
 681             ENTITY("brvbar", 166),
 682             ENTITY("bull", 8226),
 683             ENTITY("cap", 8745),
 684             ENTITY("ccedil", 231),
 685             ENTITY("cedil", 184),
 686             ENTITY("cent", 162),
 687             ENTITY("chi", 967),
 688             ENTITY("circ", 710),
 689             ENTITY("clubs", 9827),
 690             ENTITY("cong", 8773),
 691             ENTITY("copy", 169),
 692             ENTITY("crarr", 8629),
 693             ENTITY("cup", 8746),
 694             ENTITY("curren", 164),
 695             ENTITY("dArr", 8659),
 696             ENTITY("dagger", 8224),
 697             ENTITY("darr", 8595),
 698             ENTITY("deg", 176),
 699             ENTITY("delta", 948),
 700             ENTITY("diams", 9830),
 701             ENTITY("divide", 247),
 702             ENTITY("eacute", 233),
 703             ENTITY("ecirc", 234),
 704             ENTITY("egrave", 232),
 705             ENTITY("empty", 8709),
 706             ENTITY("emsp", 8195),
 707             ENTITY("ensp", 8194),
 708             ENTITY("epsilon", 949),
 709             ENTITY("equiv", 8801),
 710             ENTITY("eta", 951),
 711             ENTITY("eth", 240),
 712             ENTITY("euml", 235),
 713             ENTITY("euro", 8364),
 714             ENTITY("exist", 8707),
 715             ENTITY("fnof", 402),
 716             ENTITY("forall", 8704),
 717             ENTITY("frac12", 189),
 718             ENTITY("frac14", 188),
 719             ENTITY("frac34", 190),
 720             ENTITY("frasl", 8260),
 721             ENTITY("gamma", 947),
 722             ENTITY("ge", 8805),
 723             ENTITY("gt", 62),
 724             ENTITY("hArr", 8660),
 725             ENTITY("harr", 8596),
 726             ENTITY("hearts", 9829),
 727             ENTITY("hellip", 8230),
 728             ENTITY("iacute", 237),
 729             ENTITY("icirc", 238),
 730             ENTITY("iexcl", 161),
 731             ENTITY("igrave", 236),
 732             ENTITY("image", 8465),
 733             ENTITY("infin", 8734),
 734             ENTITY("int", 8747),
 735             ENTITY("iota", 953),
 736             ENTITY("iquest", 191),
 737             ENTITY("isin", 8712),
 738             ENTITY("iuml", 239),
 739             ENTITY("kappa", 954),
 740             ENTITY("lArr", 8656),
 741             ENTITY("lambda", 955),
 742             ENTITY("lang", 9001),
 743             ENTITY("laquo", 171),
 744             ENTITY("larr", 8592),
 745             ENTITY("lceil", 8968),
 746             ENTITY("ldquo", 8220),
 747             ENTITY("le", 8804),
 748             ENTITY("lfloor", 8970),
 749             ENTITY("lowast", 8727),
 750             ENTITY("loz", 9674),
 751             ENTITY("lrm", 8206),
 752             ENTITY("lsaquo", 8249),
 753             ENTITY("lsquo", 8216),
 754             ENTITY("lt", 60),
 755             ENTITY("macr", 175),
 756             ENTITY("mdash", 8212),
 757             ENTITY("micro", 181),
 758             ENTITY("middot", 183),
 759             ENTITY("minus", 8722),
 760             ENTITY("mu", 956),
 761             ENTITY("nabla", 8711),
 762             ENTITY("nbsp", 160),
 763             ENTITY("ndash", 8211),
 764             ENTITY("ne", 8800),
 765             ENTITY("ni", 8715),
 766             ENTITY("not", 172),
 767             ENTITY("notin", 8713),
 768             ENTITY("nsub", 8836),
 769             ENTITY("ntilde", 241),
 770             ENTITY("nu", 957),
 771             ENTITY("oacute", 243),
 772             ENTITY("ocirc", 244),
 773             ENTITY("oelig", 339),
 774             ENTITY("ograve", 242),
 775             ENTITY("oline", 8254),
 776             ENTITY("omega", 969),
 777             ENTITY("omicron", 959),
 778             ENTITY("oplus", 8853),
 779             ENTITY("or", 8744),
 780             ENTITY("ordf", 170),
 781             ENTITY("ordm", 186),
 782             ENTITY("oslash", 248),
 783             ENTITY("otilde", 245),
 784             ENTITY("otimes", 8855),
 785             ENTITY("ouml", 246),
 786             ENTITY("para", 182),
 787             ENTITY("part", 8706),
 788             ENTITY("permil", 8240),
 789             ENTITY("perp", 8869),
 790             ENTITY("phi", 966),
 791             ENTITY("pi", 960),
 792             ENTITY("piv", 982),
 793             ENTITY("plusmn", 177),
 794             ENTITY("pound", 163),
 795             ENTITY("prime", 8242),
 796             ENTITY("prod", 8719),
 797             ENTITY("prop", 8733),
 798             ENTITY("psi", 968),
 799             ENTITY("quot", 34),
 800             ENTITY("rArr", 8658),
 801             ENTITY("radic", 8730),
 802             ENTITY("rang", 9002),
 803             ENTITY("raquo", 187),
 804             ENTITY("rarr", 8594),
 805             ENTITY("rceil", 8969),
 806             ENTITY("rdquo", 8221),
 807             ENTITY("real", 8476),
 808             ENTITY("reg", 174),
 809             ENTITY("rfloor", 8971),
 810             ENTITY("rho", 961),
 811             ENTITY("rlm", 8207),
 812             ENTITY("rsaquo", 8250),
 813             ENTITY("rsquo", 8217),
 814             ENTITY("sbquo", 8218),
 815             ENTITY("scaron", 353),
 816             ENTITY("sdot", 8901),
 817             ENTITY("sect", 167),
 818             ENTITY("shy", 173),
 819             ENTITY("sigma", 963),
 820             ENTITY("sigmaf", 962),
 821             ENTITY("sim", 8764),
 822             ENTITY("spades", 9824),
 823             ENTITY("sub", 8834),
 824             ENTITY("sube", 8838),
 825             ENTITY("sum", 8721),
 826             ENTITY("sup", 8835),
 827             ENTITY("sup1", 185),
 828             ENTITY("sup2", 178),
 829             ENTITY("sup3", 179),
 830             ENTITY("supe", 8839),
 831             ENTITY("szlig", 223),
 832             ENTITY("tau", 964),
 833             ENTITY("there4", 8756),
 834             ENTITY("theta", 952),
 835             ENTITY("thetasym", 977),
 836             ENTITY("thinsp", 8201),
 837             ENTITY("thorn", 254),
 838             ENTITY("tilde", 732),
 839             ENTITY("times", 215),
 840             ENTITY("trade", 8482),
 841             ENTITY("uArr", 8657),
 842             ENTITY("uacute", 250),
 843             ENTITY("uarr", 8593),
 844             ENTITY("ucirc", 251),
 845             ENTITY("ugrave", 249),
 846             ENTITY("uml", 168),
 847             ENTITY("upsih", 978),
 848             ENTITY("upsilon", 965),
 849             ENTITY("uuml", 252),
 850             ENTITY("weierp", 8472),
 851             ENTITY("xi", 958),
 852             ENTITY("yacute", 253),
 853             ENTITY("yen", 165),
 854             ENTITY("yuml", 255),
 855             ENTITY("zeta", 950),
 856             ENTITY("zwj", 8205),
 857             ENTITY("zwnj", 8204),
 858             {NULL, 0}};
 859         #undef ENTITY
 860         static size_t substitutions_cnt = 0;
 861
 862         if (substitutions_cnt == 0)
 863             while (substitutions[substitutions_cnt].code != 0)
 864                 substitutions_cnt++;
 865
 866         wxHtmlEntityInfo *info = NULL;
 867 #ifdef __WXWINCE__
 868         // bsearch crashes under WinCE for some reason
 869         size_t i;
 870         for (i = 0; i < substitutions_cnt; i++)
 871         {
 872             if (entity == substitutions[i].name)
 873             {
 874                 info = & substitutions[i];
 875                 break;
 876             }
 877         }
 878 #else
 879         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 880                                            substitutions_cnt,
 881                                            sizeof(wxHtmlEntityInfo),
 882                                            wxHtmlEntityCompare);
 883 #endif
 884         if (info)
 885             code = info->code;
 886     }
 887
 888     if (code == 0)
 889         return 0;
 890     else
 891         return GetCharForCode(code);
 892 }
 893
 894 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 895                                 const wxString& url) const
 896 {
 897     return m_FS ? m_FS->OpenFile(url) : NULL;
 898
 899 }
 900
 901
 902 //-----------------------------------------------------------------------------
 903 // wxHtmlParser::ExtractCharsetInformation
 904 //-----------------------------------------------------------------------------
 905
 906 class wxMetaTagParser : public wxHtmlParser
 907 {
 908 public:
 909     wxMetaTagParser() { }
 910
 911     wxObject* GetProduct() { return NULL; }
 912
 913 protected:
 914     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 915
 916     DECLARE_NO_COPY_CLASS(wxMetaTagParser)
 917 };
 918
 919 class wxMetaTagHandler : public wxHtmlTagHandler
 920 {
 921 public:
 922     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 923     wxString GetSupportedTags() { return wxT("META,BODY"); }
 924     bool HandleTag(const wxHtmlTag& tag);
 925
 926 private:
 927     wxString *m_retval;
 928
 929     DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
 930 };
 931
 932 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 933 {
 934     if (tag.GetName() == _T("BODY"))
 935     {
 936         m_Parser->StopParsing();
 937         return false;
 938     }
 939
 940     if (tag.HasParam(_T("HTTP-EQUIV")) &&
 941         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
 942         tag.HasParam(_T("CONTENT")))
 943     {
 944         wxString content = tag.GetParam(_T("CONTENT")).Lower();
 945         if (content.Left(19) == _T("text/html; charset="))
 946         {
 947             *m_retval = content.Mid(19);
 948             m_Parser->StopParsing();
 949         }
 950     }
 951     return false;
 952 }
 953
 954
 955 /*static*/
 956 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 957 {
 958     wxString charset;
 959     wxMetaTagParser *parser = new wxMetaTagParser();
 960     if(parser)
 961     {
 962         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 963         parser->Parse(markup);
 964         delete parser;
 965     }
 966     return charset;
 967 }
 968
 969 /* static */
 970 bool
 971 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 972                              wxString::const_iterator end)
 973 {
 974     wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
 975
 976     wxString::const_iterator p = start;
 977
 978     // comments begin with "<!--" in HTML 4.0
 979     if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 980     {
 981         // not a comment at all
 982         return false;
 983     }
 984
 985     // skip the start of the comment tag in any case, if we don't find the
 986     // closing tag we should ignore broken markup
 987     start = p;
 988
 989     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 990     // comment delimiter and the closing tag character (section 3.2.4 of
 991     // http://www.w3.org/TR/html401/)
 992     int dashes = 0;
 993     while ( ++p < end )
 994     {
 995         const wxChar c = *p;
 996
 997         if ( (c == wxT(' ') || c == wxT('\n') ||
 998               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 999         {
1000             // ignore white space before potential tag end
1001             continue;
1002         }
1003
1004         if ( c == wxT('>') && dashes >= 2 )
1005         {
1006             // found end of comment
1007             start = p;
1008             break;
1009         }
1010
1011         if ( c == wxT('-') )
1012             dashes++;
1013         else
1014             dashes = 0;
1015     }
1016
1017     return true;
1018 }
1019
1020 #endif // wxUSE_HTML