src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/arrimpl.cpp"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
  51     int m_pos, m_lng;
  52 };
  53
  54 WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
  55 WX_DEFINE_OBJARRAY(wxHtmlTextPieces)
  56
  57 class wxHtmlParserState
  58 {
  59 public:
  60     wxHtmlTag         *m_curTag;
  61     wxHtmlTag         *m_tags;
  62     wxHtmlTextPieces  *m_textPieces;
  63     int                m_curTextPiece;
  64     wxString           m_source;
  65     wxHtmlParserState *m_nextState;
  66 };
  67
  68 //-----------------------------------------------------------------------------
  69 // wxHtmlParser
  70 //-----------------------------------------------------------------------------
  71
  72 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  73
  74 wxHtmlParser::wxHtmlParser()
  75     : wxObject(), m_HandlersHash(wxKEY_STRING),
  76       m_FS(NULL), m_HandlersStack(NULL)
  77 {
  78     m_entitiesParser = new wxHtmlEntitiesParser;
  79     m_Tags = NULL;
  80     m_CurTag = NULL;
  81     m_TextPieces = NULL;
  82     m_CurTextPiece = 0;
  83     m_SavedStates = NULL;
  84 }
  85
  86 wxHtmlParser::~wxHtmlParser()
  87 {
  88     while (RestoreState()) {}
  89     DestroyDOMTree();
  90
  91     if (m_HandlersStack)
  92     {
  93         wxList& tmp = *m_HandlersStack;
  94         wxList::iterator it, en;
  95         for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
  96             delete (wxHashTable*)*it;
  97         tmp.clear();
  98     }
  99     delete m_HandlersStack;
 100     m_HandlersHash.Clear();
 101     WX_CLEAR_LIST(wxList, m_HandlersList);
 102     delete m_entitiesParser;
 103 }
 104
 105 wxObject* wxHtmlParser::Parse(const wxString& source)
 106 {
 107     InitParser(source);
 108     DoParsing();
 109     wxObject *result = GetProduct();
 110     DoneParser();
 111     return result;
 112 }
 113
 114 void wxHtmlParser::InitParser(const wxString& source)
 115 {
 116     SetSource(source);
 117     m_stopParsing = false;
 118 }
 119
 120 void wxHtmlParser::DoneParser()
 121 {
 122     DestroyDOMTree();
 123 }
 124
 125 void wxHtmlParser::SetSource(const wxString& src)
 126 {
 127     DestroyDOMTree();
 128     m_Source = src;
 129     CreateDOMTree();
 130     m_CurTag = NULL;
 131     m_CurTextPiece = 0;
 132 }
 133
 134 void wxHtmlParser::CreateDOMTree()
 135 {
 136     wxHtmlTagsCache cache(m_Source);
 137     m_TextPieces = new wxHtmlTextPieces;
 138     CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
 139     m_CurTextPiece = 0;
 140 }
 141
 142 extern bool wxIsCDATAElement(const wxChar *tag);
 143
 144 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 145                                     int begin_pos, int end_pos,
 146                                     wxHtmlTagsCache *cache)
 147 {
 148     if (end_pos <= begin_pos) return;
 149
 150     wxChar c;
 151     int i = begin_pos;
 152     int textBeginning = begin_pos;
 153
 154     // If the tag contains CDATA text, we include the text between beginning
 155     // and ending tag verbosely. Setting i=end_pos will skip to the very
 156     // end of this function where text piece is added, bypassing any child
 157     // tags parsing (CDATA element can't have child elements by definition):
 158     if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
 159     {
 160         i = end_pos;
 161     }
 162
 163     while (i < end_pos)
 164     {
 165         c = m_Source.GetChar(i);
 166
 167         if (c == wxT('<'))
 168         {
 169             // add text to m_TextPieces:
 170             if (i - textBeginning > 0)
 171                 m_TextPieces->Add(
 172                     wxHtmlTextPiece(textBeginning, i - textBeginning));
 173
 174             // if it is a comment, skip it:
 175             wxString::const_iterator iter = m_Source.begin() + i;
 176             if ( SkipCommentTag(iter, m_Source.end()) )
 177             {
 178                 textBeginning =
 179                 i = iter - m_Source.begin() + 1; // skip closing '>' too
 180             }
 181
 182             // add another tag to the tree:
 183             else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
 184             {
 185                 wxHtmlTag *chd;
 186                 if (cur)
 187                     chd = new wxHtmlTag(cur, m_Source,
 188                                         i, end_pos, cache, m_entitiesParser);
 189                 else
 190                 {
 191                     chd = new wxHtmlTag(NULL, m_Source,
 192                                         i, end_pos, cache, m_entitiesParser);
 193                     if (!m_Tags)
 194                     {
 195                         // if this is the first tag to be created make the root
 196                         // m_Tags point to it:
 197                         m_Tags = chd;
 198                     }
 199                     else
 200                     {
 201                         // if there is already a root tag add this tag as
 202                         // the last sibling:
 203                         chd->m_Prev = m_Tags->GetLastSibling();
 204                         chd->m_Prev->m_Next = chd;
 205                     }
 206                 }
 207
 208                 if (chd->HasEnding())
 209                 {
 210                     CreateDOMSubTree(chd,
 211                                      chd->GetBeginPos(), chd->GetEndPos1(),
 212                                      cache);
 213                     i = chd->GetEndPos2();
 214                 }
 215                 else
 216                     i = chd->GetBeginPos();
 217
 218                 textBeginning = i;
 219             }
 220
 221             // ... or skip ending tag:
 222             else
 223             {
 224                 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
 225                 textBeginning = i+1;
 226             }
 227         }
 228         else i++;
 229     }
 230
 231     // add remaining text to m_TextPieces:
 232     if (end_pos - textBeginning > 0)
 233         m_TextPieces->Add(
 234             wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
 235 }
 236
 237 void wxHtmlParser::DestroyDOMTree()
 238 {
 239     wxHtmlTag *t1, *t2;
 240     t1 = m_Tags;
 241     while (t1)
 242     {
 243         t2 = t1->GetNextSibling();
 244         delete t1;
 245         t1 = t2;
 246     }
 247     m_Tags = m_CurTag = NULL;
 248
 249     delete m_TextPieces;
 250     m_TextPieces = NULL;
 251 }
 252
 253 void wxHtmlParser::DoParsing()
 254 {
 255     m_CurTag = m_Tags;
 256     m_CurTextPiece = 0;
 257     DoParsing(0, m_Source.length());
 258 }
 259
 260 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
 261 {
 262     if (end_pos <= begin_pos) return;
 263
 264     wxHtmlTextPieces& pieces = *m_TextPieces;
 265     size_t piecesCnt = pieces.GetCount();
 266
 267     while (begin_pos < end_pos)
 268     {
 269         while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
 270             m_CurTag = m_CurTag->GetNextTag();
 271         while (m_CurTextPiece < piecesCnt &&
 272                pieces[m_CurTextPiece].m_pos < begin_pos)
 273             m_CurTextPiece++;
 274
 275         if (m_CurTextPiece < piecesCnt &&
 276             (!m_CurTag ||
 277              pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
 278         {
 279             // Add text:
 280             AddText(GetEntitiesParser()->Parse(
 281                        m_Source.Mid(pieces[m_CurTextPiece].m_pos,
 282                                     pieces[m_CurTextPiece].m_lng)));
 283             begin_pos = pieces[m_CurTextPiece].m_pos +
 284                         pieces[m_CurTextPiece].m_lng;
 285             m_CurTextPiece++;
 286         }
 287         else if (m_CurTag)
 288         {
 289             if (m_CurTag->HasEnding())
 290                 begin_pos = m_CurTag->GetEndPos2();
 291             else
 292                 begin_pos = m_CurTag->GetBeginPos();
 293             wxHtmlTag *t = m_CurTag;
 294             m_CurTag = m_CurTag->GetNextTag();
 295             AddTag(*t);
 296             if (m_stopParsing)
 297                 return;
 298         }
 299         else break;
 300     }
 301 }
 302
 303 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 304 {
 305     wxHtmlTagHandler *h;
 306     bool inner = false;
 307
 308     h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
 309     if (h)
 310     {
 311         inner = h->HandleTag(tag);
 312         if (m_stopParsing)
 313             return;
 314     }
 315     if (!inner)
 316     {
 317         if (tag.HasEnding())
 318             DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
 319     }
 320 }
 321
 322 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 323 {
 324     wxString s(handler->GetSupportedTags());
 325     wxStringTokenizer tokenizer(s, wxT(", "));
 326
 327     while (tokenizer.HasMoreTokens())
 328         m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
 329
 330     if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
 331         m_HandlersList.Append(handler);
 332
 333     handler->SetParser(this);
 334 }
 335
 336 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 337 {
 338     wxStringTokenizer tokenizer(tags, wxT(", "));
 339     wxString key;
 340
 341     if (m_HandlersStack == NULL)
 342     {
 343         m_HandlersStack = new wxList;
 344     }
 345
 346     m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
 347
 348     while (tokenizer.HasMoreTokens())
 349     {
 350         key = tokenizer.GetNextToken();
 351         m_HandlersHash.Delete(key);
 352         m_HandlersHash.Put(key, handler);
 353     }
 354 }
 355
 356 void wxHtmlParser::PopTagHandler()
 357 {
 358     wxList::compatibility_iterator first;
 359
 360     if ( !m_HandlersStack ||
 361 #if wxUSE_STL
 362          !(first = m_HandlersStack->GetFirst())
 363 #else // !wxUSE_STL
 364          ((first = m_HandlersStack->GetFirst()) == NULL)
 365 #endif // wxUSE_STL/!wxUSE_STL
 366         )
 367     {
 368         wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
 369         return;
 370     }
 371     m_HandlersHash = *((wxHashTable*) first->GetData());
 372     delete (wxHashTable*) first->GetData();
 373     m_HandlersStack->Erase(first);
 374 }
 375
 376 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 377 {
 378     wxHtmlParserState *s = new wxHtmlParserState;
 379
 380     s->m_curTag = m_CurTag;
 381     s->m_tags = m_Tags;
 382     s->m_textPieces = m_TextPieces;
 383     s->m_curTextPiece = m_CurTextPiece;
 384     s->m_source = m_Source;
 385
 386     s->m_nextState = m_SavedStates;
 387     m_SavedStates = s;
 388
 389     m_CurTag = NULL;
 390     m_Tags = NULL;
 391     m_TextPieces = NULL;
 392     m_CurTextPiece = 0;
 393     m_Source = wxEmptyString;
 394
 395     SetSource(src);
 396 }
 397
 398 bool wxHtmlParser::RestoreState()
 399 {
 400     if (!m_SavedStates) return false;
 401
 402     DestroyDOMTree();
 403
 404     wxHtmlParserState *s = m_SavedStates;
 405     m_SavedStates = s->m_nextState;
 406
 407     m_CurTag = s->m_curTag;
 408     m_Tags = s->m_tags;
 409     m_TextPieces = s->m_textPieces;
 410     m_CurTextPiece = s->m_curTextPiece;
 411     m_Source = s->m_source;
 412
 413     delete s;
 414     return true;
 415 }
 416
 417 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 418 {
 419     return GetSource()->Mid(tag.GetBeginPos(),
 420                             tag.GetEndPos1() - tag.GetBeginPos());
 421 }
 422
 423 //-----------------------------------------------------------------------------
 424 // wxHtmlTagHandler
 425 //-----------------------------------------------------------------------------
 426
 427 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 428
 429 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 430 {
 431     // It is safe to temporarily change the source being parsed,
 432     // provided we restore the state back after parsing
 433     m_Parser->SetSourceAndSaveState(source);
 434     m_Parser->DoParsing();
 435     m_Parser->RestoreState();
 436 }
 437
 438
 439 //-----------------------------------------------------------------------------
 440 // wxHtmlEntitiesParser
 441 //-----------------------------------------------------------------------------
 442
 443 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 444
 445 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 446 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 447     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 448 #endif
 449 {
 450 }
 451
 452 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 453 {
 454 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 455     delete m_conv;
 456 #endif
 457 }
 458
 459 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 460 {
 461 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 462     if (encoding == m_encoding)
 463         return;
 464
 465     delete m_conv;
 466
 467     m_encoding = encoding;
 468     if (m_encoding == wxFONTENCODING_SYSTEM)
 469         m_conv = NULL;
 470     else
 471         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 472 #else
 473     (void) encoding;
 474 #endif
 475 }
 476
 477 wxString wxHtmlEntitiesParser::Parse(const wxString& input)
 478 {
 479     const wxChar *c, *last;
 480     const wxChar *in_str = input.c_str();
 481     wxString output;
 482
 483     output.reserve(input.length());
 484
 485     for (c = in_str, last = in_str; *c != wxT('\0'); c++)
 486     {
 487         if (*c == wxT('&'))
 488         {
 489             if (c - last > 0)
 490                 output.append(last, c - last);
 491             if ( *++c == wxT('\0') )
 492                 break;
 493
 494             wxString entity;
 495             const wxChar *ent_s = c;
 496             wxChar entity_char;
 497
 498             for (; (*c >= wxT('a') && *c <= wxT('z')) ||
 499                    (*c >= wxT('A') && *c <= wxT('Z')) ||
 500                    (*c >= wxT('0') && *c <= wxT('9')) ||
 501                    *c == wxT('_') || *c == wxT('#'); c++) {}
 502             entity.append(ent_s, c - ent_s);
 503             if (*c != wxT(';')) c--;
 504             last = c+1;
 505             entity_char = GetEntityChar(entity);
 506             if (entity_char)
 507                 output << entity_char;
 508             else
 509             {
 510                 output.append(ent_s-1, c-ent_s+2);
 511                 wxLogTrace(wxTRACE_HTML_DEBUG,
 512                            wxT("Unrecognized HTML entity: '%s'"),
 513                            entity.c_str());
 514             }
 515         }
 516     }
 517     if (*last != wxT('\0'))
 518         output.append(last);
 519     return output;
 520 }
 521
 522 #if !wxUSE_UNICODE
 523 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code)
 524 {
 525 #if wxUSE_WCHAR_T
 526     char buf[2];
 527     wchar_t wbuf[2];
 528     wbuf[0] = (wchar_t)code;
 529     wbuf[1] = 0;
 530     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 531     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 532         return '?';
 533     return buf[0];
 534 #else
 535     return (code < 256) ? (wxChar)code : '?';
 536 #endif
 537 }
 538 #endif
 539
 540 struct wxHtmlEntityInfo
 541 {
 542     const wxStringCharType *name;
 543     unsigned code;
 544 };
 545
 546 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 547 {
 548 #if wxUSE_UNICODE_UTF8
 549     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 550 #else
 551     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 552 #endif
 553 }
 554
 555 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity)
 556 {
 557     unsigned code = 0;
 558
 559     if (entity[0] == wxT('#'))
 560     {
 561         const wxChar *ent_s = entity.c_str();
 562         const wxChar *format;
 563
 564         if (ent_s[1] == wxT('x') || ent_s[1] == wxT('X'))
 565         {
 566             format = wxT("%x");
 567             ent_s++;
 568         }
 569         else
 570             format = wxT("%u");
 571         ent_s++;
 572
 573         if (wxSscanf(ent_s, format, &code) != 1)
 574             code = 0;
 575     }
 576     else
 577     {
 578         // store the literals in wx's internal representation (either char*
 579         // in UTF-8 or wchar_t*) for best performance:
 580         #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
 581
 582         static wxHtmlEntityInfo substitutions[] = {
 583             ENTITY("AElig", 198),
 584             ENTITY("Aacute", 193),
 585             ENTITY("Acirc", 194),
 586             ENTITY("Agrave", 192),
 587             ENTITY("Alpha", 913),
 588             ENTITY("Aring", 197),
 589             ENTITY("Atilde", 195),
 590             ENTITY("Auml", 196),
 591             ENTITY("Beta", 914),
 592             ENTITY("Ccedil", 199),
 593             ENTITY("Chi", 935),
 594             ENTITY("Dagger", 8225),
 595             ENTITY("Delta", 916),
 596             ENTITY("ETH", 208),
 597             ENTITY("Eacute", 201),
 598             ENTITY("Ecirc", 202),
 599             ENTITY("Egrave", 200),
 600             ENTITY("Epsilon", 917),
 601             ENTITY("Eta", 919),
 602             ENTITY("Euml", 203),
 603             ENTITY("Gamma", 915),
 604             ENTITY("Iacute", 205),
 605             ENTITY("Icirc", 206),
 606             ENTITY("Igrave", 204),
 607             ENTITY("Iota", 921),
 608             ENTITY("Iuml", 207),
 609             ENTITY("Kappa", 922),
 610             ENTITY("Lambda", 923),
 611             ENTITY("Mu", 924),
 612             ENTITY("Ntilde", 209),
 613             ENTITY("Nu", 925),
 614             ENTITY("OElig", 338),
 615             ENTITY("Oacute", 211),
 616             ENTITY("Ocirc", 212),
 617             ENTITY("Ograve", 210),
 618             ENTITY("Omega", 937),
 619             ENTITY("Omicron", 927),
 620             ENTITY("Oslash", 216),
 621             ENTITY("Otilde", 213),
 622             ENTITY("Ouml", 214),
 623             ENTITY("Phi", 934),
 624             ENTITY("Pi", 928),
 625             ENTITY("Prime", 8243),
 626             ENTITY("Psi", 936),
 627             ENTITY("Rho", 929),
 628             ENTITY("Scaron", 352),
 629             ENTITY("Sigma", 931),
 630             ENTITY("THORN", 222),
 631             ENTITY("Tau", 932),
 632             ENTITY("Theta", 920),
 633             ENTITY("Uacute", 218),
 634             ENTITY("Ucirc", 219),
 635             ENTITY("Ugrave", 217),
 636             ENTITY("Upsilon", 933),
 637             ENTITY("Uuml", 220),
 638             ENTITY("Xi", 926),
 639             ENTITY("Yacute", 221),
 640             ENTITY("Yuml", 376),
 641             ENTITY("Zeta", 918),
 642             ENTITY("aacute", 225),
 643             ENTITY("acirc", 226),
 644             ENTITY("acute", 180),
 645             ENTITY("aelig", 230),
 646             ENTITY("agrave", 224),
 647             ENTITY("alefsym", 8501),
 648             ENTITY("alpha", 945),
 649             ENTITY("amp", 38),
 650             ENTITY("and", 8743),
 651             ENTITY("ang", 8736),
 652             ENTITY("aring", 229),
 653             ENTITY("asymp", 8776),
 654             ENTITY("atilde", 227),
 655             ENTITY("auml", 228),
 656             ENTITY("bdquo", 8222),
 657             ENTITY("beta", 946),
 658             ENTITY("brvbar", 166),
 659             ENTITY("bull", 8226),
 660             ENTITY("cap", 8745),
 661             ENTITY("ccedil", 231),
 662             ENTITY("cedil", 184),
 663             ENTITY("cent", 162),
 664             ENTITY("chi", 967),
 665             ENTITY("circ", 710),
 666             ENTITY("clubs", 9827),
 667             ENTITY("cong", 8773),
 668             ENTITY("copy", 169),
 669             ENTITY("crarr", 8629),
 670             ENTITY("cup", 8746),
 671             ENTITY("curren", 164),
 672             ENTITY("dArr", 8659),
 673             ENTITY("dagger", 8224),
 674             ENTITY("darr", 8595),
 675             ENTITY("deg", 176),
 676             ENTITY("delta", 948),
 677             ENTITY("diams", 9830),
 678             ENTITY("divide", 247),
 679             ENTITY("eacute", 233),
 680             ENTITY("ecirc", 234),
 681             ENTITY("egrave", 232),
 682             ENTITY("empty", 8709),
 683             ENTITY("emsp", 8195),
 684             ENTITY("ensp", 8194),
 685             ENTITY("epsilon", 949),
 686             ENTITY("equiv", 8801),
 687             ENTITY("eta", 951),
 688             ENTITY("eth", 240),
 689             ENTITY("euml", 235),
 690             ENTITY("euro", 8364),
 691             ENTITY("exist", 8707),
 692             ENTITY("fnof", 402),
 693             ENTITY("forall", 8704),
 694             ENTITY("frac12", 189),
 695             ENTITY("frac14", 188),
 696             ENTITY("frac34", 190),
 697             ENTITY("frasl", 8260),
 698             ENTITY("gamma", 947),
 699             ENTITY("ge", 8805),
 700             ENTITY("gt", 62),
 701             ENTITY("hArr", 8660),
 702             ENTITY("harr", 8596),
 703             ENTITY("hearts", 9829),
 704             ENTITY("hellip", 8230),
 705             ENTITY("iacute", 237),
 706             ENTITY("icirc", 238),
 707             ENTITY("iexcl", 161),
 708             ENTITY("igrave", 236),
 709             ENTITY("image", 8465),
 710             ENTITY("infin", 8734),
 711             ENTITY("int", 8747),
 712             ENTITY("iota", 953),
 713             ENTITY("iquest", 191),
 714             ENTITY("isin", 8712),
 715             ENTITY("iuml", 239),
 716             ENTITY("kappa", 954),
 717             ENTITY("lArr", 8656),
 718             ENTITY("lambda", 955),
 719             ENTITY("lang", 9001),
 720             ENTITY("laquo", 171),
 721             ENTITY("larr", 8592),
 722             ENTITY("lceil", 8968),
 723             ENTITY("ldquo", 8220),
 724             ENTITY("le", 8804),
 725             ENTITY("lfloor", 8970),
 726             ENTITY("lowast", 8727),
 727             ENTITY("loz", 9674),
 728             ENTITY("lrm", 8206),
 729             ENTITY("lsaquo", 8249),
 730             ENTITY("lsquo", 8216),
 731             ENTITY("lt", 60),
 732             ENTITY("macr", 175),
 733             ENTITY("mdash", 8212),
 734             ENTITY("micro", 181),
 735             ENTITY("middot", 183),
 736             ENTITY("minus", 8722),
 737             ENTITY("mu", 956),
 738             ENTITY("nabla", 8711),
 739             ENTITY("nbsp", 160),
 740             ENTITY("ndash", 8211),
 741             ENTITY("ne", 8800),
 742             ENTITY("ni", 8715),
 743             ENTITY("not", 172),
 744             ENTITY("notin", 8713),
 745             ENTITY("nsub", 8836),
 746             ENTITY("ntilde", 241),
 747             ENTITY("nu", 957),
 748             ENTITY("oacute", 243),
 749             ENTITY("ocirc", 244),
 750             ENTITY("oelig", 339),
 751             ENTITY("ograve", 242),
 752             ENTITY("oline", 8254),
 753             ENTITY("omega", 969),
 754             ENTITY("omicron", 959),
 755             ENTITY("oplus", 8853),
 756             ENTITY("or", 8744),
 757             ENTITY("ordf", 170),
 758             ENTITY("ordm", 186),
 759             ENTITY("oslash", 248),
 760             ENTITY("otilde", 245),
 761             ENTITY("otimes", 8855),
 762             ENTITY("ouml", 246),
 763             ENTITY("para", 182),
 764             ENTITY("part", 8706),
 765             ENTITY("permil", 8240),
 766             ENTITY("perp", 8869),
 767             ENTITY("phi", 966),
 768             ENTITY("pi", 960),
 769             ENTITY("piv", 982),
 770             ENTITY("plusmn", 177),
 771             ENTITY("pound", 163),
 772             ENTITY("prime", 8242),
 773             ENTITY("prod", 8719),
 774             ENTITY("prop", 8733),
 775             ENTITY("psi", 968),
 776             ENTITY("quot", 34),
 777             ENTITY("rArr", 8658),
 778             ENTITY("radic", 8730),
 779             ENTITY("rang", 9002),
 780             ENTITY("raquo", 187),
 781             ENTITY("rarr", 8594),
 782             ENTITY("rceil", 8969),
 783             ENTITY("rdquo", 8221),
 784             ENTITY("real", 8476),
 785             ENTITY("reg", 174),
 786             ENTITY("rfloor", 8971),
 787             ENTITY("rho", 961),
 788             ENTITY("rlm", 8207),
 789             ENTITY("rsaquo", 8250),
 790             ENTITY("rsquo", 8217),
 791             ENTITY("sbquo", 8218),
 792             ENTITY("scaron", 353),
 793             ENTITY("sdot", 8901),
 794             ENTITY("sect", 167),
 795             ENTITY("shy", 173),
 796             ENTITY("sigma", 963),
 797             ENTITY("sigmaf", 962),
 798             ENTITY("sim", 8764),
 799             ENTITY("spades", 9824),
 800             ENTITY("sub", 8834),
 801             ENTITY("sube", 8838),
 802             ENTITY("sum", 8721),
 803             ENTITY("sup", 8835),
 804             ENTITY("sup1", 185),
 805             ENTITY("sup2", 178),
 806             ENTITY("sup3", 179),
 807             ENTITY("supe", 8839),
 808             ENTITY("szlig", 223),
 809             ENTITY("tau", 964),
 810             ENTITY("there4", 8756),
 811             ENTITY("theta", 952),
 812             ENTITY("thetasym", 977),
 813             ENTITY("thinsp", 8201),
 814             ENTITY("thorn", 254),
 815             ENTITY("tilde", 732),
 816             ENTITY("times", 215),
 817             ENTITY("trade", 8482),
 818             ENTITY("uArr", 8657),
 819             ENTITY("uacute", 250),
 820             ENTITY("uarr", 8593),
 821             ENTITY("ucirc", 251),
 822             ENTITY("ugrave", 249),
 823             ENTITY("uml", 168),
 824             ENTITY("upsih", 978),
 825             ENTITY("upsilon", 965),
 826             ENTITY("uuml", 252),
 827             ENTITY("weierp", 8472),
 828             ENTITY("xi", 958),
 829             ENTITY("yacute", 253),
 830             ENTITY("yen", 165),
 831             ENTITY("yuml", 255),
 832             ENTITY("zeta", 950),
 833             ENTITY("zwj", 8205),
 834             ENTITY("zwnj", 8204),
 835             {NULL, 0}};
 836         #undef ENTITY
 837         static size_t substitutions_cnt = 0;
 838
 839         if (substitutions_cnt == 0)
 840             while (substitutions[substitutions_cnt].code != 0)
 841                 substitutions_cnt++;
 842
 843         wxHtmlEntityInfo *info = NULL;
 844 #ifdef __WXWINCE__
 845         // bsearch crashes under WinCE for some reason
 846         size_t i;
 847         for (i = 0; i < substitutions_cnt; i++)
 848         {
 849             if (entity == substitutions[i].name)
 850             {
 851                 info = & substitutions[i];
 852                 break;
 853             }
 854         }
 855 #else
 856         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 857                                            substitutions_cnt,
 858                                            sizeof(wxHtmlEntityInfo),
 859                                            wxHtmlEntityCompare);
 860 #endif
 861         if (info)
 862             code = info->code;
 863     }
 864
 865     if (code == 0)
 866         return 0;
 867     else
 868         return GetCharForCode(code);
 869 }
 870
 871 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 872                                 const wxString& url) const
 873 {
 874     return m_FS ? m_FS->OpenFile(url) : NULL;
 875
 876 }
 877
 878
 879 //-----------------------------------------------------------------------------
 880 // wxHtmlParser::ExtractCharsetInformation
 881 //-----------------------------------------------------------------------------
 882
 883 class wxMetaTagParser : public wxHtmlParser
 884 {
 885 public:
 886     wxMetaTagParser() { }
 887
 888     wxObject* GetProduct() { return NULL; }
 889
 890 protected:
 891     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 892
 893     DECLARE_NO_COPY_CLASS(wxMetaTagParser)
 894 };
 895
 896 class wxMetaTagHandler : public wxHtmlTagHandler
 897 {
 898 public:
 899     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 900     wxString GetSupportedTags() { return wxT("META,BODY"); }
 901     bool HandleTag(const wxHtmlTag& tag);
 902
 903 private:
 904     wxString *m_retval;
 905
 906     DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
 907 };
 908
 909 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 910 {
 911     if (tag.GetName() == _T("BODY"))
 912     {
 913         m_Parser->StopParsing();
 914         return false;
 915     }
 916
 917     if (tag.HasParam(_T("HTTP-EQUIV")) &&
 918         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
 919         tag.HasParam(_T("CONTENT")))
 920     {
 921         wxString content = tag.GetParam(_T("CONTENT")).Lower();
 922         if (content.Left(19) == _T("text/html; charset="))
 923         {
 924             *m_retval = content.Mid(19);
 925             m_Parser->StopParsing();
 926         }
 927     }
 928     return false;
 929 }
 930
 931
 932 /*static*/
 933 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 934 {
 935     wxString charset;
 936     wxMetaTagParser *parser = new wxMetaTagParser();
 937     if(parser)
 938     {
 939         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 940         parser->Parse(markup);
 941         delete parser;
 942     }
 943     return charset;
 944 }
 945
 946 /* static */
 947 bool
 948 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 949                              wxString::const_iterator end)
 950 {
 951     wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
 952
 953     wxString::const_iterator p = start;
 954
 955     // comments begin with "<!--" in HTML 4.0
 956     if ( end - p < 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 957     {
 958         // not a comment at all
 959         return false;
 960     }
 961
 962     // skip the start of the comment tag in any case, if we don't find the
 963     // closing tag we should ignore broken markup
 964     start = p;
 965
 966     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 967     // comment delimiter and the closing tag character (section 3.2.4 of
 968     // http://www.w3.org/TR/html401/)
 969     int dashes = 0;
 970     while ( ++p < end )
 971     {
 972         const wxChar c = *p;
 973
 974         if ( (c == wxT(' ') || c == wxT('\n') ||
 975               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 976         {
 977             // ignore white space before potential tag end
 978             continue;
 979         }
 980
 981         if ( c == wxT('>') && dashes >= 2 )
 982         {
 983             // found end of comment
 984             start = p;
 985             break;
 986         }
 987
 988         if ( c == wxT('-') )
 989             dashes++;
 990         else
 991             dashes = 0;
 992     }
 993
 994     return true;
 995 }
 996
 997 #endif // wxUSE_HTML