src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/vector.h"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece() {}
  51     wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
  52     int m_pos, m_lng;
  53 };
  54
  55 // NB: this is an empty class and not typedef because of forward declaration
  56 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
  57 {
  58 };
  59
  60 class wxHtmlParserState
  61 {
  62 public:
  63     wxHtmlTag         *m_curTag;
  64     wxHtmlTag         *m_tags;
  65     wxHtmlTextPieces  *m_textPieces;
  66     int                m_curTextPiece;
  67     wxString           m_source;
  68     wxHtmlParserState *m_nextState;
  69 };
  70
  71 //-----------------------------------------------------------------------------
  72 // wxHtmlParser
  73 //-----------------------------------------------------------------------------
  74
  75 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  76
  77 wxHtmlParser::wxHtmlParser()
  78     : wxObject(), m_HandlersHash(wxKEY_STRING),
  79       m_FS(NULL), m_HandlersStack(NULL)
  80 {
  81     m_entitiesParser = new wxHtmlEntitiesParser;
  82     m_Tags = NULL;
  83     m_CurTag = NULL;
  84     m_TextPieces = NULL;
  85     m_CurTextPiece = 0;
  86     m_SavedStates = NULL;
  87 }
  88
  89 wxHtmlParser::~wxHtmlParser()
  90 {
  91     while (RestoreState()) {}
  92     DestroyDOMTree();
  93
  94     if (m_HandlersStack)
  95     {
  96         wxList& tmp = *m_HandlersStack;
  97         wxList::iterator it, en;
  98         for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
  99             delete (wxHashTable*)*it;
 100         tmp.clear();
 101     }
 102     delete m_HandlersStack;
 103     m_HandlersHash.Clear();
 104     WX_CLEAR_LIST(wxList, m_HandlersList);
 105     delete m_entitiesParser;
 106 }
 107
 108 wxObject* wxHtmlParser::Parse(const wxString& source)
 109 {
 110     InitParser(source);
 111     DoParsing();
 112     wxObject *result = GetProduct();
 113     DoneParser();
 114     return result;
 115 }
 116
 117 void wxHtmlParser::InitParser(const wxString& source)
 118 {
 119     SetSource(source);
 120     m_stopParsing = false;
 121 }
 122
 123 void wxHtmlParser::DoneParser()
 124 {
 125     DestroyDOMTree();
 126 }
 127
 128 void wxHtmlParser::SetSource(const wxString& src)
 129 {
 130     DestroyDOMTree();
 131     m_Source = src;
 132     CreateDOMTree();
 133     m_CurTag = NULL;
 134     m_CurTextPiece = 0;
 135 }
 136
 137 void wxHtmlParser::CreateDOMTree()
 138 {
 139     wxHtmlTagsCache cache(m_Source);
 140     m_TextPieces = new wxHtmlTextPieces;
 141     CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
 142     m_CurTextPiece = 0;
 143 }
 144
 145 extern bool wxIsCDATAElement(const wxChar *tag);
 146
 147 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 148                                     int begin_pos, int end_pos,
 149                                     wxHtmlTagsCache *cache)
 150 {
 151     if (end_pos <= begin_pos) return;
 152
 153     wxChar c;
 154     int i = begin_pos;
 155     int textBeginning = begin_pos;
 156
 157     // If the tag contains CDATA text, we include the text between beginning
 158     // and ending tag verbosely. Setting i=end_pos will skip to the very
 159     // end of this function where text piece is added, bypassing any child
 160     // tags parsing (CDATA element can't have child elements by definition):
 161     if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
 162     {
 163         i = end_pos;
 164     }
 165
 166     while (i < end_pos)
 167     {
 168         c = m_Source.GetChar(i);
 169
 170         if (c == wxT('<'))
 171         {
 172             // add text to m_TextPieces:
 173             if (i - textBeginning > 0)
 174                 m_TextPieces->push_back(
 175                     wxHtmlTextPiece(textBeginning, i - textBeginning));
 176
 177             // if it is a comment, skip it:
 178             wxString::const_iterator iter = m_Source.begin() + i;
 179             if ( SkipCommentTag(iter, m_Source.end()) )
 180             {
 181                 textBeginning =
 182                 i = iter - m_Source.begin() + 1; // skip closing '>' too
 183             }
 184
 185             // add another tag to the tree:
 186             else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
 187             {
 188                 wxHtmlTag *chd;
 189                 if (cur)
 190                     chd = new wxHtmlTag(cur, m_Source,
 191                                         i, end_pos, cache, m_entitiesParser);
 192                 else
 193                 {
 194                     chd = new wxHtmlTag(NULL, m_Source,
 195                                         i, end_pos, cache, m_entitiesParser);
 196                     if (!m_Tags)
 197                     {
 198                         // if this is the first tag to be created make the root
 199                         // m_Tags point to it:
 200                         m_Tags = chd;
 201                     }
 202                     else
 203                     {
 204                         // if there is already a root tag add this tag as
 205                         // the last sibling:
 206                         chd->m_Prev = m_Tags->GetLastSibling();
 207                         chd->m_Prev->m_Next = chd;
 208                     }
 209                 }
 210
 211                 if (chd->HasEnding())
 212                 {
 213                     CreateDOMSubTree(chd,
 214                                      chd->GetBeginPos(), chd->GetEndPos1(),
 215                                      cache);
 216                     i = chd->GetEndPos2();
 217                 }
 218                 else
 219                     i = chd->GetBeginPos();
 220
 221                 textBeginning = i;
 222             }
 223
 224             // ... or skip ending tag:
 225             else
 226             {
 227                 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
 228                 textBeginning = i+1;
 229             }
 230         }
 231         else i++;
 232     }
 233
 234     // add remaining text to m_TextPieces:
 235     if (end_pos - textBeginning > 0)
 236         m_TextPieces->push_back(
 237             wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
 238 }
 239
 240 void wxHtmlParser::DestroyDOMTree()
 241 {
 242     wxHtmlTag *t1, *t2;
 243     t1 = m_Tags;
 244     while (t1)
 245     {
 246         t2 = t1->GetNextSibling();
 247         delete t1;
 248         t1 = t2;
 249     }
 250     m_Tags = m_CurTag = NULL;
 251
 252     delete m_TextPieces;
 253     m_TextPieces = NULL;
 254 }
 255
 256 void wxHtmlParser::DoParsing()
 257 {
 258     m_CurTag = m_Tags;
 259     m_CurTextPiece = 0;
 260     DoParsing(0, m_Source.length());
 261 }
 262
 263 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
 264 {
 265     if (end_pos <= begin_pos) return;
 266
 267     wxHtmlTextPieces& pieces = *m_TextPieces;
 268     size_t piecesCnt = pieces.size();
 269
 270     while (begin_pos < end_pos)
 271     {
 272         while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
 273             m_CurTag = m_CurTag->GetNextTag();
 274         while (m_CurTextPiece < piecesCnt &&
 275                pieces[m_CurTextPiece].m_pos < begin_pos)
 276             m_CurTextPiece++;
 277
 278         if (m_CurTextPiece < piecesCnt &&
 279             (!m_CurTag ||
 280              pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
 281         {
 282             // Add text:
 283             AddText(GetEntitiesParser()->Parse(
 284                        m_Source.Mid(pieces[m_CurTextPiece].m_pos,
 285                                     pieces[m_CurTextPiece].m_lng)));
 286             begin_pos = pieces[m_CurTextPiece].m_pos +
 287                         pieces[m_CurTextPiece].m_lng;
 288             m_CurTextPiece++;
 289         }
 290         else if (m_CurTag)
 291         {
 292             if (m_CurTag->HasEnding())
 293                 begin_pos = m_CurTag->GetEndPos2();
 294             else
 295                 begin_pos = m_CurTag->GetBeginPos();
 296             wxHtmlTag *t = m_CurTag;
 297             m_CurTag = m_CurTag->GetNextTag();
 298             AddTag(*t);
 299             if (m_stopParsing)
 300                 return;
 301         }
 302         else break;
 303     }
 304 }
 305
 306 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 307 {
 308     wxHtmlTagHandler *h;
 309     bool inner = false;
 310
 311     h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
 312     if (h)
 313     {
 314         inner = h->HandleTag(tag);
 315         if (m_stopParsing)
 316             return;
 317     }
 318     if (!inner)
 319     {
 320         if (tag.HasEnding())
 321             DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
 322     }
 323 }
 324
 325 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 326 {
 327     wxString s(handler->GetSupportedTags());
 328     wxStringTokenizer tokenizer(s, wxT(", "));
 329
 330     while (tokenizer.HasMoreTokens())
 331         m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
 332
 333     if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
 334         m_HandlersList.Append(handler);
 335
 336     handler->SetParser(this);
 337 }
 338
 339 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 340 {
 341     wxStringTokenizer tokenizer(tags, wxT(", "));
 342     wxString key;
 343
 344     if (m_HandlersStack == NULL)
 345     {
 346         m_HandlersStack = new wxList;
 347     }
 348
 349     m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
 350
 351     while (tokenizer.HasMoreTokens())
 352     {
 353         key = tokenizer.GetNextToken();
 354         m_HandlersHash.Delete(key);
 355         m_HandlersHash.Put(key, handler);
 356     }
 357 }
 358
 359 void wxHtmlParser::PopTagHandler()
 360 {
 361     wxList::compatibility_iterator first;
 362
 363     if ( !m_HandlersStack ||
 364 #if wxUSE_STL
 365          !(first = m_HandlersStack->GetFirst())
 366 #else // !wxUSE_STL
 367          ((first = m_HandlersStack->GetFirst()) == NULL)
 368 #endif // wxUSE_STL/!wxUSE_STL
 369         )
 370     {
 371         wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
 372         return;
 373     }
 374     m_HandlersHash = *((wxHashTable*) first->GetData());
 375     delete (wxHashTable*) first->GetData();
 376     m_HandlersStack->Erase(first);
 377 }
 378
 379 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 380 {
 381     wxHtmlParserState *s = new wxHtmlParserState;
 382
 383     s->m_curTag = m_CurTag;
 384     s->m_tags = m_Tags;
 385     s->m_textPieces = m_TextPieces;
 386     s->m_curTextPiece = m_CurTextPiece;
 387     s->m_source = m_Source;
 388
 389     s->m_nextState = m_SavedStates;
 390     m_SavedStates = s;
 391
 392     m_CurTag = NULL;
 393     m_Tags = NULL;
 394     m_TextPieces = NULL;
 395     m_CurTextPiece = 0;
 396     m_Source = wxEmptyString;
 397
 398     SetSource(src);
 399 }
 400
 401 bool wxHtmlParser::RestoreState()
 402 {
 403     if (!m_SavedStates) return false;
 404
 405     DestroyDOMTree();
 406
 407     wxHtmlParserState *s = m_SavedStates;
 408     m_SavedStates = s->m_nextState;
 409
 410     m_CurTag = s->m_curTag;
 411     m_Tags = s->m_tags;
 412     m_TextPieces = s->m_textPieces;
 413     m_CurTextPiece = s->m_curTextPiece;
 414     m_Source = s->m_source;
 415
 416     delete s;
 417     return true;
 418 }
 419
 420 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 421 {
 422     return GetSource()->Mid(tag.GetBeginPos(),
 423                             tag.GetEndPos1() - tag.GetBeginPos());
 424 }
 425
 426 //-----------------------------------------------------------------------------
 427 // wxHtmlTagHandler
 428 //-----------------------------------------------------------------------------
 429
 430 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 431
 432 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 433 {
 434     // It is safe to temporarily change the source being parsed,
 435     // provided we restore the state back after parsing
 436     m_Parser->SetSourceAndSaveState(source);
 437     m_Parser->DoParsing();
 438     m_Parser->RestoreState();
 439 }
 440
 441
 442 //-----------------------------------------------------------------------------
 443 // wxHtmlEntitiesParser
 444 //-----------------------------------------------------------------------------
 445
 446 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 447
 448 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 449 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 450     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 451 #endif
 452 {
 453 }
 454
 455 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 456 {
 457 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 458     delete m_conv;
 459 #endif
 460 }
 461
 462 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 463 {
 464 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 465     if (encoding == m_encoding)
 466         return;
 467
 468     delete m_conv;
 469
 470     m_encoding = encoding;
 471     if (m_encoding == wxFONTENCODING_SYSTEM)
 472         m_conv = NULL;
 473     else
 474         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 475 #else
 476     (void) encoding;
 477 #endif
 478 }
 479
 480 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 481 {
 482     wxString output;
 483
 484     const wxString::const_iterator end(input.end());
 485     wxString::const_iterator c(input.begin());
 486     wxString::const_iterator last(c);
 487
 488     for ( ; c < end; ++c )
 489     {
 490         if (*c == wxT('&'))
 491         {
 492             if ( output.empty() )
 493                 output.reserve(input.length());
 494
 495             if (c - last > 0)
 496                 output.append(last, c);
 497             if ( ++c == end )
 498                 break;
 499
 500             wxString entity;
 501             const wxString::const_iterator ent_s = c;
 502             wxChar entity_char;
 503
 504             for (; c != end &&
 505                    ((*c >= wxT('a') && *c <= wxT('z')) ||
 506                     (*c >= wxT('A') && *c <= wxT('Z')) ||
 507                     (*c >= wxT('0') && *c <= wxT('9')) ||
 508                     *c == wxT('_') || *c == wxT('#')); ++c) {}
 509             entity.append(ent_s, c);
 510             if (c == end || *c != wxT(';')) --c;
 511             last = c+1;
 512             entity_char = GetEntityChar(entity);
 513             if (entity_char)
 514                 output << entity_char;
 515             else
 516             {
 517                 output.append(ent_s-1, c+1);
 518                 wxLogTrace(wxTRACE_HTML_DEBUG,
 519                            "Unrecognized HTML entity: '%s'",
 520                            entity);
 521             }
 522         }
 523     }
 524     if ( last == input.begin() ) // common case: no entity
 525         return input;
 526     if ( last != end )
 527         output.append(last, end);
 528     return output;
 529 }
 530
 531 #if !wxUSE_UNICODE
 532 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 533 {
 534 #if wxUSE_WCHAR_T
 535     char buf[2];
 536     wchar_t wbuf[2];
 537     wbuf[0] = (wchar_t)code;
 538     wbuf[1] = 0;
 539     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 540     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 541         return '?';
 542     return buf[0];
 543 #else
 544     return (code < 256) ? (wxChar)code : '?';
 545 #endif
 546 }
 547 #endif
 548
 549 struct wxHtmlEntityInfo
 550 {
 551     const wxStringCharType *name;
 552     unsigned code;
 553 };
 554
 555 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 556 {
 557 #if wxUSE_UNICODE_UTF8
 558     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 559 #else
 560     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 561 #endif
 562 }
 563
 564 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 565 {
 566     unsigned code = 0;
 567
 568     if (entity[0] == wxT('#'))
 569     {
 570         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 571         //     representation is the same for numbers
 572         const wxStringCharType *ent_s = entity.wx_str();
 573         const wxStringCharType *format;
 574
 575         if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
 576         {
 577             format = wxSTRING_TEXT("%x");
 578             ent_s++;
 579         }
 580         else
 581             format = wxSTRING_TEXT("%u");
 582         ent_s++;
 583
 584         if (wxSscanf(ent_s, format, &code) != 1)
 585             code = 0;
 586     }
 587     else
 588     {
 589         // store the literals in wx's internal representation (either char*
 590         // in UTF-8 or wchar_t*) for best performance:
 591         #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
 592
 593         static wxHtmlEntityInfo substitutions[] = {
 594             ENTITY("AElig", 198),
 595             ENTITY("Aacute", 193),
 596             ENTITY("Acirc", 194),
 597             ENTITY("Agrave", 192),
 598             ENTITY("Alpha", 913),
 599             ENTITY("Aring", 197),
 600             ENTITY("Atilde", 195),
 601             ENTITY("Auml", 196),
 602             ENTITY("Beta", 914),
 603             ENTITY("Ccedil", 199),
 604             ENTITY("Chi", 935),
 605             ENTITY("Dagger", 8225),
 606             ENTITY("Delta", 916),
 607             ENTITY("ETH", 208),
 608             ENTITY("Eacute", 201),
 609             ENTITY("Ecirc", 202),
 610             ENTITY("Egrave", 200),
 611             ENTITY("Epsilon", 917),
 612             ENTITY("Eta", 919),
 613             ENTITY("Euml", 203),
 614             ENTITY("Gamma", 915),
 615             ENTITY("Iacute", 205),
 616             ENTITY("Icirc", 206),
 617             ENTITY("Igrave", 204),
 618             ENTITY("Iota", 921),
 619             ENTITY("Iuml", 207),
 620             ENTITY("Kappa", 922),
 621             ENTITY("Lambda", 923),
 622             ENTITY("Mu", 924),
 623             ENTITY("Ntilde", 209),
 624             ENTITY("Nu", 925),
 625             ENTITY("OElig", 338),
 626             ENTITY("Oacute", 211),
 627             ENTITY("Ocirc", 212),
 628             ENTITY("Ograve", 210),
 629             ENTITY("Omega", 937),
 630             ENTITY("Omicron", 927),
 631             ENTITY("Oslash", 216),
 632             ENTITY("Otilde", 213),
 633             ENTITY("Ouml", 214),
 634             ENTITY("Phi", 934),
 635             ENTITY("Pi", 928),
 636             ENTITY("Prime", 8243),
 637             ENTITY("Psi", 936),
 638             ENTITY("Rho", 929),
 639             ENTITY("Scaron", 352),
 640             ENTITY("Sigma", 931),
 641             ENTITY("THORN", 222),
 642             ENTITY("Tau", 932),
 643             ENTITY("Theta", 920),
 644             ENTITY("Uacute", 218),
 645             ENTITY("Ucirc", 219),
 646             ENTITY("Ugrave", 217),
 647             ENTITY("Upsilon", 933),
 648             ENTITY("Uuml", 220),
 649             ENTITY("Xi", 926),
 650             ENTITY("Yacute", 221),
 651             ENTITY("Yuml", 376),
 652             ENTITY("Zeta", 918),
 653             ENTITY("aacute", 225),
 654             ENTITY("acirc", 226),
 655             ENTITY("acute", 180),
 656             ENTITY("aelig", 230),
 657             ENTITY("agrave", 224),
 658             ENTITY("alefsym", 8501),
 659             ENTITY("alpha", 945),
 660             ENTITY("amp", 38),
 661             ENTITY("and", 8743),
 662             ENTITY("ang", 8736),
 663             ENTITY("aring", 229),
 664             ENTITY("asymp", 8776),
 665             ENTITY("atilde", 227),
 666             ENTITY("auml", 228),
 667             ENTITY("bdquo", 8222),
 668             ENTITY("beta", 946),
 669             ENTITY("brvbar", 166),
 670             ENTITY("bull", 8226),
 671             ENTITY("cap", 8745),
 672             ENTITY("ccedil", 231),
 673             ENTITY("cedil", 184),
 674             ENTITY("cent", 162),
 675             ENTITY("chi", 967),
 676             ENTITY("circ", 710),
 677             ENTITY("clubs", 9827),
 678             ENTITY("cong", 8773),
 679             ENTITY("copy", 169),
 680             ENTITY("crarr", 8629),
 681             ENTITY("cup", 8746),
 682             ENTITY("curren", 164),
 683             ENTITY("dArr", 8659),
 684             ENTITY("dagger", 8224),
 685             ENTITY("darr", 8595),
 686             ENTITY("deg", 176),
 687             ENTITY("delta", 948),
 688             ENTITY("diams", 9830),
 689             ENTITY("divide", 247),
 690             ENTITY("eacute", 233),
 691             ENTITY("ecirc", 234),
 692             ENTITY("egrave", 232),
 693             ENTITY("empty", 8709),
 694             ENTITY("emsp", 8195),
 695             ENTITY("ensp", 8194),
 696             ENTITY("epsilon", 949),
 697             ENTITY("equiv", 8801),
 698             ENTITY("eta", 951),
 699             ENTITY("eth", 240),
 700             ENTITY("euml", 235),
 701             ENTITY("euro", 8364),
 702             ENTITY("exist", 8707),
 703             ENTITY("fnof", 402),
 704             ENTITY("forall", 8704),
 705             ENTITY("frac12", 189),
 706             ENTITY("frac14", 188),
 707             ENTITY("frac34", 190),
 708             ENTITY("frasl", 8260),
 709             ENTITY("gamma", 947),
 710             ENTITY("ge", 8805),
 711             ENTITY("gt", 62),
 712             ENTITY("hArr", 8660),
 713             ENTITY("harr", 8596),
 714             ENTITY("hearts", 9829),
 715             ENTITY("hellip", 8230),
 716             ENTITY("iacute", 237),
 717             ENTITY("icirc", 238),
 718             ENTITY("iexcl", 161),
 719             ENTITY("igrave", 236),
 720             ENTITY("image", 8465),
 721             ENTITY("infin", 8734),
 722             ENTITY("int", 8747),
 723             ENTITY("iota", 953),
 724             ENTITY("iquest", 191),
 725             ENTITY("isin", 8712),
 726             ENTITY("iuml", 239),
 727             ENTITY("kappa", 954),
 728             ENTITY("lArr", 8656),
 729             ENTITY("lambda", 955),
 730             ENTITY("lang", 9001),
 731             ENTITY("laquo", 171),
 732             ENTITY("larr", 8592),
 733             ENTITY("lceil", 8968),
 734             ENTITY("ldquo", 8220),
 735             ENTITY("le", 8804),
 736             ENTITY("lfloor", 8970),
 737             ENTITY("lowast", 8727),
 738             ENTITY("loz", 9674),
 739             ENTITY("lrm", 8206),
 740             ENTITY("lsaquo", 8249),
 741             ENTITY("lsquo", 8216),
 742             ENTITY("lt", 60),
 743             ENTITY("macr", 175),
 744             ENTITY("mdash", 8212),
 745             ENTITY("micro", 181),
 746             ENTITY("middot", 183),
 747             ENTITY("minus", 8722),
 748             ENTITY("mu", 956),
 749             ENTITY("nabla", 8711),
 750             ENTITY("nbsp", 160),
 751             ENTITY("ndash", 8211),
 752             ENTITY("ne", 8800),
 753             ENTITY("ni", 8715),
 754             ENTITY("not", 172),
 755             ENTITY("notin", 8713),
 756             ENTITY("nsub", 8836),
 757             ENTITY("ntilde", 241),
 758             ENTITY("nu", 957),
 759             ENTITY("oacute", 243),
 760             ENTITY("ocirc", 244),
 761             ENTITY("oelig", 339),
 762             ENTITY("ograve", 242),
 763             ENTITY("oline", 8254),
 764             ENTITY("omega", 969),
 765             ENTITY("omicron", 959),
 766             ENTITY("oplus", 8853),
 767             ENTITY("or", 8744),
 768             ENTITY("ordf", 170),
 769             ENTITY("ordm", 186),
 770             ENTITY("oslash", 248),
 771             ENTITY("otilde", 245),
 772             ENTITY("otimes", 8855),
 773             ENTITY("ouml", 246),
 774             ENTITY("para", 182),
 775             ENTITY("part", 8706),
 776             ENTITY("permil", 8240),
 777             ENTITY("perp", 8869),
 778             ENTITY("phi", 966),
 779             ENTITY("pi", 960),
 780             ENTITY("piv", 982),
 781             ENTITY("plusmn", 177),
 782             ENTITY("pound", 163),
 783             ENTITY("prime", 8242),
 784             ENTITY("prod", 8719),
 785             ENTITY("prop", 8733),
 786             ENTITY("psi", 968),
 787             ENTITY("quot", 34),
 788             ENTITY("rArr", 8658),
 789             ENTITY("radic", 8730),
 790             ENTITY("rang", 9002),
 791             ENTITY("raquo", 187),
 792             ENTITY("rarr", 8594),
 793             ENTITY("rceil", 8969),
 794             ENTITY("rdquo", 8221),
 795             ENTITY("real", 8476),
 796             ENTITY("reg", 174),
 797             ENTITY("rfloor", 8971),
 798             ENTITY("rho", 961),
 799             ENTITY("rlm", 8207),
 800             ENTITY("rsaquo", 8250),
 801             ENTITY("rsquo", 8217),
 802             ENTITY("sbquo", 8218),
 803             ENTITY("scaron", 353),
 804             ENTITY("sdot", 8901),
 805             ENTITY("sect", 167),
 806             ENTITY("shy", 173),
 807             ENTITY("sigma", 963),
 808             ENTITY("sigmaf", 962),
 809             ENTITY("sim", 8764),
 810             ENTITY("spades", 9824),
 811             ENTITY("sub", 8834),
 812             ENTITY("sube", 8838),
 813             ENTITY("sum", 8721),
 814             ENTITY("sup", 8835),
 815             ENTITY("sup1", 185),
 816             ENTITY("sup2", 178),
 817             ENTITY("sup3", 179),
 818             ENTITY("supe", 8839),
 819             ENTITY("szlig", 223),
 820             ENTITY("tau", 964),
 821             ENTITY("there4", 8756),
 822             ENTITY("theta", 952),
 823             ENTITY("thetasym", 977),
 824             ENTITY("thinsp", 8201),
 825             ENTITY("thorn", 254),
 826             ENTITY("tilde", 732),
 827             ENTITY("times", 215),
 828             ENTITY("trade", 8482),
 829             ENTITY("uArr", 8657),
 830             ENTITY("uacute", 250),
 831             ENTITY("uarr", 8593),
 832             ENTITY("ucirc", 251),
 833             ENTITY("ugrave", 249),
 834             ENTITY("uml", 168),
 835             ENTITY("upsih", 978),
 836             ENTITY("upsilon", 965),
 837             ENTITY("uuml", 252),
 838             ENTITY("weierp", 8472),
 839             ENTITY("xi", 958),
 840             ENTITY("yacute", 253),
 841             ENTITY("yen", 165),
 842             ENTITY("yuml", 255),
 843             ENTITY("zeta", 950),
 844             ENTITY("zwj", 8205),
 845             ENTITY("zwnj", 8204),
 846             {NULL, 0}};
 847         #undef ENTITY
 848         static size_t substitutions_cnt = 0;
 849
 850         if (substitutions_cnt == 0)
 851             while (substitutions[substitutions_cnt].code != 0)
 852                 substitutions_cnt++;
 853
 854         wxHtmlEntityInfo *info = NULL;
 855 #ifdef __WXWINCE__
 856         // bsearch crashes under WinCE for some reason
 857         size_t i;
 858         for (i = 0; i < substitutions_cnt; i++)
 859         {
 860             if (entity == substitutions[i].name)
 861             {
 862                 info = & substitutions[i];
 863                 break;
 864             }
 865         }
 866 #else
 867         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 868                                            substitutions_cnt,
 869                                            sizeof(wxHtmlEntityInfo),
 870                                            wxHtmlEntityCompare);
 871 #endif
 872         if (info)
 873             code = info->code;
 874     }
 875
 876     if (code == 0)
 877         return 0;
 878     else
 879         return GetCharForCode(code);
 880 }
 881
 882 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 883                                 const wxString& url) const
 884 {
 885     return m_FS ? m_FS->OpenFile(url) : NULL;
 886
 887 }
 888
 889
 890 //-----------------------------------------------------------------------------
 891 // wxHtmlParser::ExtractCharsetInformation
 892 //-----------------------------------------------------------------------------
 893
 894 class wxMetaTagParser : public wxHtmlParser
 895 {
 896 public:
 897     wxMetaTagParser() { }
 898
 899     wxObject* GetProduct() { return NULL; }
 900
 901 protected:
 902     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 903
 904     DECLARE_NO_COPY_CLASS(wxMetaTagParser)
 905 };
 906
 907 class wxMetaTagHandler : public wxHtmlTagHandler
 908 {
 909 public:
 910     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 911     wxString GetSupportedTags() { return wxT("META,BODY"); }
 912     bool HandleTag(const wxHtmlTag& tag);
 913
 914 private:
 915     wxString *m_retval;
 916
 917     DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
 918 };
 919
 920 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 921 {
 922     if (tag.GetName() == _T("BODY"))
 923     {
 924         m_Parser->StopParsing();
 925         return false;
 926     }
 927
 928     if (tag.HasParam(_T("HTTP-EQUIV")) &&
 929         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
 930         tag.HasParam(_T("CONTENT")))
 931     {
 932         wxString content = tag.GetParam(_T("CONTENT")).Lower();
 933         if (content.Left(19) == _T("text/html; charset="))
 934         {
 935             *m_retval = content.Mid(19);
 936             m_Parser->StopParsing();
 937         }
 938     }
 939     return false;
 940 }
 941
 942
 943 /*static*/
 944 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 945 {
 946     wxString charset;
 947     wxMetaTagParser *parser = new wxMetaTagParser();
 948     if(parser)
 949     {
 950         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 951         parser->Parse(markup);
 952         delete parser;
 953     }
 954     return charset;
 955 }
 956
 957 /* static */
 958 bool
 959 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 960                              wxString::const_iterator end)
 961 {
 962     wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
 963
 964     wxString::const_iterator p = start;
 965
 966     // comments begin with "<!--" in HTML 4.0
 967     if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 968     {
 969         // not a comment at all
 970         return false;
 971     }
 972
 973     // skip the start of the comment tag in any case, if we don't find the
 974     // closing tag we should ignore broken markup
 975     start = p;
 976
 977     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 978     // comment delimiter and the closing tag character (section 3.2.4 of
 979     // http://www.w3.org/TR/html401/)
 980     int dashes = 0;
 981     while ( ++p < end )
 982     {
 983         const wxChar c = *p;
 984
 985         if ( (c == wxT(' ') || c == wxT('\n') ||
 986               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 987         {
 988             // ignore white space before potential tag end
 989             continue;
 990         }
 991
 992         if ( c == wxT('>') && dashes >= 2 )
 993         {
 994             // found end of comment
 995             start = p;
 996             break;
 997         }
 998
 999         if ( c == wxT('-') )
1000             dashes++;
1001         else
1002             dashes = 0;
1003     }
1004
1005     return true;
1006 }
1007
1008 #endif // wxUSE_HTML