src/html/htmlpars.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/html/htmlpars.cpp
   3 // Purpose:     wxHtmlParser class (generic parser)
   4 // Author:      Vaclav Slavik
   5 // RCS-ID:      $Id$
   6 // Copyright:   (c) 1999 Vaclav Slavik
   7 // Licence:     wxWindows licence
   8 /////////////////////////////////////////////////////////////////////////////
   9
  10 #include "wx/wxprec.h"
  11
  12 #ifdef __BORLANDC__
  13     #pragma hdrstop
  14 #endif
  15
  16 #if wxUSE_HTML && wxUSE_STREAMS
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/dynarray.h"
  20     #include "wx/log.h"
  21     #include "wx/intl.h"
  22     #include "wx/app.h"
  23     #include "wx/wxcrtvararg.h"
  24 #endif
  25
  26 #include "wx/tokenzr.h"
  27 #include "wx/wfstream.h"
  28 #include "wx/url.h"
  29 #include "wx/fontmap.h"
  30 #include "wx/html/htmldefs.h"
  31 #include "wx/html/htmlpars.h"
  32 #include "wx/arrimpl.cpp"
  33
  34 #ifdef __WXWINCE__
  35     #include "wx/msw/wince/missing.h"       // for bsearch()
  36 #endif
  37
  38 // DLL options compatibility check:
  39 WX_CHECK_BUILD_OPTIONS("wxHTML")
  40
  41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
  42
  43 //-----------------------------------------------------------------------------
  44 // wxHtmlParser helpers
  45 //-----------------------------------------------------------------------------
  46
  47 class wxHtmlTextPiece
  48 {
  49 public:
  50     wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
  51     int m_pos, m_lng;
  52 };
  53
  54 WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
  55 WX_DEFINE_OBJARRAY(wxHtmlTextPieces)
  56
  57 class wxHtmlParserState
  58 {
  59 public:
  60     wxHtmlTag         *m_curTag;
  61     wxHtmlTag         *m_tags;
  62     wxHtmlTextPieces  *m_textPieces;
  63     int                m_curTextPiece;
  64     wxString           m_source;
  65     wxHtmlParserState *m_nextState;
  66 };
  67
  68 //-----------------------------------------------------------------------------
  69 // wxHtmlParser
  70 //-----------------------------------------------------------------------------
  71
  72 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
  73
  74 wxHtmlParser::wxHtmlParser()
  75     : wxObject(), m_HandlersHash(wxKEY_STRING),
  76       m_FS(NULL), m_HandlersStack(NULL)
  77 {
  78     m_entitiesParser = new wxHtmlEntitiesParser;
  79     m_Tags = NULL;
  80     m_CurTag = NULL;
  81     m_TextPieces = NULL;
  82     m_CurTextPiece = 0;
  83     m_SavedStates = NULL;
  84 }
  85
  86 wxHtmlParser::~wxHtmlParser()
  87 {
  88     while (RestoreState()) {}
  89     DestroyDOMTree();
  90
  91     if (m_HandlersStack)
  92     {
  93         wxList& tmp = *m_HandlersStack;
  94         wxList::iterator it, en;
  95         for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
  96             delete (wxHashTable*)*it;
  97         tmp.clear();
  98     }
  99     delete m_HandlersStack;
 100     m_HandlersHash.Clear();
 101     WX_CLEAR_LIST(wxList, m_HandlersList);
 102     delete m_entitiesParser;
 103 }
 104
 105 wxObject* wxHtmlParser::Parse(const wxString& source)
 106 {
 107     InitParser(source);
 108     DoParsing();
 109     wxObject *result = GetProduct();
 110     DoneParser();
 111     return result;
 112 }
 113
 114 void wxHtmlParser::InitParser(const wxString& source)
 115 {
 116     SetSource(source);
 117     m_stopParsing = false;
 118 }
 119
 120 void wxHtmlParser::DoneParser()
 121 {
 122     DestroyDOMTree();
 123 }
 124
 125 void wxHtmlParser::SetSource(const wxString& src)
 126 {
 127     DestroyDOMTree();
 128     m_Source = src;
 129     CreateDOMTree();
 130     m_CurTag = NULL;
 131     m_CurTextPiece = 0;
 132 }
 133
 134 void wxHtmlParser::CreateDOMTree()
 135 {
 136     wxHtmlTagsCache cache(m_Source);
 137     m_TextPieces = new wxHtmlTextPieces;
 138     CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
 139     m_CurTextPiece = 0;
 140 }
 141
 142 extern bool wxIsCDATAElement(const wxChar *tag);
 143
 144 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
 145                                     int begin_pos, int end_pos,
 146                                     wxHtmlTagsCache *cache)
 147 {
 148     if (end_pos <= begin_pos) return;
 149
 150     wxChar c;
 151     int i = begin_pos;
 152     int textBeginning = begin_pos;
 153
 154     // If the tag contains CDATA text, we include the text between beginning
 155     // and ending tag verbosely. Setting i=end_pos will skip to the very
 156     // end of this function where text piece is added, bypassing any child
 157     // tags parsing (CDATA element can't have child elements by definition):
 158     if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
 159     {
 160         i = end_pos;
 161     }
 162
 163     while (i < end_pos)
 164     {
 165         c = m_Source.GetChar(i);
 166
 167         if (c == wxT('<'))
 168         {
 169             // add text to m_TextPieces:
 170             if (i - textBeginning > 0)
 171                 m_TextPieces->Add(
 172                     wxHtmlTextPiece(textBeginning, i - textBeginning));
 173
 174             // if it is a comment, skip it:
 175             wxString::const_iterator iter = m_Source.begin() + i;
 176             if ( SkipCommentTag(iter, m_Source.end()) )
 177             {
 178                 textBeginning =
 179                 i = iter - m_Source.begin() + 1; // skip closing '>' too
 180             }
 181
 182             // add another tag to the tree:
 183             else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
 184             {
 185                 wxHtmlTag *chd;
 186                 if (cur)
 187                     chd = new wxHtmlTag(cur, m_Source,
 188                                         i, end_pos, cache, m_entitiesParser);
 189                 else
 190                 {
 191                     chd = new wxHtmlTag(NULL, m_Source,
 192                                         i, end_pos, cache, m_entitiesParser);
 193                     if (!m_Tags)
 194                     {
 195                         // if this is the first tag to be created make the root
 196                         // m_Tags point to it:
 197                         m_Tags = chd;
 198                     }
 199                     else
 200                     {
 201                         // if there is already a root tag add this tag as
 202                         // the last sibling:
 203                         chd->m_Prev = m_Tags->GetLastSibling();
 204                         chd->m_Prev->m_Next = chd;
 205                     }
 206                 }
 207
 208                 if (chd->HasEnding())
 209                 {
 210                     CreateDOMSubTree(chd,
 211                                      chd->GetBeginPos(), chd->GetEndPos1(),
 212                                      cache);
 213                     i = chd->GetEndPos2();
 214                 }
 215                 else
 216                     i = chd->GetBeginPos();
 217
 218                 textBeginning = i;
 219             }
 220
 221             // ... or skip ending tag:
 222             else
 223             {
 224                 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
 225                 textBeginning = i+1;
 226             }
 227         }
 228         else i++;
 229     }
 230
 231     // add remaining text to m_TextPieces:
 232     if (end_pos - textBeginning > 0)
 233         m_TextPieces->Add(
 234             wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
 235 }
 236
 237 void wxHtmlParser::DestroyDOMTree()
 238 {
 239     wxHtmlTag *t1, *t2;
 240     t1 = m_Tags;
 241     while (t1)
 242     {
 243         t2 = t1->GetNextSibling();
 244         delete t1;
 245         t1 = t2;
 246     }
 247     m_Tags = m_CurTag = NULL;
 248
 249     delete m_TextPieces;
 250     m_TextPieces = NULL;
 251 }
 252
 253 void wxHtmlParser::DoParsing()
 254 {
 255     m_CurTag = m_Tags;
 256     m_CurTextPiece = 0;
 257     DoParsing(0, m_Source.length());
 258 }
 259
 260 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
 261 {
 262     if (end_pos <= begin_pos) return;
 263
 264     wxHtmlTextPieces& pieces = *m_TextPieces;
 265     size_t piecesCnt = pieces.GetCount();
 266
 267     while (begin_pos < end_pos)
 268     {
 269         while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
 270             m_CurTag = m_CurTag->GetNextTag();
 271         while (m_CurTextPiece < piecesCnt &&
 272                pieces[m_CurTextPiece].m_pos < begin_pos)
 273             m_CurTextPiece++;
 274
 275         if (m_CurTextPiece < piecesCnt &&
 276             (!m_CurTag ||
 277              pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
 278         {
 279             // Add text:
 280             AddText(GetEntitiesParser()->Parse(
 281                        m_Source.Mid(pieces[m_CurTextPiece].m_pos,
 282                                     pieces[m_CurTextPiece].m_lng)));
 283             begin_pos = pieces[m_CurTextPiece].m_pos +
 284                         pieces[m_CurTextPiece].m_lng;
 285             m_CurTextPiece++;
 286         }
 287         else if (m_CurTag)
 288         {
 289             if (m_CurTag->HasEnding())
 290                 begin_pos = m_CurTag->GetEndPos2();
 291             else
 292                 begin_pos = m_CurTag->GetBeginPos();
 293             wxHtmlTag *t = m_CurTag;
 294             m_CurTag = m_CurTag->GetNextTag();
 295             AddTag(*t);
 296             if (m_stopParsing)
 297                 return;
 298         }
 299         else break;
 300     }
 301 }
 302
 303 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
 304 {
 305     wxHtmlTagHandler *h;
 306     bool inner = false;
 307
 308     h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
 309     if (h)
 310     {
 311         inner = h->HandleTag(tag);
 312         if (m_stopParsing)
 313             return;
 314     }
 315     if (!inner)
 316     {
 317         if (tag.HasEnding())
 318             DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
 319     }
 320 }
 321
 322 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
 323 {
 324     wxString s(handler->GetSupportedTags());
 325     wxStringTokenizer tokenizer(s, wxT(", "));
 326
 327     while (tokenizer.HasMoreTokens())
 328         m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
 329
 330     if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
 331         m_HandlersList.Append(handler);
 332
 333     handler->SetParser(this);
 334 }
 335
 336 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
 337 {
 338     wxStringTokenizer tokenizer(tags, wxT(", "));
 339     wxString key;
 340
 341     if (m_HandlersStack == NULL)
 342     {
 343         m_HandlersStack = new wxList;
 344     }
 345
 346     m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
 347
 348     while (tokenizer.HasMoreTokens())
 349     {
 350         key = tokenizer.GetNextToken();
 351         m_HandlersHash.Delete(key);
 352         m_HandlersHash.Put(key, handler);
 353     }
 354 }
 355
 356 void wxHtmlParser::PopTagHandler()
 357 {
 358     wxList::compatibility_iterator first;
 359
 360     if ( !m_HandlersStack ||
 361 #if wxUSE_STL
 362          !(first = m_HandlersStack->GetFirst())
 363 #else // !wxUSE_STL
 364          ((first = m_HandlersStack->GetFirst()) == NULL)
 365 #endif // wxUSE_STL/!wxUSE_STL
 366         )
 367     {
 368         wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
 369         return;
 370     }
 371     m_HandlersHash = *((wxHashTable*) first->GetData());
 372     delete (wxHashTable*) first->GetData();
 373     m_HandlersStack->Erase(first);
 374 }
 375
 376 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
 377 {
 378     wxHtmlParserState *s = new wxHtmlParserState;
 379
 380     s->m_curTag = m_CurTag;
 381     s->m_tags = m_Tags;
 382     s->m_textPieces = m_TextPieces;
 383     s->m_curTextPiece = m_CurTextPiece;
 384     s->m_source = m_Source;
 385
 386     s->m_nextState = m_SavedStates;
 387     m_SavedStates = s;
 388
 389     m_CurTag = NULL;
 390     m_Tags = NULL;
 391     m_TextPieces = NULL;
 392     m_CurTextPiece = 0;
 393     m_Source = wxEmptyString;
 394
 395     SetSource(src);
 396 }
 397
 398 bool wxHtmlParser::RestoreState()
 399 {
 400     if (!m_SavedStates) return false;
 401
 402     DestroyDOMTree();
 403
 404     wxHtmlParserState *s = m_SavedStates;
 405     m_SavedStates = s->m_nextState;
 406
 407     m_CurTag = s->m_curTag;
 408     m_Tags = s->m_tags;
 409     m_TextPieces = s->m_textPieces;
 410     m_CurTextPiece = s->m_curTextPiece;
 411     m_Source = s->m_source;
 412
 413     delete s;
 414     return true;
 415 }
 416
 417 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
 418 {
 419     return GetSource()->Mid(tag.GetBeginPos(),
 420                             tag.GetEndPos1() - tag.GetBeginPos());
 421 }
 422
 423 //-----------------------------------------------------------------------------
 424 // wxHtmlTagHandler
 425 //-----------------------------------------------------------------------------
 426
 427 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
 428
 429 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
 430 {
 431     // It is safe to temporarily change the source being parsed,
 432     // provided we restore the state back after parsing
 433     m_Parser->SetSourceAndSaveState(source);
 434     m_Parser->DoParsing();
 435     m_Parser->RestoreState();
 436 }
 437
 438
 439 //-----------------------------------------------------------------------------
 440 // wxHtmlEntitiesParser
 441 //-----------------------------------------------------------------------------
 442
 443 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
 444
 445 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
 446 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 447     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
 448 #endif
 449 {
 450 }
 451
 452 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
 453 {
 454 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 455     delete m_conv;
 456 #endif
 457 }
 458
 459 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
 460 {
 461 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
 462     if (encoding == m_encoding)
 463         return;
 464
 465     delete m_conv;
 466
 467     m_encoding = encoding;
 468     if (m_encoding == wxFONTENCODING_SYSTEM)
 469         m_conv = NULL;
 470     else
 471         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
 472 #else
 473     (void) encoding;
 474 #endif
 475 }
 476
 477 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
 478 {
 479     wxString output;
 480
 481     const wxString::const_iterator end(input.end());
 482     wxString::const_iterator c(input.begin());
 483     wxString::const_iterator last(c);
 484
 485     for ( ; c < end; ++c )
 486     {
 487         if (*c == wxT('&'))
 488         {
 489             if ( output.empty() )
 490                 output.reserve(input.length());
 491
 492             if (c - last > 0)
 493                 output.append(last, c);
 494             if ( ++c == end )
 495                 break;
 496
 497             wxString entity;
 498             const wxString::const_iterator ent_s = c;
 499             wxChar entity_char;
 500
 501             for (; c != end &&
 502                    ((*c >= wxT('a') && *c <= wxT('z')) ||
 503                     (*c >= wxT('A') && *c <= wxT('Z')) ||
 504                     (*c >= wxT('0') && *c <= wxT('9')) ||
 505                     *c == wxT('_') || *c == wxT('#')); ++c) {}
 506             entity.append(ent_s, c);
 507             if (c == end || *c != wxT(';')) --c;
 508             last = c+1;
 509             entity_char = GetEntityChar(entity);
 510             if (entity_char)
 511                 output << entity_char;
 512             else
 513             {
 514                 output.append(ent_s-1, c+1);
 515                 wxLogTrace(wxTRACE_HTML_DEBUG,
 516                            "Unrecognized HTML entity: '%s'",
 517                            entity);
 518             }
 519         }
 520     }
 521     if ( last == input.begin() ) // common case: no entity
 522         return input;
 523     if ( last != end )
 524         output.append(last, end);
 525     return output;
 526 }
 527
 528 #if !wxUSE_UNICODE
 529 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
 530 {
 531 #if wxUSE_WCHAR_T
 532     char buf[2];
 533     wchar_t wbuf[2];
 534     wbuf[0] = (wchar_t)code;
 535     wbuf[1] = 0;
 536     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
 537     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
 538         return '?';
 539     return buf[0];
 540 #else
 541     return (code < 256) ? (wxChar)code : '?';
 542 #endif
 543 }
 544 #endif
 545
 546 struct wxHtmlEntityInfo
 547 {
 548     const wxStringCharType *name;
 549     unsigned code;
 550 };
 551
 552 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
 553 {
 554 #if wxUSE_UNICODE_UTF8
 555     return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
 556 #else
 557     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
 558 #endif
 559 }
 560
 561 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
 562 {
 563     unsigned code = 0;
 564
 565     if (entity[0] == wxT('#'))
 566     {
 567         // NB: parsed value is a number, so it's OK to use wx_str(), internal
 568         //     representation is the same for numbers
 569         const wxStringCharType *ent_s = entity.wx_str();
 570         const wxStringCharType *format;
 571
 572         if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
 573         {
 574             format = wxSTRING_TEXT("%x");
 575             ent_s++;
 576         }
 577         else
 578             format = wxSTRING_TEXT("%u");
 579         ent_s++;
 580
 581         if (wxSscanf(ent_s, format, &code) != 1)
 582             code = 0;
 583     }
 584     else
 585     {
 586         // store the literals in wx's internal representation (either char*
 587         // in UTF-8 or wchar_t*) for best performance:
 588         #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
 589
 590         static wxHtmlEntityInfo substitutions[] = {
 591             ENTITY("AElig", 198),
 592             ENTITY("Aacute", 193),
 593             ENTITY("Acirc", 194),
 594             ENTITY("Agrave", 192),
 595             ENTITY("Alpha", 913),
 596             ENTITY("Aring", 197),
 597             ENTITY("Atilde", 195),
 598             ENTITY("Auml", 196),
 599             ENTITY("Beta", 914),
 600             ENTITY("Ccedil", 199),
 601             ENTITY("Chi", 935),
 602             ENTITY("Dagger", 8225),
 603             ENTITY("Delta", 916),
 604             ENTITY("ETH", 208),
 605             ENTITY("Eacute", 201),
 606             ENTITY("Ecirc", 202),
 607             ENTITY("Egrave", 200),
 608             ENTITY("Epsilon", 917),
 609             ENTITY("Eta", 919),
 610             ENTITY("Euml", 203),
 611             ENTITY("Gamma", 915),
 612             ENTITY("Iacute", 205),
 613             ENTITY("Icirc", 206),
 614             ENTITY("Igrave", 204),
 615             ENTITY("Iota", 921),
 616             ENTITY("Iuml", 207),
 617             ENTITY("Kappa", 922),
 618             ENTITY("Lambda", 923),
 619             ENTITY("Mu", 924),
 620             ENTITY("Ntilde", 209),
 621             ENTITY("Nu", 925),
 622             ENTITY("OElig", 338),
 623             ENTITY("Oacute", 211),
 624             ENTITY("Ocirc", 212),
 625             ENTITY("Ograve", 210),
 626             ENTITY("Omega", 937),
 627             ENTITY("Omicron", 927),
 628             ENTITY("Oslash", 216),
 629             ENTITY("Otilde", 213),
 630             ENTITY("Ouml", 214),
 631             ENTITY("Phi", 934),
 632             ENTITY("Pi", 928),
 633             ENTITY("Prime", 8243),
 634             ENTITY("Psi", 936),
 635             ENTITY("Rho", 929),
 636             ENTITY("Scaron", 352),
 637             ENTITY("Sigma", 931),
 638             ENTITY("THORN", 222),
 639             ENTITY("Tau", 932),
 640             ENTITY("Theta", 920),
 641             ENTITY("Uacute", 218),
 642             ENTITY("Ucirc", 219),
 643             ENTITY("Ugrave", 217),
 644             ENTITY("Upsilon", 933),
 645             ENTITY("Uuml", 220),
 646             ENTITY("Xi", 926),
 647             ENTITY("Yacute", 221),
 648             ENTITY("Yuml", 376),
 649             ENTITY("Zeta", 918),
 650             ENTITY("aacute", 225),
 651             ENTITY("acirc", 226),
 652             ENTITY("acute", 180),
 653             ENTITY("aelig", 230),
 654             ENTITY("agrave", 224),
 655             ENTITY("alefsym", 8501),
 656             ENTITY("alpha", 945),
 657             ENTITY("amp", 38),
 658             ENTITY("and", 8743),
 659             ENTITY("ang", 8736),
 660             ENTITY("aring", 229),
 661             ENTITY("asymp", 8776),
 662             ENTITY("atilde", 227),
 663             ENTITY("auml", 228),
 664             ENTITY("bdquo", 8222),
 665             ENTITY("beta", 946),
 666             ENTITY("brvbar", 166),
 667             ENTITY("bull", 8226),
 668             ENTITY("cap", 8745),
 669             ENTITY("ccedil", 231),
 670             ENTITY("cedil", 184),
 671             ENTITY("cent", 162),
 672             ENTITY("chi", 967),
 673             ENTITY("circ", 710),
 674             ENTITY("clubs", 9827),
 675             ENTITY("cong", 8773),
 676             ENTITY("copy", 169),
 677             ENTITY("crarr", 8629),
 678             ENTITY("cup", 8746),
 679             ENTITY("curren", 164),
 680             ENTITY("dArr", 8659),
 681             ENTITY("dagger", 8224),
 682             ENTITY("darr", 8595),
 683             ENTITY("deg", 176),
 684             ENTITY("delta", 948),
 685             ENTITY("diams", 9830),
 686             ENTITY("divide", 247),
 687             ENTITY("eacute", 233),
 688             ENTITY("ecirc", 234),
 689             ENTITY("egrave", 232),
 690             ENTITY("empty", 8709),
 691             ENTITY("emsp", 8195),
 692             ENTITY("ensp", 8194),
 693             ENTITY("epsilon", 949),
 694             ENTITY("equiv", 8801),
 695             ENTITY("eta", 951),
 696             ENTITY("eth", 240),
 697             ENTITY("euml", 235),
 698             ENTITY("euro", 8364),
 699             ENTITY("exist", 8707),
 700             ENTITY("fnof", 402),
 701             ENTITY("forall", 8704),
 702             ENTITY("frac12", 189),
 703             ENTITY("frac14", 188),
 704             ENTITY("frac34", 190),
 705             ENTITY("frasl", 8260),
 706             ENTITY("gamma", 947),
 707             ENTITY("ge", 8805),
 708             ENTITY("gt", 62),
 709             ENTITY("hArr", 8660),
 710             ENTITY("harr", 8596),
 711             ENTITY("hearts", 9829),
 712             ENTITY("hellip", 8230),
 713             ENTITY("iacute", 237),
 714             ENTITY("icirc", 238),
 715             ENTITY("iexcl", 161),
 716             ENTITY("igrave", 236),
 717             ENTITY("image", 8465),
 718             ENTITY("infin", 8734),
 719             ENTITY("int", 8747),
 720             ENTITY("iota", 953),
 721             ENTITY("iquest", 191),
 722             ENTITY("isin", 8712),
 723             ENTITY("iuml", 239),
 724             ENTITY("kappa", 954),
 725             ENTITY("lArr", 8656),
 726             ENTITY("lambda", 955),
 727             ENTITY("lang", 9001),
 728             ENTITY("laquo", 171),
 729             ENTITY("larr", 8592),
 730             ENTITY("lceil", 8968),
 731             ENTITY("ldquo", 8220),
 732             ENTITY("le", 8804),
 733             ENTITY("lfloor", 8970),
 734             ENTITY("lowast", 8727),
 735             ENTITY("loz", 9674),
 736             ENTITY("lrm", 8206),
 737             ENTITY("lsaquo", 8249),
 738             ENTITY("lsquo", 8216),
 739             ENTITY("lt", 60),
 740             ENTITY("macr", 175),
 741             ENTITY("mdash", 8212),
 742             ENTITY("micro", 181),
 743             ENTITY("middot", 183),
 744             ENTITY("minus", 8722),
 745             ENTITY("mu", 956),
 746             ENTITY("nabla", 8711),
 747             ENTITY("nbsp", 160),
 748             ENTITY("ndash", 8211),
 749             ENTITY("ne", 8800),
 750             ENTITY("ni", 8715),
 751             ENTITY("not", 172),
 752             ENTITY("notin", 8713),
 753             ENTITY("nsub", 8836),
 754             ENTITY("ntilde", 241),
 755             ENTITY("nu", 957),
 756             ENTITY("oacute", 243),
 757             ENTITY("ocirc", 244),
 758             ENTITY("oelig", 339),
 759             ENTITY("ograve", 242),
 760             ENTITY("oline", 8254),
 761             ENTITY("omega", 969),
 762             ENTITY("omicron", 959),
 763             ENTITY("oplus", 8853),
 764             ENTITY("or", 8744),
 765             ENTITY("ordf", 170),
 766             ENTITY("ordm", 186),
 767             ENTITY("oslash", 248),
 768             ENTITY("otilde", 245),
 769             ENTITY("otimes", 8855),
 770             ENTITY("ouml", 246),
 771             ENTITY("para", 182),
 772             ENTITY("part", 8706),
 773             ENTITY("permil", 8240),
 774             ENTITY("perp", 8869),
 775             ENTITY("phi", 966),
 776             ENTITY("pi", 960),
 777             ENTITY("piv", 982),
 778             ENTITY("plusmn", 177),
 779             ENTITY("pound", 163),
 780             ENTITY("prime", 8242),
 781             ENTITY("prod", 8719),
 782             ENTITY("prop", 8733),
 783             ENTITY("psi", 968),
 784             ENTITY("quot", 34),
 785             ENTITY("rArr", 8658),
 786             ENTITY("radic", 8730),
 787             ENTITY("rang", 9002),
 788             ENTITY("raquo", 187),
 789             ENTITY("rarr", 8594),
 790             ENTITY("rceil", 8969),
 791             ENTITY("rdquo", 8221),
 792             ENTITY("real", 8476),
 793             ENTITY("reg", 174),
 794             ENTITY("rfloor", 8971),
 795             ENTITY("rho", 961),
 796             ENTITY("rlm", 8207),
 797             ENTITY("rsaquo", 8250),
 798             ENTITY("rsquo", 8217),
 799             ENTITY("sbquo", 8218),
 800             ENTITY("scaron", 353),
 801             ENTITY("sdot", 8901),
 802             ENTITY("sect", 167),
 803             ENTITY("shy", 173),
 804             ENTITY("sigma", 963),
 805             ENTITY("sigmaf", 962),
 806             ENTITY("sim", 8764),
 807             ENTITY("spades", 9824),
 808             ENTITY("sub", 8834),
 809             ENTITY("sube", 8838),
 810             ENTITY("sum", 8721),
 811             ENTITY("sup", 8835),
 812             ENTITY("sup1", 185),
 813             ENTITY("sup2", 178),
 814             ENTITY("sup3", 179),
 815             ENTITY("supe", 8839),
 816             ENTITY("szlig", 223),
 817             ENTITY("tau", 964),
 818             ENTITY("there4", 8756),
 819             ENTITY("theta", 952),
 820             ENTITY("thetasym", 977),
 821             ENTITY("thinsp", 8201),
 822             ENTITY("thorn", 254),
 823             ENTITY("tilde", 732),
 824             ENTITY("times", 215),
 825             ENTITY("trade", 8482),
 826             ENTITY("uArr", 8657),
 827             ENTITY("uacute", 250),
 828             ENTITY("uarr", 8593),
 829             ENTITY("ucirc", 251),
 830             ENTITY("ugrave", 249),
 831             ENTITY("uml", 168),
 832             ENTITY("upsih", 978),
 833             ENTITY("upsilon", 965),
 834             ENTITY("uuml", 252),
 835             ENTITY("weierp", 8472),
 836             ENTITY("xi", 958),
 837             ENTITY("yacute", 253),
 838             ENTITY("yen", 165),
 839             ENTITY("yuml", 255),
 840             ENTITY("zeta", 950),
 841             ENTITY("zwj", 8205),
 842             ENTITY("zwnj", 8204),
 843             {NULL, 0}};
 844         #undef ENTITY
 845         static size_t substitutions_cnt = 0;
 846
 847         if (substitutions_cnt == 0)
 848             while (substitutions[substitutions_cnt].code != 0)
 849                 substitutions_cnt++;
 850
 851         wxHtmlEntityInfo *info = NULL;
 852 #ifdef __WXWINCE__
 853         // bsearch crashes under WinCE for some reason
 854         size_t i;
 855         for (i = 0; i < substitutions_cnt; i++)
 856         {
 857             if (entity == substitutions[i].name)
 858             {
 859                 info = & substitutions[i];
 860                 break;
 861             }
 862         }
 863 #else
 864         info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
 865                                            substitutions_cnt,
 866                                            sizeof(wxHtmlEntityInfo),
 867                                            wxHtmlEntityCompare);
 868 #endif
 869         if (info)
 870             code = info->code;
 871     }
 872
 873     if (code == 0)
 874         return 0;
 875     else
 876         return GetCharForCode(code);
 877 }
 878
 879 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
 880                                 const wxString& url) const
 881 {
 882     return m_FS ? m_FS->OpenFile(url) : NULL;
 883
 884 }
 885
 886
 887 //-----------------------------------------------------------------------------
 888 // wxHtmlParser::ExtractCharsetInformation
 889 //-----------------------------------------------------------------------------
 890
 891 class wxMetaTagParser : public wxHtmlParser
 892 {
 893 public:
 894     wxMetaTagParser() { }
 895
 896     wxObject* GetProduct() { return NULL; }
 897
 898 protected:
 899     virtual void AddText(const wxString& WXUNUSED(txt)) {}
 900
 901     DECLARE_NO_COPY_CLASS(wxMetaTagParser)
 902 };
 903
 904 class wxMetaTagHandler : public wxHtmlTagHandler
 905 {
 906 public:
 907     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
 908     wxString GetSupportedTags() { return wxT("META,BODY"); }
 909     bool HandleTag(const wxHtmlTag& tag);
 910
 911 private:
 912     wxString *m_retval;
 913
 914     DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
 915 };
 916
 917 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
 918 {
 919     if (tag.GetName() == _T("BODY"))
 920     {
 921         m_Parser->StopParsing();
 922         return false;
 923     }
 924
 925     if (tag.HasParam(_T("HTTP-EQUIV")) &&
 926         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
 927         tag.HasParam(_T("CONTENT")))
 928     {
 929         wxString content = tag.GetParam(_T("CONTENT")).Lower();
 930         if (content.Left(19) == _T("text/html; charset="))
 931         {
 932             *m_retval = content.Mid(19);
 933             m_Parser->StopParsing();
 934         }
 935     }
 936     return false;
 937 }
 938
 939
 940 /*static*/
 941 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
 942 {
 943     wxString charset;
 944     wxMetaTagParser *parser = new wxMetaTagParser();
 945     if(parser)
 946     {
 947         parser->AddTagHandler(new wxMetaTagHandler(&charset));
 948         parser->Parse(markup);
 949         delete parser;
 950     }
 951     return charset;
 952 }
 953
 954 /* static */
 955 bool
 956 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
 957                              wxString::const_iterator end)
 958 {
 959     wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
 960
 961     wxString::const_iterator p = start;
 962
 963     // comments begin with "<!--" in HTML 4.0
 964     if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
 965     {
 966         // not a comment at all
 967         return false;
 968     }
 969
 970     // skip the start of the comment tag in any case, if we don't find the
 971     // closing tag we should ignore broken markup
 972     start = p;
 973
 974     // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
 975     // comment delimiter and the closing tag character (section 3.2.4 of
 976     // http://www.w3.org/TR/html401/)
 977     int dashes = 0;
 978     while ( ++p < end )
 979     {
 980         const wxChar c = *p;
 981
 982         if ( (c == wxT(' ') || c == wxT('\n') ||
 983               c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
 984         {
 985             // ignore white space before potential tag end
 986             continue;
 987         }
 988
 989         if ( c == wxT('>') && dashes >= 2 )
 990         {
 991             // found end of comment
 992             start = p;
 993             break;
 994         }
 995
 996         if ( c == wxT('-') )
 997             dashes++;
 998         else
 999             dashes = 0;
1000     }
1001
1002     return true;
1003 }
1004
1005 #endif // wxUSE_HTML