rewrote wxHtmlEntitiesParser::Parse() using iterators, optimized for the common case...
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/arrimpl.cpp"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
51 int m_pos, m_lng;
52 };
53
54 WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
55 WX_DEFINE_OBJARRAY(wxHtmlTextPieces)
56
57 class wxHtmlParserState
58 {
59 public:
60 wxHtmlTag *m_curTag;
61 wxHtmlTag *m_tags;
62 wxHtmlTextPieces *m_textPieces;
63 int m_curTextPiece;
64 wxString m_source;
65 wxHtmlParserState *m_nextState;
66 };
67
68 //-----------------------------------------------------------------------------
69 // wxHtmlParser
70 //-----------------------------------------------------------------------------
71
72 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
73
74 wxHtmlParser::wxHtmlParser()
75 : wxObject(), m_HandlersHash(wxKEY_STRING),
76 m_FS(NULL), m_HandlersStack(NULL)
77 {
78 m_entitiesParser = new wxHtmlEntitiesParser;
79 m_Tags = NULL;
80 m_CurTag = NULL;
81 m_TextPieces = NULL;
82 m_CurTextPiece = 0;
83 m_SavedStates = NULL;
84 }
85
86 wxHtmlParser::~wxHtmlParser()
87 {
88 while (RestoreState()) {}
89 DestroyDOMTree();
90
91 if (m_HandlersStack)
92 {
93 wxList& tmp = *m_HandlersStack;
94 wxList::iterator it, en;
95 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
96 delete (wxHashTable*)*it;
97 tmp.clear();
98 }
99 delete m_HandlersStack;
100 m_HandlersHash.Clear();
101 WX_CLEAR_LIST(wxList, m_HandlersList);
102 delete m_entitiesParser;
103 }
104
105 wxObject* wxHtmlParser::Parse(const wxString& source)
106 {
107 InitParser(source);
108 DoParsing();
109 wxObject *result = GetProduct();
110 DoneParser();
111 return result;
112 }
113
114 void wxHtmlParser::InitParser(const wxString& source)
115 {
116 SetSource(source);
117 m_stopParsing = false;
118 }
119
120 void wxHtmlParser::DoneParser()
121 {
122 DestroyDOMTree();
123 }
124
125 void wxHtmlParser::SetSource(const wxString& src)
126 {
127 DestroyDOMTree();
128 m_Source = src;
129 CreateDOMTree();
130 m_CurTag = NULL;
131 m_CurTextPiece = 0;
132 }
133
134 void wxHtmlParser::CreateDOMTree()
135 {
136 wxHtmlTagsCache cache(m_Source);
137 m_TextPieces = new wxHtmlTextPieces;
138 CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
139 m_CurTextPiece = 0;
140 }
141
142 extern bool wxIsCDATAElement(const wxChar *tag);
143
144 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
145 int begin_pos, int end_pos,
146 wxHtmlTagsCache *cache)
147 {
148 if (end_pos <= begin_pos) return;
149
150 wxChar c;
151 int i = begin_pos;
152 int textBeginning = begin_pos;
153
154 // If the tag contains CDATA text, we include the text between beginning
155 // and ending tag verbosely. Setting i=end_pos will skip to the very
156 // end of this function where text piece is added, bypassing any child
157 // tags parsing (CDATA element can't have child elements by definition):
158 if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
159 {
160 i = end_pos;
161 }
162
163 while (i < end_pos)
164 {
165 c = m_Source.GetChar(i);
166
167 if (c == wxT('<'))
168 {
169 // add text to m_TextPieces:
170 if (i - textBeginning > 0)
171 m_TextPieces->Add(
172 wxHtmlTextPiece(textBeginning, i - textBeginning));
173
174 // if it is a comment, skip it:
175 wxString::const_iterator iter = m_Source.begin() + i;
176 if ( SkipCommentTag(iter, m_Source.end()) )
177 {
178 textBeginning =
179 i = iter - m_Source.begin() + 1; // skip closing '>' too
180 }
181
182 // add another tag to the tree:
183 else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
184 {
185 wxHtmlTag *chd;
186 if (cur)
187 chd = new wxHtmlTag(cur, m_Source,
188 i, end_pos, cache, m_entitiesParser);
189 else
190 {
191 chd = new wxHtmlTag(NULL, m_Source,
192 i, end_pos, cache, m_entitiesParser);
193 if (!m_Tags)
194 {
195 // if this is the first tag to be created make the root
196 // m_Tags point to it:
197 m_Tags = chd;
198 }
199 else
200 {
201 // if there is already a root tag add this tag as
202 // the last sibling:
203 chd->m_Prev = m_Tags->GetLastSibling();
204 chd->m_Prev->m_Next = chd;
205 }
206 }
207
208 if (chd->HasEnding())
209 {
210 CreateDOMSubTree(chd,
211 chd->GetBeginPos(), chd->GetEndPos1(),
212 cache);
213 i = chd->GetEndPos2();
214 }
215 else
216 i = chd->GetBeginPos();
217
218 textBeginning = i;
219 }
220
221 // ... or skip ending tag:
222 else
223 {
224 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
225 textBeginning = i+1;
226 }
227 }
228 else i++;
229 }
230
231 // add remaining text to m_TextPieces:
232 if (end_pos - textBeginning > 0)
233 m_TextPieces->Add(
234 wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
235 }
236
237 void wxHtmlParser::DestroyDOMTree()
238 {
239 wxHtmlTag *t1, *t2;
240 t1 = m_Tags;
241 while (t1)
242 {
243 t2 = t1->GetNextSibling();
244 delete t1;
245 t1 = t2;
246 }
247 m_Tags = m_CurTag = NULL;
248
249 delete m_TextPieces;
250 m_TextPieces = NULL;
251 }
252
253 void wxHtmlParser::DoParsing()
254 {
255 m_CurTag = m_Tags;
256 m_CurTextPiece = 0;
257 DoParsing(0, m_Source.length());
258 }
259
260 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
261 {
262 if (end_pos <= begin_pos) return;
263
264 wxHtmlTextPieces& pieces = *m_TextPieces;
265 size_t piecesCnt = pieces.GetCount();
266
267 while (begin_pos < end_pos)
268 {
269 while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
270 m_CurTag = m_CurTag->GetNextTag();
271 while (m_CurTextPiece < piecesCnt &&
272 pieces[m_CurTextPiece].m_pos < begin_pos)
273 m_CurTextPiece++;
274
275 if (m_CurTextPiece < piecesCnt &&
276 (!m_CurTag ||
277 pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
278 {
279 // Add text:
280 AddText(GetEntitiesParser()->Parse(
281 m_Source.Mid(pieces[m_CurTextPiece].m_pos,
282 pieces[m_CurTextPiece].m_lng)));
283 begin_pos = pieces[m_CurTextPiece].m_pos +
284 pieces[m_CurTextPiece].m_lng;
285 m_CurTextPiece++;
286 }
287 else if (m_CurTag)
288 {
289 if (m_CurTag->HasEnding())
290 begin_pos = m_CurTag->GetEndPos2();
291 else
292 begin_pos = m_CurTag->GetBeginPos();
293 wxHtmlTag *t = m_CurTag;
294 m_CurTag = m_CurTag->GetNextTag();
295 AddTag(*t);
296 if (m_stopParsing)
297 return;
298 }
299 else break;
300 }
301 }
302
303 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
304 {
305 wxHtmlTagHandler *h;
306 bool inner = false;
307
308 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
309 if (h)
310 {
311 inner = h->HandleTag(tag);
312 if (m_stopParsing)
313 return;
314 }
315 if (!inner)
316 {
317 if (tag.HasEnding())
318 DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
319 }
320 }
321
322 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
323 {
324 wxString s(handler->GetSupportedTags());
325 wxStringTokenizer tokenizer(s, wxT(", "));
326
327 while (tokenizer.HasMoreTokens())
328 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
329
330 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
331 m_HandlersList.Append(handler);
332
333 handler->SetParser(this);
334 }
335
336 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
337 {
338 wxStringTokenizer tokenizer(tags, wxT(", "));
339 wxString key;
340
341 if (m_HandlersStack == NULL)
342 {
343 m_HandlersStack = new wxList;
344 }
345
346 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
347
348 while (tokenizer.HasMoreTokens())
349 {
350 key = tokenizer.GetNextToken();
351 m_HandlersHash.Delete(key);
352 m_HandlersHash.Put(key, handler);
353 }
354 }
355
356 void wxHtmlParser::PopTagHandler()
357 {
358 wxList::compatibility_iterator first;
359
360 if ( !m_HandlersStack ||
361 #if wxUSE_STL
362 !(first = m_HandlersStack->GetFirst())
363 #else // !wxUSE_STL
364 ((first = m_HandlersStack->GetFirst()) == NULL)
365 #endif // wxUSE_STL/!wxUSE_STL
366 )
367 {
368 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
369 return;
370 }
371 m_HandlersHash = *((wxHashTable*) first->GetData());
372 delete (wxHashTable*) first->GetData();
373 m_HandlersStack->Erase(first);
374 }
375
376 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
377 {
378 wxHtmlParserState *s = new wxHtmlParserState;
379
380 s->m_curTag = m_CurTag;
381 s->m_tags = m_Tags;
382 s->m_textPieces = m_TextPieces;
383 s->m_curTextPiece = m_CurTextPiece;
384 s->m_source = m_Source;
385
386 s->m_nextState = m_SavedStates;
387 m_SavedStates = s;
388
389 m_CurTag = NULL;
390 m_Tags = NULL;
391 m_TextPieces = NULL;
392 m_CurTextPiece = 0;
393 m_Source = wxEmptyString;
394
395 SetSource(src);
396 }
397
398 bool wxHtmlParser::RestoreState()
399 {
400 if (!m_SavedStates) return false;
401
402 DestroyDOMTree();
403
404 wxHtmlParserState *s = m_SavedStates;
405 m_SavedStates = s->m_nextState;
406
407 m_CurTag = s->m_curTag;
408 m_Tags = s->m_tags;
409 m_TextPieces = s->m_textPieces;
410 m_CurTextPiece = s->m_curTextPiece;
411 m_Source = s->m_source;
412
413 delete s;
414 return true;
415 }
416
417 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
418 {
419 return GetSource()->Mid(tag.GetBeginPos(),
420 tag.GetEndPos1() - tag.GetBeginPos());
421 }
422
423 //-----------------------------------------------------------------------------
424 // wxHtmlTagHandler
425 //-----------------------------------------------------------------------------
426
427 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
428
429 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
430 {
431 // It is safe to temporarily change the source being parsed,
432 // provided we restore the state back after parsing
433 m_Parser->SetSourceAndSaveState(source);
434 m_Parser->DoParsing();
435 m_Parser->RestoreState();
436 }
437
438
439 //-----------------------------------------------------------------------------
440 // wxHtmlEntitiesParser
441 //-----------------------------------------------------------------------------
442
443 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
444
445 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
446 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
447 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
448 #endif
449 {
450 }
451
452 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
453 {
454 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
455 delete m_conv;
456 #endif
457 }
458
459 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
460 {
461 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
462 if (encoding == m_encoding)
463 return;
464
465 delete m_conv;
466
467 m_encoding = encoding;
468 if (m_encoding == wxFONTENCODING_SYSTEM)
469 m_conv = NULL;
470 else
471 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
472 #else
473 (void) encoding;
474 #endif
475 }
476
477 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
478 {
479 wxString output;
480
481 const wxString::const_iterator end(input.end());
482 wxString::const_iterator c(input.begin());
483 wxString::const_iterator last(c);
484
485 for ( ; c < end; ++c )
486 {
487 if (*c == wxT('&'))
488 {
489 if ( output.empty() )
490 output.reserve(input.length());
491
492 if (c - last > 0)
493 output.append(last, c);
494 if ( ++c == end )
495 break;
496
497 wxString entity;
498 const wxString::const_iterator ent_s = c;
499 wxChar entity_char;
500
501 for (; c != end &&
502 ((*c >= wxT('a') && *c <= wxT('z')) ||
503 (*c >= wxT('A') && *c <= wxT('Z')) ||
504 (*c >= wxT('0') && *c <= wxT('9')) ||
505 *c == wxT('_') || *c == wxT('#')); ++c) {}
506 entity.append(ent_s, c);
507 if (c == end || *c != wxT(';')) --c;
508 last = c+1;
509 entity_char = GetEntityChar(entity);
510 if (entity_char)
511 output << entity_char;
512 else
513 {
514 output.append(ent_s-1, c+1);
515 wxLogTrace(wxTRACE_HTML_DEBUG,
516 "Unrecognized HTML entity: '%s'",
517 entity);
518 }
519 }
520 }
521 if ( last == input.begin() ) // common case: no entity
522 return input;
523 if ( last != end )
524 output.append(last, end);
525 return output;
526 }
527
528 #if !wxUSE_UNICODE
529 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
530 {
531 #if wxUSE_WCHAR_T
532 char buf[2];
533 wchar_t wbuf[2];
534 wbuf[0] = (wchar_t)code;
535 wbuf[1] = 0;
536 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
537 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
538 return '?';
539 return buf[0];
540 #else
541 return (code < 256) ? (wxChar)code : '?';
542 #endif
543 }
544 #endif
545
546 struct wxHtmlEntityInfo
547 {
548 const wxStringCharType *name;
549 unsigned code;
550 };
551
552 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
553 {
554 #if wxUSE_UNICODE_UTF8
555 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
556 #else
557 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
558 #endif
559 }
560
561 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
562 {
563 unsigned code = 0;
564
565 if (entity[0] == wxT('#'))
566 {
567 // NB: parsed value is a number, so it's OK to use wx_str(), internal
568 // representation is the same for numbers
569 const wxStringCharType *ent_s = entity.wx_str();
570 const wxStringCharType *format;
571
572 if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
573 {
574 format = wxSTRING_TEXT("%x");
575 ent_s++;
576 }
577 else
578 format = wxSTRING_TEXT("%u");
579 ent_s++;
580
581 if (wxSscanf(ent_s, format, &code) != 1)
582 code = 0;
583 }
584 else
585 {
586 // store the literals in wx's internal representation (either char*
587 // in UTF-8 or wchar_t*) for best performance:
588 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
589
590 static wxHtmlEntityInfo substitutions[] = {
591 ENTITY("AElig", 198),
592 ENTITY("Aacute", 193),
593 ENTITY("Acirc", 194),
594 ENTITY("Agrave", 192),
595 ENTITY("Alpha", 913),
596 ENTITY("Aring", 197),
597 ENTITY("Atilde", 195),
598 ENTITY("Auml", 196),
599 ENTITY("Beta", 914),
600 ENTITY("Ccedil", 199),
601 ENTITY("Chi", 935),
602 ENTITY("Dagger", 8225),
603 ENTITY("Delta", 916),
604 ENTITY("ETH", 208),
605 ENTITY("Eacute", 201),
606 ENTITY("Ecirc", 202),
607 ENTITY("Egrave", 200),
608 ENTITY("Epsilon", 917),
609 ENTITY("Eta", 919),
610 ENTITY("Euml", 203),
611 ENTITY("Gamma", 915),
612 ENTITY("Iacute", 205),
613 ENTITY("Icirc", 206),
614 ENTITY("Igrave", 204),
615 ENTITY("Iota", 921),
616 ENTITY("Iuml", 207),
617 ENTITY("Kappa", 922),
618 ENTITY("Lambda", 923),
619 ENTITY("Mu", 924),
620 ENTITY("Ntilde", 209),
621 ENTITY("Nu", 925),
622 ENTITY("OElig", 338),
623 ENTITY("Oacute", 211),
624 ENTITY("Ocirc", 212),
625 ENTITY("Ograve", 210),
626 ENTITY("Omega", 937),
627 ENTITY("Omicron", 927),
628 ENTITY("Oslash", 216),
629 ENTITY("Otilde", 213),
630 ENTITY("Ouml", 214),
631 ENTITY("Phi", 934),
632 ENTITY("Pi", 928),
633 ENTITY("Prime", 8243),
634 ENTITY("Psi", 936),
635 ENTITY("Rho", 929),
636 ENTITY("Scaron", 352),
637 ENTITY("Sigma", 931),
638 ENTITY("THORN", 222),
639 ENTITY("Tau", 932),
640 ENTITY("Theta", 920),
641 ENTITY("Uacute", 218),
642 ENTITY("Ucirc", 219),
643 ENTITY("Ugrave", 217),
644 ENTITY("Upsilon", 933),
645 ENTITY("Uuml", 220),
646 ENTITY("Xi", 926),
647 ENTITY("Yacute", 221),
648 ENTITY("Yuml", 376),
649 ENTITY("Zeta", 918),
650 ENTITY("aacute", 225),
651 ENTITY("acirc", 226),
652 ENTITY("acute", 180),
653 ENTITY("aelig", 230),
654 ENTITY("agrave", 224),
655 ENTITY("alefsym", 8501),
656 ENTITY("alpha", 945),
657 ENTITY("amp", 38),
658 ENTITY("and", 8743),
659 ENTITY("ang", 8736),
660 ENTITY("aring", 229),
661 ENTITY("asymp", 8776),
662 ENTITY("atilde", 227),
663 ENTITY("auml", 228),
664 ENTITY("bdquo", 8222),
665 ENTITY("beta", 946),
666 ENTITY("brvbar", 166),
667 ENTITY("bull", 8226),
668 ENTITY("cap", 8745),
669 ENTITY("ccedil", 231),
670 ENTITY("cedil", 184),
671 ENTITY("cent", 162),
672 ENTITY("chi", 967),
673 ENTITY("circ", 710),
674 ENTITY("clubs", 9827),
675 ENTITY("cong", 8773),
676 ENTITY("copy", 169),
677 ENTITY("crarr", 8629),
678 ENTITY("cup", 8746),
679 ENTITY("curren", 164),
680 ENTITY("dArr", 8659),
681 ENTITY("dagger", 8224),
682 ENTITY("darr", 8595),
683 ENTITY("deg", 176),
684 ENTITY("delta", 948),
685 ENTITY("diams", 9830),
686 ENTITY("divide", 247),
687 ENTITY("eacute", 233),
688 ENTITY("ecirc", 234),
689 ENTITY("egrave", 232),
690 ENTITY("empty", 8709),
691 ENTITY("emsp", 8195),
692 ENTITY("ensp", 8194),
693 ENTITY("epsilon", 949),
694 ENTITY("equiv", 8801),
695 ENTITY("eta", 951),
696 ENTITY("eth", 240),
697 ENTITY("euml", 235),
698 ENTITY("euro", 8364),
699 ENTITY("exist", 8707),
700 ENTITY("fnof", 402),
701 ENTITY("forall", 8704),
702 ENTITY("frac12", 189),
703 ENTITY("frac14", 188),
704 ENTITY("frac34", 190),
705 ENTITY("frasl", 8260),
706 ENTITY("gamma", 947),
707 ENTITY("ge", 8805),
708 ENTITY("gt", 62),
709 ENTITY("hArr", 8660),
710 ENTITY("harr", 8596),
711 ENTITY("hearts", 9829),
712 ENTITY("hellip", 8230),
713 ENTITY("iacute", 237),
714 ENTITY("icirc", 238),
715 ENTITY("iexcl", 161),
716 ENTITY("igrave", 236),
717 ENTITY("image", 8465),
718 ENTITY("infin", 8734),
719 ENTITY("int", 8747),
720 ENTITY("iota", 953),
721 ENTITY("iquest", 191),
722 ENTITY("isin", 8712),
723 ENTITY("iuml", 239),
724 ENTITY("kappa", 954),
725 ENTITY("lArr", 8656),
726 ENTITY("lambda", 955),
727 ENTITY("lang", 9001),
728 ENTITY("laquo", 171),
729 ENTITY("larr", 8592),
730 ENTITY("lceil", 8968),
731 ENTITY("ldquo", 8220),
732 ENTITY("le", 8804),
733 ENTITY("lfloor", 8970),
734 ENTITY("lowast", 8727),
735 ENTITY("loz", 9674),
736 ENTITY("lrm", 8206),
737 ENTITY("lsaquo", 8249),
738 ENTITY("lsquo", 8216),
739 ENTITY("lt", 60),
740 ENTITY("macr", 175),
741 ENTITY("mdash", 8212),
742 ENTITY("micro", 181),
743 ENTITY("middot", 183),
744 ENTITY("minus", 8722),
745 ENTITY("mu", 956),
746 ENTITY("nabla", 8711),
747 ENTITY("nbsp", 160),
748 ENTITY("ndash", 8211),
749 ENTITY("ne", 8800),
750 ENTITY("ni", 8715),
751 ENTITY("not", 172),
752 ENTITY("notin", 8713),
753 ENTITY("nsub", 8836),
754 ENTITY("ntilde", 241),
755 ENTITY("nu", 957),
756 ENTITY("oacute", 243),
757 ENTITY("ocirc", 244),
758 ENTITY("oelig", 339),
759 ENTITY("ograve", 242),
760 ENTITY("oline", 8254),
761 ENTITY("omega", 969),
762 ENTITY("omicron", 959),
763 ENTITY("oplus", 8853),
764 ENTITY("or", 8744),
765 ENTITY("ordf", 170),
766 ENTITY("ordm", 186),
767 ENTITY("oslash", 248),
768 ENTITY("otilde", 245),
769 ENTITY("otimes", 8855),
770 ENTITY("ouml", 246),
771 ENTITY("para", 182),
772 ENTITY("part", 8706),
773 ENTITY("permil", 8240),
774 ENTITY("perp", 8869),
775 ENTITY("phi", 966),
776 ENTITY("pi", 960),
777 ENTITY("piv", 982),
778 ENTITY("plusmn", 177),
779 ENTITY("pound", 163),
780 ENTITY("prime", 8242),
781 ENTITY("prod", 8719),
782 ENTITY("prop", 8733),
783 ENTITY("psi", 968),
784 ENTITY("quot", 34),
785 ENTITY("rArr", 8658),
786 ENTITY("radic", 8730),
787 ENTITY("rang", 9002),
788 ENTITY("raquo", 187),
789 ENTITY("rarr", 8594),
790 ENTITY("rceil", 8969),
791 ENTITY("rdquo", 8221),
792 ENTITY("real", 8476),
793 ENTITY("reg", 174),
794 ENTITY("rfloor", 8971),
795 ENTITY("rho", 961),
796 ENTITY("rlm", 8207),
797 ENTITY("rsaquo", 8250),
798 ENTITY("rsquo", 8217),
799 ENTITY("sbquo", 8218),
800 ENTITY("scaron", 353),
801 ENTITY("sdot", 8901),
802 ENTITY("sect", 167),
803 ENTITY("shy", 173),
804 ENTITY("sigma", 963),
805 ENTITY("sigmaf", 962),
806 ENTITY("sim", 8764),
807 ENTITY("spades", 9824),
808 ENTITY("sub", 8834),
809 ENTITY("sube", 8838),
810 ENTITY("sum", 8721),
811 ENTITY("sup", 8835),
812 ENTITY("sup1", 185),
813 ENTITY("sup2", 178),
814 ENTITY("sup3", 179),
815 ENTITY("supe", 8839),
816 ENTITY("szlig", 223),
817 ENTITY("tau", 964),
818 ENTITY("there4", 8756),
819 ENTITY("theta", 952),
820 ENTITY("thetasym", 977),
821 ENTITY("thinsp", 8201),
822 ENTITY("thorn", 254),
823 ENTITY("tilde", 732),
824 ENTITY("times", 215),
825 ENTITY("trade", 8482),
826 ENTITY("uArr", 8657),
827 ENTITY("uacute", 250),
828 ENTITY("uarr", 8593),
829 ENTITY("ucirc", 251),
830 ENTITY("ugrave", 249),
831 ENTITY("uml", 168),
832 ENTITY("upsih", 978),
833 ENTITY("upsilon", 965),
834 ENTITY("uuml", 252),
835 ENTITY("weierp", 8472),
836 ENTITY("xi", 958),
837 ENTITY("yacute", 253),
838 ENTITY("yen", 165),
839 ENTITY("yuml", 255),
840 ENTITY("zeta", 950),
841 ENTITY("zwj", 8205),
842 ENTITY("zwnj", 8204),
843 {NULL, 0}};
844 #undef ENTITY
845 static size_t substitutions_cnt = 0;
846
847 if (substitutions_cnt == 0)
848 while (substitutions[substitutions_cnt].code != 0)
849 substitutions_cnt++;
850
851 wxHtmlEntityInfo *info = NULL;
852 #ifdef __WXWINCE__
853 // bsearch crashes under WinCE for some reason
854 size_t i;
855 for (i = 0; i < substitutions_cnt; i++)
856 {
857 if (entity == substitutions[i].name)
858 {
859 info = & substitutions[i];
860 break;
861 }
862 }
863 #else
864 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
865 substitutions_cnt,
866 sizeof(wxHtmlEntityInfo),
867 wxHtmlEntityCompare);
868 #endif
869 if (info)
870 code = info->code;
871 }
872
873 if (code == 0)
874 return 0;
875 else
876 return GetCharForCode(code);
877 }
878
879 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
880 const wxString& url) const
881 {
882 return m_FS ? m_FS->OpenFile(url) : NULL;
883
884 }
885
886
887 //-----------------------------------------------------------------------------
888 // wxHtmlParser::ExtractCharsetInformation
889 //-----------------------------------------------------------------------------
890
891 class wxMetaTagParser : public wxHtmlParser
892 {
893 public:
894 wxMetaTagParser() { }
895
896 wxObject* GetProduct() { return NULL; }
897
898 protected:
899 virtual void AddText(const wxString& WXUNUSED(txt)) {}
900
901 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
902 };
903
904 class wxMetaTagHandler : public wxHtmlTagHandler
905 {
906 public:
907 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
908 wxString GetSupportedTags() { return wxT("META,BODY"); }
909 bool HandleTag(const wxHtmlTag& tag);
910
911 private:
912 wxString *m_retval;
913
914 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
915 };
916
917 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
918 {
919 if (tag.GetName() == _T("BODY"))
920 {
921 m_Parser->StopParsing();
922 return false;
923 }
924
925 if (tag.HasParam(_T("HTTP-EQUIV")) &&
926 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
927 tag.HasParam(_T("CONTENT")))
928 {
929 wxString content = tag.GetParam(_T("CONTENT")).Lower();
930 if (content.Left(19) == _T("text/html; charset="))
931 {
932 *m_retval = content.Mid(19);
933 m_Parser->StopParsing();
934 }
935 }
936 return false;
937 }
938
939
940 /*static*/
941 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
942 {
943 wxString charset;
944 wxMetaTagParser *parser = new wxMetaTagParser();
945 if(parser)
946 {
947 parser->AddTagHandler(new wxMetaTagHandler(&charset));
948 parser->Parse(markup);
949 delete parser;
950 }
951 return charset;
952 }
953
954 /* static */
955 bool
956 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
957 wxString::const_iterator end)
958 {
959 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
960
961 wxString::const_iterator p = start;
962
963 // comments begin with "<!--" in HTML 4.0
964 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
965 {
966 // not a comment at all
967 return false;
968 }
969
970 // skip the start of the comment tag in any case, if we don't find the
971 // closing tag we should ignore broken markup
972 start = p;
973
974 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
975 // comment delimiter and the closing tag character (section 3.2.4 of
976 // http://www.w3.org/TR/html401/)
977 int dashes = 0;
978 while ( ++p < end )
979 {
980 const wxChar c = *p;
981
982 if ( (c == wxT(' ') || c == wxT('\n') ||
983 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
984 {
985 // ignore white space before potential tag end
986 continue;
987 }
988
989 if ( c == wxT('>') && dashes >= 2 )
990 {
991 // found end of comment
992 start = p;
993 break;
994 }
995
996 if ( c == wxT('-') )
997 dashes++;
998 else
999 dashes = 0;
1000 }
1001
1002 return true;
1003 }
1004
1005 #endif // wxUSE_HTML