fixed memory corruption in wxHTML when parsing '&;' in the markup
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece() {}
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
55 };
56
57 // NB: this is an empty class and not typedef because of forward declaration
58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59 {
60 };
61
62 class wxHtmlParserState
63 {
64 public:
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
69 const wxString *m_source;
70 wxHtmlParserState *m_nextState;
71 };
72
73 //-----------------------------------------------------------------------------
74 // wxHtmlParser
75 //-----------------------------------------------------------------------------
76
77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
79 wxHtmlParser::wxHtmlParser()
80 : wxObject(),
81 m_FS(NULL)
82 {
83 m_Source = NULL;
84 m_entitiesParser = new wxHtmlEntitiesParser;
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
90 }
91
92 wxHtmlParser::~wxHtmlParser()
93 {
94 while (RestoreState()) {}
95 DestroyDOMTree();
96
97 WX_CLEAR_ARRAY(m_HandlersStack);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
99 delete m_entitiesParser;
100 delete m_Source;
101 }
102
103 wxObject* wxHtmlParser::Parse(const wxString& source)
104 {
105 InitParser(source);
106 DoParsing();
107 wxObject *result = GetProduct();
108 DoneParser();
109 return result;
110 }
111
112 void wxHtmlParser::InitParser(const wxString& source)
113 {
114 SetSource(source);
115 m_stopParsing = false;
116 }
117
118 void wxHtmlParser::DoneParser()
119 {
120 DestroyDOMTree();
121 }
122
123 void wxHtmlParser::SetSource(const wxString& src)
124 {
125 DestroyDOMTree();
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
132 // on the heap.
133 delete m_Source;
134 m_Source = new wxString(src);
135 CreateDOMTree();
136 m_CurTag = NULL;
137 m_CurTextPiece = 0;
138 }
139
140 void wxHtmlParser::CreateDOMTree()
141 {
142 wxHtmlTagsCache cache(*m_Source);
143 m_TextPieces = new wxHtmlTextPieces;
144 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
145 m_CurTextPiece = 0;
146 }
147
148 extern bool wxIsCDATAElement(const wxString& tag);
149
150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
151 const wxString::const_iterator& begin_pos,
152 const wxString::const_iterator& end_pos,
153 wxHtmlTagsCache *cache)
154 {
155 if (end_pos <= begin_pos)
156 return;
157
158 wxChar c;
159 wxString::const_iterator i = begin_pos;
160 wxString::const_iterator textBeginning = begin_pos;
161
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
166 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
167 {
168 i = end_pos;
169 }
170
171 while (i < end_pos)
172 {
173 c = *i;
174
175 if (c == wxT('<'))
176 {
177 // add text to m_TextPieces:
178 if (i > textBeginning)
179 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
180
181 // if it is a comment, skip it:
182 if ( SkipCommentTag(i, m_Source->end()) )
183 {
184 textBeginning = i = i + 1; // skip closing '>' too
185 }
186
187 // add another tag to the tree:
188 else if (i < end_pos-1 && *(i+1) != wxT('/'))
189 {
190 wxHtmlTag *chd;
191 if (cur)
192 chd = new wxHtmlTag(cur, m_Source,
193 i, end_pos, cache, m_entitiesParser);
194 else
195 {
196 chd = new wxHtmlTag(NULL, m_Source,
197 i, end_pos, cache, m_entitiesParser);
198 if (!m_Tags)
199 {
200 // if this is the first tag to be created make the root
201 // m_Tags point to it:
202 m_Tags = chd;
203 }
204 else
205 {
206 // if there is already a root tag add this tag as
207 // the last sibling:
208 chd->m_Prev = m_Tags->GetLastSibling();
209 chd->m_Prev->m_Next = chd;
210 }
211 }
212
213 if (chd->HasEnding())
214 {
215 CreateDOMSubTree(chd,
216 chd->GetBeginIter(), chd->GetEndIter1(),
217 cache);
218 i = chd->GetEndIter2();
219 }
220 else
221 i = chd->GetBeginIter();
222
223 textBeginning = i;
224 }
225
226 // ... or skip ending tag:
227 else
228 {
229 while (i < end_pos && *i != wxT('>')) ++i;
230 textBeginning = i+1;
231 }
232 }
233 else ++i;
234 }
235
236 // add remaining text to m_TextPieces:
237 if (end_pos > textBeginning)
238 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
239 }
240
241 void wxHtmlParser::DestroyDOMTree()
242 {
243 wxHtmlTag *t1, *t2;
244 t1 = m_Tags;
245 while (t1)
246 {
247 t2 = t1->GetNextSibling();
248 delete t1;
249 t1 = t2;
250 }
251 m_Tags = m_CurTag = NULL;
252
253 delete m_TextPieces;
254 m_TextPieces = NULL;
255 }
256
257 void wxHtmlParser::DoParsing()
258 {
259 m_CurTag = m_Tags;
260 m_CurTextPiece = 0;
261 DoParsing(m_Source->begin(), m_Source->end());
262 }
263
264 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
265 const wxString::const_iterator& end_pos)
266 {
267 wxString::const_iterator begin_pos(begin_pos_);
268
269 if (end_pos <= begin_pos)
270 return;
271
272 wxHtmlTextPieces& pieces = *m_TextPieces;
273 size_t piecesCnt = pieces.size();
274
275 while (begin_pos < end_pos)
276 {
277 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
278 m_CurTag = m_CurTag->GetNextTag();
279 while (m_CurTextPiece < piecesCnt &&
280 pieces[m_CurTextPiece].m_start < begin_pos)
281 m_CurTextPiece++;
282
283 if (m_CurTextPiece < piecesCnt &&
284 (!m_CurTag ||
285 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
286 {
287 // Add text:
288 AddText(GetEntitiesParser()->Parse(
289 wxString(pieces[m_CurTextPiece].m_start,
290 pieces[m_CurTextPiece].m_end)));
291 begin_pos = pieces[m_CurTextPiece].m_end;
292 m_CurTextPiece++;
293 }
294 else if (m_CurTag)
295 {
296 if (m_CurTag->HasEnding())
297 begin_pos = m_CurTag->GetEndIter2();
298 else
299 begin_pos = m_CurTag->GetBeginIter();
300 wxHtmlTag *t = m_CurTag;
301 m_CurTag = m_CurTag->GetNextTag();
302 AddTag(*t);
303 if (m_stopParsing)
304 return;
305 }
306 else break;
307 }
308 }
309
310 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
311 {
312 bool inner = false;
313
314 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
315 if (h != m_HandlersHash.end())
316 {
317 inner = h->second->HandleTag(tag);
318 if (m_stopParsing)
319 return;
320 }
321 if (!inner)
322 {
323 if (tag.HasEnding())
324 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
325 }
326 }
327
328 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
329 {
330 wxString s(handler->GetSupportedTags());
331 wxStringTokenizer tokenizer(s, wxT(", "));
332
333 while (tokenizer.HasMoreTokens())
334 m_HandlersHash[tokenizer.GetNextToken()] = handler;
335
336 m_HandlersSet.insert(handler);
337
338 handler->SetParser(this);
339 }
340
341 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
342 {
343 wxStringTokenizer tokenizer(tags, wxT(", "));
344 wxString key;
345
346 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
347
348 while (tokenizer.HasMoreTokens())
349 {
350 key = tokenizer.GetNextToken();
351 m_HandlersHash[key] = handler;
352 }
353 }
354
355 void wxHtmlParser::PopTagHandler()
356 {
357 wxCHECK_RET( !m_HandlersStack.empty(),
358 "attempt to remove HTML tag handler from empty stack" );
359
360 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
361 m_HandlersStack.pop_back();
362 m_HandlersHash = *prev;
363 delete prev;
364 }
365
366 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
367 {
368 wxHtmlParserState *s = new wxHtmlParserState;
369
370 s->m_curTag = m_CurTag;
371 s->m_tags = m_Tags;
372 s->m_textPieces = m_TextPieces;
373 s->m_curTextPiece = m_CurTextPiece;
374 s->m_source = m_Source;
375
376 s->m_nextState = m_SavedStates;
377 m_SavedStates = s;
378
379 m_CurTag = NULL;
380 m_Tags = NULL;
381 m_TextPieces = NULL;
382 m_CurTextPiece = 0;
383 m_Source = NULL;
384
385 SetSource(src);
386 }
387
388 bool wxHtmlParser::RestoreState()
389 {
390 if (!m_SavedStates) return false;
391
392 DestroyDOMTree();
393 delete m_Source;
394
395 wxHtmlParserState *s = m_SavedStates;
396 m_SavedStates = s->m_nextState;
397
398 m_CurTag = s->m_curTag;
399 m_Tags = s->m_tags;
400 m_TextPieces = s->m_textPieces;
401 m_CurTextPiece = s->m_curTextPiece;
402 m_Source = s->m_source;
403
404 delete s;
405 return true;
406 }
407
408 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
409 {
410 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
411 }
412
413 //-----------------------------------------------------------------------------
414 // wxHtmlTagHandler
415 //-----------------------------------------------------------------------------
416
417 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
418
419 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
420 {
421 // It is safe to temporarily change the source being parsed,
422 // provided we restore the state back after parsing
423 m_Parser->SetSourceAndSaveState(source);
424 m_Parser->DoParsing();
425 m_Parser->RestoreState();
426 }
427
428
429 //-----------------------------------------------------------------------------
430 // wxHtmlEntitiesParser
431 //-----------------------------------------------------------------------------
432
433 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
434
435 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
436 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
437 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
438 #endif
439 {
440 }
441
442 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
443 {
444 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
445 delete m_conv;
446 #endif
447 }
448
449 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
450 {
451 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
452 if (encoding == m_encoding)
453 return;
454
455 delete m_conv;
456
457 m_encoding = encoding;
458 if (m_encoding == wxFONTENCODING_SYSTEM)
459 m_conv = NULL;
460 else
461 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
462 #else
463 (void) encoding;
464 #endif
465 }
466
467 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
468 {
469 wxString output;
470
471 const wxString::const_iterator end(input.end());
472 wxString::const_iterator c(input.begin());
473 wxString::const_iterator last(c);
474
475 for ( ; c < end; ++c )
476 {
477 if (*c == wxT('&'))
478 {
479 if ( output.empty() )
480 output.reserve(input.length());
481
482 if (c - last > 0)
483 output.append(last, c);
484 if ( ++c == end )
485 break;
486
487 wxString entity;
488 const wxString::const_iterator ent_s = c;
489 wxChar entity_char;
490
491 for ( ; c != end; ++c )
492 {
493 wxChar ch = *c;
494 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
495 (ch >= wxT('A') && ch <= wxT('Z')) ||
496 (ch >= wxT('0') && ch <= wxT('9')) ||
497 ch == wxT('_') || ch == wxT('#')) )
498 break;
499 }
500
501 entity.append(ent_s, c);
502 if (c == end || *c != wxT(';')) --c;
503 last = c+1;
504 entity_char = GetEntityChar(entity);
505 if (entity_char)
506 output << entity_char;
507 else
508 {
509 output.append(ent_s-1, c+1);
510 wxLogTrace(wxTRACE_HTML_DEBUG,
511 "Unrecognized HTML entity: '%s'",
512 entity);
513 }
514 }
515 }
516 if ( last == input.begin() ) // common case: no entity
517 return input;
518 if ( last != end )
519 output.append(last, end);
520 return output;
521 }
522
523 #if !wxUSE_UNICODE
524 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
525 {
526 #if wxUSE_WCHAR_T
527 char buf[2];
528 wchar_t wbuf[2];
529 wbuf[0] = (wchar_t)code;
530 wbuf[1] = 0;
531 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
532 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
533 return '?';
534 return buf[0];
535 #else
536 return (code < 256) ? (wxChar)code : '?';
537 #endif
538 }
539 #endif
540
541 struct wxHtmlEntityInfo
542 {
543 const wxStringCharType *name;
544 unsigned code;
545 };
546
547 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
548 {
549 #if wxUSE_UNICODE_UTF8
550 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
551 #else
552 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
553 #endif
554 }
555
556 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
557 {
558 unsigned code = 0;
559
560 if (entity.empty())
561 return 0; // invalid entity reference
562
563 if (entity[0] == wxT('#'))
564 {
565 // NB: parsed value is a number, so it's OK to use wx_str(), internal
566 // representation is the same for numbers
567 const wxStringCharType *ent_s = entity.wx_str();
568 const wxStringCharType *format;
569
570 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
571 {
572 format = wxS("%x");
573 ent_s++;
574 }
575 else
576 format = wxS("%u");
577 ent_s++;
578
579 if (wxSscanf(ent_s, format, &code) != 1)
580 code = 0;
581 }
582 else
583 {
584 // store the literals in wx's internal representation (either char*
585 // in UTF-8 or wchar_t*) for best performance:
586 #define ENTITY(name, code) { wxS(name), code }
587
588 static wxHtmlEntityInfo substitutions[] = {
589 ENTITY("AElig", 198),
590 ENTITY("Aacute", 193),
591 ENTITY("Acirc", 194),
592 ENTITY("Agrave", 192),
593 ENTITY("Alpha", 913),
594 ENTITY("Aring", 197),
595 ENTITY("Atilde", 195),
596 ENTITY("Auml", 196),
597 ENTITY("Beta", 914),
598 ENTITY("Ccedil", 199),
599 ENTITY("Chi", 935),
600 ENTITY("Dagger", 8225),
601 ENTITY("Delta", 916),
602 ENTITY("ETH", 208),
603 ENTITY("Eacute", 201),
604 ENTITY("Ecirc", 202),
605 ENTITY("Egrave", 200),
606 ENTITY("Epsilon", 917),
607 ENTITY("Eta", 919),
608 ENTITY("Euml", 203),
609 ENTITY("Gamma", 915),
610 ENTITY("Iacute", 205),
611 ENTITY("Icirc", 206),
612 ENTITY("Igrave", 204),
613 ENTITY("Iota", 921),
614 ENTITY("Iuml", 207),
615 ENTITY("Kappa", 922),
616 ENTITY("Lambda", 923),
617 ENTITY("Mu", 924),
618 ENTITY("Ntilde", 209),
619 ENTITY("Nu", 925),
620 ENTITY("OElig", 338),
621 ENTITY("Oacute", 211),
622 ENTITY("Ocirc", 212),
623 ENTITY("Ograve", 210),
624 ENTITY("Omega", 937),
625 ENTITY("Omicron", 927),
626 ENTITY("Oslash", 216),
627 ENTITY("Otilde", 213),
628 ENTITY("Ouml", 214),
629 ENTITY("Phi", 934),
630 ENTITY("Pi", 928),
631 ENTITY("Prime", 8243),
632 ENTITY("Psi", 936),
633 ENTITY("Rho", 929),
634 ENTITY("Scaron", 352),
635 ENTITY("Sigma", 931),
636 ENTITY("THORN", 222),
637 ENTITY("Tau", 932),
638 ENTITY("Theta", 920),
639 ENTITY("Uacute", 218),
640 ENTITY("Ucirc", 219),
641 ENTITY("Ugrave", 217),
642 ENTITY("Upsilon", 933),
643 ENTITY("Uuml", 220),
644 ENTITY("Xi", 926),
645 ENTITY("Yacute", 221),
646 ENTITY("Yuml", 376),
647 ENTITY("Zeta", 918),
648 ENTITY("aacute", 225),
649 ENTITY("acirc", 226),
650 ENTITY("acute", 180),
651 ENTITY("aelig", 230),
652 ENTITY("agrave", 224),
653 ENTITY("alefsym", 8501),
654 ENTITY("alpha", 945),
655 ENTITY("amp", 38),
656 ENTITY("and", 8743),
657 ENTITY("ang", 8736),
658 ENTITY("aring", 229),
659 ENTITY("asymp", 8776),
660 ENTITY("atilde", 227),
661 ENTITY("auml", 228),
662 ENTITY("bdquo", 8222),
663 ENTITY("beta", 946),
664 ENTITY("brvbar", 166),
665 ENTITY("bull", 8226),
666 ENTITY("cap", 8745),
667 ENTITY("ccedil", 231),
668 ENTITY("cedil", 184),
669 ENTITY("cent", 162),
670 ENTITY("chi", 967),
671 ENTITY("circ", 710),
672 ENTITY("clubs", 9827),
673 ENTITY("cong", 8773),
674 ENTITY("copy", 169),
675 ENTITY("crarr", 8629),
676 ENTITY("cup", 8746),
677 ENTITY("curren", 164),
678 ENTITY("dArr", 8659),
679 ENTITY("dagger", 8224),
680 ENTITY("darr", 8595),
681 ENTITY("deg", 176),
682 ENTITY("delta", 948),
683 ENTITY("diams", 9830),
684 ENTITY("divide", 247),
685 ENTITY("eacute", 233),
686 ENTITY("ecirc", 234),
687 ENTITY("egrave", 232),
688 ENTITY("empty", 8709),
689 ENTITY("emsp", 8195),
690 ENTITY("ensp", 8194),
691 ENTITY("epsilon", 949),
692 ENTITY("equiv", 8801),
693 ENTITY("eta", 951),
694 ENTITY("eth", 240),
695 ENTITY("euml", 235),
696 ENTITY("euro", 8364),
697 ENTITY("exist", 8707),
698 ENTITY("fnof", 402),
699 ENTITY("forall", 8704),
700 ENTITY("frac12", 189),
701 ENTITY("frac14", 188),
702 ENTITY("frac34", 190),
703 ENTITY("frasl", 8260),
704 ENTITY("gamma", 947),
705 ENTITY("ge", 8805),
706 ENTITY("gt", 62),
707 ENTITY("hArr", 8660),
708 ENTITY("harr", 8596),
709 ENTITY("hearts", 9829),
710 ENTITY("hellip", 8230),
711 ENTITY("iacute", 237),
712 ENTITY("icirc", 238),
713 ENTITY("iexcl", 161),
714 ENTITY("igrave", 236),
715 ENTITY("image", 8465),
716 ENTITY("infin", 8734),
717 ENTITY("int", 8747),
718 ENTITY("iota", 953),
719 ENTITY("iquest", 191),
720 ENTITY("isin", 8712),
721 ENTITY("iuml", 239),
722 ENTITY("kappa", 954),
723 ENTITY("lArr", 8656),
724 ENTITY("lambda", 955),
725 ENTITY("lang", 9001),
726 ENTITY("laquo", 171),
727 ENTITY("larr", 8592),
728 ENTITY("lceil", 8968),
729 ENTITY("ldquo", 8220),
730 ENTITY("le", 8804),
731 ENTITY("lfloor", 8970),
732 ENTITY("lowast", 8727),
733 ENTITY("loz", 9674),
734 ENTITY("lrm", 8206),
735 ENTITY("lsaquo", 8249),
736 ENTITY("lsquo", 8216),
737 ENTITY("lt", 60),
738 ENTITY("macr", 175),
739 ENTITY("mdash", 8212),
740 ENTITY("micro", 181),
741 ENTITY("middot", 183),
742 ENTITY("minus", 8722),
743 ENTITY("mu", 956),
744 ENTITY("nabla", 8711),
745 ENTITY("nbsp", 160),
746 ENTITY("ndash", 8211),
747 ENTITY("ne", 8800),
748 ENTITY("ni", 8715),
749 ENTITY("not", 172),
750 ENTITY("notin", 8713),
751 ENTITY("nsub", 8836),
752 ENTITY("ntilde", 241),
753 ENTITY("nu", 957),
754 ENTITY("oacute", 243),
755 ENTITY("ocirc", 244),
756 ENTITY("oelig", 339),
757 ENTITY("ograve", 242),
758 ENTITY("oline", 8254),
759 ENTITY("omega", 969),
760 ENTITY("omicron", 959),
761 ENTITY("oplus", 8853),
762 ENTITY("or", 8744),
763 ENTITY("ordf", 170),
764 ENTITY("ordm", 186),
765 ENTITY("oslash", 248),
766 ENTITY("otilde", 245),
767 ENTITY("otimes", 8855),
768 ENTITY("ouml", 246),
769 ENTITY("para", 182),
770 ENTITY("part", 8706),
771 ENTITY("permil", 8240),
772 ENTITY("perp", 8869),
773 ENTITY("phi", 966),
774 ENTITY("pi", 960),
775 ENTITY("piv", 982),
776 ENTITY("plusmn", 177),
777 ENTITY("pound", 163),
778 ENTITY("prime", 8242),
779 ENTITY("prod", 8719),
780 ENTITY("prop", 8733),
781 ENTITY("psi", 968),
782 ENTITY("quot", 34),
783 ENTITY("rArr", 8658),
784 ENTITY("radic", 8730),
785 ENTITY("rang", 9002),
786 ENTITY("raquo", 187),
787 ENTITY("rarr", 8594),
788 ENTITY("rceil", 8969),
789 ENTITY("rdquo", 8221),
790 ENTITY("real", 8476),
791 ENTITY("reg", 174),
792 ENTITY("rfloor", 8971),
793 ENTITY("rho", 961),
794 ENTITY("rlm", 8207),
795 ENTITY("rsaquo", 8250),
796 ENTITY("rsquo", 8217),
797 ENTITY("sbquo", 8218),
798 ENTITY("scaron", 353),
799 ENTITY("sdot", 8901),
800 ENTITY("sect", 167),
801 ENTITY("shy", 173),
802 ENTITY("sigma", 963),
803 ENTITY("sigmaf", 962),
804 ENTITY("sim", 8764),
805 ENTITY("spades", 9824),
806 ENTITY("sub", 8834),
807 ENTITY("sube", 8838),
808 ENTITY("sum", 8721),
809 ENTITY("sup", 8835),
810 ENTITY("sup1", 185),
811 ENTITY("sup2", 178),
812 ENTITY("sup3", 179),
813 ENTITY("supe", 8839),
814 ENTITY("szlig", 223),
815 ENTITY("tau", 964),
816 ENTITY("there4", 8756),
817 ENTITY("theta", 952),
818 ENTITY("thetasym", 977),
819 ENTITY("thinsp", 8201),
820 ENTITY("thorn", 254),
821 ENTITY("tilde", 732),
822 ENTITY("times", 215),
823 ENTITY("trade", 8482),
824 ENTITY("uArr", 8657),
825 ENTITY("uacute", 250),
826 ENTITY("uarr", 8593),
827 ENTITY("ucirc", 251),
828 ENTITY("ugrave", 249),
829 ENTITY("uml", 168),
830 ENTITY("upsih", 978),
831 ENTITY("upsilon", 965),
832 ENTITY("uuml", 252),
833 ENTITY("weierp", 8472),
834 ENTITY("xi", 958),
835 ENTITY("yacute", 253),
836 ENTITY("yen", 165),
837 ENTITY("yuml", 255),
838 ENTITY("zeta", 950),
839 ENTITY("zwj", 8205),
840 ENTITY("zwnj", 8204),
841 {NULL, 0}};
842 #undef ENTITY
843 static size_t substitutions_cnt = 0;
844
845 if (substitutions_cnt == 0)
846 while (substitutions[substitutions_cnt].code != 0)
847 substitutions_cnt++;
848
849 wxHtmlEntityInfo *info = NULL;
850 #ifdef __WXWINCE__
851 // bsearch crashes under WinCE for some reason
852 size_t i;
853 for (i = 0; i < substitutions_cnt; i++)
854 {
855 if (entity == substitutions[i].name)
856 {
857 info = & substitutions[i];
858 break;
859 }
860 }
861 #else
862 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
863 substitutions_cnt,
864 sizeof(wxHtmlEntityInfo),
865 wxHtmlEntityCompare);
866 #endif
867 if (info)
868 code = info->code;
869 }
870
871 if (code == 0)
872 return 0;
873 else
874 return GetCharForCode(code);
875 }
876
877 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
878 const wxString& url) const
879 {
880 return m_FS ? m_FS->OpenFile(url) : NULL;
881
882 }
883
884
885 //-----------------------------------------------------------------------------
886 // wxHtmlParser::ExtractCharsetInformation
887 //-----------------------------------------------------------------------------
888
889 class wxMetaTagParser : public wxHtmlParser
890 {
891 public:
892 wxMetaTagParser() { }
893
894 wxObject* GetProduct() { return NULL; }
895
896 protected:
897 virtual void AddText(const wxString& WXUNUSED(txt)) {}
898
899 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
900 };
901
902 class wxMetaTagHandler : public wxHtmlTagHandler
903 {
904 public:
905 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
906 wxString GetSupportedTags() { return wxT("META,BODY"); }
907 bool HandleTag(const wxHtmlTag& tag);
908
909 private:
910 wxString *m_retval;
911
912 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
913 };
914
915 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
916 {
917 if (tag.GetName() == _T("BODY"))
918 {
919 m_Parser->StopParsing();
920 return false;
921 }
922
923 if (tag.HasParam(_T("HTTP-EQUIV")) &&
924 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
925 tag.HasParam(_T("CONTENT")))
926 {
927 wxString content = tag.GetParam(_T("CONTENT")).Lower();
928 if (content.Left(19) == _T("text/html; charset="))
929 {
930 *m_retval = content.Mid(19);
931 m_Parser->StopParsing();
932 }
933 }
934 return false;
935 }
936
937
938 /*static*/
939 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
940 {
941 wxString charset;
942 wxMetaTagParser *parser = new wxMetaTagParser();
943 if(parser)
944 {
945 parser->AddTagHandler(new wxMetaTagHandler(&charset));
946 parser->Parse(markup);
947 delete parser;
948 }
949 return charset;
950 }
951
952 /* static */
953 bool
954 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
955 wxString::const_iterator end)
956 {
957 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
958
959 wxString::const_iterator p = start;
960
961 // comments begin with "<!--" in HTML 4.0
962 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
963 {
964 // not a comment at all
965 return false;
966 }
967
968 // skip the start of the comment tag in any case, if we don't find the
969 // closing tag we should ignore broken markup
970 start = p;
971
972 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
973 // comment delimiter and the closing tag character (section 3.2.4 of
974 // http://www.w3.org/TR/html401/)
975 int dashes = 0;
976 while ( ++p < end )
977 {
978 const wxChar c = *p;
979
980 if ( (c == wxT(' ') || c == wxT('\n') ||
981 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
982 {
983 // ignore white space before potential tag end
984 continue;
985 }
986
987 if ( c == wxT('>') && dashes >= 2 )
988 {
989 // found end of comment
990 start = p;
991 break;
992 }
993
994 if ( c == wxT('-') )
995 dashes++;
996 else
997 dashes = 0;
998 }
999
1000 return true;
1001 }
1002
1003 #endif // wxUSE_HTML