fixed memory leak in RestoreState
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece() {}
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
55 };
56
57 // NB: this is an empty class and not typedef because of forward declaration
58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59 {
60 };
61
62 class wxHtmlParserState
63 {
64 public:
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
69 const wxString *m_source;
70 wxHtmlParserState *m_nextState;
71 };
72
73 //-----------------------------------------------------------------------------
74 // wxHtmlParser
75 //-----------------------------------------------------------------------------
76
77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
79 wxHtmlParser::wxHtmlParser()
80 : wxObject(), m_HandlersHash(wxKEY_STRING),
81 m_FS(NULL), m_HandlersStack(NULL)
82 {
83 m_Source = NULL;
84 m_entitiesParser = new wxHtmlEntitiesParser;
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
90 }
91
92 wxHtmlParser::~wxHtmlParser()
93 {
94 while (RestoreState()) {}
95 DestroyDOMTree();
96
97 if (m_HandlersStack)
98 {
99 wxList& tmp = *m_HandlersStack;
100 wxList::iterator it, en;
101 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
102 delete (wxHashTable*)*it;
103 tmp.clear();
104 }
105 delete m_HandlersStack;
106 m_HandlersHash.Clear();
107 WX_CLEAR_LIST(wxList, m_HandlersList);
108 delete m_entitiesParser;
109 delete m_Source;
110 }
111
112 wxObject* wxHtmlParser::Parse(const wxString& source)
113 {
114 InitParser(source);
115 DoParsing();
116 wxObject *result = GetProduct();
117 DoneParser();
118 return result;
119 }
120
121 void wxHtmlParser::InitParser(const wxString& source)
122 {
123 SetSource(source);
124 m_stopParsing = false;
125 }
126
127 void wxHtmlParser::DoneParser()
128 {
129 DestroyDOMTree();
130 }
131
132 void wxHtmlParser::SetSource(const wxString& src)
133 {
134 DestroyDOMTree();
135 // NB: This is allocated on heap because wxHtmlTag uses iterators and
136 // making a copy of m_Source string in SetSourceAndSaveState() and
137 // RestoreState() would invalidate them (because wxString::m_impl's
138 // memory would change completely twice and iterators use pointers
139 // into it). So instead, we keep the string object intact and only
140 // store/restore pointer to it, for which we need it to be allocated
141 // on the heap.
142 delete m_Source;
143 m_Source = new wxString(src);
144 CreateDOMTree();
145 m_CurTag = NULL;
146 m_CurTextPiece = 0;
147 }
148
149 void wxHtmlParser::CreateDOMTree()
150 {
151 wxHtmlTagsCache cache(*m_Source);
152 m_TextPieces = new wxHtmlTextPieces;
153 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
154 m_CurTextPiece = 0;
155 }
156
157 extern bool wxIsCDATAElement(const wxString& tag);
158
159 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
160 const wxString::const_iterator& begin_pos,
161 const wxString::const_iterator& end_pos,
162 wxHtmlTagsCache *cache)
163 {
164 if (end_pos <= begin_pos)
165 return;
166
167 wxChar c;
168 wxString::const_iterator i = begin_pos;
169 wxString::const_iterator textBeginning = begin_pos;
170
171 // If the tag contains CDATA text, we include the text between beginning
172 // and ending tag verbosely. Setting i=end_pos will skip to the very
173 // end of this function where text piece is added, bypassing any child
174 // tags parsing (CDATA element can't have child elements by definition):
175 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
176 {
177 i = end_pos;
178 }
179
180 while (i < end_pos)
181 {
182 c = *i;
183
184 if (c == wxT('<'))
185 {
186 // add text to m_TextPieces:
187 if (i > textBeginning)
188 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
189
190 // if it is a comment, skip it:
191 if ( SkipCommentTag(i, m_Source->end()) )
192 {
193 textBeginning = i = i + 1; // skip closing '>' too
194 }
195
196 // add another tag to the tree:
197 else if (i < end_pos-1 && *(i+1) != wxT('/'))
198 {
199 wxHtmlTag *chd;
200 if (cur)
201 chd = new wxHtmlTag(cur, m_Source,
202 i, end_pos, cache, m_entitiesParser);
203 else
204 {
205 chd = new wxHtmlTag(NULL, m_Source,
206 i, end_pos, cache, m_entitiesParser);
207 if (!m_Tags)
208 {
209 // if this is the first tag to be created make the root
210 // m_Tags point to it:
211 m_Tags = chd;
212 }
213 else
214 {
215 // if there is already a root tag add this tag as
216 // the last sibling:
217 chd->m_Prev = m_Tags->GetLastSibling();
218 chd->m_Prev->m_Next = chd;
219 }
220 }
221
222 if (chd->HasEnding())
223 {
224 CreateDOMSubTree(chd,
225 chd->GetBeginIter(), chd->GetEndIter1(),
226 cache);
227 i = chd->GetEndIter2();
228 }
229 else
230 i = chd->GetBeginIter();
231
232 textBeginning = i;
233 }
234
235 // ... or skip ending tag:
236 else
237 {
238 while (i < end_pos && *i != wxT('>')) ++i;
239 textBeginning = i+1;
240 }
241 }
242 else ++i;
243 }
244
245 // add remaining text to m_TextPieces:
246 if (end_pos > textBeginning)
247 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
248 }
249
250 void wxHtmlParser::DestroyDOMTree()
251 {
252 wxHtmlTag *t1, *t2;
253 t1 = m_Tags;
254 while (t1)
255 {
256 t2 = t1->GetNextSibling();
257 delete t1;
258 t1 = t2;
259 }
260 m_Tags = m_CurTag = NULL;
261
262 delete m_TextPieces;
263 m_TextPieces = NULL;
264 }
265
266 void wxHtmlParser::DoParsing()
267 {
268 m_CurTag = m_Tags;
269 m_CurTextPiece = 0;
270 DoParsing(m_Source->begin(), m_Source->end());
271 }
272
273 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
274 const wxString::const_iterator& end_pos)
275 {
276 wxString::const_iterator begin_pos(begin_pos_);
277
278 if (end_pos <= begin_pos)
279 return;
280
281 wxHtmlTextPieces& pieces = *m_TextPieces;
282 size_t piecesCnt = pieces.size();
283
284 while (begin_pos < end_pos)
285 {
286 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
287 m_CurTag = m_CurTag->GetNextTag();
288 while (m_CurTextPiece < piecesCnt &&
289 pieces[m_CurTextPiece].m_start < begin_pos)
290 m_CurTextPiece++;
291
292 if (m_CurTextPiece < piecesCnt &&
293 (!m_CurTag ||
294 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
295 {
296 // Add text:
297 AddText(GetEntitiesParser()->Parse(
298 wxString(pieces[m_CurTextPiece].m_start,
299 pieces[m_CurTextPiece].m_end)));
300 begin_pos = pieces[m_CurTextPiece].m_end;
301 m_CurTextPiece++;
302 }
303 else if (m_CurTag)
304 {
305 if (m_CurTag->HasEnding())
306 begin_pos = m_CurTag->GetEndIter2();
307 else
308 begin_pos = m_CurTag->GetBeginIter();
309 wxHtmlTag *t = m_CurTag;
310 m_CurTag = m_CurTag->GetNextTag();
311 AddTag(*t);
312 if (m_stopParsing)
313 return;
314 }
315 else break;
316 }
317 }
318
319 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
320 {
321 wxHtmlTagHandler *h;
322 bool inner = false;
323
324 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
325 if (h)
326 {
327 inner = h->HandleTag(tag);
328 if (m_stopParsing)
329 return;
330 }
331 if (!inner)
332 {
333 if (tag.HasEnding())
334 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
335 }
336 }
337
338 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
339 {
340 wxString s(handler->GetSupportedTags());
341 wxStringTokenizer tokenizer(s, wxT(", "));
342
343 while (tokenizer.HasMoreTokens())
344 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
345
346 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
347 m_HandlersList.Append(handler);
348
349 handler->SetParser(this);
350 }
351
352 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
353 {
354 wxStringTokenizer tokenizer(tags, wxT(", "));
355 wxString key;
356
357 if (m_HandlersStack == NULL)
358 {
359 m_HandlersStack = new wxList;
360 }
361
362 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
363
364 while (tokenizer.HasMoreTokens())
365 {
366 key = tokenizer.GetNextToken();
367 m_HandlersHash.Delete(key);
368 m_HandlersHash.Put(key, handler);
369 }
370 }
371
372 void wxHtmlParser::PopTagHandler()
373 {
374 wxList::compatibility_iterator first;
375
376 if ( !m_HandlersStack ||
377 #if wxUSE_STL
378 !(first = m_HandlersStack->GetFirst())
379 #else // !wxUSE_STL
380 ((first = m_HandlersStack->GetFirst()) == NULL)
381 #endif // wxUSE_STL/!wxUSE_STL
382 )
383 {
384 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
385 return;
386 }
387 m_HandlersHash = *((wxHashTable*) first->GetData());
388 delete (wxHashTable*) first->GetData();
389 m_HandlersStack->Erase(first);
390 }
391
392 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
393 {
394 wxHtmlParserState *s = new wxHtmlParserState;
395
396 s->m_curTag = m_CurTag;
397 s->m_tags = m_Tags;
398 s->m_textPieces = m_TextPieces;
399 s->m_curTextPiece = m_CurTextPiece;
400 s->m_source = m_Source;
401
402 s->m_nextState = m_SavedStates;
403 m_SavedStates = s;
404
405 m_CurTag = NULL;
406 m_Tags = NULL;
407 m_TextPieces = NULL;
408 m_CurTextPiece = 0;
409 m_Source = NULL;
410
411 SetSource(src);
412 }
413
414 bool wxHtmlParser::RestoreState()
415 {
416 if (!m_SavedStates) return false;
417
418 DestroyDOMTree();
419 delete m_Source;
420
421 wxHtmlParserState *s = m_SavedStates;
422 m_SavedStates = s->m_nextState;
423
424 m_CurTag = s->m_curTag;
425 m_Tags = s->m_tags;
426 m_TextPieces = s->m_textPieces;
427 m_CurTextPiece = s->m_curTextPiece;
428 m_Source = s->m_source;
429
430 delete s;
431 return true;
432 }
433
434 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
435 {
436 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
437 }
438
439 //-----------------------------------------------------------------------------
440 // wxHtmlTagHandler
441 //-----------------------------------------------------------------------------
442
443 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
444
445 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
446 {
447 // It is safe to temporarily change the source being parsed,
448 // provided we restore the state back after parsing
449 m_Parser->SetSourceAndSaveState(source);
450 m_Parser->DoParsing();
451 m_Parser->RestoreState();
452 }
453
454
455 //-----------------------------------------------------------------------------
456 // wxHtmlEntitiesParser
457 //-----------------------------------------------------------------------------
458
459 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
460
461 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
462 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
463 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
464 #endif
465 {
466 }
467
468 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
469 {
470 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
471 delete m_conv;
472 #endif
473 }
474
475 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
476 {
477 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
478 if (encoding == m_encoding)
479 return;
480
481 delete m_conv;
482
483 m_encoding = encoding;
484 if (m_encoding == wxFONTENCODING_SYSTEM)
485 m_conv = NULL;
486 else
487 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
488 #else
489 (void) encoding;
490 #endif
491 }
492
493 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
494 {
495 wxString output;
496
497 const wxString::const_iterator end(input.end());
498 wxString::const_iterator c(input.begin());
499 wxString::const_iterator last(c);
500
501 for ( ; c < end; ++c )
502 {
503 if (*c == wxT('&'))
504 {
505 if ( output.empty() )
506 output.reserve(input.length());
507
508 if (c - last > 0)
509 output.append(last, c);
510 if ( ++c == end )
511 break;
512
513 wxString entity;
514 const wxString::const_iterator ent_s = c;
515 wxChar entity_char;
516
517 for ( ; c != end; ++c )
518 {
519 wxChar ch = *c;
520 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
521 (ch >= wxT('A') && ch <= wxT('Z')) ||
522 (ch >= wxT('0') && ch <= wxT('9')) ||
523 ch == wxT('_') || ch == wxT('#')) )
524 break;
525 }
526
527 entity.append(ent_s, c);
528 if (c == end || *c != wxT(';')) --c;
529 last = c+1;
530 entity_char = GetEntityChar(entity);
531 if (entity_char)
532 output << entity_char;
533 else
534 {
535 output.append(ent_s-1, c+1);
536 wxLogTrace(wxTRACE_HTML_DEBUG,
537 "Unrecognized HTML entity: '%s'",
538 entity);
539 }
540 }
541 }
542 if ( last == input.begin() ) // common case: no entity
543 return input;
544 if ( last != end )
545 output.append(last, end);
546 return output;
547 }
548
549 #if !wxUSE_UNICODE
550 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
551 {
552 #if wxUSE_WCHAR_T
553 char buf[2];
554 wchar_t wbuf[2];
555 wbuf[0] = (wchar_t)code;
556 wbuf[1] = 0;
557 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
558 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
559 return '?';
560 return buf[0];
561 #else
562 return (code < 256) ? (wxChar)code : '?';
563 #endif
564 }
565 #endif
566
567 struct wxHtmlEntityInfo
568 {
569 const wxStringCharType *name;
570 unsigned code;
571 };
572
573 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
574 {
575 #if wxUSE_UNICODE_UTF8
576 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
577 #else
578 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
579 #endif
580 }
581
582 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
583 {
584 unsigned code = 0;
585
586 if (entity[0] == wxT('#'))
587 {
588 // NB: parsed value is a number, so it's OK to use wx_str(), internal
589 // representation is the same for numbers
590 const wxStringCharType *ent_s = entity.wx_str();
591 const wxStringCharType *format;
592
593 if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
594 {
595 format = wxSTRING_TEXT("%x");
596 ent_s++;
597 }
598 else
599 format = wxSTRING_TEXT("%u");
600 ent_s++;
601
602 if (wxSscanf(ent_s, format, &code) != 1)
603 code = 0;
604 }
605 else
606 {
607 // store the literals in wx's internal representation (either char*
608 // in UTF-8 or wchar_t*) for best performance:
609 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
610
611 static wxHtmlEntityInfo substitutions[] = {
612 ENTITY("AElig", 198),
613 ENTITY("Aacute", 193),
614 ENTITY("Acirc", 194),
615 ENTITY("Agrave", 192),
616 ENTITY("Alpha", 913),
617 ENTITY("Aring", 197),
618 ENTITY("Atilde", 195),
619 ENTITY("Auml", 196),
620 ENTITY("Beta", 914),
621 ENTITY("Ccedil", 199),
622 ENTITY("Chi", 935),
623 ENTITY("Dagger", 8225),
624 ENTITY("Delta", 916),
625 ENTITY("ETH", 208),
626 ENTITY("Eacute", 201),
627 ENTITY("Ecirc", 202),
628 ENTITY("Egrave", 200),
629 ENTITY("Epsilon", 917),
630 ENTITY("Eta", 919),
631 ENTITY("Euml", 203),
632 ENTITY("Gamma", 915),
633 ENTITY("Iacute", 205),
634 ENTITY("Icirc", 206),
635 ENTITY("Igrave", 204),
636 ENTITY("Iota", 921),
637 ENTITY("Iuml", 207),
638 ENTITY("Kappa", 922),
639 ENTITY("Lambda", 923),
640 ENTITY("Mu", 924),
641 ENTITY("Ntilde", 209),
642 ENTITY("Nu", 925),
643 ENTITY("OElig", 338),
644 ENTITY("Oacute", 211),
645 ENTITY("Ocirc", 212),
646 ENTITY("Ograve", 210),
647 ENTITY("Omega", 937),
648 ENTITY("Omicron", 927),
649 ENTITY("Oslash", 216),
650 ENTITY("Otilde", 213),
651 ENTITY("Ouml", 214),
652 ENTITY("Phi", 934),
653 ENTITY("Pi", 928),
654 ENTITY("Prime", 8243),
655 ENTITY("Psi", 936),
656 ENTITY("Rho", 929),
657 ENTITY("Scaron", 352),
658 ENTITY("Sigma", 931),
659 ENTITY("THORN", 222),
660 ENTITY("Tau", 932),
661 ENTITY("Theta", 920),
662 ENTITY("Uacute", 218),
663 ENTITY("Ucirc", 219),
664 ENTITY("Ugrave", 217),
665 ENTITY("Upsilon", 933),
666 ENTITY("Uuml", 220),
667 ENTITY("Xi", 926),
668 ENTITY("Yacute", 221),
669 ENTITY("Yuml", 376),
670 ENTITY("Zeta", 918),
671 ENTITY("aacute", 225),
672 ENTITY("acirc", 226),
673 ENTITY("acute", 180),
674 ENTITY("aelig", 230),
675 ENTITY("agrave", 224),
676 ENTITY("alefsym", 8501),
677 ENTITY("alpha", 945),
678 ENTITY("amp", 38),
679 ENTITY("and", 8743),
680 ENTITY("ang", 8736),
681 ENTITY("aring", 229),
682 ENTITY("asymp", 8776),
683 ENTITY("atilde", 227),
684 ENTITY("auml", 228),
685 ENTITY("bdquo", 8222),
686 ENTITY("beta", 946),
687 ENTITY("brvbar", 166),
688 ENTITY("bull", 8226),
689 ENTITY("cap", 8745),
690 ENTITY("ccedil", 231),
691 ENTITY("cedil", 184),
692 ENTITY("cent", 162),
693 ENTITY("chi", 967),
694 ENTITY("circ", 710),
695 ENTITY("clubs", 9827),
696 ENTITY("cong", 8773),
697 ENTITY("copy", 169),
698 ENTITY("crarr", 8629),
699 ENTITY("cup", 8746),
700 ENTITY("curren", 164),
701 ENTITY("dArr", 8659),
702 ENTITY("dagger", 8224),
703 ENTITY("darr", 8595),
704 ENTITY("deg", 176),
705 ENTITY("delta", 948),
706 ENTITY("diams", 9830),
707 ENTITY("divide", 247),
708 ENTITY("eacute", 233),
709 ENTITY("ecirc", 234),
710 ENTITY("egrave", 232),
711 ENTITY("empty", 8709),
712 ENTITY("emsp", 8195),
713 ENTITY("ensp", 8194),
714 ENTITY("epsilon", 949),
715 ENTITY("equiv", 8801),
716 ENTITY("eta", 951),
717 ENTITY("eth", 240),
718 ENTITY("euml", 235),
719 ENTITY("euro", 8364),
720 ENTITY("exist", 8707),
721 ENTITY("fnof", 402),
722 ENTITY("forall", 8704),
723 ENTITY("frac12", 189),
724 ENTITY("frac14", 188),
725 ENTITY("frac34", 190),
726 ENTITY("frasl", 8260),
727 ENTITY("gamma", 947),
728 ENTITY("ge", 8805),
729 ENTITY("gt", 62),
730 ENTITY("hArr", 8660),
731 ENTITY("harr", 8596),
732 ENTITY("hearts", 9829),
733 ENTITY("hellip", 8230),
734 ENTITY("iacute", 237),
735 ENTITY("icirc", 238),
736 ENTITY("iexcl", 161),
737 ENTITY("igrave", 236),
738 ENTITY("image", 8465),
739 ENTITY("infin", 8734),
740 ENTITY("int", 8747),
741 ENTITY("iota", 953),
742 ENTITY("iquest", 191),
743 ENTITY("isin", 8712),
744 ENTITY("iuml", 239),
745 ENTITY("kappa", 954),
746 ENTITY("lArr", 8656),
747 ENTITY("lambda", 955),
748 ENTITY("lang", 9001),
749 ENTITY("laquo", 171),
750 ENTITY("larr", 8592),
751 ENTITY("lceil", 8968),
752 ENTITY("ldquo", 8220),
753 ENTITY("le", 8804),
754 ENTITY("lfloor", 8970),
755 ENTITY("lowast", 8727),
756 ENTITY("loz", 9674),
757 ENTITY("lrm", 8206),
758 ENTITY("lsaquo", 8249),
759 ENTITY("lsquo", 8216),
760 ENTITY("lt", 60),
761 ENTITY("macr", 175),
762 ENTITY("mdash", 8212),
763 ENTITY("micro", 181),
764 ENTITY("middot", 183),
765 ENTITY("minus", 8722),
766 ENTITY("mu", 956),
767 ENTITY("nabla", 8711),
768 ENTITY("nbsp", 160),
769 ENTITY("ndash", 8211),
770 ENTITY("ne", 8800),
771 ENTITY("ni", 8715),
772 ENTITY("not", 172),
773 ENTITY("notin", 8713),
774 ENTITY("nsub", 8836),
775 ENTITY("ntilde", 241),
776 ENTITY("nu", 957),
777 ENTITY("oacute", 243),
778 ENTITY("ocirc", 244),
779 ENTITY("oelig", 339),
780 ENTITY("ograve", 242),
781 ENTITY("oline", 8254),
782 ENTITY("omega", 969),
783 ENTITY("omicron", 959),
784 ENTITY("oplus", 8853),
785 ENTITY("or", 8744),
786 ENTITY("ordf", 170),
787 ENTITY("ordm", 186),
788 ENTITY("oslash", 248),
789 ENTITY("otilde", 245),
790 ENTITY("otimes", 8855),
791 ENTITY("ouml", 246),
792 ENTITY("para", 182),
793 ENTITY("part", 8706),
794 ENTITY("permil", 8240),
795 ENTITY("perp", 8869),
796 ENTITY("phi", 966),
797 ENTITY("pi", 960),
798 ENTITY("piv", 982),
799 ENTITY("plusmn", 177),
800 ENTITY("pound", 163),
801 ENTITY("prime", 8242),
802 ENTITY("prod", 8719),
803 ENTITY("prop", 8733),
804 ENTITY("psi", 968),
805 ENTITY("quot", 34),
806 ENTITY("rArr", 8658),
807 ENTITY("radic", 8730),
808 ENTITY("rang", 9002),
809 ENTITY("raquo", 187),
810 ENTITY("rarr", 8594),
811 ENTITY("rceil", 8969),
812 ENTITY("rdquo", 8221),
813 ENTITY("real", 8476),
814 ENTITY("reg", 174),
815 ENTITY("rfloor", 8971),
816 ENTITY("rho", 961),
817 ENTITY("rlm", 8207),
818 ENTITY("rsaquo", 8250),
819 ENTITY("rsquo", 8217),
820 ENTITY("sbquo", 8218),
821 ENTITY("scaron", 353),
822 ENTITY("sdot", 8901),
823 ENTITY("sect", 167),
824 ENTITY("shy", 173),
825 ENTITY("sigma", 963),
826 ENTITY("sigmaf", 962),
827 ENTITY("sim", 8764),
828 ENTITY("spades", 9824),
829 ENTITY("sub", 8834),
830 ENTITY("sube", 8838),
831 ENTITY("sum", 8721),
832 ENTITY("sup", 8835),
833 ENTITY("sup1", 185),
834 ENTITY("sup2", 178),
835 ENTITY("sup3", 179),
836 ENTITY("supe", 8839),
837 ENTITY("szlig", 223),
838 ENTITY("tau", 964),
839 ENTITY("there4", 8756),
840 ENTITY("theta", 952),
841 ENTITY("thetasym", 977),
842 ENTITY("thinsp", 8201),
843 ENTITY("thorn", 254),
844 ENTITY("tilde", 732),
845 ENTITY("times", 215),
846 ENTITY("trade", 8482),
847 ENTITY("uArr", 8657),
848 ENTITY("uacute", 250),
849 ENTITY("uarr", 8593),
850 ENTITY("ucirc", 251),
851 ENTITY("ugrave", 249),
852 ENTITY("uml", 168),
853 ENTITY("upsih", 978),
854 ENTITY("upsilon", 965),
855 ENTITY("uuml", 252),
856 ENTITY("weierp", 8472),
857 ENTITY("xi", 958),
858 ENTITY("yacute", 253),
859 ENTITY("yen", 165),
860 ENTITY("yuml", 255),
861 ENTITY("zeta", 950),
862 ENTITY("zwj", 8205),
863 ENTITY("zwnj", 8204),
864 {NULL, 0}};
865 #undef ENTITY
866 static size_t substitutions_cnt = 0;
867
868 if (substitutions_cnt == 0)
869 while (substitutions[substitutions_cnt].code != 0)
870 substitutions_cnt++;
871
872 wxHtmlEntityInfo *info = NULL;
873 #ifdef __WXWINCE__
874 // bsearch crashes under WinCE for some reason
875 size_t i;
876 for (i = 0; i < substitutions_cnt; i++)
877 {
878 if (entity == substitutions[i].name)
879 {
880 info = & substitutions[i];
881 break;
882 }
883 }
884 #else
885 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
886 substitutions_cnt,
887 sizeof(wxHtmlEntityInfo),
888 wxHtmlEntityCompare);
889 #endif
890 if (info)
891 code = info->code;
892 }
893
894 if (code == 0)
895 return 0;
896 else
897 return GetCharForCode(code);
898 }
899
900 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
901 const wxString& url) const
902 {
903 return m_FS ? m_FS->OpenFile(url) : NULL;
904
905 }
906
907
908 //-----------------------------------------------------------------------------
909 // wxHtmlParser::ExtractCharsetInformation
910 //-----------------------------------------------------------------------------
911
912 class wxMetaTagParser : public wxHtmlParser
913 {
914 public:
915 wxMetaTagParser() { }
916
917 wxObject* GetProduct() { return NULL; }
918
919 protected:
920 virtual void AddText(const wxString& WXUNUSED(txt)) {}
921
922 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
923 };
924
925 class wxMetaTagHandler : public wxHtmlTagHandler
926 {
927 public:
928 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
929 wxString GetSupportedTags() { return wxT("META,BODY"); }
930 bool HandleTag(const wxHtmlTag& tag);
931
932 private:
933 wxString *m_retval;
934
935 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
936 };
937
938 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
939 {
940 if (tag.GetName() == _T("BODY"))
941 {
942 m_Parser->StopParsing();
943 return false;
944 }
945
946 if (tag.HasParam(_T("HTTP-EQUIV")) &&
947 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
948 tag.HasParam(_T("CONTENT")))
949 {
950 wxString content = tag.GetParam(_T("CONTENT")).Lower();
951 if (content.Left(19) == _T("text/html; charset="))
952 {
953 *m_retval = content.Mid(19);
954 m_Parser->StopParsing();
955 }
956 }
957 return false;
958 }
959
960
961 /*static*/
962 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
963 {
964 wxString charset;
965 wxMetaTagParser *parser = new wxMetaTagParser();
966 if(parser)
967 {
968 parser->AddTagHandler(new wxMetaTagHandler(&charset));
969 parser->Parse(markup);
970 delete parser;
971 }
972 return charset;
973 }
974
975 /* static */
976 bool
977 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
978 wxString::const_iterator end)
979 {
980 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
981
982 wxString::const_iterator p = start;
983
984 // comments begin with "<!--" in HTML 4.0
985 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
986 {
987 // not a comment at all
988 return false;
989 }
990
991 // skip the start of the comment tag in any case, if we don't find the
992 // closing tag we should ignore broken markup
993 start = p;
994
995 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
996 // comment delimiter and the closing tag character (section 3.2.4 of
997 // http://www.w3.org/TR/html401/)
998 int dashes = 0;
999 while ( ++p < end )
1000 {
1001 const wxChar c = *p;
1002
1003 if ( (c == wxT(' ') || c == wxT('\n') ||
1004 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
1005 {
1006 // ignore white space before potential tag end
1007 continue;
1008 }
1009
1010 if ( c == wxT('>') && dashes >= 2 )
1011 {
1012 // found end of comment
1013 start = p;
1014 break;
1015 }
1016
1017 if ( c == wxT('-') )
1018 dashes++;
1019 else
1020 dashes = 0;
1021 }
1022
1023 return true;
1024 }
1025
1026 #endif // wxUSE_HTML