fixed wxHTML parsing to run in O(n) even in UTF8 build
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece() {}
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
55 };
56
57 // NB: this is an empty class and not typedef because of forward declaration
58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59 {
60 };
61
62 class wxHtmlParserState
63 {
64 public:
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
69 const wxString *m_source;
70 wxHtmlParserState *m_nextState;
71 };
72
73 //-----------------------------------------------------------------------------
74 // wxHtmlParser
75 //-----------------------------------------------------------------------------
76
77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
79 wxHtmlParser::wxHtmlParser()
80 : wxObject(), m_HandlersHash(wxKEY_STRING),
81 m_FS(NULL), m_HandlersStack(NULL)
82 {
83 m_Source = NULL;
84 m_entitiesParser = new wxHtmlEntitiesParser;
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
90 }
91
92 wxHtmlParser::~wxHtmlParser()
93 {
94 while (RestoreState()) {}
95 DestroyDOMTree();
96
97 if (m_HandlersStack)
98 {
99 wxList& tmp = *m_HandlersStack;
100 wxList::iterator it, en;
101 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
102 delete (wxHashTable*)*it;
103 tmp.clear();
104 }
105 delete m_HandlersStack;
106 m_HandlersHash.Clear();
107 WX_CLEAR_LIST(wxList, m_HandlersList);
108 delete m_entitiesParser;
109 delete m_Source;
110 }
111
112 wxObject* wxHtmlParser::Parse(const wxString& source)
113 {
114 InitParser(source);
115 DoParsing();
116 wxObject *result = GetProduct();
117 DoneParser();
118 return result;
119 }
120
121 void wxHtmlParser::InitParser(const wxString& source)
122 {
123 SetSource(source);
124 m_stopParsing = false;
125 }
126
127 void wxHtmlParser::DoneParser()
128 {
129 DestroyDOMTree();
130 }
131
132 void wxHtmlParser::SetSource(const wxString& src)
133 {
134 DestroyDOMTree();
135 // NB: this is allocated on heap because wxHtmlTag keeps a pointer to
136 // this string if WXWIN_COMPATIBILITY_2_8
137 delete m_Source;
138 m_Source = new wxString(src);
139 CreateDOMTree();
140 m_CurTag = NULL;
141 m_CurTextPiece = 0;
142 }
143
144 void wxHtmlParser::CreateDOMTree()
145 {
146 wxHtmlTagsCache cache(*m_Source);
147 m_TextPieces = new wxHtmlTextPieces;
148 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
149 m_CurTextPiece = 0;
150 }
151
152 extern bool wxIsCDATAElement(const wxString& tag);
153
154 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
155 const wxString::const_iterator& begin_pos,
156 const wxString::const_iterator& end_pos,
157 wxHtmlTagsCache *cache)
158 {
159 if (end_pos <= begin_pos)
160 return;
161
162 wxChar c;
163 wxString::const_iterator i = begin_pos;
164 wxString::const_iterator textBeginning = begin_pos;
165
166 // If the tag contains CDATA text, we include the text between beginning
167 // and ending tag verbosely. Setting i=end_pos will skip to the very
168 // end of this function where text piece is added, bypassing any child
169 // tags parsing (CDATA element can't have child elements by definition):
170 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
171 {
172 i = end_pos;
173 }
174
175 while (i < end_pos)
176 {
177 c = *i;
178
179 if (c == wxT('<'))
180 {
181 // add text to m_TextPieces:
182 if (i > textBeginning)
183 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
184
185 // if it is a comment, skip it:
186 if ( SkipCommentTag(i, m_Source->end()) )
187 {
188 textBeginning = i = i + 1; // skip closing '>' too
189 }
190
191 // add another tag to the tree:
192 else if (i < end_pos-1 && *(i+1) != wxT('/'))
193 {
194 wxHtmlTag *chd;
195 if (cur)
196 chd = new wxHtmlTag(cur, m_Source,
197 i, end_pos, cache, m_entitiesParser);
198 else
199 {
200 chd = new wxHtmlTag(NULL, m_Source,
201 i, end_pos, cache, m_entitiesParser);
202 if (!m_Tags)
203 {
204 // if this is the first tag to be created make the root
205 // m_Tags point to it:
206 m_Tags = chd;
207 }
208 else
209 {
210 // if there is already a root tag add this tag as
211 // the last sibling:
212 chd->m_Prev = m_Tags->GetLastSibling();
213 chd->m_Prev->m_Next = chd;
214 }
215 }
216
217 if (chd->HasEnding())
218 {
219 CreateDOMSubTree(chd,
220 chd->GetBeginIter(), chd->GetEndIter1(),
221 cache);
222 i = chd->GetEndIter2();
223 }
224 else
225 i = chd->GetBeginIter();
226
227 textBeginning = i;
228 }
229
230 // ... or skip ending tag:
231 else
232 {
233 while (i < end_pos && *i != wxT('>')) ++i;
234 textBeginning = i+1;
235 }
236 }
237 else ++i;
238 }
239
240 // add remaining text to m_TextPieces:
241 if (end_pos > textBeginning)
242 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
243 }
244
245 void wxHtmlParser::DestroyDOMTree()
246 {
247 wxHtmlTag *t1, *t2;
248 t1 = m_Tags;
249 while (t1)
250 {
251 t2 = t1->GetNextSibling();
252 delete t1;
253 t1 = t2;
254 }
255 m_Tags = m_CurTag = NULL;
256
257 delete m_TextPieces;
258 m_TextPieces = NULL;
259 }
260
261 void wxHtmlParser::DoParsing()
262 {
263 m_CurTag = m_Tags;
264 m_CurTextPiece = 0;
265 DoParsing(m_Source->begin(), m_Source->end());
266 }
267
268 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
269 const wxString::const_iterator& end_pos)
270 {
271 wxString::const_iterator begin_pos(begin_pos_);
272
273 if (end_pos <= begin_pos)
274 return;
275
276 wxHtmlTextPieces& pieces = *m_TextPieces;
277 size_t piecesCnt = pieces.size();
278
279 while (begin_pos < end_pos)
280 {
281 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
282 m_CurTag = m_CurTag->GetNextTag();
283 while (m_CurTextPiece < piecesCnt &&
284 pieces[m_CurTextPiece].m_start < begin_pos)
285 m_CurTextPiece++;
286
287 if (m_CurTextPiece < piecesCnt &&
288 (!m_CurTag ||
289 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
290 {
291 // Add text:
292 AddText(GetEntitiesParser()->Parse(
293 wxString(pieces[m_CurTextPiece].m_start,
294 pieces[m_CurTextPiece].m_end)));
295 begin_pos = pieces[m_CurTextPiece].m_end;
296 m_CurTextPiece++;
297 }
298 else if (m_CurTag)
299 {
300 if (m_CurTag->HasEnding())
301 begin_pos = m_CurTag->GetEndIter2();
302 else
303 begin_pos = m_CurTag->GetBeginIter();
304 wxHtmlTag *t = m_CurTag;
305 m_CurTag = m_CurTag->GetNextTag();
306 AddTag(*t);
307 if (m_stopParsing)
308 return;
309 }
310 else break;
311 }
312 }
313
314 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
315 {
316 wxHtmlTagHandler *h;
317 bool inner = false;
318
319 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
320 if (h)
321 {
322 inner = h->HandleTag(tag);
323 if (m_stopParsing)
324 return;
325 }
326 if (!inner)
327 {
328 if (tag.HasEnding())
329 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
330 }
331 }
332
333 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
334 {
335 wxString s(handler->GetSupportedTags());
336 wxStringTokenizer tokenizer(s, wxT(", "));
337
338 while (tokenizer.HasMoreTokens())
339 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
340
341 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
342 m_HandlersList.Append(handler);
343
344 handler->SetParser(this);
345 }
346
347 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
348 {
349 wxStringTokenizer tokenizer(tags, wxT(", "));
350 wxString key;
351
352 if (m_HandlersStack == NULL)
353 {
354 m_HandlersStack = new wxList;
355 }
356
357 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
358
359 while (tokenizer.HasMoreTokens())
360 {
361 key = tokenizer.GetNextToken();
362 m_HandlersHash.Delete(key);
363 m_HandlersHash.Put(key, handler);
364 }
365 }
366
367 void wxHtmlParser::PopTagHandler()
368 {
369 wxList::compatibility_iterator first;
370
371 if ( !m_HandlersStack ||
372 #if wxUSE_STL
373 !(first = m_HandlersStack->GetFirst())
374 #else // !wxUSE_STL
375 ((first = m_HandlersStack->GetFirst()) == NULL)
376 #endif // wxUSE_STL/!wxUSE_STL
377 )
378 {
379 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
380 return;
381 }
382 m_HandlersHash = *((wxHashTable*) first->GetData());
383 delete (wxHashTable*) first->GetData();
384 m_HandlersStack->Erase(first);
385 }
386
387 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
388 {
389 wxHtmlParserState *s = new wxHtmlParserState;
390
391 s->m_curTag = m_CurTag;
392 s->m_tags = m_Tags;
393 s->m_textPieces = m_TextPieces;
394 s->m_curTextPiece = m_CurTextPiece;
395 s->m_source = m_Source;
396
397 s->m_nextState = m_SavedStates;
398 m_SavedStates = s;
399
400 m_CurTag = NULL;
401 m_Tags = NULL;
402 m_TextPieces = NULL;
403 m_CurTextPiece = 0;
404 m_Source = NULL;
405
406 SetSource(src);
407 }
408
409 bool wxHtmlParser::RestoreState()
410 {
411 if (!m_SavedStates) return false;
412
413 DestroyDOMTree();
414
415 wxHtmlParserState *s = m_SavedStates;
416 m_SavedStates = s->m_nextState;
417
418 m_CurTag = s->m_curTag;
419 m_Tags = s->m_tags;
420 m_TextPieces = s->m_textPieces;
421 m_CurTextPiece = s->m_curTextPiece;
422 m_Source = s->m_source;
423
424 delete s;
425 return true;
426 }
427
428 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
429 {
430 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
431 }
432
433 //-----------------------------------------------------------------------------
434 // wxHtmlTagHandler
435 //-----------------------------------------------------------------------------
436
437 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
438
439 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
440 {
441 // It is safe to temporarily change the source being parsed,
442 // provided we restore the state back after parsing
443 m_Parser->SetSourceAndSaveState(source);
444 m_Parser->DoParsing();
445 m_Parser->RestoreState();
446 }
447
448
449 //-----------------------------------------------------------------------------
450 // wxHtmlEntitiesParser
451 //-----------------------------------------------------------------------------
452
453 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
454
455 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
456 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
457 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
458 #endif
459 {
460 }
461
462 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
463 {
464 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
465 delete m_conv;
466 #endif
467 }
468
469 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
470 {
471 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
472 if (encoding == m_encoding)
473 return;
474
475 delete m_conv;
476
477 m_encoding = encoding;
478 if (m_encoding == wxFONTENCODING_SYSTEM)
479 m_conv = NULL;
480 else
481 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
482 #else
483 (void) encoding;
484 #endif
485 }
486
487 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
488 {
489 wxString output;
490
491 const wxString::const_iterator end(input.end());
492 wxString::const_iterator c(input.begin());
493 wxString::const_iterator last(c);
494
495 for ( ; c < end; ++c )
496 {
497 if (*c == wxT('&'))
498 {
499 if ( output.empty() )
500 output.reserve(input.length());
501
502 if (c - last > 0)
503 output.append(last, c);
504 if ( ++c == end )
505 break;
506
507 wxString entity;
508 const wxString::const_iterator ent_s = c;
509 wxChar entity_char;
510
511 for ( ; c != end; ++c )
512 {
513 wxChar ch = *c;
514 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
515 (ch >= wxT('A') && ch <= wxT('Z')) ||
516 (ch >= wxT('0') && ch <= wxT('9')) ||
517 ch == wxT('_') || ch == wxT('#')) )
518 break;
519 }
520
521 entity.append(ent_s, c);
522 if (c == end || *c != wxT(';')) --c;
523 last = c+1;
524 entity_char = GetEntityChar(entity);
525 if (entity_char)
526 output << entity_char;
527 else
528 {
529 output.append(ent_s-1, c+1);
530 wxLogTrace(wxTRACE_HTML_DEBUG,
531 "Unrecognized HTML entity: '%s'",
532 entity);
533 }
534 }
535 }
536 if ( last == input.begin() ) // common case: no entity
537 return input;
538 if ( last != end )
539 output.append(last, end);
540 return output;
541 }
542
543 #if !wxUSE_UNICODE
544 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
545 {
546 #if wxUSE_WCHAR_T
547 char buf[2];
548 wchar_t wbuf[2];
549 wbuf[0] = (wchar_t)code;
550 wbuf[1] = 0;
551 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
552 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
553 return '?';
554 return buf[0];
555 #else
556 return (code < 256) ? (wxChar)code : '?';
557 #endif
558 }
559 #endif
560
561 struct wxHtmlEntityInfo
562 {
563 const wxStringCharType *name;
564 unsigned code;
565 };
566
567 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
568 {
569 #if wxUSE_UNICODE_UTF8
570 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
571 #else
572 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
573 #endif
574 }
575
576 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
577 {
578 unsigned code = 0;
579
580 if (entity[0] == wxT('#'))
581 {
582 // NB: parsed value is a number, so it's OK to use wx_str(), internal
583 // representation is the same for numbers
584 const wxStringCharType *ent_s = entity.wx_str();
585 const wxStringCharType *format;
586
587 if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
588 {
589 format = wxSTRING_TEXT("%x");
590 ent_s++;
591 }
592 else
593 format = wxSTRING_TEXT("%u");
594 ent_s++;
595
596 if (wxSscanf(ent_s, format, &code) != 1)
597 code = 0;
598 }
599 else
600 {
601 // store the literals in wx's internal representation (either char*
602 // in UTF-8 or wchar_t*) for best performance:
603 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
604
605 static wxHtmlEntityInfo substitutions[] = {
606 ENTITY("AElig", 198),
607 ENTITY("Aacute", 193),
608 ENTITY("Acirc", 194),
609 ENTITY("Agrave", 192),
610 ENTITY("Alpha", 913),
611 ENTITY("Aring", 197),
612 ENTITY("Atilde", 195),
613 ENTITY("Auml", 196),
614 ENTITY("Beta", 914),
615 ENTITY("Ccedil", 199),
616 ENTITY("Chi", 935),
617 ENTITY("Dagger", 8225),
618 ENTITY("Delta", 916),
619 ENTITY("ETH", 208),
620 ENTITY("Eacute", 201),
621 ENTITY("Ecirc", 202),
622 ENTITY("Egrave", 200),
623 ENTITY("Epsilon", 917),
624 ENTITY("Eta", 919),
625 ENTITY("Euml", 203),
626 ENTITY("Gamma", 915),
627 ENTITY("Iacute", 205),
628 ENTITY("Icirc", 206),
629 ENTITY("Igrave", 204),
630 ENTITY("Iota", 921),
631 ENTITY("Iuml", 207),
632 ENTITY("Kappa", 922),
633 ENTITY("Lambda", 923),
634 ENTITY("Mu", 924),
635 ENTITY("Ntilde", 209),
636 ENTITY("Nu", 925),
637 ENTITY("OElig", 338),
638 ENTITY("Oacute", 211),
639 ENTITY("Ocirc", 212),
640 ENTITY("Ograve", 210),
641 ENTITY("Omega", 937),
642 ENTITY("Omicron", 927),
643 ENTITY("Oslash", 216),
644 ENTITY("Otilde", 213),
645 ENTITY("Ouml", 214),
646 ENTITY("Phi", 934),
647 ENTITY("Pi", 928),
648 ENTITY("Prime", 8243),
649 ENTITY("Psi", 936),
650 ENTITY("Rho", 929),
651 ENTITY("Scaron", 352),
652 ENTITY("Sigma", 931),
653 ENTITY("THORN", 222),
654 ENTITY("Tau", 932),
655 ENTITY("Theta", 920),
656 ENTITY("Uacute", 218),
657 ENTITY("Ucirc", 219),
658 ENTITY("Ugrave", 217),
659 ENTITY("Upsilon", 933),
660 ENTITY("Uuml", 220),
661 ENTITY("Xi", 926),
662 ENTITY("Yacute", 221),
663 ENTITY("Yuml", 376),
664 ENTITY("Zeta", 918),
665 ENTITY("aacute", 225),
666 ENTITY("acirc", 226),
667 ENTITY("acute", 180),
668 ENTITY("aelig", 230),
669 ENTITY("agrave", 224),
670 ENTITY("alefsym", 8501),
671 ENTITY("alpha", 945),
672 ENTITY("amp", 38),
673 ENTITY("and", 8743),
674 ENTITY("ang", 8736),
675 ENTITY("aring", 229),
676 ENTITY("asymp", 8776),
677 ENTITY("atilde", 227),
678 ENTITY("auml", 228),
679 ENTITY("bdquo", 8222),
680 ENTITY("beta", 946),
681 ENTITY("brvbar", 166),
682 ENTITY("bull", 8226),
683 ENTITY("cap", 8745),
684 ENTITY("ccedil", 231),
685 ENTITY("cedil", 184),
686 ENTITY("cent", 162),
687 ENTITY("chi", 967),
688 ENTITY("circ", 710),
689 ENTITY("clubs", 9827),
690 ENTITY("cong", 8773),
691 ENTITY("copy", 169),
692 ENTITY("crarr", 8629),
693 ENTITY("cup", 8746),
694 ENTITY("curren", 164),
695 ENTITY("dArr", 8659),
696 ENTITY("dagger", 8224),
697 ENTITY("darr", 8595),
698 ENTITY("deg", 176),
699 ENTITY("delta", 948),
700 ENTITY("diams", 9830),
701 ENTITY("divide", 247),
702 ENTITY("eacute", 233),
703 ENTITY("ecirc", 234),
704 ENTITY("egrave", 232),
705 ENTITY("empty", 8709),
706 ENTITY("emsp", 8195),
707 ENTITY("ensp", 8194),
708 ENTITY("epsilon", 949),
709 ENTITY("equiv", 8801),
710 ENTITY("eta", 951),
711 ENTITY("eth", 240),
712 ENTITY("euml", 235),
713 ENTITY("euro", 8364),
714 ENTITY("exist", 8707),
715 ENTITY("fnof", 402),
716 ENTITY("forall", 8704),
717 ENTITY("frac12", 189),
718 ENTITY("frac14", 188),
719 ENTITY("frac34", 190),
720 ENTITY("frasl", 8260),
721 ENTITY("gamma", 947),
722 ENTITY("ge", 8805),
723 ENTITY("gt", 62),
724 ENTITY("hArr", 8660),
725 ENTITY("harr", 8596),
726 ENTITY("hearts", 9829),
727 ENTITY("hellip", 8230),
728 ENTITY("iacute", 237),
729 ENTITY("icirc", 238),
730 ENTITY("iexcl", 161),
731 ENTITY("igrave", 236),
732 ENTITY("image", 8465),
733 ENTITY("infin", 8734),
734 ENTITY("int", 8747),
735 ENTITY("iota", 953),
736 ENTITY("iquest", 191),
737 ENTITY("isin", 8712),
738 ENTITY("iuml", 239),
739 ENTITY("kappa", 954),
740 ENTITY("lArr", 8656),
741 ENTITY("lambda", 955),
742 ENTITY("lang", 9001),
743 ENTITY("laquo", 171),
744 ENTITY("larr", 8592),
745 ENTITY("lceil", 8968),
746 ENTITY("ldquo", 8220),
747 ENTITY("le", 8804),
748 ENTITY("lfloor", 8970),
749 ENTITY("lowast", 8727),
750 ENTITY("loz", 9674),
751 ENTITY("lrm", 8206),
752 ENTITY("lsaquo", 8249),
753 ENTITY("lsquo", 8216),
754 ENTITY("lt", 60),
755 ENTITY("macr", 175),
756 ENTITY("mdash", 8212),
757 ENTITY("micro", 181),
758 ENTITY("middot", 183),
759 ENTITY("minus", 8722),
760 ENTITY("mu", 956),
761 ENTITY("nabla", 8711),
762 ENTITY("nbsp", 160),
763 ENTITY("ndash", 8211),
764 ENTITY("ne", 8800),
765 ENTITY("ni", 8715),
766 ENTITY("not", 172),
767 ENTITY("notin", 8713),
768 ENTITY("nsub", 8836),
769 ENTITY("ntilde", 241),
770 ENTITY("nu", 957),
771 ENTITY("oacute", 243),
772 ENTITY("ocirc", 244),
773 ENTITY("oelig", 339),
774 ENTITY("ograve", 242),
775 ENTITY("oline", 8254),
776 ENTITY("omega", 969),
777 ENTITY("omicron", 959),
778 ENTITY("oplus", 8853),
779 ENTITY("or", 8744),
780 ENTITY("ordf", 170),
781 ENTITY("ordm", 186),
782 ENTITY("oslash", 248),
783 ENTITY("otilde", 245),
784 ENTITY("otimes", 8855),
785 ENTITY("ouml", 246),
786 ENTITY("para", 182),
787 ENTITY("part", 8706),
788 ENTITY("permil", 8240),
789 ENTITY("perp", 8869),
790 ENTITY("phi", 966),
791 ENTITY("pi", 960),
792 ENTITY("piv", 982),
793 ENTITY("plusmn", 177),
794 ENTITY("pound", 163),
795 ENTITY("prime", 8242),
796 ENTITY("prod", 8719),
797 ENTITY("prop", 8733),
798 ENTITY("psi", 968),
799 ENTITY("quot", 34),
800 ENTITY("rArr", 8658),
801 ENTITY("radic", 8730),
802 ENTITY("rang", 9002),
803 ENTITY("raquo", 187),
804 ENTITY("rarr", 8594),
805 ENTITY("rceil", 8969),
806 ENTITY("rdquo", 8221),
807 ENTITY("real", 8476),
808 ENTITY("reg", 174),
809 ENTITY("rfloor", 8971),
810 ENTITY("rho", 961),
811 ENTITY("rlm", 8207),
812 ENTITY("rsaquo", 8250),
813 ENTITY("rsquo", 8217),
814 ENTITY("sbquo", 8218),
815 ENTITY("scaron", 353),
816 ENTITY("sdot", 8901),
817 ENTITY("sect", 167),
818 ENTITY("shy", 173),
819 ENTITY("sigma", 963),
820 ENTITY("sigmaf", 962),
821 ENTITY("sim", 8764),
822 ENTITY("spades", 9824),
823 ENTITY("sub", 8834),
824 ENTITY("sube", 8838),
825 ENTITY("sum", 8721),
826 ENTITY("sup", 8835),
827 ENTITY("sup1", 185),
828 ENTITY("sup2", 178),
829 ENTITY("sup3", 179),
830 ENTITY("supe", 8839),
831 ENTITY("szlig", 223),
832 ENTITY("tau", 964),
833 ENTITY("there4", 8756),
834 ENTITY("theta", 952),
835 ENTITY("thetasym", 977),
836 ENTITY("thinsp", 8201),
837 ENTITY("thorn", 254),
838 ENTITY("tilde", 732),
839 ENTITY("times", 215),
840 ENTITY("trade", 8482),
841 ENTITY("uArr", 8657),
842 ENTITY("uacute", 250),
843 ENTITY("uarr", 8593),
844 ENTITY("ucirc", 251),
845 ENTITY("ugrave", 249),
846 ENTITY("uml", 168),
847 ENTITY("upsih", 978),
848 ENTITY("upsilon", 965),
849 ENTITY("uuml", 252),
850 ENTITY("weierp", 8472),
851 ENTITY("xi", 958),
852 ENTITY("yacute", 253),
853 ENTITY("yen", 165),
854 ENTITY("yuml", 255),
855 ENTITY("zeta", 950),
856 ENTITY("zwj", 8205),
857 ENTITY("zwnj", 8204),
858 {NULL, 0}};
859 #undef ENTITY
860 static size_t substitutions_cnt = 0;
861
862 if (substitutions_cnt == 0)
863 while (substitutions[substitutions_cnt].code != 0)
864 substitutions_cnt++;
865
866 wxHtmlEntityInfo *info = NULL;
867 #ifdef __WXWINCE__
868 // bsearch crashes under WinCE for some reason
869 size_t i;
870 for (i = 0; i < substitutions_cnt; i++)
871 {
872 if (entity == substitutions[i].name)
873 {
874 info = & substitutions[i];
875 break;
876 }
877 }
878 #else
879 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
880 substitutions_cnt,
881 sizeof(wxHtmlEntityInfo),
882 wxHtmlEntityCompare);
883 #endif
884 if (info)
885 code = info->code;
886 }
887
888 if (code == 0)
889 return 0;
890 else
891 return GetCharForCode(code);
892 }
893
894 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
895 const wxString& url) const
896 {
897 return m_FS ? m_FS->OpenFile(url) : NULL;
898
899 }
900
901
902 //-----------------------------------------------------------------------------
903 // wxHtmlParser::ExtractCharsetInformation
904 //-----------------------------------------------------------------------------
905
906 class wxMetaTagParser : public wxHtmlParser
907 {
908 public:
909 wxMetaTagParser() { }
910
911 wxObject* GetProduct() { return NULL; }
912
913 protected:
914 virtual void AddText(const wxString& WXUNUSED(txt)) {}
915
916 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
917 };
918
919 class wxMetaTagHandler : public wxHtmlTagHandler
920 {
921 public:
922 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
923 wxString GetSupportedTags() { return wxT("META,BODY"); }
924 bool HandleTag(const wxHtmlTag& tag);
925
926 private:
927 wxString *m_retval;
928
929 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
930 };
931
932 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
933 {
934 if (tag.GetName() == _T("BODY"))
935 {
936 m_Parser->StopParsing();
937 return false;
938 }
939
940 if (tag.HasParam(_T("HTTP-EQUIV")) &&
941 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
942 tag.HasParam(_T("CONTENT")))
943 {
944 wxString content = tag.GetParam(_T("CONTENT")).Lower();
945 if (content.Left(19) == _T("text/html; charset="))
946 {
947 *m_retval = content.Mid(19);
948 m_Parser->StopParsing();
949 }
950 }
951 return false;
952 }
953
954
955 /*static*/
956 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
957 {
958 wxString charset;
959 wxMetaTagParser *parser = new wxMetaTagParser();
960 if(parser)
961 {
962 parser->AddTagHandler(new wxMetaTagHandler(&charset));
963 parser->Parse(markup);
964 delete parser;
965 }
966 return charset;
967 }
968
969 /* static */
970 bool
971 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
972 wxString::const_iterator end)
973 {
974 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
975
976 wxString::const_iterator p = start;
977
978 // comments begin with "<!--" in HTML 4.0
979 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
980 {
981 // not a comment at all
982 return false;
983 }
984
985 // skip the start of the comment tag in any case, if we don't find the
986 // closing tag we should ignore broken markup
987 start = p;
988
989 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
990 // comment delimiter and the closing tag character (section 3.2.4 of
991 // http://www.w3.org/TR/html401/)
992 int dashes = 0;
993 while ( ++p < end )
994 {
995 const wxChar c = *p;
996
997 if ( (c == wxT(' ') || c == wxT('\n') ||
998 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
999 {
1000 // ignore white space before potential tag end
1001 continue;
1002 }
1003
1004 if ( c == wxT('>') && dashes >= 2 )
1005 {
1006 // found end of comment
1007 start = p;
1008 break;
1009 }
1010
1011 if ( c == wxT('-') )
1012 dashes++;
1013 else
1014 dashes = 0;
1015 }
1016
1017 return true;
1018 }
1019
1020 #endif // wxUSE_HTML