Remove wxUSE_WCHAR_T checks.
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece() {}
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
55 };
56
57 // NB: this is an empty class and not typedef because of forward declaration
58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59 {
60 };
61
62 class wxHtmlParserState
63 {
64 public:
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
69 const wxString *m_source;
70 wxHtmlParserState *m_nextState;
71 };
72
73 //-----------------------------------------------------------------------------
74 // wxHtmlParser
75 //-----------------------------------------------------------------------------
76
77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
79 wxHtmlParser::wxHtmlParser()
80 : wxObject(),
81 m_FS(NULL)
82 {
83 m_Source = NULL;
84 m_entitiesParser = new wxHtmlEntitiesParser;
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
90 }
91
92 wxHtmlParser::~wxHtmlParser()
93 {
94 while (RestoreState()) {}
95 DestroyDOMTree();
96
97 WX_CLEAR_ARRAY(m_HandlersStack);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
99 delete m_entitiesParser;
100 delete m_Source;
101 }
102
103 wxObject* wxHtmlParser::Parse(const wxString& source)
104 {
105 InitParser(source);
106 DoParsing();
107 wxObject *result = GetProduct();
108 DoneParser();
109 return result;
110 }
111
112 void wxHtmlParser::InitParser(const wxString& source)
113 {
114 SetSource(source);
115 m_stopParsing = false;
116 }
117
118 void wxHtmlParser::DoneParser()
119 {
120 DestroyDOMTree();
121 }
122
123 void wxHtmlParser::SetSource(const wxString& src)
124 {
125 DestroyDOMTree();
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
132 // on the heap.
133 delete m_Source;
134 m_Source = new wxString(src);
135 CreateDOMTree();
136 m_CurTag = NULL;
137 m_CurTextPiece = 0;
138 }
139
140 void wxHtmlParser::CreateDOMTree()
141 {
142 wxHtmlTagsCache cache(*m_Source);
143 m_TextPieces = new wxHtmlTextPieces;
144 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
145 m_CurTextPiece = 0;
146 }
147
148 extern bool wxIsCDATAElement(const wxString& tag);
149
150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
151 const wxString::const_iterator& begin_pos,
152 const wxString::const_iterator& end_pos,
153 wxHtmlTagsCache *cache)
154 {
155 if (end_pos <= begin_pos)
156 return;
157
158 wxChar c;
159 wxString::const_iterator i = begin_pos;
160 wxString::const_iterator textBeginning = begin_pos;
161
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
166 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
167 {
168 i = end_pos;
169 }
170
171 while (i < end_pos)
172 {
173 c = *i;
174
175 if (c == wxT('<'))
176 {
177 // add text to m_TextPieces:
178 if (i > textBeginning)
179 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
180
181 // if it is a comment, skip it:
182 if ( SkipCommentTag(i, m_Source->end()) )
183 {
184 textBeginning = i = i + 1; // skip closing '>' too
185 }
186
187 // add another tag to the tree:
188 else if (i < end_pos-1 && *(i+1) != wxT('/'))
189 {
190 wxHtmlTag *chd;
191 if (cur)
192 chd = new wxHtmlTag(cur, m_Source,
193 i, end_pos, cache, m_entitiesParser);
194 else
195 {
196 chd = new wxHtmlTag(NULL, m_Source,
197 i, end_pos, cache, m_entitiesParser);
198 if (!m_Tags)
199 {
200 // if this is the first tag to be created make the root
201 // m_Tags point to it:
202 m_Tags = chd;
203 }
204 else
205 {
206 // if there is already a root tag add this tag as
207 // the last sibling:
208 chd->m_Prev = m_Tags->GetLastSibling();
209 chd->m_Prev->m_Next = chd;
210 }
211 }
212
213 if (chd->HasEnding())
214 {
215 CreateDOMSubTree(chd,
216 chd->GetBeginIter(), chd->GetEndIter1(),
217 cache);
218 i = chd->GetEndIter2();
219 }
220 else
221 i = chd->GetBeginIter();
222
223 textBeginning = i;
224 }
225
226 // ... or skip ending tag:
227 else
228 {
229 while (i < end_pos && *i != wxT('>')) ++i;
230 textBeginning = i+1;
231 }
232 }
233 else ++i;
234 }
235
236 // add remaining text to m_TextPieces:
237 if (end_pos > textBeginning)
238 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
239 }
240
241 void wxHtmlParser::DestroyDOMTree()
242 {
243 wxHtmlTag *t1, *t2;
244 t1 = m_Tags;
245 while (t1)
246 {
247 t2 = t1->GetNextSibling();
248 delete t1;
249 t1 = t2;
250 }
251 m_Tags = m_CurTag = NULL;
252
253 delete m_TextPieces;
254 m_TextPieces = NULL;
255 }
256
257 void wxHtmlParser::DoParsing()
258 {
259 m_CurTag = m_Tags;
260 m_CurTextPiece = 0;
261 DoParsing(m_Source->begin(), m_Source->end());
262 }
263
264 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
265 const wxString::const_iterator& end_pos)
266 {
267 wxString::const_iterator begin_pos(begin_pos_);
268
269 if (end_pos <= begin_pos)
270 return;
271
272 wxHtmlTextPieces& pieces = *m_TextPieces;
273 size_t piecesCnt = pieces.size();
274
275 while (begin_pos < end_pos)
276 {
277 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
278 m_CurTag = m_CurTag->GetNextTag();
279 while (m_CurTextPiece < piecesCnt &&
280 pieces[m_CurTextPiece].m_start < begin_pos)
281 m_CurTextPiece++;
282
283 if (m_CurTextPiece < piecesCnt &&
284 (!m_CurTag ||
285 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
286 {
287 // Add text:
288 AddText(GetEntitiesParser()->Parse(
289 wxString(pieces[m_CurTextPiece].m_start,
290 pieces[m_CurTextPiece].m_end)));
291 begin_pos = pieces[m_CurTextPiece].m_end;
292 m_CurTextPiece++;
293 }
294 else if (m_CurTag)
295 {
296 if (m_CurTag->HasEnding())
297 begin_pos = m_CurTag->GetEndIter2();
298 else
299 begin_pos = m_CurTag->GetBeginIter();
300 wxHtmlTag *t = m_CurTag;
301 m_CurTag = m_CurTag->GetNextTag();
302 AddTag(*t);
303 if (m_stopParsing)
304 return;
305 }
306 else break;
307 }
308 }
309
310 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
311 {
312 bool inner = false;
313
314 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
315 if (h != m_HandlersHash.end())
316 {
317 inner = h->second->HandleTag(tag);
318 if (m_stopParsing)
319 return;
320 }
321 if (!inner)
322 {
323 if (tag.HasEnding())
324 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
325 }
326 }
327
328 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
329 {
330 wxString s(handler->GetSupportedTags());
331 wxStringTokenizer tokenizer(s, wxT(", "));
332
333 while (tokenizer.HasMoreTokens())
334 m_HandlersHash[tokenizer.GetNextToken()] = handler;
335
336 m_HandlersSet.insert(handler);
337
338 handler->SetParser(this);
339 }
340
341 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
342 {
343 wxStringTokenizer tokenizer(tags, wxT(", "));
344 wxString key;
345
346 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
347
348 while (tokenizer.HasMoreTokens())
349 {
350 key = tokenizer.GetNextToken();
351 m_HandlersHash[key] = handler;
352 }
353 }
354
355 void wxHtmlParser::PopTagHandler()
356 {
357 wxCHECK_RET( !m_HandlersStack.empty(),
358 "attempt to remove HTML tag handler from empty stack" );
359
360 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
361 m_HandlersStack.pop_back();
362 m_HandlersHash = *prev;
363 delete prev;
364 }
365
366 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
367 {
368 wxHtmlParserState *s = new wxHtmlParserState;
369
370 s->m_curTag = m_CurTag;
371 s->m_tags = m_Tags;
372 s->m_textPieces = m_TextPieces;
373 s->m_curTextPiece = m_CurTextPiece;
374 s->m_source = m_Source;
375
376 s->m_nextState = m_SavedStates;
377 m_SavedStates = s;
378
379 m_CurTag = NULL;
380 m_Tags = NULL;
381 m_TextPieces = NULL;
382 m_CurTextPiece = 0;
383 m_Source = NULL;
384
385 SetSource(src);
386 }
387
388 bool wxHtmlParser::RestoreState()
389 {
390 if (!m_SavedStates) return false;
391
392 DestroyDOMTree();
393 delete m_Source;
394
395 wxHtmlParserState *s = m_SavedStates;
396 m_SavedStates = s->m_nextState;
397
398 m_CurTag = s->m_curTag;
399 m_Tags = s->m_tags;
400 m_TextPieces = s->m_textPieces;
401 m_CurTextPiece = s->m_curTextPiece;
402 m_Source = s->m_source;
403
404 delete s;
405 return true;
406 }
407
408 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
409 {
410 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
411 }
412
413 //-----------------------------------------------------------------------------
414 // wxHtmlTagHandler
415 //-----------------------------------------------------------------------------
416
417 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
418
419 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
420 {
421 // It is safe to temporarily change the source being parsed,
422 // provided we restore the state back after parsing
423 m_Parser->SetSourceAndSaveState(source);
424 m_Parser->DoParsing();
425 m_Parser->RestoreState();
426 }
427
428
429 //-----------------------------------------------------------------------------
430 // wxHtmlEntitiesParser
431 //-----------------------------------------------------------------------------
432
433 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
434
435 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
436 #if !wxUSE_UNICODE
437 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
438 #endif
439 {
440 }
441
442 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
443 {
444 #if !wxUSE_UNICODE
445 delete m_conv;
446 #endif
447 }
448
449 #if !wxUSE_UNICODE
450 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
451 {
452 if (encoding == m_encoding)
453 return;
454
455 delete m_conv;
456
457 m_encoding = encoding;
458 if (m_encoding == wxFONTENCODING_SYSTEM)
459 m_conv = NULL;
460 else
461 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
462 }
463 #endif // !wxUSE_UNICODE
464
465 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
466 {
467 wxString output;
468
469 const wxString::const_iterator end(input.end());
470 wxString::const_iterator c(input.begin());
471 wxString::const_iterator last(c);
472
473 for ( ; c < end; ++c )
474 {
475 if (*c == wxT('&'))
476 {
477 if ( output.empty() )
478 output.reserve(input.length());
479
480 if (c - last > 0)
481 output.append(last, c);
482 if ( ++c == end )
483 break;
484
485 wxString entity;
486 const wxString::const_iterator ent_s = c;
487 wxChar entity_char;
488
489 for ( ; c != end; ++c )
490 {
491 wxChar ch = *c;
492 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
493 (ch >= wxT('A') && ch <= wxT('Z')) ||
494 (ch >= wxT('0') && ch <= wxT('9')) ||
495 ch == wxT('_') || ch == wxT('#')) )
496 break;
497 }
498
499 entity.append(ent_s, c);
500 if (c == end || *c != wxT(';')) --c;
501 last = c+1;
502 entity_char = GetEntityChar(entity);
503 if (entity_char)
504 output << entity_char;
505 else
506 {
507 output.append(ent_s-1, c+1);
508 wxLogTrace(wxTRACE_HTML_DEBUG,
509 "Unrecognized HTML entity: '%s'",
510 entity);
511 }
512 }
513 }
514 if ( last == input.begin() ) // common case: no entity
515 return input;
516 if ( last != end )
517 output.append(last, end);
518 return output;
519 }
520
521 #if !wxUSE_UNICODE
522 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
523 {
524 char buf[2];
525 wchar_t wbuf[2];
526 wbuf[0] = (wchar_t)code;
527 wbuf[1] = 0;
528 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
529 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
530 return '?';
531 return buf[0];
532 }
533 #endif
534
535 struct wxHtmlEntityInfo
536 {
537 const wxStringCharType *name;
538 unsigned code;
539 };
540
541 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
542 {
543 #if wxUSE_UNICODE_UTF8
544 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
545 #else
546 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
547 #endif
548 }
549
550 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
551 {
552 unsigned code = 0;
553
554 if (entity.empty())
555 return 0; // invalid entity reference
556
557 if (entity[0] == wxT('#'))
558 {
559 // NB: parsed value is a number, so it's OK to use wx_str(), internal
560 // representation is the same for numbers
561 const wxStringCharType *ent_s = entity.wx_str();
562 const wxStringCharType *format;
563
564 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
565 {
566 format = wxS("%x");
567 ent_s++;
568 }
569 else
570 format = wxS("%u");
571 ent_s++;
572
573 if (wxSscanf(ent_s, format, &code) != 1)
574 code = 0;
575 }
576 else
577 {
578 // store the literals in wx's internal representation (either char*
579 // in UTF-8 or wchar_t*) for best performance:
580 #define ENTITY(name, code) { wxS(name), code }
581
582 static wxHtmlEntityInfo substitutions[] = {
583 ENTITY("AElig", 198),
584 ENTITY("Aacute", 193),
585 ENTITY("Acirc", 194),
586 ENTITY("Agrave", 192),
587 ENTITY("Alpha", 913),
588 ENTITY("Aring", 197),
589 ENTITY("Atilde", 195),
590 ENTITY("Auml", 196),
591 ENTITY("Beta", 914),
592 ENTITY("Ccedil", 199),
593 ENTITY("Chi", 935),
594 ENTITY("Dagger", 8225),
595 ENTITY("Delta", 916),
596 ENTITY("ETH", 208),
597 ENTITY("Eacute", 201),
598 ENTITY("Ecirc", 202),
599 ENTITY("Egrave", 200),
600 ENTITY("Epsilon", 917),
601 ENTITY("Eta", 919),
602 ENTITY("Euml", 203),
603 ENTITY("Gamma", 915),
604 ENTITY("Iacute", 205),
605 ENTITY("Icirc", 206),
606 ENTITY("Igrave", 204),
607 ENTITY("Iota", 921),
608 ENTITY("Iuml", 207),
609 ENTITY("Kappa", 922),
610 ENTITY("Lambda", 923),
611 ENTITY("Mu", 924),
612 ENTITY("Ntilde", 209),
613 ENTITY("Nu", 925),
614 ENTITY("OElig", 338),
615 ENTITY("Oacute", 211),
616 ENTITY("Ocirc", 212),
617 ENTITY("Ograve", 210),
618 ENTITY("Omega", 937),
619 ENTITY("Omicron", 927),
620 ENTITY("Oslash", 216),
621 ENTITY("Otilde", 213),
622 ENTITY("Ouml", 214),
623 ENTITY("Phi", 934),
624 ENTITY("Pi", 928),
625 ENTITY("Prime", 8243),
626 ENTITY("Psi", 936),
627 ENTITY("Rho", 929),
628 ENTITY("Scaron", 352),
629 ENTITY("Sigma", 931),
630 ENTITY("THORN", 222),
631 ENTITY("Tau", 932),
632 ENTITY("Theta", 920),
633 ENTITY("Uacute", 218),
634 ENTITY("Ucirc", 219),
635 ENTITY("Ugrave", 217),
636 ENTITY("Upsilon", 933),
637 ENTITY("Uuml", 220),
638 ENTITY("Xi", 926),
639 ENTITY("Yacute", 221),
640 ENTITY("Yuml", 376),
641 ENTITY("Zeta", 918),
642 ENTITY("aacute", 225),
643 ENTITY("acirc", 226),
644 ENTITY("acute", 180),
645 ENTITY("aelig", 230),
646 ENTITY("agrave", 224),
647 ENTITY("alefsym", 8501),
648 ENTITY("alpha", 945),
649 ENTITY("amp", 38),
650 ENTITY("and", 8743),
651 ENTITY("ang", 8736),
652 ENTITY("apos", 39),
653 ENTITY("aring", 229),
654 ENTITY("asymp", 8776),
655 ENTITY("atilde", 227),
656 ENTITY("auml", 228),
657 ENTITY("bdquo", 8222),
658 ENTITY("beta", 946),
659 ENTITY("brvbar", 166),
660 ENTITY("bull", 8226),
661 ENTITY("cap", 8745),
662 ENTITY("ccedil", 231),
663 ENTITY("cedil", 184),
664 ENTITY("cent", 162),
665 ENTITY("chi", 967),
666 ENTITY("circ", 710),
667 ENTITY("clubs", 9827),
668 ENTITY("cong", 8773),
669 ENTITY("copy", 169),
670 ENTITY("crarr", 8629),
671 ENTITY("cup", 8746),
672 ENTITY("curren", 164),
673 ENTITY("dArr", 8659),
674 ENTITY("dagger", 8224),
675 ENTITY("darr", 8595),
676 ENTITY("deg", 176),
677 ENTITY("delta", 948),
678 ENTITY("diams", 9830),
679 ENTITY("divide", 247),
680 ENTITY("eacute", 233),
681 ENTITY("ecirc", 234),
682 ENTITY("egrave", 232),
683 ENTITY("empty", 8709),
684 ENTITY("emsp", 8195),
685 ENTITY("ensp", 8194),
686 ENTITY("epsilon", 949),
687 ENTITY("equiv", 8801),
688 ENTITY("eta", 951),
689 ENTITY("eth", 240),
690 ENTITY("euml", 235),
691 ENTITY("euro", 8364),
692 ENTITY("exist", 8707),
693 ENTITY("fnof", 402),
694 ENTITY("forall", 8704),
695 ENTITY("frac12", 189),
696 ENTITY("frac14", 188),
697 ENTITY("frac34", 190),
698 ENTITY("frasl", 8260),
699 ENTITY("gamma", 947),
700 ENTITY("ge", 8805),
701 ENTITY("gt", 62),
702 ENTITY("hArr", 8660),
703 ENTITY("harr", 8596),
704 ENTITY("hearts", 9829),
705 ENTITY("hellip", 8230),
706 ENTITY("iacute", 237),
707 ENTITY("icirc", 238),
708 ENTITY("iexcl", 161),
709 ENTITY("igrave", 236),
710 ENTITY("image", 8465),
711 ENTITY("infin", 8734),
712 ENTITY("int", 8747),
713 ENTITY("iota", 953),
714 ENTITY("iquest", 191),
715 ENTITY("isin", 8712),
716 ENTITY("iuml", 239),
717 ENTITY("kappa", 954),
718 ENTITY("lArr", 8656),
719 ENTITY("lambda", 955),
720 ENTITY("lang", 9001),
721 ENTITY("laquo", 171),
722 ENTITY("larr", 8592),
723 ENTITY("lceil", 8968),
724 ENTITY("ldquo", 8220),
725 ENTITY("le", 8804),
726 ENTITY("lfloor", 8970),
727 ENTITY("lowast", 8727),
728 ENTITY("loz", 9674),
729 ENTITY("lrm", 8206),
730 ENTITY("lsaquo", 8249),
731 ENTITY("lsquo", 8216),
732 ENTITY("lt", 60),
733 ENTITY("macr", 175),
734 ENTITY("mdash", 8212),
735 ENTITY("micro", 181),
736 ENTITY("middot", 183),
737 ENTITY("minus", 8722),
738 ENTITY("mu", 956),
739 ENTITY("nabla", 8711),
740 ENTITY("nbsp", 160),
741 ENTITY("ndash", 8211),
742 ENTITY("ne", 8800),
743 ENTITY("ni", 8715),
744 ENTITY("not", 172),
745 ENTITY("notin", 8713),
746 ENTITY("nsub", 8836),
747 ENTITY("ntilde", 241),
748 ENTITY("nu", 957),
749 ENTITY("oacute", 243),
750 ENTITY("ocirc", 244),
751 ENTITY("oelig", 339),
752 ENTITY("ograve", 242),
753 ENTITY("oline", 8254),
754 ENTITY("omega", 969),
755 ENTITY("omicron", 959),
756 ENTITY("oplus", 8853),
757 ENTITY("or", 8744),
758 ENTITY("ordf", 170),
759 ENTITY("ordm", 186),
760 ENTITY("oslash", 248),
761 ENTITY("otilde", 245),
762 ENTITY("otimes", 8855),
763 ENTITY("ouml", 246),
764 ENTITY("para", 182),
765 ENTITY("part", 8706),
766 ENTITY("permil", 8240),
767 ENTITY("perp", 8869),
768 ENTITY("phi", 966),
769 ENTITY("pi", 960),
770 ENTITY("piv", 982),
771 ENTITY("plusmn", 177),
772 ENTITY("pound", 163),
773 ENTITY("prime", 8242),
774 ENTITY("prod", 8719),
775 ENTITY("prop", 8733),
776 ENTITY("psi", 968),
777 ENTITY("quot", 34),
778 ENTITY("rArr", 8658),
779 ENTITY("radic", 8730),
780 ENTITY("rang", 9002),
781 ENTITY("raquo", 187),
782 ENTITY("rarr", 8594),
783 ENTITY("rceil", 8969),
784 ENTITY("rdquo", 8221),
785 ENTITY("real", 8476),
786 ENTITY("reg", 174),
787 ENTITY("rfloor", 8971),
788 ENTITY("rho", 961),
789 ENTITY("rlm", 8207),
790 ENTITY("rsaquo", 8250),
791 ENTITY("rsquo", 8217),
792 ENTITY("sbquo", 8218),
793 ENTITY("scaron", 353),
794 ENTITY("sdot", 8901),
795 ENTITY("sect", 167),
796 ENTITY("shy", 173),
797 ENTITY("sigma", 963),
798 ENTITY("sigmaf", 962),
799 ENTITY("sim", 8764),
800 ENTITY("spades", 9824),
801 ENTITY("sub", 8834),
802 ENTITY("sube", 8838),
803 ENTITY("sum", 8721),
804 ENTITY("sup", 8835),
805 ENTITY("sup1", 185),
806 ENTITY("sup2", 178),
807 ENTITY("sup3", 179),
808 ENTITY("supe", 8839),
809 ENTITY("szlig", 223),
810 ENTITY("tau", 964),
811 ENTITY("there4", 8756),
812 ENTITY("theta", 952),
813 ENTITY("thetasym", 977),
814 ENTITY("thinsp", 8201),
815 ENTITY("thorn", 254),
816 ENTITY("tilde", 732),
817 ENTITY("times", 215),
818 ENTITY("trade", 8482),
819 ENTITY("uArr", 8657),
820 ENTITY("uacute", 250),
821 ENTITY("uarr", 8593),
822 ENTITY("ucirc", 251),
823 ENTITY("ugrave", 249),
824 ENTITY("uml", 168),
825 ENTITY("upsih", 978),
826 ENTITY("upsilon", 965),
827 ENTITY("uuml", 252),
828 ENTITY("weierp", 8472),
829 ENTITY("xi", 958),
830 ENTITY("yacute", 253),
831 ENTITY("yen", 165),
832 ENTITY("yuml", 255),
833 ENTITY("zeta", 950),
834 ENTITY("zwj", 8205),
835 ENTITY("zwnj", 8204),
836 {NULL, 0}};
837 #undef ENTITY
838 static size_t substitutions_cnt = 0;
839
840 if (substitutions_cnt == 0)
841 while (substitutions[substitutions_cnt].code != 0)
842 substitutions_cnt++;
843
844 wxHtmlEntityInfo *info;
845 #ifdef __WXWINCE__
846 // bsearch crashes under WinCE for some reason
847 info = NULL;
848 size_t i;
849 for (i = 0; i < substitutions_cnt; i++)
850 {
851 if (entity == substitutions[i].name)
852 {
853 info = & substitutions[i];
854 break;
855 }
856 }
857 #else
858 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
859 substitutions_cnt,
860 sizeof(wxHtmlEntityInfo),
861 wxHtmlEntityCompare);
862 #endif
863 if (info)
864 code = info->code;
865 }
866
867 if (code == 0)
868 return 0;
869 else
870 return GetCharForCode(code);
871 }
872
873 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
874 const wxString& url) const
875 {
876 return m_FS ? m_FS->OpenFile(url) : NULL;
877
878 }
879
880
881 //-----------------------------------------------------------------------------
882 // wxHtmlParser::ExtractCharsetInformation
883 //-----------------------------------------------------------------------------
884
885 class wxMetaTagParser : public wxHtmlParser
886 {
887 public:
888 wxMetaTagParser() { }
889
890 wxObject* GetProduct() { return NULL; }
891
892 protected:
893 virtual void AddText(const wxString& WXUNUSED(txt)) {}
894
895 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
896 };
897
898 class wxMetaTagHandler : public wxHtmlTagHandler
899 {
900 public:
901 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
902 wxString GetSupportedTags() { return wxT("META,BODY"); }
903 bool HandleTag(const wxHtmlTag& tag);
904
905 private:
906 wxString *m_retval;
907
908 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
909 };
910
911 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
912 {
913 if (tag.GetName() == wxT("BODY"))
914 {
915 m_Parser->StopParsing();
916 return false;
917 }
918
919 if (tag.HasParam(wxT("HTTP-EQUIV")) &&
920 tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
921 tag.HasParam(wxT("CONTENT")))
922 {
923 wxString content = tag.GetParam(wxT("CONTENT")).Lower();
924 if (content.Left(19) == wxT("text/html; charset="))
925 {
926 *m_retval = content.Mid(19);
927 m_Parser->StopParsing();
928 }
929 }
930 return false;
931 }
932
933
934 /*static*/
935 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
936 {
937 wxString charset;
938 wxMetaTagParser *parser = new wxMetaTagParser();
939 if(parser)
940 {
941 parser->AddTagHandler(new wxMetaTagHandler(&charset));
942 parser->Parse(markup);
943 delete parser;
944 }
945 return charset;
946 }
947
948 /* static */
949 bool
950 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
951 wxString::const_iterator end)
952 {
953 wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
954
955 wxString::const_iterator p = start;
956
957 // comments begin with "<!--" in HTML 4.0
958 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
959 {
960 // not a comment at all
961 return false;
962 }
963
964 // skip the start of the comment tag in any case, if we don't find the
965 // closing tag we should ignore broken markup
966 start = p;
967
968 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
969 // comment delimiter and the closing tag character (section 3.2.4 of
970 // http://www.w3.org/TR/html401/)
971 int dashes = 0;
972 while ( ++p < end )
973 {
974 const wxChar c = *p;
975
976 if ( (c == wxT(' ') || c == wxT('\n') ||
977 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
978 {
979 // ignore white space before potential tag end
980 continue;
981 }
982
983 if ( c == wxT('>') && dashes >= 2 )
984 {
985 // found end of comment
986 start = p;
987 break;
988 }
989
990 if ( c == wxT('-') )
991 dashes++;
992 else
993 dashes = 0;
994 }
995
996 return true;
997 }
998
999 #endif // wxUSE_HTML