Update OpenVMS compile support
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece() {}
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
55 };
56
57 // NB: this is an empty class and not typedef because of forward declaration
58 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59 {
60 };
61
62 class wxHtmlParserState
63 {
64 public:
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
69 const wxString *m_source;
70 wxHtmlParserState *m_nextState;
71 };
72
73 //-----------------------------------------------------------------------------
74 // wxHtmlParser
75 //-----------------------------------------------------------------------------
76
77 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
79 wxHtmlParser::wxHtmlParser()
80 : wxObject(),
81 m_FS(NULL)
82 {
83 m_Source = NULL;
84 m_entitiesParser = new wxHtmlEntitiesParser;
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
90 }
91
92 wxHtmlParser::~wxHtmlParser()
93 {
94 while (RestoreState()) {}
95 DestroyDOMTree();
96
97 WX_CLEAR_ARRAY(m_HandlersStack);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
99 delete m_entitiesParser;
100 delete m_Source;
101 }
102
103 wxObject* wxHtmlParser::Parse(const wxString& source)
104 {
105 InitParser(source);
106 DoParsing();
107 wxObject *result = GetProduct();
108 DoneParser();
109 return result;
110 }
111
112 void wxHtmlParser::InitParser(const wxString& source)
113 {
114 SetSource(source);
115 m_stopParsing = false;
116 }
117
118 void wxHtmlParser::DoneParser()
119 {
120 DestroyDOMTree();
121 }
122
123 void wxHtmlParser::SetSource(const wxString& src)
124 {
125 DestroyDOMTree();
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
132 // on the heap.
133 delete m_Source;
134 m_Source = new wxString(src);
135 CreateDOMTree();
136 m_CurTag = NULL;
137 m_CurTextPiece = 0;
138 }
139
140 void wxHtmlParser::CreateDOMTree()
141 {
142 wxHtmlTagsCache cache(*m_Source);
143 m_TextPieces = new wxHtmlTextPieces;
144 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
145 m_CurTextPiece = 0;
146 }
147
148 extern bool wxIsCDATAElement(const wxString& tag);
149
150 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
151 const wxString::const_iterator& begin_pos,
152 const wxString::const_iterator& end_pos,
153 wxHtmlTagsCache *cache)
154 {
155 if (end_pos <= begin_pos)
156 return;
157
158 wxChar c;
159 wxString::const_iterator i = begin_pos;
160 wxString::const_iterator textBeginning = begin_pos;
161
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
166 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
167 {
168 i = end_pos;
169 }
170
171 while (i < end_pos)
172 {
173 c = *i;
174
175 if (c == wxT('<'))
176 {
177 // add text to m_TextPieces:
178 if (i > textBeginning)
179 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
180
181 // if it is a comment, skip it:
182 if ( SkipCommentTag(i, m_Source->end()) )
183 {
184 textBeginning = i = i + 1; // skip closing '>' too
185 }
186
187 // add another tag to the tree:
188 else if (i < end_pos-1 && *(i+1) != wxT('/'))
189 {
190 wxHtmlTag *chd;
191 if (cur)
192 chd = new wxHtmlTag(cur, m_Source,
193 i, end_pos, cache, m_entitiesParser);
194 else
195 {
196 chd = new wxHtmlTag(NULL, m_Source,
197 i, end_pos, cache, m_entitiesParser);
198 if (!m_Tags)
199 {
200 // if this is the first tag to be created make the root
201 // m_Tags point to it:
202 m_Tags = chd;
203 }
204 else
205 {
206 // if there is already a root tag add this tag as
207 // the last sibling:
208 chd->m_Prev = m_Tags->GetLastSibling();
209 chd->m_Prev->m_Next = chd;
210 }
211 }
212
213 if (chd->HasEnding())
214 {
215 CreateDOMSubTree(chd,
216 chd->GetBeginIter(), chd->GetEndIter1(),
217 cache);
218 i = chd->GetEndIter2();
219 }
220 else
221 i = chd->GetBeginIter();
222
223 textBeginning = i;
224 }
225
226 // ... or skip ending tag:
227 else
228 {
229 while (i < end_pos && *i != wxT('>')) ++i;
230 textBeginning = i+1;
231 }
232 }
233 else ++i;
234 }
235
236 // add remaining text to m_TextPieces:
237 if (end_pos > textBeginning)
238 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
239 }
240
241 void wxHtmlParser::DestroyDOMTree()
242 {
243 wxHtmlTag *t1, *t2;
244 t1 = m_Tags;
245 while (t1)
246 {
247 t2 = t1->GetNextSibling();
248 delete t1;
249 t1 = t2;
250 }
251 m_Tags = m_CurTag = NULL;
252
253 wxDELETE(m_TextPieces);
254 }
255
256 void wxHtmlParser::DoParsing()
257 {
258 m_CurTag = m_Tags;
259 m_CurTextPiece = 0;
260 DoParsing(m_Source->begin(), m_Source->end());
261 }
262
263 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
264 const wxString::const_iterator& end_pos)
265 {
266 wxString::const_iterator begin_pos(begin_pos_);
267
268 if (end_pos <= begin_pos)
269 return;
270
271 wxHtmlTextPieces& pieces = *m_TextPieces;
272 size_t piecesCnt = pieces.size();
273
274 while (begin_pos < end_pos)
275 {
276 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
277 m_CurTag = m_CurTag->GetNextTag();
278 while (m_CurTextPiece < piecesCnt &&
279 pieces[m_CurTextPiece].m_start < begin_pos)
280 m_CurTextPiece++;
281
282 if (m_CurTextPiece < piecesCnt &&
283 (!m_CurTag ||
284 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
285 {
286 // Add text:
287 AddText(GetEntitiesParser()->Parse(
288 wxString(pieces[m_CurTextPiece].m_start,
289 pieces[m_CurTextPiece].m_end)));
290 begin_pos = pieces[m_CurTextPiece].m_end;
291 m_CurTextPiece++;
292 }
293 else if (m_CurTag)
294 {
295 if (m_CurTag->HasEnding())
296 begin_pos = m_CurTag->GetEndIter2();
297 else
298 begin_pos = m_CurTag->GetBeginIter();
299 wxHtmlTag *t = m_CurTag;
300 m_CurTag = m_CurTag->GetNextTag();
301 AddTag(*t);
302 if (m_stopParsing)
303 return;
304 }
305 else break;
306 }
307 }
308
309 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
310 {
311 bool inner = false;
312
313 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
314 if (h != m_HandlersHash.end())
315 {
316 inner = h->second->HandleTag(tag);
317 if (m_stopParsing)
318 return;
319 }
320 if (!inner)
321 {
322 if (tag.HasEnding())
323 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
324 }
325 }
326
327 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
328 {
329 wxString s(handler->GetSupportedTags());
330 wxStringTokenizer tokenizer(s, wxT(", "));
331
332 while (tokenizer.HasMoreTokens())
333 m_HandlersHash[tokenizer.GetNextToken()] = handler;
334
335 m_HandlersSet.insert(handler);
336
337 handler->SetParser(this);
338 }
339
340 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
341 {
342 wxStringTokenizer tokenizer(tags, wxT(", "));
343 wxString key;
344
345 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
346
347 while (tokenizer.HasMoreTokens())
348 {
349 key = tokenizer.GetNextToken();
350 m_HandlersHash[key] = handler;
351 }
352 }
353
354 void wxHtmlParser::PopTagHandler()
355 {
356 wxCHECK_RET( !m_HandlersStack.empty(),
357 "attempt to remove HTML tag handler from empty stack" );
358
359 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
360 m_HandlersStack.pop_back();
361 m_HandlersHash = *prev;
362 delete prev;
363 }
364
365 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
366 {
367 wxHtmlParserState *s = new wxHtmlParserState;
368
369 s->m_curTag = m_CurTag;
370 s->m_tags = m_Tags;
371 s->m_textPieces = m_TextPieces;
372 s->m_curTextPiece = m_CurTextPiece;
373 s->m_source = m_Source;
374
375 s->m_nextState = m_SavedStates;
376 m_SavedStates = s;
377
378 m_CurTag = NULL;
379 m_Tags = NULL;
380 m_TextPieces = NULL;
381 m_CurTextPiece = 0;
382 m_Source = NULL;
383
384 SetSource(src);
385 }
386
387 bool wxHtmlParser::RestoreState()
388 {
389 if (!m_SavedStates) return false;
390
391 DestroyDOMTree();
392 delete m_Source;
393
394 wxHtmlParserState *s = m_SavedStates;
395 m_SavedStates = s->m_nextState;
396
397 m_CurTag = s->m_curTag;
398 m_Tags = s->m_tags;
399 m_TextPieces = s->m_textPieces;
400 m_CurTextPiece = s->m_curTextPiece;
401 m_Source = s->m_source;
402
403 delete s;
404 return true;
405 }
406
407 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
408 {
409 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
410 }
411
412 //-----------------------------------------------------------------------------
413 // wxHtmlTagHandler
414 //-----------------------------------------------------------------------------
415
416 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
417
418 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
419 {
420 // It is safe to temporarily change the source being parsed,
421 // provided we restore the state back after parsing
422 m_Parser->SetSourceAndSaveState(source);
423 m_Parser->DoParsing();
424 m_Parser->RestoreState();
425 }
426
427
428 //-----------------------------------------------------------------------------
429 // wxHtmlEntitiesParser
430 //-----------------------------------------------------------------------------
431
432 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
433
434 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
435 #if !wxUSE_UNICODE
436 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
437 #endif
438 {
439 }
440
441 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
442 {
443 #if !wxUSE_UNICODE
444 delete m_conv;
445 #endif
446 }
447
448 #if !wxUSE_UNICODE
449 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
450 {
451 if (encoding == m_encoding)
452 return;
453
454 delete m_conv;
455
456 m_encoding = encoding;
457 if (m_encoding == wxFONTENCODING_SYSTEM)
458 m_conv = NULL;
459 else
460 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
461 }
462 #endif // !wxUSE_UNICODE
463
464 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
465 {
466 wxString output;
467
468 const wxString::const_iterator end(input.end());
469 wxString::const_iterator c(input.begin());
470 wxString::const_iterator last(c);
471
472 for ( ; c < end; ++c )
473 {
474 if (*c == wxT('&'))
475 {
476 if ( output.empty() )
477 output.reserve(input.length());
478
479 if (c - last > 0)
480 output.append(last, c);
481 if ( ++c == end )
482 break;
483
484 wxString entity;
485 const wxString::const_iterator ent_s = c;
486 wxChar entity_char;
487
488 for ( ; c != end; ++c )
489 {
490 wxChar ch = *c;
491 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
492 (ch >= wxT('A') && ch <= wxT('Z')) ||
493 (ch >= wxT('0') && ch <= wxT('9')) ||
494 ch == wxT('_') || ch == wxT('#')) )
495 break;
496 }
497
498 entity.append(ent_s, c);
499 if (c == end || *c != wxT(';')) --c;
500 last = c+1;
501 entity_char = GetEntityChar(entity);
502 if (entity_char)
503 output << entity_char;
504 else
505 {
506 output.append(ent_s-1, c+1);
507 wxLogTrace(wxTRACE_HTML_DEBUG,
508 "Unrecognized HTML entity: '%s'",
509 entity);
510 }
511 }
512 }
513 if ( last == input.begin() ) // common case: no entity
514 return input;
515 if ( last != end )
516 output.append(last, end);
517 return output;
518 }
519
520 #if !wxUSE_UNICODE
521 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
522 {
523 char buf[2];
524 wchar_t wbuf[2];
525 wbuf[0] = (wchar_t)code;
526 wbuf[1] = 0;
527 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
528 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
529 return '?';
530 return buf[0];
531 }
532 #endif
533
534 struct wxHtmlEntityInfo
535 {
536 const wxStringCharType *name;
537 unsigned code;
538 };
539
540 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
541 {
542 #if wxUSE_UNICODE_UTF8
543 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
544 #else
545 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
546 #endif
547 }
548
549 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
550 {
551 unsigned code = 0;
552
553 if (entity.empty())
554 return 0; // invalid entity reference
555
556 if (entity[0] == wxT('#'))
557 {
558 // NB: parsed value is a number, so it's OK to use wx_str(), internal
559 // representation is the same for numbers
560 const wxStringCharType *ent_s = entity.wx_str();
561 const wxStringCharType *format;
562
563 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
564 {
565 format = wxS("%x");
566 ent_s++;
567 }
568 else
569 format = wxS("%u");
570 ent_s++;
571
572 if (wxSscanf(ent_s, format, &code) != 1)
573 code = 0;
574 }
575 else
576 {
577 // store the literals in wx's internal representation (either char*
578 // in UTF-8 or wchar_t*) for best performance:
579 #define ENTITY(name, code) { wxS(name), code }
580
581 static wxHtmlEntityInfo substitutions[] = {
582 ENTITY("AElig", 198),
583 ENTITY("Aacute", 193),
584 ENTITY("Acirc", 194),
585 ENTITY("Agrave", 192),
586 ENTITY("Alpha", 913),
587 ENTITY("Aring", 197),
588 ENTITY("Atilde", 195),
589 ENTITY("Auml", 196),
590 ENTITY("Beta", 914),
591 ENTITY("Ccedil", 199),
592 ENTITY("Chi", 935),
593 ENTITY("Dagger", 8225),
594 ENTITY("Delta", 916),
595 ENTITY("ETH", 208),
596 ENTITY("Eacute", 201),
597 ENTITY("Ecirc", 202),
598 ENTITY("Egrave", 200),
599 ENTITY("Epsilon", 917),
600 ENTITY("Eta", 919),
601 ENTITY("Euml", 203),
602 ENTITY("Gamma", 915),
603 ENTITY("Iacute", 205),
604 ENTITY("Icirc", 206),
605 ENTITY("Igrave", 204),
606 ENTITY("Iota", 921),
607 ENTITY("Iuml", 207),
608 ENTITY("Kappa", 922),
609 ENTITY("Lambda", 923),
610 ENTITY("Mu", 924),
611 ENTITY("Ntilde", 209),
612 ENTITY("Nu", 925),
613 ENTITY("OElig", 338),
614 ENTITY("Oacute", 211),
615 ENTITY("Ocirc", 212),
616 ENTITY("Ograve", 210),
617 ENTITY("Omega", 937),
618 ENTITY("Omicron", 927),
619 ENTITY("Oslash", 216),
620 ENTITY("Otilde", 213),
621 ENTITY("Ouml", 214),
622 ENTITY("Phi", 934),
623 ENTITY("Pi", 928),
624 ENTITY("Prime", 8243),
625 ENTITY("Psi", 936),
626 ENTITY("Rho", 929),
627 ENTITY("Scaron", 352),
628 ENTITY("Sigma", 931),
629 ENTITY("THORN", 222),
630 ENTITY("Tau", 932),
631 ENTITY("Theta", 920),
632 ENTITY("Uacute", 218),
633 ENTITY("Ucirc", 219),
634 ENTITY("Ugrave", 217),
635 ENTITY("Upsilon", 933),
636 ENTITY("Uuml", 220),
637 ENTITY("Xi", 926),
638 ENTITY("Yacute", 221),
639 ENTITY("Yuml", 376),
640 ENTITY("Zeta", 918),
641 ENTITY("aacute", 225),
642 ENTITY("acirc", 226),
643 ENTITY("acute", 180),
644 ENTITY("aelig", 230),
645 ENTITY("agrave", 224),
646 ENTITY("alefsym", 8501),
647 ENTITY("alpha", 945),
648 ENTITY("amp", 38),
649 ENTITY("and", 8743),
650 ENTITY("ang", 8736),
651 ENTITY("apos", 39),
652 ENTITY("aring", 229),
653 ENTITY("asymp", 8776),
654 ENTITY("atilde", 227),
655 ENTITY("auml", 228),
656 ENTITY("bdquo", 8222),
657 ENTITY("beta", 946),
658 ENTITY("brvbar", 166),
659 ENTITY("bull", 8226),
660 ENTITY("cap", 8745),
661 ENTITY("ccedil", 231),
662 ENTITY("cedil", 184),
663 ENTITY("cent", 162),
664 ENTITY("chi", 967),
665 ENTITY("circ", 710),
666 ENTITY("clubs", 9827),
667 ENTITY("cong", 8773),
668 ENTITY("copy", 169),
669 ENTITY("crarr", 8629),
670 ENTITY("cup", 8746),
671 ENTITY("curren", 164),
672 ENTITY("dArr", 8659),
673 ENTITY("dagger", 8224),
674 ENTITY("darr", 8595),
675 ENTITY("deg", 176),
676 ENTITY("delta", 948),
677 ENTITY("diams", 9830),
678 ENTITY("divide", 247),
679 ENTITY("eacute", 233),
680 ENTITY("ecirc", 234),
681 ENTITY("egrave", 232),
682 ENTITY("empty", 8709),
683 ENTITY("emsp", 8195),
684 ENTITY("ensp", 8194),
685 ENTITY("epsilon", 949),
686 ENTITY("equiv", 8801),
687 ENTITY("eta", 951),
688 ENTITY("eth", 240),
689 ENTITY("euml", 235),
690 ENTITY("euro", 8364),
691 ENTITY("exist", 8707),
692 ENTITY("fnof", 402),
693 ENTITY("forall", 8704),
694 ENTITY("frac12", 189),
695 ENTITY("frac14", 188),
696 ENTITY("frac34", 190),
697 ENTITY("frasl", 8260),
698 ENTITY("gamma", 947),
699 ENTITY("ge", 8805),
700 ENTITY("gt", 62),
701 ENTITY("hArr", 8660),
702 ENTITY("harr", 8596),
703 ENTITY("hearts", 9829),
704 ENTITY("hellip", 8230),
705 ENTITY("iacute", 237),
706 ENTITY("icirc", 238),
707 ENTITY("iexcl", 161),
708 ENTITY("igrave", 236),
709 ENTITY("image", 8465),
710 ENTITY("infin", 8734),
711 ENTITY("int", 8747),
712 ENTITY("iota", 953),
713 ENTITY("iquest", 191),
714 ENTITY("isin", 8712),
715 ENTITY("iuml", 239),
716 ENTITY("kappa", 954),
717 ENTITY("lArr", 8656),
718 ENTITY("lambda", 955),
719 ENTITY("lang", 9001),
720 ENTITY("laquo", 171),
721 ENTITY("larr", 8592),
722 ENTITY("lceil", 8968),
723 ENTITY("ldquo", 8220),
724 ENTITY("le", 8804),
725 ENTITY("lfloor", 8970),
726 ENTITY("lowast", 8727),
727 ENTITY("loz", 9674),
728 ENTITY("lrm", 8206),
729 ENTITY("lsaquo", 8249),
730 ENTITY("lsquo", 8216),
731 ENTITY("lt", 60),
732 ENTITY("macr", 175),
733 ENTITY("mdash", 8212),
734 ENTITY("micro", 181),
735 ENTITY("middot", 183),
736 ENTITY("minus", 8722),
737 ENTITY("mu", 956),
738 ENTITY("nabla", 8711),
739 ENTITY("nbsp", 160),
740 ENTITY("ndash", 8211),
741 ENTITY("ne", 8800),
742 ENTITY("ni", 8715),
743 ENTITY("not", 172),
744 ENTITY("notin", 8713),
745 ENTITY("nsub", 8836),
746 ENTITY("ntilde", 241),
747 ENTITY("nu", 957),
748 ENTITY("oacute", 243),
749 ENTITY("ocirc", 244),
750 ENTITY("oelig", 339),
751 ENTITY("ograve", 242),
752 ENTITY("oline", 8254),
753 ENTITY("omega", 969),
754 ENTITY("omicron", 959),
755 ENTITY("oplus", 8853),
756 ENTITY("or", 8744),
757 ENTITY("ordf", 170),
758 ENTITY("ordm", 186),
759 ENTITY("oslash", 248),
760 ENTITY("otilde", 245),
761 ENTITY("otimes", 8855),
762 ENTITY("ouml", 246),
763 ENTITY("para", 182),
764 ENTITY("part", 8706),
765 ENTITY("permil", 8240),
766 ENTITY("perp", 8869),
767 ENTITY("phi", 966),
768 ENTITY("pi", 960),
769 ENTITY("piv", 982),
770 ENTITY("plusmn", 177),
771 ENTITY("pound", 163),
772 ENTITY("prime", 8242),
773 ENTITY("prod", 8719),
774 ENTITY("prop", 8733),
775 ENTITY("psi", 968),
776 ENTITY("quot", 34),
777 ENTITY("rArr", 8658),
778 ENTITY("radic", 8730),
779 ENTITY("rang", 9002),
780 ENTITY("raquo", 187),
781 ENTITY("rarr", 8594),
782 ENTITY("rceil", 8969),
783 ENTITY("rdquo", 8221),
784 ENTITY("real", 8476),
785 ENTITY("reg", 174),
786 ENTITY("rfloor", 8971),
787 ENTITY("rho", 961),
788 ENTITY("rlm", 8207),
789 ENTITY("rsaquo", 8250),
790 ENTITY("rsquo", 8217),
791 ENTITY("sbquo", 8218),
792 ENTITY("scaron", 353),
793 ENTITY("sdot", 8901),
794 ENTITY("sect", 167),
795 ENTITY("shy", 173),
796 ENTITY("sigma", 963),
797 ENTITY("sigmaf", 962),
798 ENTITY("sim", 8764),
799 ENTITY("spades", 9824),
800 ENTITY("sub", 8834),
801 ENTITY("sube", 8838),
802 ENTITY("sum", 8721),
803 ENTITY("sup", 8835),
804 ENTITY("sup1", 185),
805 ENTITY("sup2", 178),
806 ENTITY("sup3", 179),
807 ENTITY("supe", 8839),
808 ENTITY("szlig", 223),
809 ENTITY("tau", 964),
810 ENTITY("there4", 8756),
811 ENTITY("theta", 952),
812 ENTITY("thetasym", 977),
813 ENTITY("thinsp", 8201),
814 ENTITY("thorn", 254),
815 ENTITY("tilde", 732),
816 ENTITY("times", 215),
817 ENTITY("trade", 8482),
818 ENTITY("uArr", 8657),
819 ENTITY("uacute", 250),
820 ENTITY("uarr", 8593),
821 ENTITY("ucirc", 251),
822 ENTITY("ugrave", 249),
823 ENTITY("uml", 168),
824 ENTITY("upsih", 978),
825 ENTITY("upsilon", 965),
826 ENTITY("uuml", 252),
827 ENTITY("weierp", 8472),
828 ENTITY("xi", 958),
829 ENTITY("yacute", 253),
830 ENTITY("yen", 165),
831 ENTITY("yuml", 255),
832 ENTITY("zeta", 950),
833 ENTITY("zwj", 8205),
834 ENTITY("zwnj", 8204),
835 {NULL, 0}};
836 #undef ENTITY
837 static size_t substitutions_cnt = 0;
838
839 if (substitutions_cnt == 0)
840 while (substitutions[substitutions_cnt].code != 0)
841 substitutions_cnt++;
842
843 wxHtmlEntityInfo *info;
844 #ifdef __WXWINCE__
845 // bsearch crashes under WinCE for some reason
846 info = NULL;
847 size_t i;
848 for (i = 0; i < substitutions_cnt; i++)
849 {
850 if (entity == substitutions[i].name)
851 {
852 info = & substitutions[i];
853 break;
854 }
855 }
856 #else
857 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
858 substitutions_cnt,
859 sizeof(wxHtmlEntityInfo),
860 wxHtmlEntityCompare);
861 #endif
862 if (info)
863 code = info->code;
864 }
865
866 if (code == 0)
867 return 0;
868 else
869 return GetCharForCode(code);
870 }
871
872 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
873 const wxString& url) const
874 {
875 return m_FS ? m_FS->OpenFile(url) : NULL;
876
877 }
878
879
880 //-----------------------------------------------------------------------------
881 // wxHtmlParser::ExtractCharsetInformation
882 //-----------------------------------------------------------------------------
883
884 class wxMetaTagParser : public wxHtmlParser
885 {
886 public:
887 wxMetaTagParser() { }
888
889 wxObject* GetProduct() { return NULL; }
890
891 protected:
892 virtual void AddText(const wxString& WXUNUSED(txt)) {}
893
894 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
895 };
896
897 class wxMetaTagHandler : public wxHtmlTagHandler
898 {
899 public:
900 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
901 wxString GetSupportedTags() { return wxT("META,BODY"); }
902 bool HandleTag(const wxHtmlTag& tag);
903
904 private:
905 wxString *m_retval;
906
907 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
908 };
909
910 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
911 {
912 if (tag.GetName() == wxT("BODY"))
913 {
914 m_Parser->StopParsing();
915 return false;
916 }
917
918 if (tag.HasParam(wxT("HTTP-EQUIV")) &&
919 tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
920 tag.HasParam(wxT("CONTENT")))
921 {
922 wxString content = tag.GetParam(wxT("CONTENT")).Lower();
923 if (content.Left(19) == wxT("text/html; charset="))
924 {
925 *m_retval = content.Mid(19);
926 m_Parser->StopParsing();
927 }
928 }
929 return false;
930 }
931
932
933 /*static*/
934 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
935 {
936 wxString charset;
937 wxMetaTagParser *parser = new wxMetaTagParser();
938 if(parser)
939 {
940 parser->AddTagHandler(new wxMetaTagHandler(&charset));
941 parser->Parse(markup);
942 delete parser;
943 }
944 return charset;
945 }
946
947 /* static */
948 bool
949 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
950 wxString::const_iterator end)
951 {
952 wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
953
954 wxString::const_iterator p = start;
955
956 // comments begin with "<!--" in HTML 4.0
957 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
958 {
959 // not a comment at all
960 return false;
961 }
962
963 // skip the start of the comment tag in any case, if we don't find the
964 // closing tag we should ignore broken markup
965 start = p;
966
967 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
968 // comment delimiter and the closing tag character (section 3.2.4 of
969 // http://www.w3.org/TR/html401/)
970 int dashes = 0;
971 while ( ++p < end )
972 {
973 const wxChar c = *p;
974
975 if ( (c == wxT(' ') || c == wxT('\n') ||
976 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
977 {
978 // ignore white space before potential tag end
979 continue;
980 }
981
982 if ( c == wxT('>') && dashes >= 2 )
983 {
984 // found end of comment
985 start = p;
986 break;
987 }
988
989 if ( c == wxT('-') )
990 dashes++;
991 else
992 dashes = 0;
993 }
994
995 return true;
996 }
997
998 #endif // wxUSE_HTML