The rounded corners look really dumb at this size.
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // Copyright: (c) 1999 Vaclav Slavik
6 // Licence: wxWindows licence
7 /////////////////////////////////////////////////////////////////////////////
8
9 #include "wx/wxprec.h"
10
11 #ifdef __BORLANDC__
12 #pragma hdrstop
13 #endif
14
15 #if wxUSE_HTML && wxUSE_STREAMS
16
17 #ifndef WX_PRECOMP
18 #include "wx/dynarray.h"
19 #include "wx/log.h"
20 #include "wx/intl.h"
21 #include "wx/app.h"
22 #include "wx/wxcrtvararg.h"
23 #endif
24
25 #include "wx/tokenzr.h"
26 #include "wx/wfstream.h"
27 #include "wx/url.h"
28 #include "wx/fontmap.h"
29 #include "wx/html/htmldefs.h"
30 #include "wx/html/htmlpars.h"
31 #include "wx/vector.h"
32
33 #ifdef __WXWINCE__
34 #include "wx/msw/wince/missing.h" // for bsearch()
35 #endif
36
37 // DLL options compatibility check:
38 WX_CHECK_BUILD_OPTIONS("wxHTML")
39
40 const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
41
42 //-----------------------------------------------------------------------------
43 // wxHtmlParser helpers
44 //-----------------------------------------------------------------------------
45
46 class wxHtmlTextPiece
47 {
48 public:
49 wxHtmlTextPiece() {}
50 wxHtmlTextPiece(const wxString::const_iterator& start,
51 const wxString::const_iterator& end)
52 : m_start(start), m_end(end) {}
53 wxString::const_iterator m_start, m_end;
54 };
55
56 // NB: this is an empty class and not typedef because of forward declaration
57 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
58 {
59 };
60
61 class wxHtmlParserState
62 {
63 public:
64 wxHtmlTag *m_curTag;
65 wxHtmlTag *m_tags;
66 wxHtmlTextPieces *m_textPieces;
67 int m_curTextPiece;
68 const wxString *m_source;
69 wxHtmlParserState *m_nextState;
70 };
71
72 //-----------------------------------------------------------------------------
73 // wxHtmlParser
74 //-----------------------------------------------------------------------------
75
76 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
77
78 wxHtmlParser::wxHtmlParser()
79 : wxObject(),
80 m_FS(NULL)
81 {
82 m_Source = NULL;
83 m_entitiesParser = new wxHtmlEntitiesParser;
84 m_Tags = NULL;
85 m_CurTag = NULL;
86 m_TextPieces = NULL;
87 m_CurTextPiece = 0;
88 m_SavedStates = NULL;
89 }
90
91 wxHtmlParser::~wxHtmlParser()
92 {
93 while (RestoreState()) {}
94 DestroyDOMTree();
95
96 WX_CLEAR_ARRAY(m_HandlersStack);
97 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
98 delete m_entitiesParser;
99 delete m_Source;
100 }
101
102 wxObject* wxHtmlParser::Parse(const wxString& source)
103 {
104 InitParser(source);
105 DoParsing();
106 wxObject *result = GetProduct();
107 DoneParser();
108 return result;
109 }
110
111 void wxHtmlParser::InitParser(const wxString& source)
112 {
113 SetSource(source);
114 m_stopParsing = false;
115 }
116
117 void wxHtmlParser::DoneParser()
118 {
119 DestroyDOMTree();
120 }
121
122 void wxHtmlParser::SetSource(const wxString& src)
123 {
124 DestroyDOMTree();
125 // NB: This is allocated on heap because wxHtmlTag uses iterators and
126 // making a copy of m_Source string in SetSourceAndSaveState() and
127 // RestoreState() would invalidate them (because wxString::m_impl's
128 // memory would change completely twice and iterators use pointers
129 // into it). So instead, we keep the string object intact and only
130 // store/restore pointer to it, for which we need it to be allocated
131 // on the heap.
132 delete m_Source;
133 m_Source = new wxString(src);
134 CreateDOMTree();
135 m_CurTag = NULL;
136 m_CurTextPiece = 0;
137 }
138
139 void wxHtmlParser::CreateDOMTree()
140 {
141 wxHtmlTagsCache cache(*m_Source);
142 m_TextPieces = new wxHtmlTextPieces;
143 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
144 m_CurTextPiece = 0;
145 }
146
147 extern bool wxIsCDATAElement(const wxString& tag);
148
149 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
150 const wxString::const_iterator& begin_pos,
151 const wxString::const_iterator& end_pos,
152 wxHtmlTagsCache *cache)
153 {
154 if (end_pos <= begin_pos)
155 return;
156
157 wxChar c;
158 wxString::const_iterator i = begin_pos;
159 wxString::const_iterator textBeginning = begin_pos;
160
161 // If the tag contains CDATA text, we include the text between beginning
162 // and ending tag verbosely. Setting i=end_pos will skip to the very
163 // end of this function where text piece is added, bypassing any child
164 // tags parsing (CDATA element can't have child elements by definition):
165 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
166 {
167 i = end_pos;
168 }
169
170 while (i < end_pos)
171 {
172 c = *i;
173
174 if (c == wxT('<'))
175 {
176 // add text to m_TextPieces:
177 if (i > textBeginning)
178 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
179
180 // if it is a comment, skip it:
181 if ( SkipCommentTag(i, m_Source->end()) )
182 {
183 textBeginning = i = i + 1; // skip closing '>' too
184 }
185
186 // add another tag to the tree:
187 else if (i < end_pos-1 && *(i+1) != wxT('/'))
188 {
189 wxHtmlTag *chd;
190 if (cur)
191 chd = new wxHtmlTag(cur, m_Source,
192 i, end_pos, cache, m_entitiesParser);
193 else
194 {
195 chd = new wxHtmlTag(NULL, m_Source,
196 i, end_pos, cache, m_entitiesParser);
197 if (!m_Tags)
198 {
199 // if this is the first tag to be created make the root
200 // m_Tags point to it:
201 m_Tags = chd;
202 }
203 else
204 {
205 // if there is already a root tag add this tag as
206 // the last sibling:
207 chd->m_Prev = m_Tags->GetLastSibling();
208 chd->m_Prev->m_Next = chd;
209 }
210 }
211
212 if (chd->HasEnding())
213 {
214 CreateDOMSubTree(chd,
215 chd->GetBeginIter(), chd->GetEndIter1(),
216 cache);
217 i = chd->GetEndIter2();
218 }
219 else
220 i = chd->GetBeginIter();
221
222 textBeginning = i;
223 }
224
225 // ... or skip ending tag:
226 else
227 {
228 while (i < end_pos && *i != wxT('>')) ++i;
229 textBeginning = i < end_pos ? i+1 : i;
230 }
231 }
232 else ++i;
233 }
234
235 // add remaining text to m_TextPieces:
236 if (end_pos > textBeginning)
237 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
238 }
239
240 void wxHtmlParser::DestroyDOMTree()
241 {
242 wxHtmlTag *t1, *t2;
243 t1 = m_Tags;
244 while (t1)
245 {
246 t2 = t1->GetNextSibling();
247 delete t1;
248 t1 = t2;
249 }
250 m_Tags = m_CurTag = NULL;
251
252 wxDELETE(m_TextPieces);
253 }
254
255 void wxHtmlParser::DoParsing()
256 {
257 m_CurTag = m_Tags;
258 m_CurTextPiece = 0;
259 DoParsing(m_Source->begin(), m_Source->end());
260 }
261
262 void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
263 const wxString::const_iterator& end_pos)
264 {
265 wxString::const_iterator begin_pos(begin_pos_);
266
267 if (end_pos <= begin_pos)
268 return;
269
270 wxHtmlTextPieces& pieces = *m_TextPieces;
271 size_t piecesCnt = pieces.size();
272
273 while (begin_pos < end_pos)
274 {
275 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
276 m_CurTag = m_CurTag->GetNextTag();
277 while (m_CurTextPiece < piecesCnt &&
278 pieces[m_CurTextPiece].m_start < begin_pos)
279 m_CurTextPiece++;
280
281 if (m_CurTextPiece < piecesCnt &&
282 (!m_CurTag ||
283 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
284 {
285 // Add text:
286 AddText(GetEntitiesParser()->Parse(
287 wxString(pieces[m_CurTextPiece].m_start,
288 pieces[m_CurTextPiece].m_end)));
289 begin_pos = pieces[m_CurTextPiece].m_end;
290 m_CurTextPiece++;
291 }
292 else if (m_CurTag)
293 {
294 if (m_CurTag->HasEnding())
295 begin_pos = m_CurTag->GetEndIter2();
296 else
297 begin_pos = m_CurTag->GetBeginIter();
298 wxHtmlTag *t = m_CurTag;
299 m_CurTag = m_CurTag->GetNextTag();
300 AddTag(*t);
301 if (m_stopParsing)
302 return;
303 }
304 else break;
305 }
306 }
307
308 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
309 {
310 bool inner = false;
311
312 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
313 if (h != m_HandlersHash.end())
314 {
315 inner = h->second->HandleTag(tag);
316 if (m_stopParsing)
317 return;
318 }
319 if (!inner)
320 {
321 if (tag.HasEnding())
322 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
323 }
324 }
325
326 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
327 {
328 wxString s(handler->GetSupportedTags());
329 wxStringTokenizer tokenizer(s, wxT(", "));
330
331 while (tokenizer.HasMoreTokens())
332 m_HandlersHash[tokenizer.GetNextToken()] = handler;
333
334 m_HandlersSet.insert(handler);
335
336 handler->SetParser(this);
337 }
338
339 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
340 {
341 wxStringTokenizer tokenizer(tags, wxT(", "));
342 wxString key;
343
344 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
345
346 while (tokenizer.HasMoreTokens())
347 {
348 key = tokenizer.GetNextToken();
349 m_HandlersHash[key] = handler;
350 }
351 }
352
353 void wxHtmlParser::PopTagHandler()
354 {
355 wxCHECK_RET( !m_HandlersStack.empty(),
356 "attempt to remove HTML tag handler from empty stack" );
357
358 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
359 m_HandlersStack.pop_back();
360 m_HandlersHash = *prev;
361 delete prev;
362 }
363
364 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
365 {
366 wxHtmlParserState *s = new wxHtmlParserState;
367
368 s->m_curTag = m_CurTag;
369 s->m_tags = m_Tags;
370 s->m_textPieces = m_TextPieces;
371 s->m_curTextPiece = m_CurTextPiece;
372 s->m_source = m_Source;
373
374 s->m_nextState = m_SavedStates;
375 m_SavedStates = s;
376
377 m_CurTag = NULL;
378 m_Tags = NULL;
379 m_TextPieces = NULL;
380 m_CurTextPiece = 0;
381 m_Source = NULL;
382
383 SetSource(src);
384 }
385
386 bool wxHtmlParser::RestoreState()
387 {
388 if (!m_SavedStates) return false;
389
390 DestroyDOMTree();
391 delete m_Source;
392
393 wxHtmlParserState *s = m_SavedStates;
394 m_SavedStates = s->m_nextState;
395
396 m_CurTag = s->m_curTag;
397 m_Tags = s->m_tags;
398 m_TextPieces = s->m_textPieces;
399 m_CurTextPiece = s->m_curTextPiece;
400 m_Source = s->m_source;
401
402 delete s;
403 return true;
404 }
405
406 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
407 {
408 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
409 }
410
411 //-----------------------------------------------------------------------------
412 // wxHtmlTagHandler
413 //-----------------------------------------------------------------------------
414
415 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
416
417 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
418 {
419 // It is safe to temporarily change the source being parsed,
420 // provided we restore the state back after parsing
421 m_Parser->SetSourceAndSaveState(source);
422 m_Parser->DoParsing();
423 m_Parser->RestoreState();
424 }
425
426
427 //-----------------------------------------------------------------------------
428 // wxHtmlEntitiesParser
429 //-----------------------------------------------------------------------------
430
431 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
432
433 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
434 #if !wxUSE_UNICODE
435 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
436 #endif
437 {
438 }
439
440 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
441 {
442 #if !wxUSE_UNICODE
443 delete m_conv;
444 #endif
445 }
446
447 #if !wxUSE_UNICODE
448 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
449 {
450 if (encoding == m_encoding)
451 return;
452
453 delete m_conv;
454
455 m_encoding = encoding;
456 if (m_encoding == wxFONTENCODING_SYSTEM)
457 m_conv = NULL;
458 else
459 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
460 }
461 #endif // !wxUSE_UNICODE
462
463 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
464 {
465 wxString output;
466
467 const wxString::const_iterator end(input.end());
468 wxString::const_iterator c(input.begin());
469 wxString::const_iterator last(c);
470
471 for ( ; c < end; ++c )
472 {
473 if (*c == wxT('&'))
474 {
475 if ( output.empty() )
476 output.reserve(input.length());
477
478 if (c - last > 0)
479 output.append(last, c);
480 if ( ++c == end )
481 break;
482
483 wxString entity;
484 const wxString::const_iterator ent_s = c;
485 wxChar entity_char;
486
487 for ( ; c != end; ++c )
488 {
489 wxChar ch = *c;
490 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
491 (ch >= wxT('A') && ch <= wxT('Z')) ||
492 (ch >= wxT('0') && ch <= wxT('9')) ||
493 ch == wxT('_') || ch == wxT('#')) )
494 break;
495 }
496
497 entity.append(ent_s, c);
498 if (c == end || *c != wxT(';')) --c;
499 last = c+1;
500 entity_char = GetEntityChar(entity);
501 if (entity_char)
502 output << entity_char;
503 else
504 {
505 output.append(ent_s-1, c+1);
506 wxLogTrace(wxTRACE_HTML_DEBUG,
507 "Unrecognized HTML entity: '%s'",
508 entity);
509 }
510 }
511 }
512 if ( last == input.begin() ) // common case: no entity
513 return input;
514 if ( last != end )
515 output.append(last, end);
516 return output;
517 }
518
519 #if !wxUSE_UNICODE
520 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
521 {
522 char buf[2];
523 wchar_t wbuf[2];
524 wbuf[0] = (wchar_t)code;
525 wbuf[1] = 0;
526 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
527 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
528 return '?';
529 return buf[0];
530 }
531 #endif
532
533 struct wxHtmlEntityInfo
534 {
535 const wxStringCharType *name;
536 unsigned code;
537 };
538
539 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
540 {
541 #if wxUSE_UNICODE_UTF8
542 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
543 #else
544 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
545 #endif
546 }
547
548 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
549 {
550 unsigned code = 0;
551
552 if (entity.empty())
553 return 0; // invalid entity reference
554
555 if (entity[0] == wxT('#'))
556 {
557 // NB: parsed value is a number, so it's OK to use wx_str(), internal
558 // representation is the same for numbers
559 const wxStringCharType *ent_s = entity.wx_str();
560 const wxStringCharType *format;
561
562 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
563 {
564 format = wxS("%x");
565 ent_s++;
566 }
567 else
568 format = wxS("%u");
569 ent_s++;
570
571 if (wxSscanf(ent_s, format, &code) != 1)
572 code = 0;
573 }
574 else
575 {
576 // store the literals in wx's internal representation (either char*
577 // in UTF-8 or wchar_t*) for best performance:
578 #define ENTITY(name, code) { wxS(name), code }
579
580 static wxHtmlEntityInfo substitutions[] = {
581 ENTITY("AElig", 198),
582 ENTITY("Aacute", 193),
583 ENTITY("Acirc", 194),
584 ENTITY("Agrave", 192),
585 ENTITY("Alpha", 913),
586 ENTITY("Aring", 197),
587 ENTITY("Atilde", 195),
588 ENTITY("Auml", 196),
589 ENTITY("Beta", 914),
590 ENTITY("Ccedil", 199),
591 ENTITY("Chi", 935),
592 ENTITY("Dagger", 8225),
593 ENTITY("Delta", 916),
594 ENTITY("ETH", 208),
595 ENTITY("Eacute", 201),
596 ENTITY("Ecirc", 202),
597 ENTITY("Egrave", 200),
598 ENTITY("Epsilon", 917),
599 ENTITY("Eta", 919),
600 ENTITY("Euml", 203),
601 ENTITY("Gamma", 915),
602 ENTITY("Iacute", 205),
603 ENTITY("Icirc", 206),
604 ENTITY("Igrave", 204),
605 ENTITY("Iota", 921),
606 ENTITY("Iuml", 207),
607 ENTITY("Kappa", 922),
608 ENTITY("Lambda", 923),
609 ENTITY("Mu", 924),
610 ENTITY("Ntilde", 209),
611 ENTITY("Nu", 925),
612 ENTITY("OElig", 338),
613 ENTITY("Oacute", 211),
614 ENTITY("Ocirc", 212),
615 ENTITY("Ograve", 210),
616 ENTITY("Omega", 937),
617 ENTITY("Omicron", 927),
618 ENTITY("Oslash", 216),
619 ENTITY("Otilde", 213),
620 ENTITY("Ouml", 214),
621 ENTITY("Phi", 934),
622 ENTITY("Pi", 928),
623 ENTITY("Prime", 8243),
624 ENTITY("Psi", 936),
625 ENTITY("Rho", 929),
626 ENTITY("Scaron", 352),
627 ENTITY("Sigma", 931),
628 ENTITY("THORN", 222),
629 ENTITY("Tau", 932),
630 ENTITY("Theta", 920),
631 ENTITY("Uacute", 218),
632 ENTITY("Ucirc", 219),
633 ENTITY("Ugrave", 217),
634 ENTITY("Upsilon", 933),
635 ENTITY("Uuml", 220),
636 ENTITY("Xi", 926),
637 ENTITY("Yacute", 221),
638 ENTITY("Yuml", 376),
639 ENTITY("Zeta", 918),
640 ENTITY("aacute", 225),
641 ENTITY("acirc", 226),
642 ENTITY("acute", 180),
643 ENTITY("aelig", 230),
644 ENTITY("agrave", 224),
645 ENTITY("alefsym", 8501),
646 ENTITY("alpha", 945),
647 ENTITY("amp", 38),
648 ENTITY("and", 8743),
649 ENTITY("ang", 8736),
650 ENTITY("apos", 39),
651 ENTITY("aring", 229),
652 ENTITY("asymp", 8776),
653 ENTITY("atilde", 227),
654 ENTITY("auml", 228),
655 ENTITY("bdquo", 8222),
656 ENTITY("beta", 946),
657 ENTITY("brvbar", 166),
658 ENTITY("bull", 8226),
659 ENTITY("cap", 8745),
660 ENTITY("ccedil", 231),
661 ENTITY("cedil", 184),
662 ENTITY("cent", 162),
663 ENTITY("chi", 967),
664 ENTITY("circ", 710),
665 ENTITY("clubs", 9827),
666 ENTITY("cong", 8773),
667 ENTITY("copy", 169),
668 ENTITY("crarr", 8629),
669 ENTITY("cup", 8746),
670 ENTITY("curren", 164),
671 ENTITY("dArr", 8659),
672 ENTITY("dagger", 8224),
673 ENTITY("darr", 8595),
674 ENTITY("deg", 176),
675 ENTITY("delta", 948),
676 ENTITY("diams", 9830),
677 ENTITY("divide", 247),
678 ENTITY("eacute", 233),
679 ENTITY("ecirc", 234),
680 ENTITY("egrave", 232),
681 ENTITY("empty", 8709),
682 ENTITY("emsp", 8195),
683 ENTITY("ensp", 8194),
684 ENTITY("epsilon", 949),
685 ENTITY("equiv", 8801),
686 ENTITY("eta", 951),
687 ENTITY("eth", 240),
688 ENTITY("euml", 235),
689 ENTITY("euro", 8364),
690 ENTITY("exist", 8707),
691 ENTITY("fnof", 402),
692 ENTITY("forall", 8704),
693 ENTITY("frac12", 189),
694 ENTITY("frac14", 188),
695 ENTITY("frac34", 190),
696 ENTITY("frasl", 8260),
697 ENTITY("gamma", 947),
698 ENTITY("ge", 8805),
699 ENTITY("gt", 62),
700 ENTITY("hArr", 8660),
701 ENTITY("harr", 8596),
702 ENTITY("hearts", 9829),
703 ENTITY("hellip", 8230),
704 ENTITY("iacute", 237),
705 ENTITY("icirc", 238),
706 ENTITY("iexcl", 161),
707 ENTITY("igrave", 236),
708 ENTITY("image", 8465),
709 ENTITY("infin", 8734),
710 ENTITY("int", 8747),
711 ENTITY("iota", 953),
712 ENTITY("iquest", 191),
713 ENTITY("isin", 8712),
714 ENTITY("iuml", 239),
715 ENTITY("kappa", 954),
716 ENTITY("lArr", 8656),
717 ENTITY("lambda", 955),
718 ENTITY("lang", 9001),
719 ENTITY("laquo", 171),
720 ENTITY("larr", 8592),
721 ENTITY("lceil", 8968),
722 ENTITY("ldquo", 8220),
723 ENTITY("le", 8804),
724 ENTITY("lfloor", 8970),
725 ENTITY("lowast", 8727),
726 ENTITY("loz", 9674),
727 ENTITY("lrm", 8206),
728 ENTITY("lsaquo", 8249),
729 ENTITY("lsquo", 8216),
730 ENTITY("lt", 60),
731 ENTITY("macr", 175),
732 ENTITY("mdash", 8212),
733 ENTITY("micro", 181),
734 ENTITY("middot", 183),
735 ENTITY("minus", 8722),
736 ENTITY("mu", 956),
737 ENTITY("nabla", 8711),
738 ENTITY("nbsp", 160),
739 ENTITY("ndash", 8211),
740 ENTITY("ne", 8800),
741 ENTITY("ni", 8715),
742 ENTITY("not", 172),
743 ENTITY("notin", 8713),
744 ENTITY("nsub", 8836),
745 ENTITY("ntilde", 241),
746 ENTITY("nu", 957),
747 ENTITY("oacute", 243),
748 ENTITY("ocirc", 244),
749 ENTITY("oelig", 339),
750 ENTITY("ograve", 242),
751 ENTITY("oline", 8254),
752 ENTITY("omega", 969),
753 ENTITY("omicron", 959),
754 ENTITY("oplus", 8853),
755 ENTITY("or", 8744),
756 ENTITY("ordf", 170),
757 ENTITY("ordm", 186),
758 ENTITY("oslash", 248),
759 ENTITY("otilde", 245),
760 ENTITY("otimes", 8855),
761 ENTITY("ouml", 246),
762 ENTITY("para", 182),
763 ENTITY("part", 8706),
764 ENTITY("permil", 8240),
765 ENTITY("perp", 8869),
766 ENTITY("phi", 966),
767 ENTITY("pi", 960),
768 ENTITY("piv", 982),
769 ENTITY("plusmn", 177),
770 ENTITY("pound", 163),
771 ENTITY("prime", 8242),
772 ENTITY("prod", 8719),
773 ENTITY("prop", 8733),
774 ENTITY("psi", 968),
775 ENTITY("quot", 34),
776 ENTITY("rArr", 8658),
777 ENTITY("radic", 8730),
778 ENTITY("rang", 9002),
779 ENTITY("raquo", 187),
780 ENTITY("rarr", 8594),
781 ENTITY("rceil", 8969),
782 ENTITY("rdquo", 8221),
783 ENTITY("real", 8476),
784 ENTITY("reg", 174),
785 ENTITY("rfloor", 8971),
786 ENTITY("rho", 961),
787 ENTITY("rlm", 8207),
788 ENTITY("rsaquo", 8250),
789 ENTITY("rsquo", 8217),
790 ENTITY("sbquo", 8218),
791 ENTITY("scaron", 353),
792 ENTITY("sdot", 8901),
793 ENTITY("sect", 167),
794 ENTITY("shy", 173),
795 ENTITY("sigma", 963),
796 ENTITY("sigmaf", 962),
797 ENTITY("sim", 8764),
798 ENTITY("spades", 9824),
799 ENTITY("sub", 8834),
800 ENTITY("sube", 8838),
801 ENTITY("sum", 8721),
802 ENTITY("sup", 8835),
803 ENTITY("sup1", 185),
804 ENTITY("sup2", 178),
805 ENTITY("sup3", 179),
806 ENTITY("supe", 8839),
807 ENTITY("szlig", 223),
808 ENTITY("tau", 964),
809 ENTITY("there4", 8756),
810 ENTITY("theta", 952),
811 ENTITY("thetasym", 977),
812 ENTITY("thinsp", 8201),
813 ENTITY("thorn", 254),
814 ENTITY("tilde", 732),
815 ENTITY("times", 215),
816 ENTITY("trade", 8482),
817 ENTITY("uArr", 8657),
818 ENTITY("uacute", 250),
819 ENTITY("uarr", 8593),
820 ENTITY("ucirc", 251),
821 ENTITY("ugrave", 249),
822 ENTITY("uml", 168),
823 ENTITY("upsih", 978),
824 ENTITY("upsilon", 965),
825 ENTITY("uuml", 252),
826 ENTITY("weierp", 8472),
827 ENTITY("xi", 958),
828 ENTITY("yacute", 253),
829 ENTITY("yen", 165),
830 ENTITY("yuml", 255),
831 ENTITY("zeta", 950),
832 ENTITY("zwj", 8205),
833 ENTITY("zwnj", 8204),
834 {NULL, 0}};
835 #undef ENTITY
836 static size_t substitutions_cnt = 0;
837
838 if (substitutions_cnt == 0)
839 while (substitutions[substitutions_cnt].code != 0)
840 substitutions_cnt++;
841
842 wxHtmlEntityInfo *info;
843 #ifdef __WXWINCE__
844 // bsearch crashes under WinCE for some reason
845 info = NULL;
846 size_t i;
847 for (i = 0; i < substitutions_cnt; i++)
848 {
849 if (entity == substitutions[i].name)
850 {
851 info = & substitutions[i];
852 break;
853 }
854 }
855 #else
856 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
857 substitutions_cnt,
858 sizeof(wxHtmlEntityInfo),
859 wxHtmlEntityCompare);
860 #endif
861 if (info)
862 code = info->code;
863 }
864
865 if (code == 0)
866 return 0;
867 else
868 return GetCharForCode(code);
869 }
870
871 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType type,
872 const wxString& url) const
873 {
874 int flags = wxFS_READ;
875 if (type == wxHTML_URL_IMAGE)
876 flags |= wxFS_SEEKABLE;
877
878 return m_FS ? m_FS->OpenFile(url, flags) : NULL;
879
880 }
881
882
883 //-----------------------------------------------------------------------------
884 // wxHtmlParser::ExtractCharsetInformation
885 //-----------------------------------------------------------------------------
886
887 class wxMetaTagParser : public wxHtmlParser
888 {
889 public:
890 wxMetaTagParser() { }
891
892 wxObject* GetProduct() { return NULL; }
893
894 protected:
895 virtual void AddText(const wxString& WXUNUSED(txt)) {}
896
897 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
898 };
899
900 class wxMetaTagHandler : public wxHtmlTagHandler
901 {
902 public:
903 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
904 wxString GetSupportedTags() { return wxT("META,BODY"); }
905 bool HandleTag(const wxHtmlTag& tag);
906
907 private:
908 wxString *m_retval;
909
910 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
911 };
912
913 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
914 {
915 if (tag.GetName() == wxT("BODY"))
916 {
917 m_Parser->StopParsing();
918 return false;
919 }
920
921 wxString httpEquiv,
922 content;
923 if (tag.GetParamAsString(wxT("HTTP-EQUIV"), &httpEquiv) &&
924 httpEquiv.IsSameAs(wxT("Content-Type"), false) &&
925 tag.GetParamAsString(wxT("CONTENT"), &content))
926 {
927 content.MakeLower();
928 if (content.Left(19) == wxT("text/html; charset="))
929 {
930 *m_retval = content.Mid(19);
931 m_Parser->StopParsing();
932 }
933 }
934 return false;
935 }
936
937
938 /*static*/
939 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
940 {
941 wxString charset;
942 wxMetaTagParser *parser = new wxMetaTagParser();
943 if(parser)
944 {
945 parser->AddTagHandler(new wxMetaTagHandler(&charset));
946 parser->Parse(markup);
947 delete parser;
948 }
949 return charset;
950 }
951
952 /* static */
953 bool
954 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
955 wxString::const_iterator end)
956 {
957 wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
958
959 wxString::const_iterator p = start;
960
961 // Comments begin with "<!--" in HTML 4.0; anything shorter or not containing
962 // these characters is not a comment and we're not going to skip it.
963 if ( ++p == end || *p != '!' )
964 return false;
965 if ( ++p == end || *p != '-' )
966 return false;
967 if ( ++p == end || *p != '-' )
968 return false;
969
970 // skip the start of the comment tag in any case, if we don't find the
971 // closing tag we should ignore broken markup
972 start = p;
973
974 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
975 // comment delimiter and the closing tag character (section 3.2.4 of
976 // http://www.w3.org/TR/html401/)
977 int dashes = 0;
978 while ( ++p < end )
979 {
980 const wxChar c = *p;
981
982 if ( (c == wxT(' ') || c == wxT('\n') ||
983 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
984 {
985 // ignore white space before potential tag end
986 continue;
987 }
988
989 if ( c == wxT('>') && dashes >= 2 )
990 {
991 // found end of comment
992 start = p;
993 break;
994 }
995
996 if ( c == wxT('-') )
997 dashes++;
998 else
999 dashes = 0;
1000 }
1001
1002 return true;
1003 }
1004
1005 #endif // wxUSE_HTML