]> git.saurik.com Git - wxWidgets.git/blob - src/html/htmlpars.cpp
use wxVector<T> instead of homegrown growing array in wxHtmlTagsCache
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/arrimpl.cpp"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
51 int m_pos, m_lng;
52 };
53
54 WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
55 WX_DEFINE_OBJARRAY(wxHtmlTextPieces)
56
57 class wxHtmlParserState
58 {
59 public:
60 wxHtmlTag *m_curTag;
61 wxHtmlTag *m_tags;
62 wxHtmlTextPieces *m_textPieces;
63 int m_curTextPiece;
64 wxString m_source;
65 wxHtmlParserState *m_nextState;
66 };
67
68 //-----------------------------------------------------------------------------
69 // wxHtmlParser
70 //-----------------------------------------------------------------------------
71
72 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
73
74 wxHtmlParser::wxHtmlParser()
75 : wxObject(), m_HandlersHash(wxKEY_STRING),
76 m_FS(NULL), m_HandlersStack(NULL)
77 {
78 m_entitiesParser = new wxHtmlEntitiesParser;
79 m_Tags = NULL;
80 m_CurTag = NULL;
81 m_TextPieces = NULL;
82 m_CurTextPiece = 0;
83 m_SavedStates = NULL;
84 }
85
86 wxHtmlParser::~wxHtmlParser()
87 {
88 while (RestoreState()) {}
89 DestroyDOMTree();
90
91 if (m_HandlersStack)
92 {
93 wxList& tmp = *m_HandlersStack;
94 wxList::iterator it, en;
95 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
96 delete (wxHashTable*)*it;
97 tmp.clear();
98 }
99 delete m_HandlersStack;
100 m_HandlersHash.Clear();
101 WX_CLEAR_LIST(wxList, m_HandlersList);
102 delete m_entitiesParser;
103 }
104
105 wxObject* wxHtmlParser::Parse(const wxString& source)
106 {
107 InitParser(source);
108 DoParsing();
109 wxObject *result = GetProduct();
110 DoneParser();
111 return result;
112 }
113
114 void wxHtmlParser::InitParser(const wxString& source)
115 {
116 SetSource(source);
117 m_stopParsing = false;
118 }
119
120 void wxHtmlParser::DoneParser()
121 {
122 DestroyDOMTree();
123 }
124
125 void wxHtmlParser::SetSource(const wxString& src)
126 {
127 DestroyDOMTree();
128 m_Source = src;
129 CreateDOMTree();
130 m_CurTag = NULL;
131 m_CurTextPiece = 0;
132 }
133
134 void wxHtmlParser::CreateDOMTree()
135 {
136 wxHtmlTagsCache cache(m_Source);
137 m_TextPieces = new wxHtmlTextPieces;
138 CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
139 m_CurTextPiece = 0;
140 }
141
142 extern bool wxIsCDATAElement(const wxChar *tag);
143
144 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
145 int begin_pos, int end_pos,
146 wxHtmlTagsCache *cache)
147 {
148 if (end_pos <= begin_pos) return;
149
150 wxChar c;
151 int i = begin_pos;
152 int textBeginning = begin_pos;
153
154 // If the tag contains CDATA text, we include the text between beginning
155 // and ending tag verbosely. Setting i=end_pos will skip to the very
156 // end of this function where text piece is added, bypassing any child
157 // tags parsing (CDATA element can't have child elements by definition):
158 if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
159 {
160 i = end_pos;
161 }
162
163 while (i < end_pos)
164 {
165 c = m_Source.GetChar(i);
166
167 if (c == wxT('<'))
168 {
169 // add text to m_TextPieces:
170 if (i - textBeginning > 0)
171 m_TextPieces->Add(
172 wxHtmlTextPiece(textBeginning, i - textBeginning));
173
174 // if it is a comment, skip it:
175 wxString::const_iterator iter = m_Source.begin() + i;
176 if ( SkipCommentTag(iter, m_Source.end()) )
177 {
178 textBeginning =
179 i = iter - m_Source.begin() + 1; // skip closing '>' too
180 }
181
182 // add another tag to the tree:
183 else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
184 {
185 wxHtmlTag *chd;
186 if (cur)
187 chd = new wxHtmlTag(cur, m_Source,
188 i, end_pos, cache, m_entitiesParser);
189 else
190 {
191 chd = new wxHtmlTag(NULL, m_Source,
192 i, end_pos, cache, m_entitiesParser);
193 if (!m_Tags)
194 {
195 // if this is the first tag to be created make the root
196 // m_Tags point to it:
197 m_Tags = chd;
198 }
199 else
200 {
201 // if there is already a root tag add this tag as
202 // the last sibling:
203 chd->m_Prev = m_Tags->GetLastSibling();
204 chd->m_Prev->m_Next = chd;
205 }
206 }
207
208 if (chd->HasEnding())
209 {
210 CreateDOMSubTree(chd,
211 chd->GetBeginPos(), chd->GetEndPos1(),
212 cache);
213 i = chd->GetEndPos2();
214 }
215 else
216 i = chd->GetBeginPos();
217
218 textBeginning = i;
219 }
220
221 // ... or skip ending tag:
222 else
223 {
224 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
225 textBeginning = i+1;
226 }
227 }
228 else i++;
229 }
230
231 // add remaining text to m_TextPieces:
232 if (end_pos - textBeginning > 0)
233 m_TextPieces->Add(
234 wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
235 }
236
237 void wxHtmlParser::DestroyDOMTree()
238 {
239 wxHtmlTag *t1, *t2;
240 t1 = m_Tags;
241 while (t1)
242 {
243 t2 = t1->GetNextSibling();
244 delete t1;
245 t1 = t2;
246 }
247 m_Tags = m_CurTag = NULL;
248
249 delete m_TextPieces;
250 m_TextPieces = NULL;
251 }
252
253 void wxHtmlParser::DoParsing()
254 {
255 m_CurTag = m_Tags;
256 m_CurTextPiece = 0;
257 DoParsing(0, m_Source.length());
258 }
259
260 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
261 {
262 if (end_pos <= begin_pos) return;
263
264 wxHtmlTextPieces& pieces = *m_TextPieces;
265 size_t piecesCnt = pieces.GetCount();
266
267 while (begin_pos < end_pos)
268 {
269 while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
270 m_CurTag = m_CurTag->GetNextTag();
271 while (m_CurTextPiece < piecesCnt &&
272 pieces[m_CurTextPiece].m_pos < begin_pos)
273 m_CurTextPiece++;
274
275 if (m_CurTextPiece < piecesCnt &&
276 (!m_CurTag ||
277 pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
278 {
279 // Add text:
280 AddText(GetEntitiesParser()->Parse(
281 m_Source.Mid(pieces[m_CurTextPiece].m_pos,
282 pieces[m_CurTextPiece].m_lng)));
283 begin_pos = pieces[m_CurTextPiece].m_pos +
284 pieces[m_CurTextPiece].m_lng;
285 m_CurTextPiece++;
286 }
287 else if (m_CurTag)
288 {
289 if (m_CurTag->HasEnding())
290 begin_pos = m_CurTag->GetEndPos2();
291 else
292 begin_pos = m_CurTag->GetBeginPos();
293 wxHtmlTag *t = m_CurTag;
294 m_CurTag = m_CurTag->GetNextTag();
295 AddTag(*t);
296 if (m_stopParsing)
297 return;
298 }
299 else break;
300 }
301 }
302
303 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
304 {
305 wxHtmlTagHandler *h;
306 bool inner = false;
307
308 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
309 if (h)
310 {
311 inner = h->HandleTag(tag);
312 if (m_stopParsing)
313 return;
314 }
315 if (!inner)
316 {
317 if (tag.HasEnding())
318 DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
319 }
320 }
321
322 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
323 {
324 wxString s(handler->GetSupportedTags());
325 wxStringTokenizer tokenizer(s, wxT(", "));
326
327 while (tokenizer.HasMoreTokens())
328 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
329
330 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
331 m_HandlersList.Append(handler);
332
333 handler->SetParser(this);
334 }
335
336 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
337 {
338 wxStringTokenizer tokenizer(tags, wxT(", "));
339 wxString key;
340
341 if (m_HandlersStack == NULL)
342 {
343 m_HandlersStack = new wxList;
344 }
345
346 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
347
348 while (tokenizer.HasMoreTokens())
349 {
350 key = tokenizer.GetNextToken();
351 m_HandlersHash.Delete(key);
352 m_HandlersHash.Put(key, handler);
353 }
354 }
355
356 void wxHtmlParser::PopTagHandler()
357 {
358 wxList::compatibility_iterator first;
359
360 if ( !m_HandlersStack ||
361 #if wxUSE_STL
362 !(first = m_HandlersStack->GetFirst())
363 #else // !wxUSE_STL
364 ((first = m_HandlersStack->GetFirst()) == NULL)
365 #endif // wxUSE_STL/!wxUSE_STL
366 )
367 {
368 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
369 return;
370 }
371 m_HandlersHash = *((wxHashTable*) first->GetData());
372 delete (wxHashTable*) first->GetData();
373 m_HandlersStack->Erase(first);
374 }
375
376 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
377 {
378 wxHtmlParserState *s = new wxHtmlParserState;
379
380 s->m_curTag = m_CurTag;
381 s->m_tags = m_Tags;
382 s->m_textPieces = m_TextPieces;
383 s->m_curTextPiece = m_CurTextPiece;
384 s->m_source = m_Source;
385
386 s->m_nextState = m_SavedStates;
387 m_SavedStates = s;
388
389 m_CurTag = NULL;
390 m_Tags = NULL;
391 m_TextPieces = NULL;
392 m_CurTextPiece = 0;
393 m_Source = wxEmptyString;
394
395 SetSource(src);
396 }
397
398 bool wxHtmlParser::RestoreState()
399 {
400 if (!m_SavedStates) return false;
401
402 DestroyDOMTree();
403
404 wxHtmlParserState *s = m_SavedStates;
405 m_SavedStates = s->m_nextState;
406
407 m_CurTag = s->m_curTag;
408 m_Tags = s->m_tags;
409 m_TextPieces = s->m_textPieces;
410 m_CurTextPiece = s->m_curTextPiece;
411 m_Source = s->m_source;
412
413 delete s;
414 return true;
415 }
416
417 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
418 {
419 return GetSource()->Mid(tag.GetBeginPos(),
420 tag.GetEndPos1() - tag.GetBeginPos());
421 }
422
423 //-----------------------------------------------------------------------------
424 // wxHtmlTagHandler
425 //-----------------------------------------------------------------------------
426
427 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
428
429 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
430 {
431 // It is safe to temporarily change the source being parsed,
432 // provided we restore the state back after parsing
433 m_Parser->SetSourceAndSaveState(source);
434 m_Parser->DoParsing();
435 m_Parser->RestoreState();
436 }
437
438
439 //-----------------------------------------------------------------------------
440 // wxHtmlEntitiesParser
441 //-----------------------------------------------------------------------------
442
443 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
444
445 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
446 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
447 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
448 #endif
449 {
450 }
451
452 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
453 {
454 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
455 delete m_conv;
456 #endif
457 }
458
459 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
460 {
461 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
462 if (encoding == m_encoding)
463 return;
464
465 delete m_conv;
466
467 m_encoding = encoding;
468 if (m_encoding == wxFONTENCODING_SYSTEM)
469 m_conv = NULL;
470 else
471 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
472 #else
473 (void) encoding;
474 #endif
475 }
476
477 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
478 {
479 const wxChar *c, *last;
480 const wxChar *in_str = input.c_str();
481 wxString output;
482
483 output.reserve(input.length());
484
485 for (c = in_str, last = in_str; *c != wxT('\0'); c++)
486 {
487 if (*c == wxT('&'))
488 {
489 if (c - last > 0)
490 output.append(last, c - last);
491 if ( *++c == wxT('\0') )
492 break;
493
494 wxString entity;
495 const wxChar *ent_s = c;
496 wxChar entity_char;
497
498 for (; (*c >= wxT('a') && *c <= wxT('z')) ||
499 (*c >= wxT('A') && *c <= wxT('Z')) ||
500 (*c >= wxT('0') && *c <= wxT('9')) ||
501 *c == wxT('_') || *c == wxT('#'); c++) {}
502 entity.append(ent_s, c - ent_s);
503 if (*c != wxT(';')) c--;
504 last = c+1;
505 entity_char = GetEntityChar(entity);
506 if (entity_char)
507 output << entity_char;
508 else
509 {
510 output.append(ent_s-1, c-ent_s+2);
511 wxLogTrace(wxTRACE_HTML_DEBUG,
512 wxT("Unrecognized HTML entity: '%s'"),
513 entity.c_str());
514 }
515 }
516 }
517 if (*last != wxT('\0'))
518 output.append(last);
519 return output;
520 }
521
522 #if !wxUSE_UNICODE
523 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
524 {
525 #if wxUSE_WCHAR_T
526 char buf[2];
527 wchar_t wbuf[2];
528 wbuf[0] = (wchar_t)code;
529 wbuf[1] = 0;
530 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
531 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
532 return '?';
533 return buf[0];
534 #else
535 return (code < 256) ? (wxChar)code : '?';
536 #endif
537 }
538 #endif
539
540 struct wxHtmlEntityInfo
541 {
542 const wxStringCharType *name;
543 unsigned code;
544 };
545
546 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
547 {
548 #if wxUSE_UNICODE_UTF8
549 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
550 #else
551 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
552 #endif
553 }
554
555 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
556 {
557 unsigned code = 0;
558
559 if (entity[0] == wxT('#'))
560 {
561 const wxChar *ent_s = entity.c_str();
562 const wxChar *format;
563
564 if (ent_s[1] == wxT('x') || ent_s[1] == wxT('X'))
565 {
566 format = wxT("%x");
567 ent_s++;
568 }
569 else
570 format = wxT("%u");
571 ent_s++;
572
573 if (wxSscanf(ent_s, format, &code) != 1)
574 code = 0;
575 }
576 else
577 {
578 // store the literals in wx's internal representation (either char*
579 // in UTF-8 or wchar_t*) for best performance:
580 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
581
582 static wxHtmlEntityInfo substitutions[] = {
583 ENTITY("AElig", 198),
584 ENTITY("Aacute", 193),
585 ENTITY("Acirc", 194),
586 ENTITY("Agrave", 192),
587 ENTITY("Alpha", 913),
588 ENTITY("Aring", 197),
589 ENTITY("Atilde", 195),
590 ENTITY("Auml", 196),
591 ENTITY("Beta", 914),
592 ENTITY("Ccedil", 199),
593 ENTITY("Chi", 935),
594 ENTITY("Dagger", 8225),
595 ENTITY("Delta", 916),
596 ENTITY("ETH", 208),
597 ENTITY("Eacute", 201),
598 ENTITY("Ecirc", 202),
599 ENTITY("Egrave", 200),
600 ENTITY("Epsilon", 917),
601 ENTITY("Eta", 919),
602 ENTITY("Euml", 203),
603 ENTITY("Gamma", 915),
604 ENTITY("Iacute", 205),
605 ENTITY("Icirc", 206),
606 ENTITY("Igrave", 204),
607 ENTITY("Iota", 921),
608 ENTITY("Iuml", 207),
609 ENTITY("Kappa", 922),
610 ENTITY("Lambda", 923),
611 ENTITY("Mu", 924),
612 ENTITY("Ntilde", 209),
613 ENTITY("Nu", 925),
614 ENTITY("OElig", 338),
615 ENTITY("Oacute", 211),
616 ENTITY("Ocirc", 212),
617 ENTITY("Ograve", 210),
618 ENTITY("Omega", 937),
619 ENTITY("Omicron", 927),
620 ENTITY("Oslash", 216),
621 ENTITY("Otilde", 213),
622 ENTITY("Ouml", 214),
623 ENTITY("Phi", 934),
624 ENTITY("Pi", 928),
625 ENTITY("Prime", 8243),
626 ENTITY("Psi", 936),
627 ENTITY("Rho", 929),
628 ENTITY("Scaron", 352),
629 ENTITY("Sigma", 931),
630 ENTITY("THORN", 222),
631 ENTITY("Tau", 932),
632 ENTITY("Theta", 920),
633 ENTITY("Uacute", 218),
634 ENTITY("Ucirc", 219),
635 ENTITY("Ugrave", 217),
636 ENTITY("Upsilon", 933),
637 ENTITY("Uuml", 220),
638 ENTITY("Xi", 926),
639 ENTITY("Yacute", 221),
640 ENTITY("Yuml", 376),
641 ENTITY("Zeta", 918),
642 ENTITY("aacute", 225),
643 ENTITY("acirc", 226),
644 ENTITY("acute", 180),
645 ENTITY("aelig", 230),
646 ENTITY("agrave", 224),
647 ENTITY("alefsym", 8501),
648 ENTITY("alpha", 945),
649 ENTITY("amp", 38),
650 ENTITY("and", 8743),
651 ENTITY("ang", 8736),
652 ENTITY("aring", 229),
653 ENTITY("asymp", 8776),
654 ENTITY("atilde", 227),
655 ENTITY("auml", 228),
656 ENTITY("bdquo", 8222),
657 ENTITY("beta", 946),
658 ENTITY("brvbar", 166),
659 ENTITY("bull", 8226),
660 ENTITY("cap", 8745),
661 ENTITY("ccedil", 231),
662 ENTITY("cedil", 184),
663 ENTITY("cent", 162),
664 ENTITY("chi", 967),
665 ENTITY("circ", 710),
666 ENTITY("clubs", 9827),
667 ENTITY("cong", 8773),
668 ENTITY("copy", 169),
669 ENTITY("crarr", 8629),
670 ENTITY("cup", 8746),
671 ENTITY("curren", 164),
672 ENTITY("dArr", 8659),
673 ENTITY("dagger", 8224),
674 ENTITY("darr", 8595),
675 ENTITY("deg", 176),
676 ENTITY("delta", 948),
677 ENTITY("diams", 9830),
678 ENTITY("divide", 247),
679 ENTITY("eacute", 233),
680 ENTITY("ecirc", 234),
681 ENTITY("egrave", 232),
682 ENTITY("empty", 8709),
683 ENTITY("emsp", 8195),
684 ENTITY("ensp", 8194),
685 ENTITY("epsilon", 949),
686 ENTITY("equiv", 8801),
687 ENTITY("eta", 951),
688 ENTITY("eth", 240),
689 ENTITY("euml", 235),
690 ENTITY("euro", 8364),
691 ENTITY("exist", 8707),
692 ENTITY("fnof", 402),
693 ENTITY("forall", 8704),
694 ENTITY("frac12", 189),
695 ENTITY("frac14", 188),
696 ENTITY("frac34", 190),
697 ENTITY("frasl", 8260),
698 ENTITY("gamma", 947),
699 ENTITY("ge", 8805),
700 ENTITY("gt", 62),
701 ENTITY("hArr", 8660),
702 ENTITY("harr", 8596),
703 ENTITY("hearts", 9829),
704 ENTITY("hellip", 8230),
705 ENTITY("iacute", 237),
706 ENTITY("icirc", 238),
707 ENTITY("iexcl", 161),
708 ENTITY("igrave", 236),
709 ENTITY("image", 8465),
710 ENTITY("infin", 8734),
711 ENTITY("int", 8747),
712 ENTITY("iota", 953),
713 ENTITY("iquest", 191),
714 ENTITY("isin", 8712),
715 ENTITY("iuml", 239),
716 ENTITY("kappa", 954),
717 ENTITY("lArr", 8656),
718 ENTITY("lambda", 955),
719 ENTITY("lang", 9001),
720 ENTITY("laquo", 171),
721 ENTITY("larr", 8592),
722 ENTITY("lceil", 8968),
723 ENTITY("ldquo", 8220),
724 ENTITY("le", 8804),
725 ENTITY("lfloor", 8970),
726 ENTITY("lowast", 8727),
727 ENTITY("loz", 9674),
728 ENTITY("lrm", 8206),
729 ENTITY("lsaquo", 8249),
730 ENTITY("lsquo", 8216),
731 ENTITY("lt", 60),
732 ENTITY("macr", 175),
733 ENTITY("mdash", 8212),
734 ENTITY("micro", 181),
735 ENTITY("middot", 183),
736 ENTITY("minus", 8722),
737 ENTITY("mu", 956),
738 ENTITY("nabla", 8711),
739 ENTITY("nbsp", 160),
740 ENTITY("ndash", 8211),
741 ENTITY("ne", 8800),
742 ENTITY("ni", 8715),
743 ENTITY("not", 172),
744 ENTITY("notin", 8713),
745 ENTITY("nsub", 8836),
746 ENTITY("ntilde", 241),
747 ENTITY("nu", 957),
748 ENTITY("oacute", 243),
749 ENTITY("ocirc", 244),
750 ENTITY("oelig", 339),
751 ENTITY("ograve", 242),
752 ENTITY("oline", 8254),
753 ENTITY("omega", 969),
754 ENTITY("omicron", 959),
755 ENTITY("oplus", 8853),
756 ENTITY("or", 8744),
757 ENTITY("ordf", 170),
758 ENTITY("ordm", 186),
759 ENTITY("oslash", 248),
760 ENTITY("otilde", 245),
761 ENTITY("otimes", 8855),
762 ENTITY("ouml", 246),
763 ENTITY("para", 182),
764 ENTITY("part", 8706),
765 ENTITY("permil", 8240),
766 ENTITY("perp", 8869),
767 ENTITY("phi", 966),
768 ENTITY("pi", 960),
769 ENTITY("piv", 982),
770 ENTITY("plusmn", 177),
771 ENTITY("pound", 163),
772 ENTITY("prime", 8242),
773 ENTITY("prod", 8719),
774 ENTITY("prop", 8733),
775 ENTITY("psi", 968),
776 ENTITY("quot", 34),
777 ENTITY("rArr", 8658),
778 ENTITY("radic", 8730),
779 ENTITY("rang", 9002),
780 ENTITY("raquo", 187),
781 ENTITY("rarr", 8594),
782 ENTITY("rceil", 8969),
783 ENTITY("rdquo", 8221),
784 ENTITY("real", 8476),
785 ENTITY("reg", 174),
786 ENTITY("rfloor", 8971),
787 ENTITY("rho", 961),
788 ENTITY("rlm", 8207),
789 ENTITY("rsaquo", 8250),
790 ENTITY("rsquo", 8217),
791 ENTITY("sbquo", 8218),
792 ENTITY("scaron", 353),
793 ENTITY("sdot", 8901),
794 ENTITY("sect", 167),
795 ENTITY("shy", 173),
796 ENTITY("sigma", 963),
797 ENTITY("sigmaf", 962),
798 ENTITY("sim", 8764),
799 ENTITY("spades", 9824),
800 ENTITY("sub", 8834),
801 ENTITY("sube", 8838),
802 ENTITY("sum", 8721),
803 ENTITY("sup", 8835),
804 ENTITY("sup1", 185),
805 ENTITY("sup2", 178),
806 ENTITY("sup3", 179),
807 ENTITY("supe", 8839),
808 ENTITY("szlig", 223),
809 ENTITY("tau", 964),
810 ENTITY("there4", 8756),
811 ENTITY("theta", 952),
812 ENTITY("thetasym", 977),
813 ENTITY("thinsp", 8201),
814 ENTITY("thorn", 254),
815 ENTITY("tilde", 732),
816 ENTITY("times", 215),
817 ENTITY("trade", 8482),
818 ENTITY("uArr", 8657),
819 ENTITY("uacute", 250),
820 ENTITY("uarr", 8593),
821 ENTITY("ucirc", 251),
822 ENTITY("ugrave", 249),
823 ENTITY("uml", 168),
824 ENTITY("upsih", 978),
825 ENTITY("upsilon", 965),
826 ENTITY("uuml", 252),
827 ENTITY("weierp", 8472),
828 ENTITY("xi", 958),
829 ENTITY("yacute", 253),
830 ENTITY("yen", 165),
831 ENTITY("yuml", 255),
832 ENTITY("zeta", 950),
833 ENTITY("zwj", 8205),
834 ENTITY("zwnj", 8204),
835 {NULL, 0}};
836 #undef ENTITY
837 static size_t substitutions_cnt = 0;
838
839 if (substitutions_cnt == 0)
840 while (substitutions[substitutions_cnt].code != 0)
841 substitutions_cnt++;
842
843 wxHtmlEntityInfo *info = NULL;
844 #ifdef __WXWINCE__
845 // bsearch crashes under WinCE for some reason
846 size_t i;
847 for (i = 0; i < substitutions_cnt; i++)
848 {
849 if (entity == substitutions[i].name)
850 {
851 info = & substitutions[i];
852 break;
853 }
854 }
855 #else
856 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
857 substitutions_cnt,
858 sizeof(wxHtmlEntityInfo),
859 wxHtmlEntityCompare);
860 #endif
861 if (info)
862 code = info->code;
863 }
864
865 if (code == 0)
866 return 0;
867 else
868 return GetCharForCode(code);
869 }
870
871 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
872 const wxString& url) const
873 {
874 return m_FS ? m_FS->OpenFile(url) : NULL;
875
876 }
877
878
879 //-----------------------------------------------------------------------------
880 // wxHtmlParser::ExtractCharsetInformation
881 //-----------------------------------------------------------------------------
882
883 class wxMetaTagParser : public wxHtmlParser
884 {
885 public:
886 wxMetaTagParser() { }
887
888 wxObject* GetProduct() { return NULL; }
889
890 protected:
891 virtual void AddText(const wxString& WXUNUSED(txt)) {}
892
893 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
894 };
895
896 class wxMetaTagHandler : public wxHtmlTagHandler
897 {
898 public:
899 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
900 wxString GetSupportedTags() { return wxT("META,BODY"); }
901 bool HandleTag(const wxHtmlTag& tag);
902
903 private:
904 wxString *m_retval;
905
906 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
907 };
908
909 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
910 {
911 if (tag.GetName() == _T("BODY"))
912 {
913 m_Parser->StopParsing();
914 return false;
915 }
916
917 if (tag.HasParam(_T("HTTP-EQUIV")) &&
918 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
919 tag.HasParam(_T("CONTENT")))
920 {
921 wxString content = tag.GetParam(_T("CONTENT")).Lower();
922 if (content.Left(19) == _T("text/html; charset="))
923 {
924 *m_retval = content.Mid(19);
925 m_Parser->StopParsing();
926 }
927 }
928 return false;
929 }
930
931
932 /*static*/
933 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
934 {
935 wxString charset;
936 wxMetaTagParser *parser = new wxMetaTagParser();
937 if(parser)
938 {
939 parser->AddTagHandler(new wxMetaTagHandler(&charset));
940 parser->Parse(markup);
941 delete parser;
942 }
943 return charset;
944 }
945
946 /* static */
947 bool
948 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
949 wxString::const_iterator end)
950 {
951 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
952
953 wxString::const_iterator p = start;
954
955 // comments begin with "<!--" in HTML 4.0
956 if ( end - p < 3 || *++p != '!' || *++p != '-' || *++p != '-' )
957 {
958 // not a comment at all
959 return false;
960 }
961
962 // skip the start of the comment tag in any case, if we don't find the
963 // closing tag we should ignore broken markup
964 start = p;
965
966 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
967 // comment delimiter and the closing tag character (section 3.2.4 of
968 // http://www.w3.org/TR/html401/)
969 int dashes = 0;
970 while ( ++p < end )
971 {
972 const wxChar c = *p;
973
974 if ( (c == wxT(' ') || c == wxT('\n') ||
975 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
976 {
977 // ignore white space before potential tag end
978 continue;
979 }
980
981 if ( c == wxT('>') && dashes >= 2 )
982 {
983 // found end of comment
984 start = p;
985 break;
986 }
987
988 if ( c == wxT('-') )
989 dashes++;
990 else
991 dashes = 0;
992 }
993
994 return true;
995 }
996
997 #endif // wxUSE_HTML