use wxVector<T> instead of OBJARRAY macros for wxHtmlTextPieces (slightly better...
[wxWidgets.git] / src / html / htmlpars.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmlpars.cpp
3 // Purpose: wxHtmlParser class (generic parser)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML && wxUSE_STREAMS
17
18 #ifndef WX_PRECOMP
19 #include "wx/dynarray.h"
20 #include "wx/log.h"
21 #include "wx/intl.h"
22 #include "wx/app.h"
23 #include "wx/wxcrtvararg.h"
24 #endif
25
26 #include "wx/tokenzr.h"
27 #include "wx/wfstream.h"
28 #include "wx/url.h"
29 #include "wx/fontmap.h"
30 #include "wx/html/htmldefs.h"
31 #include "wx/html/htmlpars.h"
32 #include "wx/vector.h"
33
34 #ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36 #endif
37
38 // DLL options compatibility check:
39 WX_CHECK_BUILD_OPTIONS("wxHTML")
40
41 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
43 //-----------------------------------------------------------------------------
44 // wxHtmlParser helpers
45 //-----------------------------------------------------------------------------
46
47 class wxHtmlTextPiece
48 {
49 public:
50 wxHtmlTextPiece() {}
51 wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
52 int m_pos, m_lng;
53 };
54
55 // NB: this is an empty class and not typedef because of forward declaration
56 class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
57 {
58 };
59
60 class wxHtmlParserState
61 {
62 public:
63 wxHtmlTag *m_curTag;
64 wxHtmlTag *m_tags;
65 wxHtmlTextPieces *m_textPieces;
66 int m_curTextPiece;
67 wxString m_source;
68 wxHtmlParserState *m_nextState;
69 };
70
71 //-----------------------------------------------------------------------------
72 // wxHtmlParser
73 //-----------------------------------------------------------------------------
74
75 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
76
77 wxHtmlParser::wxHtmlParser()
78 : wxObject(), m_HandlersHash(wxKEY_STRING),
79 m_FS(NULL), m_HandlersStack(NULL)
80 {
81 m_entitiesParser = new wxHtmlEntitiesParser;
82 m_Tags = NULL;
83 m_CurTag = NULL;
84 m_TextPieces = NULL;
85 m_CurTextPiece = 0;
86 m_SavedStates = NULL;
87 }
88
89 wxHtmlParser::~wxHtmlParser()
90 {
91 while (RestoreState()) {}
92 DestroyDOMTree();
93
94 if (m_HandlersStack)
95 {
96 wxList& tmp = *m_HandlersStack;
97 wxList::iterator it, en;
98 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
99 delete (wxHashTable*)*it;
100 tmp.clear();
101 }
102 delete m_HandlersStack;
103 m_HandlersHash.Clear();
104 WX_CLEAR_LIST(wxList, m_HandlersList);
105 delete m_entitiesParser;
106 }
107
108 wxObject* wxHtmlParser::Parse(const wxString& source)
109 {
110 InitParser(source);
111 DoParsing();
112 wxObject *result = GetProduct();
113 DoneParser();
114 return result;
115 }
116
117 void wxHtmlParser::InitParser(const wxString& source)
118 {
119 SetSource(source);
120 m_stopParsing = false;
121 }
122
123 void wxHtmlParser::DoneParser()
124 {
125 DestroyDOMTree();
126 }
127
128 void wxHtmlParser::SetSource(const wxString& src)
129 {
130 DestroyDOMTree();
131 m_Source = src;
132 CreateDOMTree();
133 m_CurTag = NULL;
134 m_CurTextPiece = 0;
135 }
136
137 void wxHtmlParser::CreateDOMTree()
138 {
139 wxHtmlTagsCache cache(m_Source);
140 m_TextPieces = new wxHtmlTextPieces;
141 CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
142 m_CurTextPiece = 0;
143 }
144
145 extern bool wxIsCDATAElement(const wxChar *tag);
146
147 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
148 int begin_pos, int end_pos,
149 wxHtmlTagsCache *cache)
150 {
151 if (end_pos <= begin_pos) return;
152
153 wxChar c;
154 int i = begin_pos;
155 int textBeginning = begin_pos;
156
157 // If the tag contains CDATA text, we include the text between beginning
158 // and ending tag verbosely. Setting i=end_pos will skip to the very
159 // end of this function where text piece is added, bypassing any child
160 // tags parsing (CDATA element can't have child elements by definition):
161 if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
162 {
163 i = end_pos;
164 }
165
166 while (i < end_pos)
167 {
168 c = m_Source.GetChar(i);
169
170 if (c == wxT('<'))
171 {
172 // add text to m_TextPieces:
173 if (i - textBeginning > 0)
174 m_TextPieces->push_back(
175 wxHtmlTextPiece(textBeginning, i - textBeginning));
176
177 // if it is a comment, skip it:
178 wxString::const_iterator iter = m_Source.begin() + i;
179 if ( SkipCommentTag(iter, m_Source.end()) )
180 {
181 textBeginning =
182 i = iter - m_Source.begin() + 1; // skip closing '>' too
183 }
184
185 // add another tag to the tree:
186 else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
187 {
188 wxHtmlTag *chd;
189 if (cur)
190 chd = new wxHtmlTag(cur, m_Source,
191 i, end_pos, cache, m_entitiesParser);
192 else
193 {
194 chd = new wxHtmlTag(NULL, m_Source,
195 i, end_pos, cache, m_entitiesParser);
196 if (!m_Tags)
197 {
198 // if this is the first tag to be created make the root
199 // m_Tags point to it:
200 m_Tags = chd;
201 }
202 else
203 {
204 // if there is already a root tag add this tag as
205 // the last sibling:
206 chd->m_Prev = m_Tags->GetLastSibling();
207 chd->m_Prev->m_Next = chd;
208 }
209 }
210
211 if (chd->HasEnding())
212 {
213 CreateDOMSubTree(chd,
214 chd->GetBeginPos(), chd->GetEndPos1(),
215 cache);
216 i = chd->GetEndPos2();
217 }
218 else
219 i = chd->GetBeginPos();
220
221 textBeginning = i;
222 }
223
224 // ... or skip ending tag:
225 else
226 {
227 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
228 textBeginning = i+1;
229 }
230 }
231 else i++;
232 }
233
234 // add remaining text to m_TextPieces:
235 if (end_pos - textBeginning > 0)
236 m_TextPieces->push_back(
237 wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
238 }
239
240 void wxHtmlParser::DestroyDOMTree()
241 {
242 wxHtmlTag *t1, *t2;
243 t1 = m_Tags;
244 while (t1)
245 {
246 t2 = t1->GetNextSibling();
247 delete t1;
248 t1 = t2;
249 }
250 m_Tags = m_CurTag = NULL;
251
252 delete m_TextPieces;
253 m_TextPieces = NULL;
254 }
255
256 void wxHtmlParser::DoParsing()
257 {
258 m_CurTag = m_Tags;
259 m_CurTextPiece = 0;
260 DoParsing(0, m_Source.length());
261 }
262
263 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
264 {
265 if (end_pos <= begin_pos) return;
266
267 wxHtmlTextPieces& pieces = *m_TextPieces;
268 size_t piecesCnt = pieces.size();
269
270 while (begin_pos < end_pos)
271 {
272 while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
273 m_CurTag = m_CurTag->GetNextTag();
274 while (m_CurTextPiece < piecesCnt &&
275 pieces[m_CurTextPiece].m_pos < begin_pos)
276 m_CurTextPiece++;
277
278 if (m_CurTextPiece < piecesCnt &&
279 (!m_CurTag ||
280 pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
281 {
282 // Add text:
283 AddText(GetEntitiesParser()->Parse(
284 m_Source.Mid(pieces[m_CurTextPiece].m_pos,
285 pieces[m_CurTextPiece].m_lng)));
286 begin_pos = pieces[m_CurTextPiece].m_pos +
287 pieces[m_CurTextPiece].m_lng;
288 m_CurTextPiece++;
289 }
290 else if (m_CurTag)
291 {
292 if (m_CurTag->HasEnding())
293 begin_pos = m_CurTag->GetEndPos2();
294 else
295 begin_pos = m_CurTag->GetBeginPos();
296 wxHtmlTag *t = m_CurTag;
297 m_CurTag = m_CurTag->GetNextTag();
298 AddTag(*t);
299 if (m_stopParsing)
300 return;
301 }
302 else break;
303 }
304 }
305
306 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
307 {
308 wxHtmlTagHandler *h;
309 bool inner = false;
310
311 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
312 if (h)
313 {
314 inner = h->HandleTag(tag);
315 if (m_stopParsing)
316 return;
317 }
318 if (!inner)
319 {
320 if (tag.HasEnding())
321 DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
322 }
323 }
324
325 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
326 {
327 wxString s(handler->GetSupportedTags());
328 wxStringTokenizer tokenizer(s, wxT(", "));
329
330 while (tokenizer.HasMoreTokens())
331 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
332
333 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
334 m_HandlersList.Append(handler);
335
336 handler->SetParser(this);
337 }
338
339 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
340 {
341 wxStringTokenizer tokenizer(tags, wxT(", "));
342 wxString key;
343
344 if (m_HandlersStack == NULL)
345 {
346 m_HandlersStack = new wxList;
347 }
348
349 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
350
351 while (tokenizer.HasMoreTokens())
352 {
353 key = tokenizer.GetNextToken();
354 m_HandlersHash.Delete(key);
355 m_HandlersHash.Put(key, handler);
356 }
357 }
358
359 void wxHtmlParser::PopTagHandler()
360 {
361 wxList::compatibility_iterator first;
362
363 if ( !m_HandlersStack ||
364 #if wxUSE_STL
365 !(first = m_HandlersStack->GetFirst())
366 #else // !wxUSE_STL
367 ((first = m_HandlersStack->GetFirst()) == NULL)
368 #endif // wxUSE_STL/!wxUSE_STL
369 )
370 {
371 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
372 return;
373 }
374 m_HandlersHash = *((wxHashTable*) first->GetData());
375 delete (wxHashTable*) first->GetData();
376 m_HandlersStack->Erase(first);
377 }
378
379 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
380 {
381 wxHtmlParserState *s = new wxHtmlParserState;
382
383 s->m_curTag = m_CurTag;
384 s->m_tags = m_Tags;
385 s->m_textPieces = m_TextPieces;
386 s->m_curTextPiece = m_CurTextPiece;
387 s->m_source = m_Source;
388
389 s->m_nextState = m_SavedStates;
390 m_SavedStates = s;
391
392 m_CurTag = NULL;
393 m_Tags = NULL;
394 m_TextPieces = NULL;
395 m_CurTextPiece = 0;
396 m_Source = wxEmptyString;
397
398 SetSource(src);
399 }
400
401 bool wxHtmlParser::RestoreState()
402 {
403 if (!m_SavedStates) return false;
404
405 DestroyDOMTree();
406
407 wxHtmlParserState *s = m_SavedStates;
408 m_SavedStates = s->m_nextState;
409
410 m_CurTag = s->m_curTag;
411 m_Tags = s->m_tags;
412 m_TextPieces = s->m_textPieces;
413 m_CurTextPiece = s->m_curTextPiece;
414 m_Source = s->m_source;
415
416 delete s;
417 return true;
418 }
419
420 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
421 {
422 return GetSource()->Mid(tag.GetBeginPos(),
423 tag.GetEndPos1() - tag.GetBeginPos());
424 }
425
426 //-----------------------------------------------------------------------------
427 // wxHtmlTagHandler
428 //-----------------------------------------------------------------------------
429
430 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
431
432 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
433 {
434 // It is safe to temporarily change the source being parsed,
435 // provided we restore the state back after parsing
436 m_Parser->SetSourceAndSaveState(source);
437 m_Parser->DoParsing();
438 m_Parser->RestoreState();
439 }
440
441
442 //-----------------------------------------------------------------------------
443 // wxHtmlEntitiesParser
444 //-----------------------------------------------------------------------------
445
446 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
447
448 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
449 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
450 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
451 #endif
452 {
453 }
454
455 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
456 {
457 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
458 delete m_conv;
459 #endif
460 }
461
462 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
463 {
464 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
465 if (encoding == m_encoding)
466 return;
467
468 delete m_conv;
469
470 m_encoding = encoding;
471 if (m_encoding == wxFONTENCODING_SYSTEM)
472 m_conv = NULL;
473 else
474 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
475 #else
476 (void) encoding;
477 #endif
478 }
479
480 wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
481 {
482 wxString output;
483
484 const wxString::const_iterator end(input.end());
485 wxString::const_iterator c(input.begin());
486 wxString::const_iterator last(c);
487
488 for ( ; c < end; ++c )
489 {
490 if (*c == wxT('&'))
491 {
492 if ( output.empty() )
493 output.reserve(input.length());
494
495 if (c - last > 0)
496 output.append(last, c);
497 if ( ++c == end )
498 break;
499
500 wxString entity;
501 const wxString::const_iterator ent_s = c;
502 wxChar entity_char;
503
504 for (; c != end &&
505 ((*c >= wxT('a') && *c <= wxT('z')) ||
506 (*c >= wxT('A') && *c <= wxT('Z')) ||
507 (*c >= wxT('0') && *c <= wxT('9')) ||
508 *c == wxT('_') || *c == wxT('#')); ++c) {}
509 entity.append(ent_s, c);
510 if (c == end || *c != wxT(';')) --c;
511 last = c+1;
512 entity_char = GetEntityChar(entity);
513 if (entity_char)
514 output << entity_char;
515 else
516 {
517 output.append(ent_s-1, c+1);
518 wxLogTrace(wxTRACE_HTML_DEBUG,
519 "Unrecognized HTML entity: '%s'",
520 entity);
521 }
522 }
523 }
524 if ( last == input.begin() ) // common case: no entity
525 return input;
526 if ( last != end )
527 output.append(last, end);
528 return output;
529 }
530
531 #if !wxUSE_UNICODE
532 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
533 {
534 #if wxUSE_WCHAR_T
535 char buf[2];
536 wchar_t wbuf[2];
537 wbuf[0] = (wchar_t)code;
538 wbuf[1] = 0;
539 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
540 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
541 return '?';
542 return buf[0];
543 #else
544 return (code < 256) ? (wxChar)code : '?';
545 #endif
546 }
547 #endif
548
549 struct wxHtmlEntityInfo
550 {
551 const wxStringCharType *name;
552 unsigned code;
553 };
554
555 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
556 {
557 #if wxUSE_UNICODE_UTF8
558 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
559 #else
560 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
561 #endif
562 }
563
564 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
565 {
566 unsigned code = 0;
567
568 if (entity[0] == wxT('#'))
569 {
570 // NB: parsed value is a number, so it's OK to use wx_str(), internal
571 // representation is the same for numbers
572 const wxStringCharType *ent_s = entity.wx_str();
573 const wxStringCharType *format;
574
575 if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
576 {
577 format = wxSTRING_TEXT("%x");
578 ent_s++;
579 }
580 else
581 format = wxSTRING_TEXT("%u");
582 ent_s++;
583
584 if (wxSscanf(ent_s, format, &code) != 1)
585 code = 0;
586 }
587 else
588 {
589 // store the literals in wx's internal representation (either char*
590 // in UTF-8 or wchar_t*) for best performance:
591 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
592
593 static wxHtmlEntityInfo substitutions[] = {
594 ENTITY("AElig", 198),
595 ENTITY("Aacute", 193),
596 ENTITY("Acirc", 194),
597 ENTITY("Agrave", 192),
598 ENTITY("Alpha", 913),
599 ENTITY("Aring", 197),
600 ENTITY("Atilde", 195),
601 ENTITY("Auml", 196),
602 ENTITY("Beta", 914),
603 ENTITY("Ccedil", 199),
604 ENTITY("Chi", 935),
605 ENTITY("Dagger", 8225),
606 ENTITY("Delta", 916),
607 ENTITY("ETH", 208),
608 ENTITY("Eacute", 201),
609 ENTITY("Ecirc", 202),
610 ENTITY("Egrave", 200),
611 ENTITY("Epsilon", 917),
612 ENTITY("Eta", 919),
613 ENTITY("Euml", 203),
614 ENTITY("Gamma", 915),
615 ENTITY("Iacute", 205),
616 ENTITY("Icirc", 206),
617 ENTITY("Igrave", 204),
618 ENTITY("Iota", 921),
619 ENTITY("Iuml", 207),
620 ENTITY("Kappa", 922),
621 ENTITY("Lambda", 923),
622 ENTITY("Mu", 924),
623 ENTITY("Ntilde", 209),
624 ENTITY("Nu", 925),
625 ENTITY("OElig", 338),
626 ENTITY("Oacute", 211),
627 ENTITY("Ocirc", 212),
628 ENTITY("Ograve", 210),
629 ENTITY("Omega", 937),
630 ENTITY("Omicron", 927),
631 ENTITY("Oslash", 216),
632 ENTITY("Otilde", 213),
633 ENTITY("Ouml", 214),
634 ENTITY("Phi", 934),
635 ENTITY("Pi", 928),
636 ENTITY("Prime", 8243),
637 ENTITY("Psi", 936),
638 ENTITY("Rho", 929),
639 ENTITY("Scaron", 352),
640 ENTITY("Sigma", 931),
641 ENTITY("THORN", 222),
642 ENTITY("Tau", 932),
643 ENTITY("Theta", 920),
644 ENTITY("Uacute", 218),
645 ENTITY("Ucirc", 219),
646 ENTITY("Ugrave", 217),
647 ENTITY("Upsilon", 933),
648 ENTITY("Uuml", 220),
649 ENTITY("Xi", 926),
650 ENTITY("Yacute", 221),
651 ENTITY("Yuml", 376),
652 ENTITY("Zeta", 918),
653 ENTITY("aacute", 225),
654 ENTITY("acirc", 226),
655 ENTITY("acute", 180),
656 ENTITY("aelig", 230),
657 ENTITY("agrave", 224),
658 ENTITY("alefsym", 8501),
659 ENTITY("alpha", 945),
660 ENTITY("amp", 38),
661 ENTITY("and", 8743),
662 ENTITY("ang", 8736),
663 ENTITY("aring", 229),
664 ENTITY("asymp", 8776),
665 ENTITY("atilde", 227),
666 ENTITY("auml", 228),
667 ENTITY("bdquo", 8222),
668 ENTITY("beta", 946),
669 ENTITY("brvbar", 166),
670 ENTITY("bull", 8226),
671 ENTITY("cap", 8745),
672 ENTITY("ccedil", 231),
673 ENTITY("cedil", 184),
674 ENTITY("cent", 162),
675 ENTITY("chi", 967),
676 ENTITY("circ", 710),
677 ENTITY("clubs", 9827),
678 ENTITY("cong", 8773),
679 ENTITY("copy", 169),
680 ENTITY("crarr", 8629),
681 ENTITY("cup", 8746),
682 ENTITY("curren", 164),
683 ENTITY("dArr", 8659),
684 ENTITY("dagger", 8224),
685 ENTITY("darr", 8595),
686 ENTITY("deg", 176),
687 ENTITY("delta", 948),
688 ENTITY("diams", 9830),
689 ENTITY("divide", 247),
690 ENTITY("eacute", 233),
691 ENTITY("ecirc", 234),
692 ENTITY("egrave", 232),
693 ENTITY("empty", 8709),
694 ENTITY("emsp", 8195),
695 ENTITY("ensp", 8194),
696 ENTITY("epsilon", 949),
697 ENTITY("equiv", 8801),
698 ENTITY("eta", 951),
699 ENTITY("eth", 240),
700 ENTITY("euml", 235),
701 ENTITY("euro", 8364),
702 ENTITY("exist", 8707),
703 ENTITY("fnof", 402),
704 ENTITY("forall", 8704),
705 ENTITY("frac12", 189),
706 ENTITY("frac14", 188),
707 ENTITY("frac34", 190),
708 ENTITY("frasl", 8260),
709 ENTITY("gamma", 947),
710 ENTITY("ge", 8805),
711 ENTITY("gt", 62),
712 ENTITY("hArr", 8660),
713 ENTITY("harr", 8596),
714 ENTITY("hearts", 9829),
715 ENTITY("hellip", 8230),
716 ENTITY("iacute", 237),
717 ENTITY("icirc", 238),
718 ENTITY("iexcl", 161),
719 ENTITY("igrave", 236),
720 ENTITY("image", 8465),
721 ENTITY("infin", 8734),
722 ENTITY("int", 8747),
723 ENTITY("iota", 953),
724 ENTITY("iquest", 191),
725 ENTITY("isin", 8712),
726 ENTITY("iuml", 239),
727 ENTITY("kappa", 954),
728 ENTITY("lArr", 8656),
729 ENTITY("lambda", 955),
730 ENTITY("lang", 9001),
731 ENTITY("laquo", 171),
732 ENTITY("larr", 8592),
733 ENTITY("lceil", 8968),
734 ENTITY("ldquo", 8220),
735 ENTITY("le", 8804),
736 ENTITY("lfloor", 8970),
737 ENTITY("lowast", 8727),
738 ENTITY("loz", 9674),
739 ENTITY("lrm", 8206),
740 ENTITY("lsaquo", 8249),
741 ENTITY("lsquo", 8216),
742 ENTITY("lt", 60),
743 ENTITY("macr", 175),
744 ENTITY("mdash", 8212),
745 ENTITY("micro", 181),
746 ENTITY("middot", 183),
747 ENTITY("minus", 8722),
748 ENTITY("mu", 956),
749 ENTITY("nabla", 8711),
750 ENTITY("nbsp", 160),
751 ENTITY("ndash", 8211),
752 ENTITY("ne", 8800),
753 ENTITY("ni", 8715),
754 ENTITY("not", 172),
755 ENTITY("notin", 8713),
756 ENTITY("nsub", 8836),
757 ENTITY("ntilde", 241),
758 ENTITY("nu", 957),
759 ENTITY("oacute", 243),
760 ENTITY("ocirc", 244),
761 ENTITY("oelig", 339),
762 ENTITY("ograve", 242),
763 ENTITY("oline", 8254),
764 ENTITY("omega", 969),
765 ENTITY("omicron", 959),
766 ENTITY("oplus", 8853),
767 ENTITY("or", 8744),
768 ENTITY("ordf", 170),
769 ENTITY("ordm", 186),
770 ENTITY("oslash", 248),
771 ENTITY("otilde", 245),
772 ENTITY("otimes", 8855),
773 ENTITY("ouml", 246),
774 ENTITY("para", 182),
775 ENTITY("part", 8706),
776 ENTITY("permil", 8240),
777 ENTITY("perp", 8869),
778 ENTITY("phi", 966),
779 ENTITY("pi", 960),
780 ENTITY("piv", 982),
781 ENTITY("plusmn", 177),
782 ENTITY("pound", 163),
783 ENTITY("prime", 8242),
784 ENTITY("prod", 8719),
785 ENTITY("prop", 8733),
786 ENTITY("psi", 968),
787 ENTITY("quot", 34),
788 ENTITY("rArr", 8658),
789 ENTITY("radic", 8730),
790 ENTITY("rang", 9002),
791 ENTITY("raquo", 187),
792 ENTITY("rarr", 8594),
793 ENTITY("rceil", 8969),
794 ENTITY("rdquo", 8221),
795 ENTITY("real", 8476),
796 ENTITY("reg", 174),
797 ENTITY("rfloor", 8971),
798 ENTITY("rho", 961),
799 ENTITY("rlm", 8207),
800 ENTITY("rsaquo", 8250),
801 ENTITY("rsquo", 8217),
802 ENTITY("sbquo", 8218),
803 ENTITY("scaron", 353),
804 ENTITY("sdot", 8901),
805 ENTITY("sect", 167),
806 ENTITY("shy", 173),
807 ENTITY("sigma", 963),
808 ENTITY("sigmaf", 962),
809 ENTITY("sim", 8764),
810 ENTITY("spades", 9824),
811 ENTITY("sub", 8834),
812 ENTITY("sube", 8838),
813 ENTITY("sum", 8721),
814 ENTITY("sup", 8835),
815 ENTITY("sup1", 185),
816 ENTITY("sup2", 178),
817 ENTITY("sup3", 179),
818 ENTITY("supe", 8839),
819 ENTITY("szlig", 223),
820 ENTITY("tau", 964),
821 ENTITY("there4", 8756),
822 ENTITY("theta", 952),
823 ENTITY("thetasym", 977),
824 ENTITY("thinsp", 8201),
825 ENTITY("thorn", 254),
826 ENTITY("tilde", 732),
827 ENTITY("times", 215),
828 ENTITY("trade", 8482),
829 ENTITY("uArr", 8657),
830 ENTITY("uacute", 250),
831 ENTITY("uarr", 8593),
832 ENTITY("ucirc", 251),
833 ENTITY("ugrave", 249),
834 ENTITY("uml", 168),
835 ENTITY("upsih", 978),
836 ENTITY("upsilon", 965),
837 ENTITY("uuml", 252),
838 ENTITY("weierp", 8472),
839 ENTITY("xi", 958),
840 ENTITY("yacute", 253),
841 ENTITY("yen", 165),
842 ENTITY("yuml", 255),
843 ENTITY("zeta", 950),
844 ENTITY("zwj", 8205),
845 ENTITY("zwnj", 8204),
846 {NULL, 0}};
847 #undef ENTITY
848 static size_t substitutions_cnt = 0;
849
850 if (substitutions_cnt == 0)
851 while (substitutions[substitutions_cnt].code != 0)
852 substitutions_cnt++;
853
854 wxHtmlEntityInfo *info = NULL;
855 #ifdef __WXWINCE__
856 // bsearch crashes under WinCE for some reason
857 size_t i;
858 for (i = 0; i < substitutions_cnt; i++)
859 {
860 if (entity == substitutions[i].name)
861 {
862 info = & substitutions[i];
863 break;
864 }
865 }
866 #else
867 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
868 substitutions_cnt,
869 sizeof(wxHtmlEntityInfo),
870 wxHtmlEntityCompare);
871 #endif
872 if (info)
873 code = info->code;
874 }
875
876 if (code == 0)
877 return 0;
878 else
879 return GetCharForCode(code);
880 }
881
882 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
883 const wxString& url) const
884 {
885 return m_FS ? m_FS->OpenFile(url) : NULL;
886
887 }
888
889
890 //-----------------------------------------------------------------------------
891 // wxHtmlParser::ExtractCharsetInformation
892 //-----------------------------------------------------------------------------
893
894 class wxMetaTagParser : public wxHtmlParser
895 {
896 public:
897 wxMetaTagParser() { }
898
899 wxObject* GetProduct() { return NULL; }
900
901 protected:
902 virtual void AddText(const wxString& WXUNUSED(txt)) {}
903
904 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
905 };
906
907 class wxMetaTagHandler : public wxHtmlTagHandler
908 {
909 public:
910 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
911 wxString GetSupportedTags() { return wxT("META,BODY"); }
912 bool HandleTag(const wxHtmlTag& tag);
913
914 private:
915 wxString *m_retval;
916
917 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
918 };
919
920 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
921 {
922 if (tag.GetName() == _T("BODY"))
923 {
924 m_Parser->StopParsing();
925 return false;
926 }
927
928 if (tag.HasParam(_T("HTTP-EQUIV")) &&
929 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
930 tag.HasParam(_T("CONTENT")))
931 {
932 wxString content = tag.GetParam(_T("CONTENT")).Lower();
933 if (content.Left(19) == _T("text/html; charset="))
934 {
935 *m_retval = content.Mid(19);
936 m_Parser->StopParsing();
937 }
938 }
939 return false;
940 }
941
942
943 /*static*/
944 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
945 {
946 wxString charset;
947 wxMetaTagParser *parser = new wxMetaTagParser();
948 if(parser)
949 {
950 parser->AddTagHandler(new wxMetaTagHandler(&charset));
951 parser->Parse(markup);
952 delete parser;
953 }
954 return charset;
955 }
956
957 /* static */
958 bool
959 wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
960 wxString::const_iterator end)
961 {
962 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
963
964 wxString::const_iterator p = start;
965
966 // comments begin with "<!--" in HTML 4.0
967 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
968 {
969 // not a comment at all
970 return false;
971 }
972
973 // skip the start of the comment tag in any case, if we don't find the
974 // closing tag we should ignore broken markup
975 start = p;
976
977 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
978 // comment delimiter and the closing tag character (section 3.2.4 of
979 // http://www.w3.org/TR/html401/)
980 int dashes = 0;
981 while ( ++p < end )
982 {
983 const wxChar c = *p;
984
985 if ( (c == wxT(' ') || c == wxT('\n') ||
986 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
987 {
988 // ignore white space before potential tag end
989 continue;
990 }
991
992 if ( c == wxT('>') && dashes >= 2 )
993 {
994 // found end of comment
995 start = p;
996 break;
997 }
998
999 if ( c == wxT('-') )
1000 dashes++;
1001 else
1002 dashes = 0;
1003 }
1004
1005 return true;
1006 }
1007
1008 #endif // wxUSE_HTML