]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
ignore bakefile-generated test.dsw too
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
69941f05 5// RCS-ID: $Id$
5526e819 6// Copyright: (c) 1999 Vaclav Slavik
65571936 7// Licence: wxWindows licence
5526e819
VS
8/////////////////////////////////////////////////////////////////////////////
9
3096bd2f 10#include "wx/wxprec.h"
5526e819 11
2b5f62a0 12#ifdef __BORLANDC__
93763ad5 13 #pragma hdrstop
5526e819
VS
14#endif
15
93763ad5
WS
16#if wxUSE_HTML && wxUSE_STREAMS
17
b4f4d3dd 18#ifndef WX_PRECOMP
ad9835c9 19 #include "wx/dynarray.h"
04dbb646
VZ
20 #include "wx/log.h"
21 #include "wx/intl.h"
670f9935 22 #include "wx/app.h"
193d0c93 23 #include "wx/wxcrtvararg.h"
5526e819
VS
24#endif
25
69941f05
VS
26#include "wx/tokenzr.h"
27#include "wx/wfstream.h"
28#include "wx/url.h"
daa616fc 29#include "wx/fontmap.h"
69941f05
VS
30#include "wx/html/htmldefs.h"
31#include "wx/html/htmlpars.h"
211dfedd 32#include "wx/arrimpl.cpp"
5526e819 33
7127d129
RR
34#ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36#endif
34fdf762
VS
37
38// DLL options compatibility check:
34fdf762 39WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 40
25271309
VZ
41const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
211dfedd
VS
43//-----------------------------------------------------------------------------
44// wxHtmlParser helpers
45//-----------------------------------------------------------------------------
46
47class wxHtmlTextPiece
48{
49public:
50 wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
51 int m_pos, m_lng;
52};
53
54WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
17a1ebd1 55WX_DEFINE_OBJARRAY(wxHtmlTextPieces)
5526e819 56
481c879b 57class wxHtmlParserState
211dfedd 58{
481c879b 59public:
211dfedd
VS
60 wxHtmlTag *m_curTag;
61 wxHtmlTag *m_tags;
62 wxHtmlTextPieces *m_textPieces;
63 int m_curTextPiece;
64 wxString m_source;
65 wxHtmlParserState *m_nextState;
66};
5526e819
VS
67
68//-----------------------------------------------------------------------------
69// wxHtmlParser
70//-----------------------------------------------------------------------------
71
72IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
73
04dbb646 74wxHtmlParser::wxHtmlParser()
211dfedd 75 : wxObject(), m_HandlersHash(wxKEY_STRING),
daa616fc
VS
76 m_FS(NULL), m_HandlersStack(NULL)
77{
78 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
79 m_Tags = NULL;
80 m_CurTag = NULL;
81 m_TextPieces = NULL;
82 m_CurTextPiece = 0;
83 m_SavedStates = NULL;
daa616fc
VS
84}
85
86wxHtmlParser::~wxHtmlParser()
87{
0beefa20
VS
88 while (RestoreState()) {}
89 DestroyDOMTree();
222ed1d6
MB
90
91 if (m_HandlersStack)
92 {
93 wxList& tmp = *m_HandlersStack;
94 wxList::iterator it, en;
95 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
96 delete (wxHashTable*)*it;
97 tmp.clear();
98 }
daa616fc
VS
99 delete m_HandlersStack;
100 m_HandlersHash.Clear();
222ed1d6 101 WX_CLEAR_LIST(wxList, m_HandlersList);
daa616fc
VS
102 delete m_entitiesParser;
103}
5526e819
VS
104
105wxObject* wxHtmlParser::Parse(const wxString& source)
106{
5526e819
VS
107 InitParser(source);
108 DoParsing();
2b5f62a0 109 wxObject *result = GetProduct();
5526e819
VS
110 DoneParser();
111 return result;
112}
113
5526e819
VS
114void wxHtmlParser::InitParser(const wxString& source)
115{
1309ba6c 116 SetSource(source);
d1da8872 117 m_stopParsing = false;
5526e819 118}
1309ba6c 119
5526e819
VS
120void wxHtmlParser::DoneParser()
121{
211dfedd 122 DestroyDOMTree();
5526e819
VS
123}
124
1309ba6c
VS
125void wxHtmlParser::SetSource(const wxString& src)
126{
211dfedd 127 DestroyDOMTree();
1309ba6c 128 m_Source = src;
211dfedd
VS
129 CreateDOMTree();
130 m_CurTag = NULL;
131 m_CurTextPiece = 0;
1309ba6c 132}
5526e819 133
211dfedd 134void wxHtmlParser::CreateDOMTree()
5526e819 135{
211dfedd
VS
136 wxHtmlTagsCache cache(m_Source);
137 m_TextPieces = new wxHtmlTextPieces;
93763ad5 138 CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
211dfedd
VS
139 m_CurTextPiece = 0;
140}
5526e819 141
7c6cd4a8
VS
142extern bool wxIsCDATAElement(const wxChar *tag);
143
211dfedd
VS
144void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
145 int begin_pos, int end_pos,
146 wxHtmlTagsCache *cache)
147{
148 if (end_pos <= begin_pos) return;
5526e819 149
211dfedd
VS
150 wxChar c;
151 int i = begin_pos;
152 int textBeginning = begin_pos;
d699f48b 153
7c6cd4a8
VS
154 // If the tag contains CDATA text, we include the text between beginning
155 // and ending tag verbosely. Setting i=end_pos will skip to the very
156 // end of this function where text piece is added, bypassing any child
157 // tags parsing (CDATA element can't have child elements by definition):
158 if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
159 {
160 i = end_pos;
161 }
162
04dbb646 163 while (i < end_pos)
4f9297b0 164 {
211dfedd 165 c = m_Source.GetChar(i);
5526e819 166
211dfedd
VS
167 if (c == wxT('<'))
168 {
169 // add text to m_TextPieces:
170 if (i - textBeginning > 0)
171 m_TextPieces->Add(
172 wxHtmlTextPiece(textBeginning, i - textBeginning));
173
174 // if it is a comment, skip it:
4609ee2e
VZ
175 wxString::const_iterator iter = m_Source.begin() + i;
176 if ( SkipCommentTag(iter, m_Source.end()) )
211dfedd 177 {
4609ee2e
VZ
178 textBeginning =
179 i = iter - m_Source.begin() + 1; // skip closing '>' too
211dfedd 180 }
d699f48b 181
211dfedd
VS
182 // add another tag to the tree:
183 else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
d1da8872 184 {
211dfedd 185 wxHtmlTag *chd;
d699f48b
KB
186 if (cur)
187 chd = new wxHtmlTag(cur, m_Source,
211dfedd 188 i, end_pos, cache, m_entitiesParser);
d699f48b 189 else
211dfedd
VS
190 {
191 chd = new wxHtmlTag(NULL, m_Source,
192 i, end_pos, cache, m_entitiesParser);
d699f48b 193 if (!m_Tags)
211dfedd 194 {
d699f48b 195 // if this is the first tag to be created make the root
211dfedd
VS
196 // m_Tags point to it:
197 m_Tags = chd;
198 }
199 else
200 {
d699f48b 201 // if there is already a root tag add this tag as
211dfedd
VS
202 // the last sibling:
203 chd->m_Prev = m_Tags->GetLastSibling();
204 chd->m_Prev->m_Next = chd;
205 }
206 }
207
208 if (chd->HasEnding())
209 {
210 CreateDOMSubTree(chd,
d699f48b 211 chd->GetBeginPos(), chd->GetEndPos1(),
211dfedd
VS
212 cache);
213 i = chd->GetEndPos2();
214 }
215 else
216 i = chd->GetBeginPos();
d1da8872 217
211dfedd
VS
218 textBeginning = i;
219 }
220
221 // ... or skip ending tag:
d699f48b 222 else
211dfedd
VS
223 {
224 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
225 textBeginning = i+1;
5526e819 226 }
5526e819 227 }
211dfedd 228 else i++;
5526e819
VS
229 }
230
211dfedd
VS
231 // add remaining text to m_TextPieces:
232 if (end_pos - textBeginning > 0)
233 m_TextPieces->Add(
234 wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
235}
236
237void wxHtmlParser::DestroyDOMTree()
238{
239 wxHtmlTag *t1, *t2;
240 t1 = m_Tags;
241 while (t1)
242 {
243 t2 = t1->GetNextSibling();
244 delete t1;
245 t1 = t2;
246 }
247 m_Tags = m_CurTag = NULL;
248
249 delete m_TextPieces;
250 m_TextPieces = NULL;
251}
252
d699f48b 253void wxHtmlParser::DoParsing()
211dfedd
VS
254{
255 m_CurTag = m_Tags;
256 m_CurTextPiece = 0;
93763ad5 257 DoParsing(0, m_Source.length());
211dfedd
VS
258}
259
260void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
261{
262 if (end_pos <= begin_pos) return;
d699f48b 263
211dfedd
VS
264 wxHtmlTextPieces& pieces = *m_TextPieces;
265 size_t piecesCnt = pieces.GetCount();
d699f48b 266
211dfedd
VS
267 while (begin_pos < end_pos)
268 {
269 while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
270 m_CurTag = m_CurTag->GetNextTag();
d699f48b 271 while (m_CurTextPiece < piecesCnt &&
211dfedd
VS
272 pieces[m_CurTextPiece].m_pos < begin_pos)
273 m_CurTextPiece++;
274
d699f48b
KB
275 if (m_CurTextPiece < piecesCnt &&
276 (!m_CurTag ||
211dfedd
VS
277 pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
278 {
279 // Add text:
f23e92e7
VS
280 AddText(GetEntitiesParser()->Parse(
281 m_Source.Mid(pieces[m_CurTextPiece].m_pos,
282 pieces[m_CurTextPiece].m_lng)));
d699f48b 283 begin_pos = pieces[m_CurTextPiece].m_pos +
211dfedd
VS
284 pieces[m_CurTextPiece].m_lng;
285 m_CurTextPiece++;
286 }
287 else if (m_CurTag)
288 {
902725ee
WS
289 if (m_CurTag->HasEnding())
290 begin_pos = m_CurTag->GetEndPos2();
291 else
292 begin_pos = m_CurTag->GetBeginPos();
211dfedd
VS
293 wxHtmlTag *t = m_CurTag;
294 m_CurTag = m_CurTag->GetNextTag();
295 AddTag(*t);
2b5f62a0
VZ
296 if (m_stopParsing)
297 return;
211dfedd
VS
298 }
299 else break;
5526e819
VS
300 }
301}
302
5526e819
VS
303void wxHtmlParser::AddTag(const wxHtmlTag& tag)
304{
305 wxHtmlTagHandler *h;
d1da8872 306 bool inner = false;
5526e819
VS
307
308 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
309 if (h)
2b5f62a0 310 {
4f9297b0 311 inner = h->HandleTag(tag);
2b5f62a0
VZ
312 if (m_stopParsing)
313 return;
314 }
04dbb646 315 if (!inner)
4f9297b0 316 {
5526e819
VS
317 if (tag.HasEnding())
318 DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
319 }
320}
321
5526e819
VS
322void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
323{
4f9297b0 324 wxString s(handler->GetSupportedTags());
211dfedd 325 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 326
5526e819 327 while (tokenizer.HasMoreTokens())
470252df 328 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
5526e819
VS
329
330 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
331 m_HandlersList.Append(handler);
332
4f9297b0 333 handler->SetParser(this);
5526e819
VS
334}
335
fbfb8bcc 336void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 337{
211dfedd 338 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
339 wxString key;
340
04dbb646 341 if (m_HandlersStack == NULL)
4f9297b0 342 {
a7a4d01b 343 m_HandlersStack = new wxList;
a7a4d01b
VS
344 }
345
222ed1d6 346 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
a7a4d01b 347
04dbb646 348 while (tokenizer.HasMoreTokens())
4f9297b0 349 {
470252df 350 key = tokenizer.GetNextToken();
a7a4d01b
VS
351 m_HandlersHash.Delete(key);
352 m_HandlersHash.Put(key, handler);
353 }
354}
355
a7a4d01b
VS
356void wxHtmlParser::PopTagHandler()
357{
222ed1d6 358 wxList::compatibility_iterator first;
04dbb646 359
dfa4a244 360 if ( !m_HandlersStack ||
28b4db7f
VZ
361#if wxUSE_STL
362 !(first = m_HandlersStack->GetFirst())
363#else // !wxUSE_STL
364 ((first = m_HandlersStack->GetFirst()) == NULL)
365#endif // wxUSE_STL/!wxUSE_STL
366 )
f3c82859
VS
367 {
368 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
369 return;
370 }
4f9297b0 371 m_HandlersHash = *((wxHashTable*) first->GetData());
222ed1d6
MB
372 delete (wxHashTable*) first->GetData();
373 m_HandlersStack->Erase(first);
a7a4d01b
VS
374}
375
211dfedd
VS
376void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
377{
378 wxHtmlParserState *s = new wxHtmlParserState;
379
380 s->m_curTag = m_CurTag;
381 s->m_tags = m_Tags;
382 s->m_textPieces = m_TextPieces;
383 s->m_curTextPiece = m_CurTextPiece;
384 s->m_source = m_Source;
385
386 s->m_nextState = m_SavedStates;
387 m_SavedStates = s;
388
389 m_CurTag = NULL;
390 m_Tags = NULL;
391 m_TextPieces = NULL;
392 m_CurTextPiece = 0;
393 m_Source = wxEmptyString;
d699f48b 394
211dfedd
VS
395 SetSource(src);
396}
397
398bool wxHtmlParser::RestoreState()
399{
d1da8872 400 if (!m_SavedStates) return false;
d699f48b 401
0beefa20
VS
402 DestroyDOMTree();
403
211dfedd
VS
404 wxHtmlParserState *s = m_SavedStates;
405 m_SavedStates = s->m_nextState;
d699f48b 406
211dfedd
VS
407 m_CurTag = s->m_curTag;
408 m_Tags = s->m_tags;
409 m_TextPieces = s->m_textPieces;
410 m_CurTextPiece = s->m_curTextPiece;
411 m_Source = s->m_source;
d699f48b 412
211dfedd 413 delete s;
d1da8872 414 return true;
211dfedd
VS
415}
416
e7feeafa
VS
417wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
418{
419 return GetSource()->Mid(tag.GetBeginPos(),
420 tag.GetEndPos1() - tag.GetBeginPos());
421}
422
5526e819
VS
423//-----------------------------------------------------------------------------
424// wxHtmlTagHandler
425//-----------------------------------------------------------------------------
426
427IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 428
e7feeafa
VS
429void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
430{
431 // It is safe to temporarily change the source being parsed,
432 // provided we restore the state back after parsing
433 m_Parser->SetSourceAndSaveState(source);
434 m_Parser->DoParsing();
435 m_Parser->RestoreState();
436}
437
daa616fc
VS
438
439//-----------------------------------------------------------------------------
440// wxHtmlEntitiesParser
441//-----------------------------------------------------------------------------
442
443IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
444
445wxHtmlEntitiesParser::wxHtmlEntitiesParser()
446#if wxUSE_WCHAR_T && !wxUSE_UNICODE
447 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 448#endif
daa616fc
VS
449{
450}
451
452wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
453{
5438a566 454#if wxUSE_WCHAR_T && !wxUSE_UNICODE
daa616fc 455 delete m_conv;
5438a566 456#endif
daa616fc 457}
5526e819 458
daa616fc
VS
459void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
460{
461#if wxUSE_WCHAR_T && !wxUSE_UNICODE
2b5f62a0
VZ
462 if (encoding == m_encoding)
463 return;
464
daa616fc 465 delete m_conv;
2b5f62a0 466
daa616fc 467 m_encoding = encoding;
2b5f62a0
VZ
468 if (m_encoding == wxFONTENCODING_SYSTEM)
469 m_conv = NULL;
470 else
daa616fc 471 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
d699f48b
KB
472#else
473 (void) encoding;
daa616fc
VS
474#endif
475}
476
477wxString wxHtmlEntitiesParser::Parse(const wxString& input)
478{
479 const wxChar *c, *last;
480 const wxChar *in_str = input.c_str();
481 wxString output;
d1da8872 482
470252df 483 output.reserve(input.length());
04dbb646 484
daa616fc
VS
485 for (c = in_str, last = in_str; *c != wxT('\0'); c++)
486 {
487 if (*c == wxT('&'))
488 {
489 if (c - last > 0)
490 output.append(last, c - last);
9e2bd135
VZ
491 if ( *++c == wxT('\0') )
492 break;
d1da8872 493
daa616fc
VS
494 wxString entity;
495 const wxChar *ent_s = c;
470252df 496 wxChar entity_char;
d1da8872 497
daa616fc
VS
498 for (; (*c >= wxT('a') && *c <= wxT('z')) ||
499 (*c >= wxT('A') && *c <= wxT('Z')) ||
500 (*c >= wxT('0') && *c <= wxT('9')) ||
501 *c == wxT('_') || *c == wxT('#'); c++) {}
502 entity.append(ent_s, c - ent_s);
211dfedd
VS
503 if (*c != wxT(';')) c--;
504 last = c+1;
470252df
VS
505 entity_char = GetEntityChar(entity);
506 if (entity_char)
507 output << entity_char;
508 else
509 {
510 output.append(ent_s-1, c-ent_s+2);
25271309
VZ
511 wxLogTrace(wxTRACE_HTML_DEBUG,
512 wxT("Unrecognized HTML entity: '%s'"),
513 entity.c_str());
470252df 514 }
daa616fc
VS
515 }
516 }
517 if (*last != wxT('\0'))
518 output.append(last);
519 return output;
520}
521
2b5f62a0 522#if !wxUSE_UNICODE
daa616fc
VS
523wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code)
524{
2b5f62a0 525#if wxUSE_WCHAR_T
daa616fc
VS
526 char buf[2];
527 wchar_t wbuf[2];
528 wbuf[0] = (wchar_t)code;
529 wbuf[1] = 0;
530 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 531 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
532 return '?';
533 return buf[0];
534#else
535 return (code < 256) ? (wxChar)code : '?';
536#endif
537}
2b5f62a0 538#endif
daa616fc 539
19817fd3
VS
540struct wxHtmlEntityInfo
541{
542 const wxStringCharType *name;
543 unsigned code;
544};
545
546extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
547{
548#if wxUSE_UNICODE_UTF8
549 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
550#else
551 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
552#endif
553}
554
daa616fc
VS
555wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity)
556{
557 unsigned code = 0;
04dbb646 558
daa616fc
VS
559 if (entity[0] == wxT('#'))
560 {
561 const wxChar *ent_s = entity.c_str();
562 const wxChar *format;
04dbb646 563
daa616fc
VS
564 if (ent_s[1] == wxT('x') || ent_s[1] == wxT('X'))
565 {
566 format = wxT("%x");
567 ent_s++;
568 }
569 else
570 format = wxT("%u");
571 ent_s++;
572
573 if (wxSscanf(ent_s, format, &code) != 1)
574 code = 0;
575 }
576 else
577 {
19817fd3
VS
578 // store the literals in wx's internal representation (either char*
579 // in UTF-8 or wchar_t*) for best performance:
580 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
581
daa616fc 582 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
583 ENTITY("AElig", 198),
584 ENTITY("Aacute", 193),
585 ENTITY("Acirc", 194),
586 ENTITY("Agrave", 192),
587 ENTITY("Alpha", 913),
588 ENTITY("Aring", 197),
589 ENTITY("Atilde", 195),
590 ENTITY("Auml", 196),
591 ENTITY("Beta", 914),
592 ENTITY("Ccedil", 199),
593 ENTITY("Chi", 935),
594 ENTITY("Dagger", 8225),
595 ENTITY("Delta", 916),
596 ENTITY("ETH", 208),
597 ENTITY("Eacute", 201),
598 ENTITY("Ecirc", 202),
599 ENTITY("Egrave", 200),
600 ENTITY("Epsilon", 917),
601 ENTITY("Eta", 919),
602 ENTITY("Euml", 203),
603 ENTITY("Gamma", 915),
604 ENTITY("Iacute", 205),
605 ENTITY("Icirc", 206),
606 ENTITY("Igrave", 204),
607 ENTITY("Iota", 921),
608 ENTITY("Iuml", 207),
609 ENTITY("Kappa", 922),
610 ENTITY("Lambda", 923),
611 ENTITY("Mu", 924),
612 ENTITY("Ntilde", 209),
613 ENTITY("Nu", 925),
614 ENTITY("OElig", 338),
615 ENTITY("Oacute", 211),
616 ENTITY("Ocirc", 212),
617 ENTITY("Ograve", 210),
618 ENTITY("Omega", 937),
619 ENTITY("Omicron", 927),
620 ENTITY("Oslash", 216),
621 ENTITY("Otilde", 213),
622 ENTITY("Ouml", 214),
623 ENTITY("Phi", 934),
624 ENTITY("Pi", 928),
625 ENTITY("Prime", 8243),
626 ENTITY("Psi", 936),
627 ENTITY("Rho", 929),
628 ENTITY("Scaron", 352),
629 ENTITY("Sigma", 931),
630 ENTITY("THORN", 222),
631 ENTITY("Tau", 932),
632 ENTITY("Theta", 920),
633 ENTITY("Uacute", 218),
634 ENTITY("Ucirc", 219),
635 ENTITY("Ugrave", 217),
636 ENTITY("Upsilon", 933),
637 ENTITY("Uuml", 220),
638 ENTITY("Xi", 926),
639 ENTITY("Yacute", 221),
640 ENTITY("Yuml", 376),
641 ENTITY("Zeta", 918),
642 ENTITY("aacute", 225),
643 ENTITY("acirc", 226),
644 ENTITY("acute", 180),
645 ENTITY("aelig", 230),
646 ENTITY("agrave", 224),
647 ENTITY("alefsym", 8501),
648 ENTITY("alpha", 945),
649 ENTITY("amp", 38),
650 ENTITY("and", 8743),
651 ENTITY("ang", 8736),
652 ENTITY("aring", 229),
653 ENTITY("asymp", 8776),
654 ENTITY("atilde", 227),
655 ENTITY("auml", 228),
656 ENTITY("bdquo", 8222),
657 ENTITY("beta", 946),
658 ENTITY("brvbar", 166),
659 ENTITY("bull", 8226),
660 ENTITY("cap", 8745),
661 ENTITY("ccedil", 231),
662 ENTITY("cedil", 184),
663 ENTITY("cent", 162),
664 ENTITY("chi", 967),
665 ENTITY("circ", 710),
666 ENTITY("clubs", 9827),
667 ENTITY("cong", 8773),
668 ENTITY("copy", 169),
669 ENTITY("crarr", 8629),
670 ENTITY("cup", 8746),
671 ENTITY("curren", 164),
672 ENTITY("dArr", 8659),
673 ENTITY("dagger", 8224),
674 ENTITY("darr", 8595),
675 ENTITY("deg", 176),
676 ENTITY("delta", 948),
677 ENTITY("diams", 9830),
678 ENTITY("divide", 247),
679 ENTITY("eacute", 233),
680 ENTITY("ecirc", 234),
681 ENTITY("egrave", 232),
682 ENTITY("empty", 8709),
683 ENTITY("emsp", 8195),
684 ENTITY("ensp", 8194),
685 ENTITY("epsilon", 949),
686 ENTITY("equiv", 8801),
687 ENTITY("eta", 951),
688 ENTITY("eth", 240),
689 ENTITY("euml", 235),
690 ENTITY("euro", 8364),
691 ENTITY("exist", 8707),
692 ENTITY("fnof", 402),
693 ENTITY("forall", 8704),
694 ENTITY("frac12", 189),
695 ENTITY("frac14", 188),
696 ENTITY("frac34", 190),
697 ENTITY("frasl", 8260),
698 ENTITY("gamma", 947),
699 ENTITY("ge", 8805),
700 ENTITY("gt", 62),
701 ENTITY("hArr", 8660),
702 ENTITY("harr", 8596),
703 ENTITY("hearts", 9829),
704 ENTITY("hellip", 8230),
705 ENTITY("iacute", 237),
706 ENTITY("icirc", 238),
707 ENTITY("iexcl", 161),
708 ENTITY("igrave", 236),
709 ENTITY("image", 8465),
710 ENTITY("infin", 8734),
711 ENTITY("int", 8747),
712 ENTITY("iota", 953),
713 ENTITY("iquest", 191),
714 ENTITY("isin", 8712),
715 ENTITY("iuml", 239),
716 ENTITY("kappa", 954),
717 ENTITY("lArr", 8656),
718 ENTITY("lambda", 955),
719 ENTITY("lang", 9001),
720 ENTITY("laquo", 171),
721 ENTITY("larr", 8592),
722 ENTITY("lceil", 8968),
723 ENTITY("ldquo", 8220),
724 ENTITY("le", 8804),
725 ENTITY("lfloor", 8970),
726 ENTITY("lowast", 8727),
727 ENTITY("loz", 9674),
728 ENTITY("lrm", 8206),
729 ENTITY("lsaquo", 8249),
730 ENTITY("lsquo", 8216),
731 ENTITY("lt", 60),
732 ENTITY("macr", 175),
733 ENTITY("mdash", 8212),
734 ENTITY("micro", 181),
735 ENTITY("middot", 183),
736 ENTITY("minus", 8722),
737 ENTITY("mu", 956),
738 ENTITY("nabla", 8711),
739 ENTITY("nbsp", 160),
740 ENTITY("ndash", 8211),
741 ENTITY("ne", 8800),
742 ENTITY("ni", 8715),
743 ENTITY("not", 172),
744 ENTITY("notin", 8713),
745 ENTITY("nsub", 8836),
746 ENTITY("ntilde", 241),
747 ENTITY("nu", 957),
748 ENTITY("oacute", 243),
749 ENTITY("ocirc", 244),
750 ENTITY("oelig", 339),
751 ENTITY("ograve", 242),
752 ENTITY("oline", 8254),
753 ENTITY("omega", 969),
754 ENTITY("omicron", 959),
755 ENTITY("oplus", 8853),
756 ENTITY("or", 8744),
757 ENTITY("ordf", 170),
758 ENTITY("ordm", 186),
759 ENTITY("oslash", 248),
760 ENTITY("otilde", 245),
761 ENTITY("otimes", 8855),
762 ENTITY("ouml", 246),
763 ENTITY("para", 182),
764 ENTITY("part", 8706),
765 ENTITY("permil", 8240),
766 ENTITY("perp", 8869),
767 ENTITY("phi", 966),
768 ENTITY("pi", 960),
769 ENTITY("piv", 982),
770 ENTITY("plusmn", 177),
771 ENTITY("pound", 163),
772 ENTITY("prime", 8242),
773 ENTITY("prod", 8719),
774 ENTITY("prop", 8733),
775 ENTITY("psi", 968),
776 ENTITY("quot", 34),
777 ENTITY("rArr", 8658),
778 ENTITY("radic", 8730),
779 ENTITY("rang", 9002),
780 ENTITY("raquo", 187),
781 ENTITY("rarr", 8594),
782 ENTITY("rceil", 8969),
783 ENTITY("rdquo", 8221),
784 ENTITY("real", 8476),
785 ENTITY("reg", 174),
786 ENTITY("rfloor", 8971),
787 ENTITY("rho", 961),
788 ENTITY("rlm", 8207),
789 ENTITY("rsaquo", 8250),
790 ENTITY("rsquo", 8217),
791 ENTITY("sbquo", 8218),
792 ENTITY("scaron", 353),
793 ENTITY("sdot", 8901),
794 ENTITY("sect", 167),
795 ENTITY("shy", 173),
796 ENTITY("sigma", 963),
797 ENTITY("sigmaf", 962),
798 ENTITY("sim", 8764),
799 ENTITY("spades", 9824),
800 ENTITY("sub", 8834),
801 ENTITY("sube", 8838),
802 ENTITY("sum", 8721),
803 ENTITY("sup", 8835),
804 ENTITY("sup1", 185),
805 ENTITY("sup2", 178),
806 ENTITY("sup3", 179),
807 ENTITY("supe", 8839),
808 ENTITY("szlig", 223),
809 ENTITY("tau", 964),
810 ENTITY("there4", 8756),
811 ENTITY("theta", 952),
812 ENTITY("thetasym", 977),
813 ENTITY("thinsp", 8201),
814 ENTITY("thorn", 254),
815 ENTITY("tilde", 732),
816 ENTITY("times", 215),
817 ENTITY("trade", 8482),
818 ENTITY("uArr", 8657),
819 ENTITY("uacute", 250),
820 ENTITY("uarr", 8593),
821 ENTITY("ucirc", 251),
822 ENTITY("ugrave", 249),
823 ENTITY("uml", 168),
824 ENTITY("upsih", 978),
825 ENTITY("upsilon", 965),
826 ENTITY("uuml", 252),
827 ENTITY("weierp", 8472),
828 ENTITY("xi", 958),
829 ENTITY("yacute", 253),
830 ENTITY("yen", 165),
831 ENTITY("yuml", 255),
832 ENTITY("zeta", 950),
833 ENTITY("zwj", 8205),
834 ENTITY("zwnj", 8204),
daa616fc 835 {NULL, 0}};
19817fd3 836 #undef ENTITY
daa616fc 837 static size_t substitutions_cnt = 0;
04dbb646 838
daa616fc
VS
839 if (substitutions_cnt == 0)
840 while (substitutions[substitutions_cnt].code != 0)
841 substitutions_cnt++;
842
3919d530
JS
843 wxHtmlEntityInfo *info = NULL;
844#ifdef __WXWINCE__
845 // bsearch crashes under WinCE for some reason
846 size_t i;
847 for (i = 0; i < substitutions_cnt; i++)
848 {
849 if (entity == substitutions[i].name)
850 {
851 info = & substitutions[i];
852 break;
853 }
854 }
855#else
19817fd3 856 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
857 substitutions_cnt,
858 sizeof(wxHtmlEntityInfo),
90350682 859 wxHtmlEntityCompare);
3919d530 860#endif
daa616fc
VS
861 if (info)
862 code = info->code;
863 }
04dbb646 864
daa616fc 865 if (code == 0)
470252df 866 return 0;
daa616fc
VS
867 else
868 return GetCharForCode(code);
869}
870
d1da8872 871wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
6cc4e6b8
VS
872 const wxString& url) const
873{
e02ecf7c 874 return m_FS ? m_FS->OpenFile(url) : NULL;
d1da8872 875
6cc4e6b8
VS
876}
877
2b5f62a0
VZ
878
879//-----------------------------------------------------------------------------
880// wxHtmlParser::ExtractCharsetInformation
881//-----------------------------------------------------------------------------
882
883class wxMetaTagParser : public wxHtmlParser
884{
885public:
2eb10e2a
VZ
886 wxMetaTagParser() { }
887
2b5f62a0 888 wxObject* GetProduct() { return NULL; }
2eb10e2a 889
2b5f62a0 890protected:
5bce3e6f 891 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a
VZ
892
893 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
2b5f62a0
VZ
894};
895
896class wxMetaTagHandler : public wxHtmlTagHandler
897{
898public:
899 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
900 wxString GetSupportedTags() { return wxT("META,BODY"); }
901 bool HandleTag(const wxHtmlTag& tag);
902
903private:
904 wxString *m_retval;
2eb10e2a
VZ
905
906 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
2b5f62a0
VZ
907};
908
909bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
910{
911 if (tag.GetName() == _T("BODY"))
912 {
913 m_Parser->StopParsing();
d1da8872 914 return false;
2b5f62a0
VZ
915 }
916
917 if (tag.HasParam(_T("HTTP-EQUIV")) &&
13fd234c 918 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
2b5f62a0
VZ
919 tag.HasParam(_T("CONTENT")))
920 {
5af11a94 921 wxString content = tag.GetParam(_T("CONTENT")).Lower();
2b5f62a0
VZ
922 if (content.Left(19) == _T("text/html; charset="))
923 {
924 *m_retval = content.Mid(19);
925 m_Parser->StopParsing();
926 }
927 }
d1da8872 928 return false;
2b5f62a0
VZ
929}
930
931
932/*static*/
933wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
934{
935 wxString charset;
e7274ba2
WS
936 wxMetaTagParser *parser = new wxMetaTagParser();
937 if(parser)
938 {
939 parser->AddTagHandler(new wxMetaTagHandler(&charset));
940 parser->Parse(markup);
941 delete parser;
942 }
2b5f62a0
VZ
943 return charset;
944}
945
4609ee2e
VZ
946/* static */
947bool
948wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
949 wxString::const_iterator end)
950{
951 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
952
953 wxString::const_iterator p = start;
954
955 // comments begin with "<!--" in HTML 4.0
956 if ( end - p < 3 || *++p != '!' || *++p != '-' || *++p != '-' )
957 {
958 // not a comment at all
959 return false;
960 }
961
962 // skip the start of the comment tag in any case, if we don't find the
963 // closing tag we should ignore broken markup
964 start = p;
965
966 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
967 // comment delimiter and the closing tag character (section 3.2.4 of
968 // http://www.w3.org/TR/html401/)
969 int dashes = 0;
970 while ( ++p < end )
971 {
972 const wxChar c = *p;
973
974 if ( (c == wxT(' ') || c == wxT('\n') ||
975 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
976 {
977 // ignore white space before potential tag end
978 continue;
979 }
980
981 if ( c == wxT('>') && dashes >= 2 )
982 {
983 // found end of comment
984 start = p;
985 break;
986 }
987
988 if ( c == wxT('-') )
989 dashes++;
990 else
991 dashes = 0;
992 }
993
994 return true;
995}
996
997#endif // wxUSE_HTML