]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
remove gtk_window_set_type_hint from GetTooltipColors, it's not necessary and GDK_WIN...
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
69941f05 5// RCS-ID: $Id$
5526e819 6// Copyright: (c) 1999 Vaclav Slavik
65571936 7// Licence: wxWindows licence
5526e819
VS
8/////////////////////////////////////////////////////////////////////////////
9
3096bd2f 10#include "wx/wxprec.h"
5526e819 11
2b5f62a0 12#ifdef __BORLANDC__
93763ad5 13 #pragma hdrstop
5526e819
VS
14#endif
15
93763ad5
WS
16#if wxUSE_HTML && wxUSE_STREAMS
17
b4f4d3dd 18#ifndef WX_PRECOMP
ad9835c9 19 #include "wx/dynarray.h"
04dbb646
VZ
20 #include "wx/log.h"
21 #include "wx/intl.h"
670f9935 22 #include "wx/app.h"
193d0c93 23 #include "wx/wxcrtvararg.h"
5526e819
VS
24#endif
25
69941f05
VS
26#include "wx/tokenzr.h"
27#include "wx/wfstream.h"
28#include "wx/url.h"
daa616fc 29#include "wx/fontmap.h"
69941f05
VS
30#include "wx/html/htmldefs.h"
31#include "wx/html/htmlpars.h"
3c47c047 32#include "wx/vector.h"
5526e819 33
7127d129
RR
34#ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36#endif
34fdf762
VS
37
38// DLL options compatibility check:
34fdf762 39WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 40
25271309
VZ
41const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
211dfedd
VS
43//-----------------------------------------------------------------------------
44// wxHtmlParser helpers
45//-----------------------------------------------------------------------------
46
47class wxHtmlTextPiece
48{
49public:
3c47c047 50 wxHtmlTextPiece() {}
b1a3a964
VS
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
211dfedd
VS
55};
56
3c47c047
VS
57// NB: this is an empty class and not typedef because of forward declaration
58class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59{
60};
5526e819 61
481c879b 62class wxHtmlParserState
211dfedd 63{
481c879b 64public:
211dfedd
VS
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
b1a3a964 69 const wxString *m_source;
211dfedd
VS
70 wxHtmlParserState *m_nextState;
71};
5526e819
VS
72
73//-----------------------------------------------------------------------------
74// wxHtmlParser
75//-----------------------------------------------------------------------------
76
77IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
04dbb646 79wxHtmlParser::wxHtmlParser()
211dfedd 80 : wxObject(), m_HandlersHash(wxKEY_STRING),
daa616fc
VS
81 m_FS(NULL), m_HandlersStack(NULL)
82{
b1a3a964 83 m_Source = NULL;
daa616fc 84 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
daa616fc
VS
90}
91
92wxHtmlParser::~wxHtmlParser()
93{
0beefa20
VS
94 while (RestoreState()) {}
95 DestroyDOMTree();
222ed1d6
MB
96
97 if (m_HandlersStack)
98 {
99 wxList& tmp = *m_HandlersStack;
100 wxList::iterator it, en;
101 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
102 delete (wxHashTable*)*it;
103 tmp.clear();
104 }
daa616fc
VS
105 delete m_HandlersStack;
106 m_HandlersHash.Clear();
222ed1d6 107 WX_CLEAR_LIST(wxList, m_HandlersList);
daa616fc 108 delete m_entitiesParser;
b1a3a964 109 delete m_Source;
daa616fc 110}
5526e819
VS
111
112wxObject* wxHtmlParser::Parse(const wxString& source)
113{
5526e819
VS
114 InitParser(source);
115 DoParsing();
2b5f62a0 116 wxObject *result = GetProduct();
5526e819
VS
117 DoneParser();
118 return result;
119}
120
5526e819
VS
121void wxHtmlParser::InitParser(const wxString& source)
122{
1309ba6c 123 SetSource(source);
d1da8872 124 m_stopParsing = false;
5526e819 125}
1309ba6c 126
5526e819
VS
127void wxHtmlParser::DoneParser()
128{
211dfedd 129 DestroyDOMTree();
5526e819
VS
130}
131
1309ba6c
VS
132void wxHtmlParser::SetSource(const wxString& src)
133{
211dfedd 134 DestroyDOMTree();
b1a3a964
VS
135 // NB: this is allocated on heap because wxHtmlTag keeps a pointer to
136 // this string if WXWIN_COMPATIBILITY_2_8
137 delete m_Source;
138 m_Source = new wxString(src);
211dfedd
VS
139 CreateDOMTree();
140 m_CurTag = NULL;
141 m_CurTextPiece = 0;
1309ba6c 142}
5526e819 143
211dfedd 144void wxHtmlParser::CreateDOMTree()
5526e819 145{
b1a3a964 146 wxHtmlTagsCache cache(*m_Source);
211dfedd 147 m_TextPieces = new wxHtmlTextPieces;
b1a3a964 148 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
211dfedd
VS
149 m_CurTextPiece = 0;
150}
5526e819 151
b1a3a964 152extern bool wxIsCDATAElement(const wxString& tag);
7c6cd4a8 153
211dfedd 154void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
b1a3a964
VS
155 const wxString::const_iterator& begin_pos,
156 const wxString::const_iterator& end_pos,
211dfedd
VS
157 wxHtmlTagsCache *cache)
158{
b1a3a964
VS
159 if (end_pos <= begin_pos)
160 return;
5526e819 161
211dfedd 162 wxChar c;
b1a3a964
VS
163 wxString::const_iterator i = begin_pos;
164 wxString::const_iterator textBeginning = begin_pos;
d699f48b 165
7c6cd4a8
VS
166 // If the tag contains CDATA text, we include the text between beginning
167 // and ending tag verbosely. Setting i=end_pos will skip to the very
168 // end of this function where text piece is added, bypassing any child
169 // tags parsing (CDATA element can't have child elements by definition):
b1a3a964 170 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
7c6cd4a8
VS
171 {
172 i = end_pos;
173 }
174
04dbb646 175 while (i < end_pos)
4f9297b0 176 {
b1a3a964 177 c = *i;
5526e819 178
211dfedd
VS
179 if (c == wxT('<'))
180 {
181 // add text to m_TextPieces:
b1a3a964
VS
182 if (i > textBeginning)
183 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
211dfedd
VS
184
185 // if it is a comment, skip it:
b1a3a964 186 if ( SkipCommentTag(i, m_Source->end()) )
211dfedd 187 {
b1a3a964 188 textBeginning = i = i + 1; // skip closing '>' too
211dfedd 189 }
d699f48b 190
211dfedd 191 // add another tag to the tree:
b1a3a964 192 else if (i < end_pos-1 && *(i+1) != wxT('/'))
d1da8872 193 {
211dfedd 194 wxHtmlTag *chd;
d699f48b
KB
195 if (cur)
196 chd = new wxHtmlTag(cur, m_Source,
211dfedd 197 i, end_pos, cache, m_entitiesParser);
d699f48b 198 else
211dfedd
VS
199 {
200 chd = new wxHtmlTag(NULL, m_Source,
201 i, end_pos, cache, m_entitiesParser);
d699f48b 202 if (!m_Tags)
211dfedd 203 {
d699f48b 204 // if this is the first tag to be created make the root
211dfedd
VS
205 // m_Tags point to it:
206 m_Tags = chd;
207 }
208 else
209 {
d699f48b 210 // if there is already a root tag add this tag as
211dfedd
VS
211 // the last sibling:
212 chd->m_Prev = m_Tags->GetLastSibling();
213 chd->m_Prev->m_Next = chd;
214 }
215 }
216
217 if (chd->HasEnding())
218 {
219 CreateDOMSubTree(chd,
b1a3a964 220 chd->GetBeginIter(), chd->GetEndIter1(),
211dfedd 221 cache);
b1a3a964 222 i = chd->GetEndIter2();
211dfedd
VS
223 }
224 else
b1a3a964 225 i = chd->GetBeginIter();
d1da8872 226
211dfedd
VS
227 textBeginning = i;
228 }
229
230 // ... or skip ending tag:
d699f48b 231 else
211dfedd 232 {
b1a3a964 233 while (i < end_pos && *i != wxT('>')) ++i;
211dfedd 234 textBeginning = i+1;
5526e819 235 }
5526e819 236 }
b1a3a964 237 else ++i;
5526e819
VS
238 }
239
211dfedd 240 // add remaining text to m_TextPieces:
b1a3a964
VS
241 if (end_pos > textBeginning)
242 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
211dfedd
VS
243}
244
245void wxHtmlParser::DestroyDOMTree()
246{
247 wxHtmlTag *t1, *t2;
248 t1 = m_Tags;
249 while (t1)
250 {
251 t2 = t1->GetNextSibling();
252 delete t1;
253 t1 = t2;
254 }
255 m_Tags = m_CurTag = NULL;
256
257 delete m_TextPieces;
258 m_TextPieces = NULL;
259}
260
d699f48b 261void wxHtmlParser::DoParsing()
211dfedd
VS
262{
263 m_CurTag = m_Tags;
264 m_CurTextPiece = 0;
b1a3a964 265 DoParsing(m_Source->begin(), m_Source->end());
211dfedd
VS
266}
267
b1a3a964
VS
268void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
269 const wxString::const_iterator& end_pos)
211dfedd 270{
b1a3a964
VS
271 wxString::const_iterator begin_pos(begin_pos_);
272
273 if (end_pos <= begin_pos)
274 return;
d699f48b 275
211dfedd 276 wxHtmlTextPieces& pieces = *m_TextPieces;
3c47c047 277 size_t piecesCnt = pieces.size();
d699f48b 278
211dfedd
VS
279 while (begin_pos < end_pos)
280 {
b1a3a964 281 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
211dfedd 282 m_CurTag = m_CurTag->GetNextTag();
d699f48b 283 while (m_CurTextPiece < piecesCnt &&
b1a3a964 284 pieces[m_CurTextPiece].m_start < begin_pos)
211dfedd
VS
285 m_CurTextPiece++;
286
d699f48b
KB
287 if (m_CurTextPiece < piecesCnt &&
288 (!m_CurTag ||
b1a3a964 289 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
211dfedd
VS
290 {
291 // Add text:
f23e92e7 292 AddText(GetEntitiesParser()->Parse(
b1a3a964
VS
293 wxString(pieces[m_CurTextPiece].m_start,
294 pieces[m_CurTextPiece].m_end)));
295 begin_pos = pieces[m_CurTextPiece].m_end;
211dfedd
VS
296 m_CurTextPiece++;
297 }
298 else if (m_CurTag)
299 {
902725ee 300 if (m_CurTag->HasEnding())
b1a3a964 301 begin_pos = m_CurTag->GetEndIter2();
902725ee 302 else
b1a3a964 303 begin_pos = m_CurTag->GetBeginIter();
211dfedd
VS
304 wxHtmlTag *t = m_CurTag;
305 m_CurTag = m_CurTag->GetNextTag();
306 AddTag(*t);
2b5f62a0
VZ
307 if (m_stopParsing)
308 return;
211dfedd
VS
309 }
310 else break;
5526e819
VS
311 }
312}
313
5526e819
VS
314void wxHtmlParser::AddTag(const wxHtmlTag& tag)
315{
316 wxHtmlTagHandler *h;
d1da8872 317 bool inner = false;
5526e819
VS
318
319 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
320 if (h)
2b5f62a0 321 {
4f9297b0 322 inner = h->HandleTag(tag);
2b5f62a0
VZ
323 if (m_stopParsing)
324 return;
325 }
04dbb646 326 if (!inner)
4f9297b0 327 {
5526e819 328 if (tag.HasEnding())
b1a3a964 329 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
5526e819
VS
330 }
331}
332
5526e819
VS
333void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
334{
4f9297b0 335 wxString s(handler->GetSupportedTags());
211dfedd 336 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 337
5526e819 338 while (tokenizer.HasMoreTokens())
470252df 339 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
5526e819
VS
340
341 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
342 m_HandlersList.Append(handler);
343
4f9297b0 344 handler->SetParser(this);
5526e819
VS
345}
346
fbfb8bcc 347void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 348{
211dfedd 349 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
350 wxString key;
351
04dbb646 352 if (m_HandlersStack == NULL)
4f9297b0 353 {
a7a4d01b 354 m_HandlersStack = new wxList;
a7a4d01b
VS
355 }
356
222ed1d6 357 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
a7a4d01b 358
04dbb646 359 while (tokenizer.HasMoreTokens())
4f9297b0 360 {
470252df 361 key = tokenizer.GetNextToken();
a7a4d01b
VS
362 m_HandlersHash.Delete(key);
363 m_HandlersHash.Put(key, handler);
364 }
365}
366
a7a4d01b
VS
367void wxHtmlParser::PopTagHandler()
368{
222ed1d6 369 wxList::compatibility_iterator first;
04dbb646 370
dfa4a244 371 if ( !m_HandlersStack ||
28b4db7f
VZ
372#if wxUSE_STL
373 !(first = m_HandlersStack->GetFirst())
374#else // !wxUSE_STL
375 ((first = m_HandlersStack->GetFirst()) == NULL)
376#endif // wxUSE_STL/!wxUSE_STL
377 )
f3c82859
VS
378 {
379 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
380 return;
381 }
4f9297b0 382 m_HandlersHash = *((wxHashTable*) first->GetData());
222ed1d6
MB
383 delete (wxHashTable*) first->GetData();
384 m_HandlersStack->Erase(first);
a7a4d01b
VS
385}
386
211dfedd
VS
387void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
388{
389 wxHtmlParserState *s = new wxHtmlParserState;
390
391 s->m_curTag = m_CurTag;
392 s->m_tags = m_Tags;
393 s->m_textPieces = m_TextPieces;
394 s->m_curTextPiece = m_CurTextPiece;
395 s->m_source = m_Source;
396
397 s->m_nextState = m_SavedStates;
398 m_SavedStates = s;
399
400 m_CurTag = NULL;
401 m_Tags = NULL;
402 m_TextPieces = NULL;
403 m_CurTextPiece = 0;
b1a3a964 404 m_Source = NULL;
d699f48b 405
211dfedd
VS
406 SetSource(src);
407}
408
409bool wxHtmlParser::RestoreState()
410{
d1da8872 411 if (!m_SavedStates) return false;
d699f48b 412
0beefa20
VS
413 DestroyDOMTree();
414
211dfedd
VS
415 wxHtmlParserState *s = m_SavedStates;
416 m_SavedStates = s->m_nextState;
d699f48b 417
211dfedd
VS
418 m_CurTag = s->m_curTag;
419 m_Tags = s->m_tags;
420 m_TextPieces = s->m_textPieces;
421 m_CurTextPiece = s->m_curTextPiece;
422 m_Source = s->m_source;
d699f48b 423
211dfedd 424 delete s;
d1da8872 425 return true;
211dfedd
VS
426}
427
e7feeafa
VS
428wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
429{
b1a3a964 430 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
e7feeafa
VS
431}
432
5526e819
VS
433//-----------------------------------------------------------------------------
434// wxHtmlTagHandler
435//-----------------------------------------------------------------------------
436
437IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 438
e7feeafa
VS
439void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
440{
441 // It is safe to temporarily change the source being parsed,
442 // provided we restore the state back after parsing
443 m_Parser->SetSourceAndSaveState(source);
444 m_Parser->DoParsing();
445 m_Parser->RestoreState();
446}
447
daa616fc
VS
448
449//-----------------------------------------------------------------------------
450// wxHtmlEntitiesParser
451//-----------------------------------------------------------------------------
452
453IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
454
455wxHtmlEntitiesParser::wxHtmlEntitiesParser()
456#if wxUSE_WCHAR_T && !wxUSE_UNICODE
457 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 458#endif
daa616fc
VS
459{
460}
461
462wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
463{
5438a566 464#if wxUSE_WCHAR_T && !wxUSE_UNICODE
daa616fc 465 delete m_conv;
5438a566 466#endif
daa616fc 467}
5526e819 468
daa616fc
VS
469void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
470{
471#if wxUSE_WCHAR_T && !wxUSE_UNICODE
2b5f62a0
VZ
472 if (encoding == m_encoding)
473 return;
474
daa616fc 475 delete m_conv;
2b5f62a0 476
daa616fc 477 m_encoding = encoding;
2b5f62a0
VZ
478 if (m_encoding == wxFONTENCODING_SYSTEM)
479 m_conv = NULL;
480 else
daa616fc 481 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
d699f48b
KB
482#else
483 (void) encoding;
daa616fc
VS
484#endif
485}
486
96d665d2 487wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
daa616fc 488{
daa616fc 489 wxString output;
d1da8872 490
4f7e8fda
VS
491 const wxString::const_iterator end(input.end());
492 wxString::const_iterator c(input.begin());
493 wxString::const_iterator last(c);
04dbb646 494
4f7e8fda 495 for ( ; c < end; ++c )
daa616fc
VS
496 {
497 if (*c == wxT('&'))
498 {
4f7e8fda
VS
499 if ( output.empty() )
500 output.reserve(input.length());
501
daa616fc 502 if (c - last > 0)
4f7e8fda
VS
503 output.append(last, c);
504 if ( ++c == end )
9e2bd135 505 break;
d1da8872 506
daa616fc 507 wxString entity;
4f7e8fda 508 const wxString::const_iterator ent_s = c;
470252df 509 wxChar entity_char;
d1da8872 510
b1a3a964
VS
511 for ( ; c != end; ++c )
512 {
513 wxChar ch = *c;
514 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
515 (ch >= wxT('A') && ch <= wxT('Z')) ||
516 (ch >= wxT('0') && ch <= wxT('9')) ||
517 ch == wxT('_') || ch == wxT('#')) )
518 break;
519 }
520
4f7e8fda
VS
521 entity.append(ent_s, c);
522 if (c == end || *c != wxT(';')) --c;
211dfedd 523 last = c+1;
470252df
VS
524 entity_char = GetEntityChar(entity);
525 if (entity_char)
526 output << entity_char;
527 else
528 {
4f7e8fda 529 output.append(ent_s-1, c+1);
25271309 530 wxLogTrace(wxTRACE_HTML_DEBUG,
4f7e8fda
VS
531 "Unrecognized HTML entity: '%s'",
532 entity);
470252df 533 }
daa616fc
VS
534 }
535 }
4f7e8fda
VS
536 if ( last == input.begin() ) // common case: no entity
537 return input;
538 if ( last != end )
539 output.append(last, end);
daa616fc
VS
540 return output;
541}
542
2b5f62a0 543#if !wxUSE_UNICODE
96d665d2 544wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
daa616fc 545{
2b5f62a0 546#if wxUSE_WCHAR_T
daa616fc
VS
547 char buf[2];
548 wchar_t wbuf[2];
549 wbuf[0] = (wchar_t)code;
550 wbuf[1] = 0;
551 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 552 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
553 return '?';
554 return buf[0];
555#else
556 return (code < 256) ? (wxChar)code : '?';
557#endif
558}
2b5f62a0 559#endif
daa616fc 560
19817fd3
VS
561struct wxHtmlEntityInfo
562{
563 const wxStringCharType *name;
564 unsigned code;
565};
566
567extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
568{
569#if wxUSE_UNICODE_UTF8
570 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
571#else
572 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
573#endif
574}
575
96d665d2 576wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
daa616fc
VS
577{
578 unsigned code = 0;
04dbb646 579
daa616fc
VS
580 if (entity[0] == wxT('#'))
581 {
c471f7e1
VS
582 // NB: parsed value is a number, so it's OK to use wx_str(), internal
583 // representation is the same for numbers
584 const wxStringCharType *ent_s = entity.wx_str();
585 const wxStringCharType *format;
04dbb646 586
c471f7e1 587 if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
daa616fc 588 {
c471f7e1 589 format = wxSTRING_TEXT("%x");
daa616fc
VS
590 ent_s++;
591 }
592 else
c471f7e1 593 format = wxSTRING_TEXT("%u");
daa616fc
VS
594 ent_s++;
595
596 if (wxSscanf(ent_s, format, &code) != 1)
597 code = 0;
598 }
599 else
600 {
19817fd3
VS
601 // store the literals in wx's internal representation (either char*
602 // in UTF-8 or wchar_t*) for best performance:
603 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
604
daa616fc 605 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
606 ENTITY("AElig", 198),
607 ENTITY("Aacute", 193),
608 ENTITY("Acirc", 194),
609 ENTITY("Agrave", 192),
610 ENTITY("Alpha", 913),
611 ENTITY("Aring", 197),
612 ENTITY("Atilde", 195),
613 ENTITY("Auml", 196),
614 ENTITY("Beta", 914),
615 ENTITY("Ccedil", 199),
616 ENTITY("Chi", 935),
617 ENTITY("Dagger", 8225),
618 ENTITY("Delta", 916),
619 ENTITY("ETH", 208),
620 ENTITY("Eacute", 201),
621 ENTITY("Ecirc", 202),
622 ENTITY("Egrave", 200),
623 ENTITY("Epsilon", 917),
624 ENTITY("Eta", 919),
625 ENTITY("Euml", 203),
626 ENTITY("Gamma", 915),
627 ENTITY("Iacute", 205),
628 ENTITY("Icirc", 206),
629 ENTITY("Igrave", 204),
630 ENTITY("Iota", 921),
631 ENTITY("Iuml", 207),
632 ENTITY("Kappa", 922),
633 ENTITY("Lambda", 923),
634 ENTITY("Mu", 924),
635 ENTITY("Ntilde", 209),
636 ENTITY("Nu", 925),
637 ENTITY("OElig", 338),
638 ENTITY("Oacute", 211),
639 ENTITY("Ocirc", 212),
640 ENTITY("Ograve", 210),
641 ENTITY("Omega", 937),
642 ENTITY("Omicron", 927),
643 ENTITY("Oslash", 216),
644 ENTITY("Otilde", 213),
645 ENTITY("Ouml", 214),
646 ENTITY("Phi", 934),
647 ENTITY("Pi", 928),
648 ENTITY("Prime", 8243),
649 ENTITY("Psi", 936),
650 ENTITY("Rho", 929),
651 ENTITY("Scaron", 352),
652 ENTITY("Sigma", 931),
653 ENTITY("THORN", 222),
654 ENTITY("Tau", 932),
655 ENTITY("Theta", 920),
656 ENTITY("Uacute", 218),
657 ENTITY("Ucirc", 219),
658 ENTITY("Ugrave", 217),
659 ENTITY("Upsilon", 933),
660 ENTITY("Uuml", 220),
661 ENTITY("Xi", 926),
662 ENTITY("Yacute", 221),
663 ENTITY("Yuml", 376),
664 ENTITY("Zeta", 918),
665 ENTITY("aacute", 225),
666 ENTITY("acirc", 226),
667 ENTITY("acute", 180),
668 ENTITY("aelig", 230),
669 ENTITY("agrave", 224),
670 ENTITY("alefsym", 8501),
671 ENTITY("alpha", 945),
672 ENTITY("amp", 38),
673 ENTITY("and", 8743),
674 ENTITY("ang", 8736),
675 ENTITY("aring", 229),
676 ENTITY("asymp", 8776),
677 ENTITY("atilde", 227),
678 ENTITY("auml", 228),
679 ENTITY("bdquo", 8222),
680 ENTITY("beta", 946),
681 ENTITY("brvbar", 166),
682 ENTITY("bull", 8226),
683 ENTITY("cap", 8745),
684 ENTITY("ccedil", 231),
685 ENTITY("cedil", 184),
686 ENTITY("cent", 162),
687 ENTITY("chi", 967),
688 ENTITY("circ", 710),
689 ENTITY("clubs", 9827),
690 ENTITY("cong", 8773),
691 ENTITY("copy", 169),
692 ENTITY("crarr", 8629),
693 ENTITY("cup", 8746),
694 ENTITY("curren", 164),
695 ENTITY("dArr", 8659),
696 ENTITY("dagger", 8224),
697 ENTITY("darr", 8595),
698 ENTITY("deg", 176),
699 ENTITY("delta", 948),
700 ENTITY("diams", 9830),
701 ENTITY("divide", 247),
702 ENTITY("eacute", 233),
703 ENTITY("ecirc", 234),
704 ENTITY("egrave", 232),
705 ENTITY("empty", 8709),
706 ENTITY("emsp", 8195),
707 ENTITY("ensp", 8194),
708 ENTITY("epsilon", 949),
709 ENTITY("equiv", 8801),
710 ENTITY("eta", 951),
711 ENTITY("eth", 240),
712 ENTITY("euml", 235),
713 ENTITY("euro", 8364),
714 ENTITY("exist", 8707),
715 ENTITY("fnof", 402),
716 ENTITY("forall", 8704),
717 ENTITY("frac12", 189),
718 ENTITY("frac14", 188),
719 ENTITY("frac34", 190),
720 ENTITY("frasl", 8260),
721 ENTITY("gamma", 947),
722 ENTITY("ge", 8805),
723 ENTITY("gt", 62),
724 ENTITY("hArr", 8660),
725 ENTITY("harr", 8596),
726 ENTITY("hearts", 9829),
727 ENTITY("hellip", 8230),
728 ENTITY("iacute", 237),
729 ENTITY("icirc", 238),
730 ENTITY("iexcl", 161),
731 ENTITY("igrave", 236),
732 ENTITY("image", 8465),
733 ENTITY("infin", 8734),
734 ENTITY("int", 8747),
735 ENTITY("iota", 953),
736 ENTITY("iquest", 191),
737 ENTITY("isin", 8712),
738 ENTITY("iuml", 239),
739 ENTITY("kappa", 954),
740 ENTITY("lArr", 8656),
741 ENTITY("lambda", 955),
742 ENTITY("lang", 9001),
743 ENTITY("laquo", 171),
744 ENTITY("larr", 8592),
745 ENTITY("lceil", 8968),
746 ENTITY("ldquo", 8220),
747 ENTITY("le", 8804),
748 ENTITY("lfloor", 8970),
749 ENTITY("lowast", 8727),
750 ENTITY("loz", 9674),
751 ENTITY("lrm", 8206),
752 ENTITY("lsaquo", 8249),
753 ENTITY("lsquo", 8216),
754 ENTITY("lt", 60),
755 ENTITY("macr", 175),
756 ENTITY("mdash", 8212),
757 ENTITY("micro", 181),
758 ENTITY("middot", 183),
759 ENTITY("minus", 8722),
760 ENTITY("mu", 956),
761 ENTITY("nabla", 8711),
762 ENTITY("nbsp", 160),
763 ENTITY("ndash", 8211),
764 ENTITY("ne", 8800),
765 ENTITY("ni", 8715),
766 ENTITY("not", 172),
767 ENTITY("notin", 8713),
768 ENTITY("nsub", 8836),
769 ENTITY("ntilde", 241),
770 ENTITY("nu", 957),
771 ENTITY("oacute", 243),
772 ENTITY("ocirc", 244),
773 ENTITY("oelig", 339),
774 ENTITY("ograve", 242),
775 ENTITY("oline", 8254),
776 ENTITY("omega", 969),
777 ENTITY("omicron", 959),
778 ENTITY("oplus", 8853),
779 ENTITY("or", 8744),
780 ENTITY("ordf", 170),
781 ENTITY("ordm", 186),
782 ENTITY("oslash", 248),
783 ENTITY("otilde", 245),
784 ENTITY("otimes", 8855),
785 ENTITY("ouml", 246),
786 ENTITY("para", 182),
787 ENTITY("part", 8706),
788 ENTITY("permil", 8240),
789 ENTITY("perp", 8869),
790 ENTITY("phi", 966),
791 ENTITY("pi", 960),
792 ENTITY("piv", 982),
793 ENTITY("plusmn", 177),
794 ENTITY("pound", 163),
795 ENTITY("prime", 8242),
796 ENTITY("prod", 8719),
797 ENTITY("prop", 8733),
798 ENTITY("psi", 968),
799 ENTITY("quot", 34),
800 ENTITY("rArr", 8658),
801 ENTITY("radic", 8730),
802 ENTITY("rang", 9002),
803 ENTITY("raquo", 187),
804 ENTITY("rarr", 8594),
805 ENTITY("rceil", 8969),
806 ENTITY("rdquo", 8221),
807 ENTITY("real", 8476),
808 ENTITY("reg", 174),
809 ENTITY("rfloor", 8971),
810 ENTITY("rho", 961),
811 ENTITY("rlm", 8207),
812 ENTITY("rsaquo", 8250),
813 ENTITY("rsquo", 8217),
814 ENTITY("sbquo", 8218),
815 ENTITY("scaron", 353),
816 ENTITY("sdot", 8901),
817 ENTITY("sect", 167),
818 ENTITY("shy", 173),
819 ENTITY("sigma", 963),
820 ENTITY("sigmaf", 962),
821 ENTITY("sim", 8764),
822 ENTITY("spades", 9824),
823 ENTITY("sub", 8834),
824 ENTITY("sube", 8838),
825 ENTITY("sum", 8721),
826 ENTITY("sup", 8835),
827 ENTITY("sup1", 185),
828 ENTITY("sup2", 178),
829 ENTITY("sup3", 179),
830 ENTITY("supe", 8839),
831 ENTITY("szlig", 223),
832 ENTITY("tau", 964),
833 ENTITY("there4", 8756),
834 ENTITY("theta", 952),
835 ENTITY("thetasym", 977),
836 ENTITY("thinsp", 8201),
837 ENTITY("thorn", 254),
838 ENTITY("tilde", 732),
839 ENTITY("times", 215),
840 ENTITY("trade", 8482),
841 ENTITY("uArr", 8657),
842 ENTITY("uacute", 250),
843 ENTITY("uarr", 8593),
844 ENTITY("ucirc", 251),
845 ENTITY("ugrave", 249),
846 ENTITY("uml", 168),
847 ENTITY("upsih", 978),
848 ENTITY("upsilon", 965),
849 ENTITY("uuml", 252),
850 ENTITY("weierp", 8472),
851 ENTITY("xi", 958),
852 ENTITY("yacute", 253),
853 ENTITY("yen", 165),
854 ENTITY("yuml", 255),
855 ENTITY("zeta", 950),
856 ENTITY("zwj", 8205),
857 ENTITY("zwnj", 8204),
daa616fc 858 {NULL, 0}};
19817fd3 859 #undef ENTITY
daa616fc 860 static size_t substitutions_cnt = 0;
04dbb646 861
daa616fc
VS
862 if (substitutions_cnt == 0)
863 while (substitutions[substitutions_cnt].code != 0)
864 substitutions_cnt++;
865
3919d530
JS
866 wxHtmlEntityInfo *info = NULL;
867#ifdef __WXWINCE__
868 // bsearch crashes under WinCE for some reason
869 size_t i;
870 for (i = 0; i < substitutions_cnt; i++)
871 {
872 if (entity == substitutions[i].name)
873 {
874 info = & substitutions[i];
875 break;
876 }
877 }
878#else
19817fd3 879 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
880 substitutions_cnt,
881 sizeof(wxHtmlEntityInfo),
90350682 882 wxHtmlEntityCompare);
3919d530 883#endif
daa616fc
VS
884 if (info)
885 code = info->code;
886 }
04dbb646 887
daa616fc 888 if (code == 0)
470252df 889 return 0;
daa616fc
VS
890 else
891 return GetCharForCode(code);
892}
893
d1da8872 894wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
6cc4e6b8
VS
895 const wxString& url) const
896{
e02ecf7c 897 return m_FS ? m_FS->OpenFile(url) : NULL;
d1da8872 898
6cc4e6b8
VS
899}
900
2b5f62a0
VZ
901
902//-----------------------------------------------------------------------------
903// wxHtmlParser::ExtractCharsetInformation
904//-----------------------------------------------------------------------------
905
906class wxMetaTagParser : public wxHtmlParser
907{
908public:
2eb10e2a
VZ
909 wxMetaTagParser() { }
910
2b5f62a0 911 wxObject* GetProduct() { return NULL; }
2eb10e2a 912
2b5f62a0 913protected:
5bce3e6f 914 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a
VZ
915
916 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
2b5f62a0
VZ
917};
918
919class wxMetaTagHandler : public wxHtmlTagHandler
920{
921public:
922 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
923 wxString GetSupportedTags() { return wxT("META,BODY"); }
924 bool HandleTag(const wxHtmlTag& tag);
925
926private:
927 wxString *m_retval;
2eb10e2a
VZ
928
929 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
2b5f62a0
VZ
930};
931
932bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
933{
934 if (tag.GetName() == _T("BODY"))
935 {
936 m_Parser->StopParsing();
d1da8872 937 return false;
2b5f62a0
VZ
938 }
939
940 if (tag.HasParam(_T("HTTP-EQUIV")) &&
13fd234c 941 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
2b5f62a0
VZ
942 tag.HasParam(_T("CONTENT")))
943 {
5af11a94 944 wxString content = tag.GetParam(_T("CONTENT")).Lower();
2b5f62a0
VZ
945 if (content.Left(19) == _T("text/html; charset="))
946 {
947 *m_retval = content.Mid(19);
948 m_Parser->StopParsing();
949 }
950 }
d1da8872 951 return false;
2b5f62a0
VZ
952}
953
954
955/*static*/
956wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
957{
958 wxString charset;
e7274ba2
WS
959 wxMetaTagParser *parser = new wxMetaTagParser();
960 if(parser)
961 {
962 parser->AddTagHandler(new wxMetaTagHandler(&charset));
963 parser->Parse(markup);
964 delete parser;
965 }
2b5f62a0
VZ
966 return charset;
967}
968
4609ee2e
VZ
969/* static */
970bool
971wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
972 wxString::const_iterator end)
973{
974 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
975
976 wxString::const_iterator p = start;
977
978 // comments begin with "<!--" in HTML 4.0
95ebbfe1 979 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
4609ee2e
VZ
980 {
981 // not a comment at all
982 return false;
983 }
984
985 // skip the start of the comment tag in any case, if we don't find the
986 // closing tag we should ignore broken markup
987 start = p;
988
989 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
990 // comment delimiter and the closing tag character (section 3.2.4 of
991 // http://www.w3.org/TR/html401/)
992 int dashes = 0;
993 while ( ++p < end )
994 {
995 const wxChar c = *p;
996
997 if ( (c == wxT(' ') || c == wxT('\n') ||
998 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
999 {
1000 // ignore white space before potential tag end
1001 continue;
1002 }
1003
1004 if ( c == wxT('>') && dashes >= 2 )
1005 {
1006 // found end of comment
1007 start = p;
1008 break;
1009 }
1010
1011 if ( c == wxT('-') )
1012 dashes++;
1013 else
1014 dashes = 0;
1015 }
1016
1017 return true;
1018}
1019
1020#endif // wxUSE_HTML