]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
Always link with expat in monolithic build.
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
69941f05 5// RCS-ID: $Id$
5526e819 6// Copyright: (c) 1999 Vaclav Slavik
65571936 7// Licence: wxWindows licence
5526e819
VS
8/////////////////////////////////////////////////////////////////////////////
9
3096bd2f 10#include "wx/wxprec.h"
5526e819 11
2b5f62a0 12#ifdef __BORLANDC__
93763ad5 13 #pragma hdrstop
5526e819
VS
14#endif
15
93763ad5
WS
16#if wxUSE_HTML && wxUSE_STREAMS
17
b4f4d3dd 18#ifndef WX_PRECOMP
ad9835c9 19 #include "wx/dynarray.h"
04dbb646
VZ
20 #include "wx/log.h"
21 #include "wx/intl.h"
670f9935 22 #include "wx/app.h"
193d0c93 23 #include "wx/wxcrtvararg.h"
5526e819
VS
24#endif
25
69941f05
VS
26#include "wx/tokenzr.h"
27#include "wx/wfstream.h"
28#include "wx/url.h"
daa616fc 29#include "wx/fontmap.h"
69941f05
VS
30#include "wx/html/htmldefs.h"
31#include "wx/html/htmlpars.h"
3c47c047 32#include "wx/vector.h"
5526e819 33
7127d129
RR
34#ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36#endif
34fdf762
VS
37
38// DLL options compatibility check:
34fdf762 39WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 40
9a83f860 41const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
25271309 42
211dfedd
VS
43//-----------------------------------------------------------------------------
44// wxHtmlParser helpers
45//-----------------------------------------------------------------------------
46
47class wxHtmlTextPiece
48{
49public:
3c47c047 50 wxHtmlTextPiece() {}
b1a3a964
VS
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
211dfedd
VS
55};
56
3c47c047
VS
57// NB: this is an empty class and not typedef because of forward declaration
58class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59{
60};
5526e819 61
481c879b 62class wxHtmlParserState
211dfedd 63{
481c879b 64public:
211dfedd
VS
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
b1a3a964 69 const wxString *m_source;
211dfedd
VS
70 wxHtmlParserState *m_nextState;
71};
5526e819
VS
72
73//-----------------------------------------------------------------------------
74// wxHtmlParser
75//-----------------------------------------------------------------------------
76
77IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
04dbb646 79wxHtmlParser::wxHtmlParser()
2826ef0c
VS
80 : wxObject(),
81 m_FS(NULL)
daa616fc 82{
b1a3a964 83 m_Source = NULL;
daa616fc 84 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
daa616fc
VS
90}
91
92wxHtmlParser::~wxHtmlParser()
93{
0beefa20
VS
94 while (RestoreState()) {}
95 DestroyDOMTree();
222ed1d6 96
2826ef0c
VS
97 WX_CLEAR_ARRAY(m_HandlersStack);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
daa616fc 99 delete m_entitiesParser;
b1a3a964 100 delete m_Source;
daa616fc 101}
5526e819
VS
102
103wxObject* wxHtmlParser::Parse(const wxString& source)
104{
5526e819
VS
105 InitParser(source);
106 DoParsing();
2b5f62a0 107 wxObject *result = GetProduct();
5526e819
VS
108 DoneParser();
109 return result;
110}
111
5526e819
VS
112void wxHtmlParser::InitParser(const wxString& source)
113{
1309ba6c 114 SetSource(source);
d1da8872 115 m_stopParsing = false;
5526e819 116}
1309ba6c 117
5526e819
VS
118void wxHtmlParser::DoneParser()
119{
211dfedd 120 DestroyDOMTree();
5526e819
VS
121}
122
1309ba6c
VS
123void wxHtmlParser::SetSource(const wxString& src)
124{
211dfedd 125 DestroyDOMTree();
d989875a
VS
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
132 // on the heap.
b1a3a964
VS
133 delete m_Source;
134 m_Source = new wxString(src);
211dfedd
VS
135 CreateDOMTree();
136 m_CurTag = NULL;
137 m_CurTextPiece = 0;
1309ba6c 138}
5526e819 139
211dfedd 140void wxHtmlParser::CreateDOMTree()
5526e819 141{
b1a3a964 142 wxHtmlTagsCache cache(*m_Source);
211dfedd 143 m_TextPieces = new wxHtmlTextPieces;
b1a3a964 144 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
211dfedd
VS
145 m_CurTextPiece = 0;
146}
5526e819 147
b1a3a964 148extern bool wxIsCDATAElement(const wxString& tag);
7c6cd4a8 149
211dfedd 150void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
b1a3a964
VS
151 const wxString::const_iterator& begin_pos,
152 const wxString::const_iterator& end_pos,
211dfedd
VS
153 wxHtmlTagsCache *cache)
154{
b1a3a964
VS
155 if (end_pos <= begin_pos)
156 return;
5526e819 157
211dfedd 158 wxChar c;
b1a3a964
VS
159 wxString::const_iterator i = begin_pos;
160 wxString::const_iterator textBeginning = begin_pos;
d699f48b 161
7c6cd4a8
VS
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
b1a3a964 166 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
7c6cd4a8
VS
167 {
168 i = end_pos;
169 }
170
04dbb646 171 while (i < end_pos)
4f9297b0 172 {
b1a3a964 173 c = *i;
5526e819 174
211dfedd
VS
175 if (c == wxT('<'))
176 {
177 // add text to m_TextPieces:
b1a3a964
VS
178 if (i > textBeginning)
179 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
211dfedd
VS
180
181 // if it is a comment, skip it:
b1a3a964 182 if ( SkipCommentTag(i, m_Source->end()) )
211dfedd 183 {
b1a3a964 184 textBeginning = i = i + 1; // skip closing '>' too
211dfedd 185 }
d699f48b 186
211dfedd 187 // add another tag to the tree:
b1a3a964 188 else if (i < end_pos-1 && *(i+1) != wxT('/'))
d1da8872 189 {
211dfedd 190 wxHtmlTag *chd;
d699f48b
KB
191 if (cur)
192 chd = new wxHtmlTag(cur, m_Source,
211dfedd 193 i, end_pos, cache, m_entitiesParser);
d699f48b 194 else
211dfedd
VS
195 {
196 chd = new wxHtmlTag(NULL, m_Source,
197 i, end_pos, cache, m_entitiesParser);
d699f48b 198 if (!m_Tags)
211dfedd 199 {
d699f48b 200 // if this is the first tag to be created make the root
211dfedd
VS
201 // m_Tags point to it:
202 m_Tags = chd;
203 }
204 else
205 {
d699f48b 206 // if there is already a root tag add this tag as
211dfedd
VS
207 // the last sibling:
208 chd->m_Prev = m_Tags->GetLastSibling();
209 chd->m_Prev->m_Next = chd;
210 }
211 }
212
213 if (chd->HasEnding())
214 {
215 CreateDOMSubTree(chd,
b1a3a964 216 chd->GetBeginIter(), chd->GetEndIter1(),
211dfedd 217 cache);
b1a3a964 218 i = chd->GetEndIter2();
211dfedd
VS
219 }
220 else
b1a3a964 221 i = chd->GetBeginIter();
d1da8872 222
211dfedd
VS
223 textBeginning = i;
224 }
225
226 // ... or skip ending tag:
d699f48b 227 else
211dfedd 228 {
b1a3a964 229 while (i < end_pos && *i != wxT('>')) ++i;
36258204 230 textBeginning = i < end_pos ? i+1 : i;
5526e819 231 }
5526e819 232 }
b1a3a964 233 else ++i;
5526e819
VS
234 }
235
211dfedd 236 // add remaining text to m_TextPieces:
b1a3a964
VS
237 if (end_pos > textBeginning)
238 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
211dfedd
VS
239}
240
241void wxHtmlParser::DestroyDOMTree()
242{
243 wxHtmlTag *t1, *t2;
244 t1 = m_Tags;
245 while (t1)
246 {
247 t2 = t1->GetNextSibling();
248 delete t1;
249 t1 = t2;
250 }
251 m_Tags = m_CurTag = NULL;
252
5276b0a5 253 wxDELETE(m_TextPieces);
211dfedd
VS
254}
255
d699f48b 256void wxHtmlParser::DoParsing()
211dfedd
VS
257{
258 m_CurTag = m_Tags;
259 m_CurTextPiece = 0;
b1a3a964 260 DoParsing(m_Source->begin(), m_Source->end());
211dfedd
VS
261}
262
b1a3a964
VS
263void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
264 const wxString::const_iterator& end_pos)
211dfedd 265{
b1a3a964
VS
266 wxString::const_iterator begin_pos(begin_pos_);
267
268 if (end_pos <= begin_pos)
269 return;
d699f48b 270
211dfedd 271 wxHtmlTextPieces& pieces = *m_TextPieces;
3c47c047 272 size_t piecesCnt = pieces.size();
d699f48b 273
211dfedd
VS
274 while (begin_pos < end_pos)
275 {
b1a3a964 276 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
211dfedd 277 m_CurTag = m_CurTag->GetNextTag();
d699f48b 278 while (m_CurTextPiece < piecesCnt &&
b1a3a964 279 pieces[m_CurTextPiece].m_start < begin_pos)
211dfedd
VS
280 m_CurTextPiece++;
281
d699f48b
KB
282 if (m_CurTextPiece < piecesCnt &&
283 (!m_CurTag ||
b1a3a964 284 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
211dfedd
VS
285 {
286 // Add text:
f23e92e7 287 AddText(GetEntitiesParser()->Parse(
b1a3a964
VS
288 wxString(pieces[m_CurTextPiece].m_start,
289 pieces[m_CurTextPiece].m_end)));
290 begin_pos = pieces[m_CurTextPiece].m_end;
211dfedd
VS
291 m_CurTextPiece++;
292 }
293 else if (m_CurTag)
294 {
902725ee 295 if (m_CurTag->HasEnding())
b1a3a964 296 begin_pos = m_CurTag->GetEndIter2();
902725ee 297 else
b1a3a964 298 begin_pos = m_CurTag->GetBeginIter();
211dfedd
VS
299 wxHtmlTag *t = m_CurTag;
300 m_CurTag = m_CurTag->GetNextTag();
301 AddTag(*t);
2b5f62a0
VZ
302 if (m_stopParsing)
303 return;
211dfedd
VS
304 }
305 else break;
5526e819
VS
306 }
307}
308
5526e819
VS
309void wxHtmlParser::AddTag(const wxHtmlTag& tag)
310{
d1da8872 311 bool inner = false;
5526e819 312
2826ef0c
VS
313 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
314 if (h != m_HandlersHash.end())
2b5f62a0 315 {
2826ef0c 316 inner = h->second->HandleTag(tag);
2b5f62a0
VZ
317 if (m_stopParsing)
318 return;
319 }
04dbb646 320 if (!inner)
4f9297b0 321 {
5526e819 322 if (tag.HasEnding())
b1a3a964 323 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
5526e819
VS
324 }
325}
326
5526e819
VS
327void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
328{
4f9297b0 329 wxString s(handler->GetSupportedTags());
211dfedd 330 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 331
5526e819 332 while (tokenizer.HasMoreTokens())
2826ef0c 333 m_HandlersHash[tokenizer.GetNextToken()] = handler;
5526e819 334
2826ef0c 335 m_HandlersSet.insert(handler);
5526e819 336
4f9297b0 337 handler->SetParser(this);
5526e819
VS
338}
339
fbfb8bcc 340void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 341{
211dfedd 342 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
343 wxString key;
344
2826ef0c 345 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
a7a4d01b 346
04dbb646 347 while (tokenizer.HasMoreTokens())
4f9297b0 348 {
470252df 349 key = tokenizer.GetNextToken();
2826ef0c 350 m_HandlersHash[key] = handler;
a7a4d01b
VS
351 }
352}
353
a7a4d01b
VS
354void wxHtmlParser::PopTagHandler()
355{
2826ef0c
VS
356 wxCHECK_RET( !m_HandlersStack.empty(),
357 "attempt to remove HTML tag handler from empty stack" );
358
359 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
360 m_HandlersStack.pop_back();
361 m_HandlersHash = *prev;
362 delete prev;
a7a4d01b
VS
363}
364
211dfedd
VS
365void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
366{
367 wxHtmlParserState *s = new wxHtmlParserState;
368
369 s->m_curTag = m_CurTag;
370 s->m_tags = m_Tags;
371 s->m_textPieces = m_TextPieces;
372 s->m_curTextPiece = m_CurTextPiece;
373 s->m_source = m_Source;
374
375 s->m_nextState = m_SavedStates;
376 m_SavedStates = s;
377
378 m_CurTag = NULL;
379 m_Tags = NULL;
380 m_TextPieces = NULL;
381 m_CurTextPiece = 0;
b1a3a964 382 m_Source = NULL;
d699f48b 383
211dfedd
VS
384 SetSource(src);
385}
386
387bool wxHtmlParser::RestoreState()
388{
d1da8872 389 if (!m_SavedStates) return false;
d699f48b 390
0beefa20 391 DestroyDOMTree();
d989875a 392 delete m_Source;
0beefa20 393
211dfedd
VS
394 wxHtmlParserState *s = m_SavedStates;
395 m_SavedStates = s->m_nextState;
d699f48b 396
211dfedd
VS
397 m_CurTag = s->m_curTag;
398 m_Tags = s->m_tags;
399 m_TextPieces = s->m_textPieces;
400 m_CurTextPiece = s->m_curTextPiece;
401 m_Source = s->m_source;
d699f48b 402
211dfedd 403 delete s;
d1da8872 404 return true;
211dfedd
VS
405}
406
e7feeafa
VS
407wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
408{
b1a3a964 409 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
e7feeafa
VS
410}
411
5526e819
VS
412//-----------------------------------------------------------------------------
413// wxHtmlTagHandler
414//-----------------------------------------------------------------------------
415
416IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 417
e7feeafa
VS
418void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
419{
420 // It is safe to temporarily change the source being parsed,
421 // provided we restore the state back after parsing
422 m_Parser->SetSourceAndSaveState(source);
423 m_Parser->DoParsing();
424 m_Parser->RestoreState();
425}
426
daa616fc
VS
427
428//-----------------------------------------------------------------------------
429// wxHtmlEntitiesParser
430//-----------------------------------------------------------------------------
431
432IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
433
434wxHtmlEntitiesParser::wxHtmlEntitiesParser()
8d94819c 435#if !wxUSE_UNICODE
daa616fc 436 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 437#endif
daa616fc
VS
438{
439}
440
441wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
442{
8d94819c 443#if !wxUSE_UNICODE
daa616fc 444 delete m_conv;
5438a566 445#endif
daa616fc 446}
5526e819 447
8d94819c 448#if !wxUSE_UNICODE
daa616fc
VS
449void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
450{
2b5f62a0
VZ
451 if (encoding == m_encoding)
452 return;
453
daa616fc 454 delete m_conv;
2b5f62a0 455
daa616fc 456 m_encoding = encoding;
2b5f62a0
VZ
457 if (m_encoding == wxFONTENCODING_SYSTEM)
458 m_conv = NULL;
459 else
daa616fc 460 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
daa616fc 461}
8d94819c 462#endif // !wxUSE_UNICODE
daa616fc 463
96d665d2 464wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
daa616fc 465{
daa616fc 466 wxString output;
d1da8872 467
4f7e8fda
VS
468 const wxString::const_iterator end(input.end());
469 wxString::const_iterator c(input.begin());
470 wxString::const_iterator last(c);
04dbb646 471
4f7e8fda 472 for ( ; c < end; ++c )
daa616fc
VS
473 {
474 if (*c == wxT('&'))
475 {
4f7e8fda
VS
476 if ( output.empty() )
477 output.reserve(input.length());
478
daa616fc 479 if (c - last > 0)
4f7e8fda
VS
480 output.append(last, c);
481 if ( ++c == end )
9e2bd135 482 break;
d1da8872 483
daa616fc 484 wxString entity;
4f7e8fda 485 const wxString::const_iterator ent_s = c;
470252df 486 wxChar entity_char;
d1da8872 487
b1a3a964
VS
488 for ( ; c != end; ++c )
489 {
490 wxChar ch = *c;
491 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
492 (ch >= wxT('A') && ch <= wxT('Z')) ||
493 (ch >= wxT('0') && ch <= wxT('9')) ||
494 ch == wxT('_') || ch == wxT('#')) )
495 break;
496 }
497
4f7e8fda
VS
498 entity.append(ent_s, c);
499 if (c == end || *c != wxT(';')) --c;
211dfedd 500 last = c+1;
470252df
VS
501 entity_char = GetEntityChar(entity);
502 if (entity_char)
503 output << entity_char;
504 else
505 {
4f7e8fda 506 output.append(ent_s-1, c+1);
25271309 507 wxLogTrace(wxTRACE_HTML_DEBUG,
4f7e8fda
VS
508 "Unrecognized HTML entity: '%s'",
509 entity);
470252df 510 }
daa616fc
VS
511 }
512 }
4f7e8fda
VS
513 if ( last == input.begin() ) // common case: no entity
514 return input;
515 if ( last != end )
516 output.append(last, end);
daa616fc
VS
517 return output;
518}
519
2b5f62a0 520#if !wxUSE_UNICODE
96d665d2 521wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
daa616fc 522{
daa616fc
VS
523 char buf[2];
524 wchar_t wbuf[2];
525 wbuf[0] = (wchar_t)code;
526 wbuf[1] = 0;
527 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 528 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
529 return '?';
530 return buf[0];
daa616fc 531}
2b5f62a0 532#endif
daa616fc 533
19817fd3
VS
534struct wxHtmlEntityInfo
535{
536 const wxStringCharType *name;
537 unsigned code;
538};
539
540extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
541{
542#if wxUSE_UNICODE_UTF8
543 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
544#else
545 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
546#endif
547}
548
96d665d2 549wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
daa616fc
VS
550{
551 unsigned code = 0;
04dbb646 552
c0213e2a
VS
553 if (entity.empty())
554 return 0; // invalid entity reference
555
daa616fc
VS
556 if (entity[0] == wxT('#'))
557 {
c471f7e1
VS
558 // NB: parsed value is a number, so it's OK to use wx_str(), internal
559 // representation is the same for numbers
560 const wxStringCharType *ent_s = entity.wx_str();
561 const wxStringCharType *format;
04dbb646 562
d9359369 563 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
daa616fc 564 {
d9359369 565 format = wxS("%x");
daa616fc
VS
566 ent_s++;
567 }
568 else
d9359369 569 format = wxS("%u");
daa616fc
VS
570 ent_s++;
571
572 if (wxSscanf(ent_s, format, &code) != 1)
573 code = 0;
574 }
575 else
576 {
19817fd3
VS
577 // store the literals in wx's internal representation (either char*
578 // in UTF-8 or wchar_t*) for best performance:
d9359369 579 #define ENTITY(name, code) { wxS(name), code }
19817fd3 580
daa616fc 581 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
582 ENTITY("AElig", 198),
583 ENTITY("Aacute", 193),
584 ENTITY("Acirc", 194),
585 ENTITY("Agrave", 192),
586 ENTITY("Alpha", 913),
587 ENTITY("Aring", 197),
588 ENTITY("Atilde", 195),
589 ENTITY("Auml", 196),
590 ENTITY("Beta", 914),
591 ENTITY("Ccedil", 199),
592 ENTITY("Chi", 935),
593 ENTITY("Dagger", 8225),
594 ENTITY("Delta", 916),
595 ENTITY("ETH", 208),
596 ENTITY("Eacute", 201),
597 ENTITY("Ecirc", 202),
598 ENTITY("Egrave", 200),
599 ENTITY("Epsilon", 917),
600 ENTITY("Eta", 919),
601 ENTITY("Euml", 203),
602 ENTITY("Gamma", 915),
603 ENTITY("Iacute", 205),
604 ENTITY("Icirc", 206),
605 ENTITY("Igrave", 204),
606 ENTITY("Iota", 921),
607 ENTITY("Iuml", 207),
608 ENTITY("Kappa", 922),
609 ENTITY("Lambda", 923),
610 ENTITY("Mu", 924),
611 ENTITY("Ntilde", 209),
612 ENTITY("Nu", 925),
613 ENTITY("OElig", 338),
614 ENTITY("Oacute", 211),
615 ENTITY("Ocirc", 212),
616 ENTITY("Ograve", 210),
617 ENTITY("Omega", 937),
618 ENTITY("Omicron", 927),
619 ENTITY("Oslash", 216),
620 ENTITY("Otilde", 213),
621 ENTITY("Ouml", 214),
622 ENTITY("Phi", 934),
623 ENTITY("Pi", 928),
624 ENTITY("Prime", 8243),
625 ENTITY("Psi", 936),
626 ENTITY("Rho", 929),
627 ENTITY("Scaron", 352),
628 ENTITY("Sigma", 931),
629 ENTITY("THORN", 222),
630 ENTITY("Tau", 932),
631 ENTITY("Theta", 920),
632 ENTITY("Uacute", 218),
633 ENTITY("Ucirc", 219),
634 ENTITY("Ugrave", 217),
635 ENTITY("Upsilon", 933),
636 ENTITY("Uuml", 220),
637 ENTITY("Xi", 926),
638 ENTITY("Yacute", 221),
639 ENTITY("Yuml", 376),
640 ENTITY("Zeta", 918),
641 ENTITY("aacute", 225),
642 ENTITY("acirc", 226),
643 ENTITY("acute", 180),
644 ENTITY("aelig", 230),
645 ENTITY("agrave", 224),
646 ENTITY("alefsym", 8501),
647 ENTITY("alpha", 945),
648 ENTITY("amp", 38),
649 ENTITY("and", 8743),
650 ENTITY("ang", 8736),
319106d6 651 ENTITY("apos", 39),
19817fd3
VS
652 ENTITY("aring", 229),
653 ENTITY("asymp", 8776),
654 ENTITY("atilde", 227),
655 ENTITY("auml", 228),
656 ENTITY("bdquo", 8222),
657 ENTITY("beta", 946),
658 ENTITY("brvbar", 166),
659 ENTITY("bull", 8226),
660 ENTITY("cap", 8745),
661 ENTITY("ccedil", 231),
662 ENTITY("cedil", 184),
663 ENTITY("cent", 162),
664 ENTITY("chi", 967),
665 ENTITY("circ", 710),
666 ENTITY("clubs", 9827),
667 ENTITY("cong", 8773),
668 ENTITY("copy", 169),
669 ENTITY("crarr", 8629),
670 ENTITY("cup", 8746),
671 ENTITY("curren", 164),
672 ENTITY("dArr", 8659),
673 ENTITY("dagger", 8224),
674 ENTITY("darr", 8595),
675 ENTITY("deg", 176),
676 ENTITY("delta", 948),
677 ENTITY("diams", 9830),
678 ENTITY("divide", 247),
679 ENTITY("eacute", 233),
680 ENTITY("ecirc", 234),
681 ENTITY("egrave", 232),
682 ENTITY("empty", 8709),
683 ENTITY("emsp", 8195),
684 ENTITY("ensp", 8194),
685 ENTITY("epsilon", 949),
686 ENTITY("equiv", 8801),
687 ENTITY("eta", 951),
688 ENTITY("eth", 240),
689 ENTITY("euml", 235),
690 ENTITY("euro", 8364),
691 ENTITY("exist", 8707),
692 ENTITY("fnof", 402),
693 ENTITY("forall", 8704),
694 ENTITY("frac12", 189),
695 ENTITY("frac14", 188),
696 ENTITY("frac34", 190),
697 ENTITY("frasl", 8260),
698 ENTITY("gamma", 947),
699 ENTITY("ge", 8805),
700 ENTITY("gt", 62),
701 ENTITY("hArr", 8660),
702 ENTITY("harr", 8596),
703 ENTITY("hearts", 9829),
704 ENTITY("hellip", 8230),
705 ENTITY("iacute", 237),
706 ENTITY("icirc", 238),
707 ENTITY("iexcl", 161),
708 ENTITY("igrave", 236),
709 ENTITY("image", 8465),
710 ENTITY("infin", 8734),
711 ENTITY("int", 8747),
712 ENTITY("iota", 953),
713 ENTITY("iquest", 191),
714 ENTITY("isin", 8712),
715 ENTITY("iuml", 239),
716 ENTITY("kappa", 954),
717 ENTITY("lArr", 8656),
718 ENTITY("lambda", 955),
719 ENTITY("lang", 9001),
720 ENTITY("laquo", 171),
721 ENTITY("larr", 8592),
722 ENTITY("lceil", 8968),
723 ENTITY("ldquo", 8220),
724 ENTITY("le", 8804),
725 ENTITY("lfloor", 8970),
726 ENTITY("lowast", 8727),
727 ENTITY("loz", 9674),
728 ENTITY("lrm", 8206),
729 ENTITY("lsaquo", 8249),
730 ENTITY("lsquo", 8216),
731 ENTITY("lt", 60),
732 ENTITY("macr", 175),
733 ENTITY("mdash", 8212),
734 ENTITY("micro", 181),
735 ENTITY("middot", 183),
736 ENTITY("minus", 8722),
737 ENTITY("mu", 956),
738 ENTITY("nabla", 8711),
739 ENTITY("nbsp", 160),
740 ENTITY("ndash", 8211),
741 ENTITY("ne", 8800),
742 ENTITY("ni", 8715),
743 ENTITY("not", 172),
744 ENTITY("notin", 8713),
745 ENTITY("nsub", 8836),
746 ENTITY("ntilde", 241),
747 ENTITY("nu", 957),
748 ENTITY("oacute", 243),
749 ENTITY("ocirc", 244),
750 ENTITY("oelig", 339),
751 ENTITY("ograve", 242),
752 ENTITY("oline", 8254),
753 ENTITY("omega", 969),
754 ENTITY("omicron", 959),
755 ENTITY("oplus", 8853),
756 ENTITY("or", 8744),
757 ENTITY("ordf", 170),
758 ENTITY("ordm", 186),
759 ENTITY("oslash", 248),
760 ENTITY("otilde", 245),
761 ENTITY("otimes", 8855),
762 ENTITY("ouml", 246),
763 ENTITY("para", 182),
764 ENTITY("part", 8706),
765 ENTITY("permil", 8240),
766 ENTITY("perp", 8869),
767 ENTITY("phi", 966),
768 ENTITY("pi", 960),
769 ENTITY("piv", 982),
770 ENTITY("plusmn", 177),
771 ENTITY("pound", 163),
772 ENTITY("prime", 8242),
773 ENTITY("prod", 8719),
774 ENTITY("prop", 8733),
775 ENTITY("psi", 968),
776 ENTITY("quot", 34),
777 ENTITY("rArr", 8658),
778 ENTITY("radic", 8730),
779 ENTITY("rang", 9002),
780 ENTITY("raquo", 187),
781 ENTITY("rarr", 8594),
782 ENTITY("rceil", 8969),
783 ENTITY("rdquo", 8221),
784 ENTITY("real", 8476),
785 ENTITY("reg", 174),
786 ENTITY("rfloor", 8971),
787 ENTITY("rho", 961),
788 ENTITY("rlm", 8207),
789 ENTITY("rsaquo", 8250),
790 ENTITY("rsquo", 8217),
791 ENTITY("sbquo", 8218),
792 ENTITY("scaron", 353),
793 ENTITY("sdot", 8901),
794 ENTITY("sect", 167),
795 ENTITY("shy", 173),
796 ENTITY("sigma", 963),
797 ENTITY("sigmaf", 962),
798 ENTITY("sim", 8764),
799 ENTITY("spades", 9824),
800 ENTITY("sub", 8834),
801 ENTITY("sube", 8838),
802 ENTITY("sum", 8721),
803 ENTITY("sup", 8835),
804 ENTITY("sup1", 185),
805 ENTITY("sup2", 178),
806 ENTITY("sup3", 179),
807 ENTITY("supe", 8839),
808 ENTITY("szlig", 223),
809 ENTITY("tau", 964),
810 ENTITY("there4", 8756),
811 ENTITY("theta", 952),
812 ENTITY("thetasym", 977),
813 ENTITY("thinsp", 8201),
814 ENTITY("thorn", 254),
815 ENTITY("tilde", 732),
816 ENTITY("times", 215),
817 ENTITY("trade", 8482),
818 ENTITY("uArr", 8657),
819 ENTITY("uacute", 250),
820 ENTITY("uarr", 8593),
821 ENTITY("ucirc", 251),
822 ENTITY("ugrave", 249),
823 ENTITY("uml", 168),
824 ENTITY("upsih", 978),
825 ENTITY("upsilon", 965),
826 ENTITY("uuml", 252),
827 ENTITY("weierp", 8472),
828 ENTITY("xi", 958),
829 ENTITY("yacute", 253),
830 ENTITY("yen", 165),
831 ENTITY("yuml", 255),
832 ENTITY("zeta", 950),
833 ENTITY("zwj", 8205),
834 ENTITY("zwnj", 8204),
daa616fc 835 {NULL, 0}};
19817fd3 836 #undef ENTITY
daa616fc 837 static size_t substitutions_cnt = 0;
04dbb646 838
daa616fc
VS
839 if (substitutions_cnt == 0)
840 while (substitutions[substitutions_cnt].code != 0)
841 substitutions_cnt++;
842
e822d1bd 843 wxHtmlEntityInfo *info;
3919d530
JS
844#ifdef __WXWINCE__
845 // bsearch crashes under WinCE for some reason
e822d1bd 846 info = NULL;
3919d530
JS
847 size_t i;
848 for (i = 0; i < substitutions_cnt; i++)
849 {
850 if (entity == substitutions[i].name)
851 {
852 info = & substitutions[i];
853 break;
854 }
855 }
856#else
19817fd3 857 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
858 substitutions_cnt,
859 sizeof(wxHtmlEntityInfo),
90350682 860 wxHtmlEntityCompare);
3919d530 861#endif
daa616fc
VS
862 if (info)
863 code = info->code;
864 }
04dbb646 865
daa616fc 866 if (code == 0)
470252df 867 return 0;
daa616fc
VS
868 else
869 return GetCharForCode(code);
870}
871
948c6134 872wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType type,
6cc4e6b8
VS
873 const wxString& url) const
874{
948c6134
MW
875 int flags = wxFS_READ;
876 if (type == wxHTML_URL_IMAGE)
877 flags |= wxFS_SEEKABLE;
878
879 return m_FS ? m_FS->OpenFile(url, flags) : NULL;
d1da8872 880
6cc4e6b8
VS
881}
882
2b5f62a0
VZ
883
884//-----------------------------------------------------------------------------
885// wxHtmlParser::ExtractCharsetInformation
886//-----------------------------------------------------------------------------
887
888class wxMetaTagParser : public wxHtmlParser
889{
890public:
2eb10e2a
VZ
891 wxMetaTagParser() { }
892
2b5f62a0 893 wxObject* GetProduct() { return NULL; }
2eb10e2a 894
2b5f62a0 895protected:
5bce3e6f 896 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a 897
c0c133e1 898 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
2b5f62a0
VZ
899};
900
901class wxMetaTagHandler : public wxHtmlTagHandler
902{
903public:
904 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
905 wxString GetSupportedTags() { return wxT("META,BODY"); }
906 bool HandleTag(const wxHtmlTag& tag);
907
908private:
909 wxString *m_retval;
2eb10e2a 910
c0c133e1 911 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
2b5f62a0
VZ
912};
913
914bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
915{
9a83f860 916 if (tag.GetName() == wxT("BODY"))
2b5f62a0
VZ
917 {
918 m_Parser->StopParsing();
d1da8872 919 return false;
2b5f62a0
VZ
920 }
921
9a83f860
VZ
922 if (tag.HasParam(wxT("HTTP-EQUIV")) &&
923 tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
924 tag.HasParam(wxT("CONTENT")))
2b5f62a0 925 {
9a83f860
VZ
926 wxString content = tag.GetParam(wxT("CONTENT")).Lower();
927 if (content.Left(19) == wxT("text/html; charset="))
2b5f62a0
VZ
928 {
929 *m_retval = content.Mid(19);
930 m_Parser->StopParsing();
931 }
932 }
d1da8872 933 return false;
2b5f62a0
VZ
934}
935
936
937/*static*/
938wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
939{
940 wxString charset;
e7274ba2
WS
941 wxMetaTagParser *parser = new wxMetaTagParser();
942 if(parser)
943 {
944 parser->AddTagHandler(new wxMetaTagHandler(&charset));
945 parser->Parse(markup);
946 delete parser;
947 }
2b5f62a0
VZ
948 return charset;
949}
950
4609ee2e
VZ
951/* static */
952bool
953wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
954 wxString::const_iterator end)
955{
9a83f860 956 wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
4609ee2e
VZ
957
958 wxString::const_iterator p = start;
959
960 // comments begin with "<!--" in HTML 4.0
36258204 961 if ( end - start < 4 || *++p != '!' || *++p != '-' || *++p != '-' )
4609ee2e
VZ
962 {
963 // not a comment at all
964 return false;
965 }
966
967 // skip the start of the comment tag in any case, if we don't find the
968 // closing tag we should ignore broken markup
969 start = p;
970
971 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
972 // comment delimiter and the closing tag character (section 3.2.4 of
973 // http://www.w3.org/TR/html401/)
974 int dashes = 0;
975 while ( ++p < end )
976 {
977 const wxChar c = *p;
978
979 if ( (c == wxT(' ') || c == wxT('\n') ||
980 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
981 {
982 // ignore white space before potential tag end
983 continue;
984 }
985
986 if ( c == wxT('>') && dashes >= 2 )
987 {
988 // found end of comment
989 start = p;
990 break;
991 }
992
993 if ( c == wxT('-') )
994 dashes++;
995 else
996 dashes = 0;
997 }
998
999 return true;
1000}
1001
1002#endif // wxUSE_HTML