]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
Fix PCH-less compilation after recent changes.
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
69941f05 5// RCS-ID: $Id$
5526e819 6// Copyright: (c) 1999 Vaclav Slavik
65571936 7// Licence: wxWindows licence
5526e819
VS
8/////////////////////////////////////////////////////////////////////////////
9
3096bd2f 10#include "wx/wxprec.h"
5526e819 11
2b5f62a0 12#ifdef __BORLANDC__
93763ad5 13 #pragma hdrstop
5526e819
VS
14#endif
15
93763ad5
WS
16#if wxUSE_HTML && wxUSE_STREAMS
17
b4f4d3dd 18#ifndef WX_PRECOMP
ad9835c9 19 #include "wx/dynarray.h"
04dbb646
VZ
20 #include "wx/log.h"
21 #include "wx/intl.h"
670f9935 22 #include "wx/app.h"
193d0c93 23 #include "wx/wxcrtvararg.h"
5526e819
VS
24#endif
25
69941f05
VS
26#include "wx/tokenzr.h"
27#include "wx/wfstream.h"
28#include "wx/url.h"
daa616fc 29#include "wx/fontmap.h"
69941f05
VS
30#include "wx/html/htmldefs.h"
31#include "wx/html/htmlpars.h"
3c47c047 32#include "wx/vector.h"
5526e819 33
7127d129
RR
34#ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36#endif
34fdf762
VS
37
38// DLL options compatibility check:
34fdf762 39WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 40
9a83f860 41const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
25271309 42
211dfedd
VS
43//-----------------------------------------------------------------------------
44// wxHtmlParser helpers
45//-----------------------------------------------------------------------------
46
47class wxHtmlTextPiece
48{
49public:
3c47c047 50 wxHtmlTextPiece() {}
b1a3a964
VS
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
211dfedd
VS
55};
56
3c47c047
VS
57// NB: this is an empty class and not typedef because of forward declaration
58class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59{
60};
5526e819 61
481c879b 62class wxHtmlParserState
211dfedd 63{
481c879b 64public:
211dfedd
VS
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
b1a3a964 69 const wxString *m_source;
211dfedd
VS
70 wxHtmlParserState *m_nextState;
71};
5526e819
VS
72
73//-----------------------------------------------------------------------------
74// wxHtmlParser
75//-----------------------------------------------------------------------------
76
77IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
04dbb646 79wxHtmlParser::wxHtmlParser()
2826ef0c
VS
80 : wxObject(),
81 m_FS(NULL)
daa616fc 82{
b1a3a964 83 m_Source = NULL;
daa616fc 84 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
daa616fc
VS
90}
91
92wxHtmlParser::~wxHtmlParser()
93{
0beefa20
VS
94 while (RestoreState()) {}
95 DestroyDOMTree();
222ed1d6 96
2826ef0c
VS
97 WX_CLEAR_ARRAY(m_HandlersStack);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
daa616fc 99 delete m_entitiesParser;
b1a3a964 100 delete m_Source;
daa616fc 101}
5526e819
VS
102
103wxObject* wxHtmlParser::Parse(const wxString& source)
104{
5526e819
VS
105 InitParser(source);
106 DoParsing();
2b5f62a0 107 wxObject *result = GetProduct();
5526e819
VS
108 DoneParser();
109 return result;
110}
111
5526e819
VS
112void wxHtmlParser::InitParser(const wxString& source)
113{
1309ba6c 114 SetSource(source);
d1da8872 115 m_stopParsing = false;
5526e819 116}
1309ba6c 117
5526e819
VS
118void wxHtmlParser::DoneParser()
119{
211dfedd 120 DestroyDOMTree();
5526e819
VS
121}
122
1309ba6c
VS
123void wxHtmlParser::SetSource(const wxString& src)
124{
211dfedd 125 DestroyDOMTree();
d989875a
VS
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
132 // on the heap.
b1a3a964
VS
133 delete m_Source;
134 m_Source = new wxString(src);
211dfedd
VS
135 CreateDOMTree();
136 m_CurTag = NULL;
137 m_CurTextPiece = 0;
1309ba6c 138}
5526e819 139
211dfedd 140void wxHtmlParser::CreateDOMTree()
5526e819 141{
b1a3a964 142 wxHtmlTagsCache cache(*m_Source);
211dfedd 143 m_TextPieces = new wxHtmlTextPieces;
b1a3a964 144 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
211dfedd
VS
145 m_CurTextPiece = 0;
146}
5526e819 147
b1a3a964 148extern bool wxIsCDATAElement(const wxString& tag);
7c6cd4a8 149
211dfedd 150void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
b1a3a964
VS
151 const wxString::const_iterator& begin_pos,
152 const wxString::const_iterator& end_pos,
211dfedd
VS
153 wxHtmlTagsCache *cache)
154{
b1a3a964
VS
155 if (end_pos <= begin_pos)
156 return;
5526e819 157
211dfedd 158 wxChar c;
b1a3a964
VS
159 wxString::const_iterator i = begin_pos;
160 wxString::const_iterator textBeginning = begin_pos;
d699f48b 161
7c6cd4a8
VS
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
b1a3a964 166 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
7c6cd4a8
VS
167 {
168 i = end_pos;
169 }
170
04dbb646 171 while (i < end_pos)
4f9297b0 172 {
b1a3a964 173 c = *i;
5526e819 174
211dfedd
VS
175 if (c == wxT('<'))
176 {
177 // add text to m_TextPieces:
b1a3a964
VS
178 if (i > textBeginning)
179 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
211dfedd
VS
180
181 // if it is a comment, skip it:
b1a3a964 182 if ( SkipCommentTag(i, m_Source->end()) )
211dfedd 183 {
b1a3a964 184 textBeginning = i = i + 1; // skip closing '>' too
211dfedd 185 }
d699f48b 186
211dfedd 187 // add another tag to the tree:
b1a3a964 188 else if (i < end_pos-1 && *(i+1) != wxT('/'))
d1da8872 189 {
211dfedd 190 wxHtmlTag *chd;
d699f48b
KB
191 if (cur)
192 chd = new wxHtmlTag(cur, m_Source,
211dfedd 193 i, end_pos, cache, m_entitiesParser);
d699f48b 194 else
211dfedd
VS
195 {
196 chd = new wxHtmlTag(NULL, m_Source,
197 i, end_pos, cache, m_entitiesParser);
d699f48b 198 if (!m_Tags)
211dfedd 199 {
d699f48b 200 // if this is the first tag to be created make the root
211dfedd
VS
201 // m_Tags point to it:
202 m_Tags = chd;
203 }
204 else
205 {
d699f48b 206 // if there is already a root tag add this tag as
211dfedd
VS
207 // the last sibling:
208 chd->m_Prev = m_Tags->GetLastSibling();
209 chd->m_Prev->m_Next = chd;
210 }
211 }
212
213 if (chd->HasEnding())
214 {
215 CreateDOMSubTree(chd,
b1a3a964 216 chd->GetBeginIter(), chd->GetEndIter1(),
211dfedd 217 cache);
b1a3a964 218 i = chd->GetEndIter2();
211dfedd
VS
219 }
220 else
b1a3a964 221 i = chd->GetBeginIter();
d1da8872 222
211dfedd
VS
223 textBeginning = i;
224 }
225
226 // ... or skip ending tag:
d699f48b 227 else
211dfedd 228 {
b1a3a964 229 while (i < end_pos && *i != wxT('>')) ++i;
211dfedd 230 textBeginning = i+1;
5526e819 231 }
5526e819 232 }
b1a3a964 233 else ++i;
5526e819
VS
234 }
235
211dfedd 236 // add remaining text to m_TextPieces:
b1a3a964
VS
237 if (end_pos > textBeginning)
238 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
211dfedd
VS
239}
240
241void wxHtmlParser::DestroyDOMTree()
242{
243 wxHtmlTag *t1, *t2;
244 t1 = m_Tags;
245 while (t1)
246 {
247 t2 = t1->GetNextSibling();
248 delete t1;
249 t1 = t2;
250 }
251 m_Tags = m_CurTag = NULL;
252
253 delete m_TextPieces;
254 m_TextPieces = NULL;
255}
256
d699f48b 257void wxHtmlParser::DoParsing()
211dfedd
VS
258{
259 m_CurTag = m_Tags;
260 m_CurTextPiece = 0;
b1a3a964 261 DoParsing(m_Source->begin(), m_Source->end());
211dfedd
VS
262}
263
b1a3a964
VS
264void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
265 const wxString::const_iterator& end_pos)
211dfedd 266{
b1a3a964
VS
267 wxString::const_iterator begin_pos(begin_pos_);
268
269 if (end_pos <= begin_pos)
270 return;
d699f48b 271
211dfedd 272 wxHtmlTextPieces& pieces = *m_TextPieces;
3c47c047 273 size_t piecesCnt = pieces.size();
d699f48b 274
211dfedd
VS
275 while (begin_pos < end_pos)
276 {
b1a3a964 277 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
211dfedd 278 m_CurTag = m_CurTag->GetNextTag();
d699f48b 279 while (m_CurTextPiece < piecesCnt &&
b1a3a964 280 pieces[m_CurTextPiece].m_start < begin_pos)
211dfedd
VS
281 m_CurTextPiece++;
282
d699f48b
KB
283 if (m_CurTextPiece < piecesCnt &&
284 (!m_CurTag ||
b1a3a964 285 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
211dfedd
VS
286 {
287 // Add text:
f23e92e7 288 AddText(GetEntitiesParser()->Parse(
b1a3a964
VS
289 wxString(pieces[m_CurTextPiece].m_start,
290 pieces[m_CurTextPiece].m_end)));
291 begin_pos = pieces[m_CurTextPiece].m_end;
211dfedd
VS
292 m_CurTextPiece++;
293 }
294 else if (m_CurTag)
295 {
902725ee 296 if (m_CurTag->HasEnding())
b1a3a964 297 begin_pos = m_CurTag->GetEndIter2();
902725ee 298 else
b1a3a964 299 begin_pos = m_CurTag->GetBeginIter();
211dfedd
VS
300 wxHtmlTag *t = m_CurTag;
301 m_CurTag = m_CurTag->GetNextTag();
302 AddTag(*t);
2b5f62a0
VZ
303 if (m_stopParsing)
304 return;
211dfedd
VS
305 }
306 else break;
5526e819
VS
307 }
308}
309
5526e819
VS
310void wxHtmlParser::AddTag(const wxHtmlTag& tag)
311{
d1da8872 312 bool inner = false;
5526e819 313
2826ef0c
VS
314 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
315 if (h != m_HandlersHash.end())
2b5f62a0 316 {
2826ef0c 317 inner = h->second->HandleTag(tag);
2b5f62a0
VZ
318 if (m_stopParsing)
319 return;
320 }
04dbb646 321 if (!inner)
4f9297b0 322 {
5526e819 323 if (tag.HasEnding())
b1a3a964 324 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
5526e819
VS
325 }
326}
327
5526e819
VS
328void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
329{
4f9297b0 330 wxString s(handler->GetSupportedTags());
211dfedd 331 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 332
5526e819 333 while (tokenizer.HasMoreTokens())
2826ef0c 334 m_HandlersHash[tokenizer.GetNextToken()] = handler;
5526e819 335
2826ef0c 336 m_HandlersSet.insert(handler);
5526e819 337
4f9297b0 338 handler->SetParser(this);
5526e819
VS
339}
340
fbfb8bcc 341void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 342{
211dfedd 343 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
344 wxString key;
345
2826ef0c 346 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
a7a4d01b 347
04dbb646 348 while (tokenizer.HasMoreTokens())
4f9297b0 349 {
470252df 350 key = tokenizer.GetNextToken();
2826ef0c 351 m_HandlersHash[key] = handler;
a7a4d01b
VS
352 }
353}
354
a7a4d01b
VS
355void wxHtmlParser::PopTagHandler()
356{
2826ef0c
VS
357 wxCHECK_RET( !m_HandlersStack.empty(),
358 "attempt to remove HTML tag handler from empty stack" );
359
360 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
361 m_HandlersStack.pop_back();
362 m_HandlersHash = *prev;
363 delete prev;
a7a4d01b
VS
364}
365
211dfedd
VS
366void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
367{
368 wxHtmlParserState *s = new wxHtmlParserState;
369
370 s->m_curTag = m_CurTag;
371 s->m_tags = m_Tags;
372 s->m_textPieces = m_TextPieces;
373 s->m_curTextPiece = m_CurTextPiece;
374 s->m_source = m_Source;
375
376 s->m_nextState = m_SavedStates;
377 m_SavedStates = s;
378
379 m_CurTag = NULL;
380 m_Tags = NULL;
381 m_TextPieces = NULL;
382 m_CurTextPiece = 0;
b1a3a964 383 m_Source = NULL;
d699f48b 384
211dfedd
VS
385 SetSource(src);
386}
387
388bool wxHtmlParser::RestoreState()
389{
d1da8872 390 if (!m_SavedStates) return false;
d699f48b 391
0beefa20 392 DestroyDOMTree();
d989875a 393 delete m_Source;
0beefa20 394
211dfedd
VS
395 wxHtmlParserState *s = m_SavedStates;
396 m_SavedStates = s->m_nextState;
d699f48b 397
211dfedd
VS
398 m_CurTag = s->m_curTag;
399 m_Tags = s->m_tags;
400 m_TextPieces = s->m_textPieces;
401 m_CurTextPiece = s->m_curTextPiece;
402 m_Source = s->m_source;
d699f48b 403
211dfedd 404 delete s;
d1da8872 405 return true;
211dfedd
VS
406}
407
e7feeafa
VS
408wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
409{
b1a3a964 410 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
e7feeafa
VS
411}
412
5526e819
VS
413//-----------------------------------------------------------------------------
414// wxHtmlTagHandler
415//-----------------------------------------------------------------------------
416
417IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 418
e7feeafa
VS
419void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
420{
421 // It is safe to temporarily change the source being parsed,
422 // provided we restore the state back after parsing
423 m_Parser->SetSourceAndSaveState(source);
424 m_Parser->DoParsing();
425 m_Parser->RestoreState();
426}
427
daa616fc
VS
428
429//-----------------------------------------------------------------------------
430// wxHtmlEntitiesParser
431//-----------------------------------------------------------------------------
432
433IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
434
435wxHtmlEntitiesParser::wxHtmlEntitiesParser()
8d94819c 436#if !wxUSE_UNICODE
daa616fc 437 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 438#endif
daa616fc
VS
439{
440}
441
442wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
443{
8d94819c 444#if !wxUSE_UNICODE
daa616fc 445 delete m_conv;
5438a566 446#endif
daa616fc 447}
5526e819 448
8d94819c 449#if !wxUSE_UNICODE
daa616fc
VS
450void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
451{
2b5f62a0
VZ
452 if (encoding == m_encoding)
453 return;
454
daa616fc 455 delete m_conv;
2b5f62a0 456
daa616fc 457 m_encoding = encoding;
2b5f62a0
VZ
458 if (m_encoding == wxFONTENCODING_SYSTEM)
459 m_conv = NULL;
460 else
daa616fc 461 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
daa616fc 462}
8d94819c 463#endif // !wxUSE_UNICODE
daa616fc 464
96d665d2 465wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
daa616fc 466{
daa616fc 467 wxString output;
d1da8872 468
4f7e8fda
VS
469 const wxString::const_iterator end(input.end());
470 wxString::const_iterator c(input.begin());
471 wxString::const_iterator last(c);
04dbb646 472
4f7e8fda 473 for ( ; c < end; ++c )
daa616fc
VS
474 {
475 if (*c == wxT('&'))
476 {
4f7e8fda
VS
477 if ( output.empty() )
478 output.reserve(input.length());
479
daa616fc 480 if (c - last > 0)
4f7e8fda
VS
481 output.append(last, c);
482 if ( ++c == end )
9e2bd135 483 break;
d1da8872 484
daa616fc 485 wxString entity;
4f7e8fda 486 const wxString::const_iterator ent_s = c;
470252df 487 wxChar entity_char;
d1da8872 488
b1a3a964
VS
489 for ( ; c != end; ++c )
490 {
491 wxChar ch = *c;
492 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
493 (ch >= wxT('A') && ch <= wxT('Z')) ||
494 (ch >= wxT('0') && ch <= wxT('9')) ||
495 ch == wxT('_') || ch == wxT('#')) )
496 break;
497 }
498
4f7e8fda
VS
499 entity.append(ent_s, c);
500 if (c == end || *c != wxT(';')) --c;
211dfedd 501 last = c+1;
470252df
VS
502 entity_char = GetEntityChar(entity);
503 if (entity_char)
504 output << entity_char;
505 else
506 {
4f7e8fda 507 output.append(ent_s-1, c+1);
25271309 508 wxLogTrace(wxTRACE_HTML_DEBUG,
4f7e8fda
VS
509 "Unrecognized HTML entity: '%s'",
510 entity);
470252df 511 }
daa616fc
VS
512 }
513 }
4f7e8fda
VS
514 if ( last == input.begin() ) // common case: no entity
515 return input;
516 if ( last != end )
517 output.append(last, end);
daa616fc
VS
518 return output;
519}
520
2b5f62a0 521#if !wxUSE_UNICODE
96d665d2 522wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
daa616fc 523{
daa616fc
VS
524 char buf[2];
525 wchar_t wbuf[2];
526 wbuf[0] = (wchar_t)code;
527 wbuf[1] = 0;
528 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 529 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
530 return '?';
531 return buf[0];
daa616fc 532}
2b5f62a0 533#endif
daa616fc 534
19817fd3
VS
535struct wxHtmlEntityInfo
536{
537 const wxStringCharType *name;
538 unsigned code;
539};
540
541extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
542{
543#if wxUSE_UNICODE_UTF8
544 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
545#else
546 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
547#endif
548}
549
96d665d2 550wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
daa616fc
VS
551{
552 unsigned code = 0;
04dbb646 553
c0213e2a
VS
554 if (entity.empty())
555 return 0; // invalid entity reference
556
daa616fc
VS
557 if (entity[0] == wxT('#'))
558 {
c471f7e1
VS
559 // NB: parsed value is a number, so it's OK to use wx_str(), internal
560 // representation is the same for numbers
561 const wxStringCharType *ent_s = entity.wx_str();
562 const wxStringCharType *format;
04dbb646 563
d9359369 564 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
daa616fc 565 {
d9359369 566 format = wxS("%x");
daa616fc
VS
567 ent_s++;
568 }
569 else
d9359369 570 format = wxS("%u");
daa616fc
VS
571 ent_s++;
572
573 if (wxSscanf(ent_s, format, &code) != 1)
574 code = 0;
575 }
576 else
577 {
19817fd3
VS
578 // store the literals in wx's internal representation (either char*
579 // in UTF-8 or wchar_t*) for best performance:
d9359369 580 #define ENTITY(name, code) { wxS(name), code }
19817fd3 581
daa616fc 582 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
583 ENTITY("AElig", 198),
584 ENTITY("Aacute", 193),
585 ENTITY("Acirc", 194),
586 ENTITY("Agrave", 192),
587 ENTITY("Alpha", 913),
588 ENTITY("Aring", 197),
589 ENTITY("Atilde", 195),
590 ENTITY("Auml", 196),
591 ENTITY("Beta", 914),
592 ENTITY("Ccedil", 199),
593 ENTITY("Chi", 935),
594 ENTITY("Dagger", 8225),
595 ENTITY("Delta", 916),
596 ENTITY("ETH", 208),
597 ENTITY("Eacute", 201),
598 ENTITY("Ecirc", 202),
599 ENTITY("Egrave", 200),
600 ENTITY("Epsilon", 917),
601 ENTITY("Eta", 919),
602 ENTITY("Euml", 203),
603 ENTITY("Gamma", 915),
604 ENTITY("Iacute", 205),
605 ENTITY("Icirc", 206),
606 ENTITY("Igrave", 204),
607 ENTITY("Iota", 921),
608 ENTITY("Iuml", 207),
609 ENTITY("Kappa", 922),
610 ENTITY("Lambda", 923),
611 ENTITY("Mu", 924),
612 ENTITY("Ntilde", 209),
613 ENTITY("Nu", 925),
614 ENTITY("OElig", 338),
615 ENTITY("Oacute", 211),
616 ENTITY("Ocirc", 212),
617 ENTITY("Ograve", 210),
618 ENTITY("Omega", 937),
619 ENTITY("Omicron", 927),
620 ENTITY("Oslash", 216),
621 ENTITY("Otilde", 213),
622 ENTITY("Ouml", 214),
623 ENTITY("Phi", 934),
624 ENTITY("Pi", 928),
625 ENTITY("Prime", 8243),
626 ENTITY("Psi", 936),
627 ENTITY("Rho", 929),
628 ENTITY("Scaron", 352),
629 ENTITY("Sigma", 931),
630 ENTITY("THORN", 222),
631 ENTITY("Tau", 932),
632 ENTITY("Theta", 920),
633 ENTITY("Uacute", 218),
634 ENTITY("Ucirc", 219),
635 ENTITY("Ugrave", 217),
636 ENTITY("Upsilon", 933),
637 ENTITY("Uuml", 220),
638 ENTITY("Xi", 926),
639 ENTITY("Yacute", 221),
640 ENTITY("Yuml", 376),
641 ENTITY("Zeta", 918),
642 ENTITY("aacute", 225),
643 ENTITY("acirc", 226),
644 ENTITY("acute", 180),
645 ENTITY("aelig", 230),
646 ENTITY("agrave", 224),
647 ENTITY("alefsym", 8501),
648 ENTITY("alpha", 945),
649 ENTITY("amp", 38),
650 ENTITY("and", 8743),
651 ENTITY("ang", 8736),
319106d6 652 ENTITY("apos", 39),
19817fd3
VS
653 ENTITY("aring", 229),
654 ENTITY("asymp", 8776),
655 ENTITY("atilde", 227),
656 ENTITY("auml", 228),
657 ENTITY("bdquo", 8222),
658 ENTITY("beta", 946),
659 ENTITY("brvbar", 166),
660 ENTITY("bull", 8226),
661 ENTITY("cap", 8745),
662 ENTITY("ccedil", 231),
663 ENTITY("cedil", 184),
664 ENTITY("cent", 162),
665 ENTITY("chi", 967),
666 ENTITY("circ", 710),
667 ENTITY("clubs", 9827),
668 ENTITY("cong", 8773),
669 ENTITY("copy", 169),
670 ENTITY("crarr", 8629),
671 ENTITY("cup", 8746),
672 ENTITY("curren", 164),
673 ENTITY("dArr", 8659),
674 ENTITY("dagger", 8224),
675 ENTITY("darr", 8595),
676 ENTITY("deg", 176),
677 ENTITY("delta", 948),
678 ENTITY("diams", 9830),
679 ENTITY("divide", 247),
680 ENTITY("eacute", 233),
681 ENTITY("ecirc", 234),
682 ENTITY("egrave", 232),
683 ENTITY("empty", 8709),
684 ENTITY("emsp", 8195),
685 ENTITY("ensp", 8194),
686 ENTITY("epsilon", 949),
687 ENTITY("equiv", 8801),
688 ENTITY("eta", 951),
689 ENTITY("eth", 240),
690 ENTITY("euml", 235),
691 ENTITY("euro", 8364),
692 ENTITY("exist", 8707),
693 ENTITY("fnof", 402),
694 ENTITY("forall", 8704),
695 ENTITY("frac12", 189),
696 ENTITY("frac14", 188),
697 ENTITY("frac34", 190),
698 ENTITY("frasl", 8260),
699 ENTITY("gamma", 947),
700 ENTITY("ge", 8805),
701 ENTITY("gt", 62),
702 ENTITY("hArr", 8660),
703 ENTITY("harr", 8596),
704 ENTITY("hearts", 9829),
705 ENTITY("hellip", 8230),
706 ENTITY("iacute", 237),
707 ENTITY("icirc", 238),
708 ENTITY("iexcl", 161),
709 ENTITY("igrave", 236),
710 ENTITY("image", 8465),
711 ENTITY("infin", 8734),
712 ENTITY("int", 8747),
713 ENTITY("iota", 953),
714 ENTITY("iquest", 191),
715 ENTITY("isin", 8712),
716 ENTITY("iuml", 239),
717 ENTITY("kappa", 954),
718 ENTITY("lArr", 8656),
719 ENTITY("lambda", 955),
720 ENTITY("lang", 9001),
721 ENTITY("laquo", 171),
722 ENTITY("larr", 8592),
723 ENTITY("lceil", 8968),
724 ENTITY("ldquo", 8220),
725 ENTITY("le", 8804),
726 ENTITY("lfloor", 8970),
727 ENTITY("lowast", 8727),
728 ENTITY("loz", 9674),
729 ENTITY("lrm", 8206),
730 ENTITY("lsaquo", 8249),
731 ENTITY("lsquo", 8216),
732 ENTITY("lt", 60),
733 ENTITY("macr", 175),
734 ENTITY("mdash", 8212),
735 ENTITY("micro", 181),
736 ENTITY("middot", 183),
737 ENTITY("minus", 8722),
738 ENTITY("mu", 956),
739 ENTITY("nabla", 8711),
740 ENTITY("nbsp", 160),
741 ENTITY("ndash", 8211),
742 ENTITY("ne", 8800),
743 ENTITY("ni", 8715),
744 ENTITY("not", 172),
745 ENTITY("notin", 8713),
746 ENTITY("nsub", 8836),
747 ENTITY("ntilde", 241),
748 ENTITY("nu", 957),
749 ENTITY("oacute", 243),
750 ENTITY("ocirc", 244),
751 ENTITY("oelig", 339),
752 ENTITY("ograve", 242),
753 ENTITY("oline", 8254),
754 ENTITY("omega", 969),
755 ENTITY("omicron", 959),
756 ENTITY("oplus", 8853),
757 ENTITY("or", 8744),
758 ENTITY("ordf", 170),
759 ENTITY("ordm", 186),
760 ENTITY("oslash", 248),
761 ENTITY("otilde", 245),
762 ENTITY("otimes", 8855),
763 ENTITY("ouml", 246),
764 ENTITY("para", 182),
765 ENTITY("part", 8706),
766 ENTITY("permil", 8240),
767 ENTITY("perp", 8869),
768 ENTITY("phi", 966),
769 ENTITY("pi", 960),
770 ENTITY("piv", 982),
771 ENTITY("plusmn", 177),
772 ENTITY("pound", 163),
773 ENTITY("prime", 8242),
774 ENTITY("prod", 8719),
775 ENTITY("prop", 8733),
776 ENTITY("psi", 968),
777 ENTITY("quot", 34),
778 ENTITY("rArr", 8658),
779 ENTITY("radic", 8730),
780 ENTITY("rang", 9002),
781 ENTITY("raquo", 187),
782 ENTITY("rarr", 8594),
783 ENTITY("rceil", 8969),
784 ENTITY("rdquo", 8221),
785 ENTITY("real", 8476),
786 ENTITY("reg", 174),
787 ENTITY("rfloor", 8971),
788 ENTITY("rho", 961),
789 ENTITY("rlm", 8207),
790 ENTITY("rsaquo", 8250),
791 ENTITY("rsquo", 8217),
792 ENTITY("sbquo", 8218),
793 ENTITY("scaron", 353),
794 ENTITY("sdot", 8901),
795 ENTITY("sect", 167),
796 ENTITY("shy", 173),
797 ENTITY("sigma", 963),
798 ENTITY("sigmaf", 962),
799 ENTITY("sim", 8764),
800 ENTITY("spades", 9824),
801 ENTITY("sub", 8834),
802 ENTITY("sube", 8838),
803 ENTITY("sum", 8721),
804 ENTITY("sup", 8835),
805 ENTITY("sup1", 185),
806 ENTITY("sup2", 178),
807 ENTITY("sup3", 179),
808 ENTITY("supe", 8839),
809 ENTITY("szlig", 223),
810 ENTITY("tau", 964),
811 ENTITY("there4", 8756),
812 ENTITY("theta", 952),
813 ENTITY("thetasym", 977),
814 ENTITY("thinsp", 8201),
815 ENTITY("thorn", 254),
816 ENTITY("tilde", 732),
817 ENTITY("times", 215),
818 ENTITY("trade", 8482),
819 ENTITY("uArr", 8657),
820 ENTITY("uacute", 250),
821 ENTITY("uarr", 8593),
822 ENTITY("ucirc", 251),
823 ENTITY("ugrave", 249),
824 ENTITY("uml", 168),
825 ENTITY("upsih", 978),
826 ENTITY("upsilon", 965),
827 ENTITY("uuml", 252),
828 ENTITY("weierp", 8472),
829 ENTITY("xi", 958),
830 ENTITY("yacute", 253),
831 ENTITY("yen", 165),
832 ENTITY("yuml", 255),
833 ENTITY("zeta", 950),
834 ENTITY("zwj", 8205),
835 ENTITY("zwnj", 8204),
daa616fc 836 {NULL, 0}};
19817fd3 837 #undef ENTITY
daa616fc 838 static size_t substitutions_cnt = 0;
04dbb646 839
daa616fc
VS
840 if (substitutions_cnt == 0)
841 while (substitutions[substitutions_cnt].code != 0)
842 substitutions_cnt++;
843
e822d1bd 844 wxHtmlEntityInfo *info;
3919d530
JS
845#ifdef __WXWINCE__
846 // bsearch crashes under WinCE for some reason
e822d1bd 847 info = NULL;
3919d530
JS
848 size_t i;
849 for (i = 0; i < substitutions_cnt; i++)
850 {
851 if (entity == substitutions[i].name)
852 {
853 info = & substitutions[i];
854 break;
855 }
856 }
857#else
19817fd3 858 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
859 substitutions_cnt,
860 sizeof(wxHtmlEntityInfo),
90350682 861 wxHtmlEntityCompare);
3919d530 862#endif
daa616fc
VS
863 if (info)
864 code = info->code;
865 }
04dbb646 866
daa616fc 867 if (code == 0)
470252df 868 return 0;
daa616fc
VS
869 else
870 return GetCharForCode(code);
871}
872
d1da8872 873wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
6cc4e6b8
VS
874 const wxString& url) const
875{
e02ecf7c 876 return m_FS ? m_FS->OpenFile(url) : NULL;
d1da8872 877
6cc4e6b8
VS
878}
879
2b5f62a0
VZ
880
881//-----------------------------------------------------------------------------
882// wxHtmlParser::ExtractCharsetInformation
883//-----------------------------------------------------------------------------
884
885class wxMetaTagParser : public wxHtmlParser
886{
887public:
2eb10e2a
VZ
888 wxMetaTagParser() { }
889
2b5f62a0 890 wxObject* GetProduct() { return NULL; }
2eb10e2a 891
2b5f62a0 892protected:
5bce3e6f 893 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a 894
c0c133e1 895 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
2b5f62a0
VZ
896};
897
898class wxMetaTagHandler : public wxHtmlTagHandler
899{
900public:
901 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
902 wxString GetSupportedTags() { return wxT("META,BODY"); }
903 bool HandleTag(const wxHtmlTag& tag);
904
905private:
906 wxString *m_retval;
2eb10e2a 907
c0c133e1 908 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
2b5f62a0
VZ
909};
910
911bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
912{
9a83f860 913 if (tag.GetName() == wxT("BODY"))
2b5f62a0
VZ
914 {
915 m_Parser->StopParsing();
d1da8872 916 return false;
2b5f62a0
VZ
917 }
918
9a83f860
VZ
919 if (tag.HasParam(wxT("HTTP-EQUIV")) &&
920 tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
921 tag.HasParam(wxT("CONTENT")))
2b5f62a0 922 {
9a83f860
VZ
923 wxString content = tag.GetParam(wxT("CONTENT")).Lower();
924 if (content.Left(19) == wxT("text/html; charset="))
2b5f62a0
VZ
925 {
926 *m_retval = content.Mid(19);
927 m_Parser->StopParsing();
928 }
929 }
d1da8872 930 return false;
2b5f62a0
VZ
931}
932
933
934/*static*/
935wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
936{
937 wxString charset;
e7274ba2
WS
938 wxMetaTagParser *parser = new wxMetaTagParser();
939 if(parser)
940 {
941 parser->AddTagHandler(new wxMetaTagHandler(&charset));
942 parser->Parse(markup);
943 delete parser;
944 }
2b5f62a0
VZ
945 return charset;
946}
947
4609ee2e
VZ
948/* static */
949bool
950wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
951 wxString::const_iterator end)
952{
9a83f860 953 wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
4609ee2e
VZ
954
955 wxString::const_iterator p = start;
956
957 // comments begin with "<!--" in HTML 4.0
95ebbfe1 958 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
4609ee2e
VZ
959 {
960 // not a comment at all
961 return false;
962 }
963
964 // skip the start of the comment tag in any case, if we don't find the
965 // closing tag we should ignore broken markup
966 start = p;
967
968 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
969 // comment delimiter and the closing tag character (section 3.2.4 of
970 // http://www.w3.org/TR/html401/)
971 int dashes = 0;
972 while ( ++p < end )
973 {
974 const wxChar c = *p;
975
976 if ( (c == wxT(' ') || c == wxT('\n') ||
977 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
978 {
979 // ignore white space before potential tag end
980 continue;
981 }
982
983 if ( c == wxT('>') && dashes >= 2 )
984 {
985 // found end of comment
986 start = p;
987 break;
988 }
989
990 if ( c == wxT('-') )
991 dashes++;
992 else
993 dashes = 0;
994 }
995
996 return true;
997}
998
999#endif // wxUSE_HTML