]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
fix bug with wxCSConv("ASCII")
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
69941f05 5// RCS-ID: $Id$
5526e819 6// Copyright: (c) 1999 Vaclav Slavik
65571936 7// Licence: wxWindows licence
5526e819
VS
8/////////////////////////////////////////////////////////////////////////////
9
3096bd2f 10#include "wx/wxprec.h"
5526e819 11
2b5f62a0 12#ifdef __BORLANDC__
93763ad5 13 #pragma hdrstop
5526e819
VS
14#endif
15
93763ad5
WS
16#if wxUSE_HTML && wxUSE_STREAMS
17
b4f4d3dd 18#ifndef WX_PRECOMP
ad9835c9 19 #include "wx/dynarray.h"
04dbb646
VZ
20 #include "wx/log.h"
21 #include "wx/intl.h"
670f9935 22 #include "wx/app.h"
193d0c93 23 #include "wx/wxcrtvararg.h"
5526e819
VS
24#endif
25
69941f05
VS
26#include "wx/tokenzr.h"
27#include "wx/wfstream.h"
28#include "wx/url.h"
daa616fc 29#include "wx/fontmap.h"
69941f05
VS
30#include "wx/html/htmldefs.h"
31#include "wx/html/htmlpars.h"
3c47c047 32#include "wx/vector.h"
5526e819 33
7127d129
RR
34#ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36#endif
34fdf762
VS
37
38// DLL options compatibility check:
34fdf762 39WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 40
25271309
VZ
41const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
211dfedd
VS
43//-----------------------------------------------------------------------------
44// wxHtmlParser helpers
45//-----------------------------------------------------------------------------
46
47class wxHtmlTextPiece
48{
49public:
3c47c047 50 wxHtmlTextPiece() {}
b1a3a964
VS
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
211dfedd
VS
55};
56
3c47c047
VS
57// NB: this is an empty class and not typedef because of forward declaration
58class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59{
60};
5526e819 61
481c879b 62class wxHtmlParserState
211dfedd 63{
481c879b 64public:
211dfedd
VS
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
b1a3a964 69 const wxString *m_source;
211dfedd
VS
70 wxHtmlParserState *m_nextState;
71};
5526e819
VS
72
73//-----------------------------------------------------------------------------
74// wxHtmlParser
75//-----------------------------------------------------------------------------
76
77IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
04dbb646 79wxHtmlParser::wxHtmlParser()
2826ef0c
VS
80 : wxObject(),
81 m_FS(NULL)
daa616fc 82{
b1a3a964 83 m_Source = NULL;
daa616fc 84 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
daa616fc
VS
90}
91
92wxHtmlParser::~wxHtmlParser()
93{
0beefa20
VS
94 while (RestoreState()) {}
95 DestroyDOMTree();
222ed1d6 96
2826ef0c
VS
97 WX_CLEAR_ARRAY(m_HandlersStack);
98 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
daa616fc 99 delete m_entitiesParser;
b1a3a964 100 delete m_Source;
daa616fc 101}
5526e819
VS
102
103wxObject* wxHtmlParser::Parse(const wxString& source)
104{
5526e819
VS
105 InitParser(source);
106 DoParsing();
2b5f62a0 107 wxObject *result = GetProduct();
5526e819
VS
108 DoneParser();
109 return result;
110}
111
5526e819
VS
112void wxHtmlParser::InitParser(const wxString& source)
113{
1309ba6c 114 SetSource(source);
d1da8872 115 m_stopParsing = false;
5526e819 116}
1309ba6c 117
5526e819
VS
118void wxHtmlParser::DoneParser()
119{
211dfedd 120 DestroyDOMTree();
5526e819
VS
121}
122
1309ba6c
VS
123void wxHtmlParser::SetSource(const wxString& src)
124{
211dfedd 125 DestroyDOMTree();
d989875a
VS
126 // NB: This is allocated on heap because wxHtmlTag uses iterators and
127 // making a copy of m_Source string in SetSourceAndSaveState() and
128 // RestoreState() would invalidate them (because wxString::m_impl's
129 // memory would change completely twice and iterators use pointers
130 // into it). So instead, we keep the string object intact and only
131 // store/restore pointer to it, for which we need it to be allocated
132 // on the heap.
b1a3a964
VS
133 delete m_Source;
134 m_Source = new wxString(src);
211dfedd
VS
135 CreateDOMTree();
136 m_CurTag = NULL;
137 m_CurTextPiece = 0;
1309ba6c 138}
5526e819 139
211dfedd 140void wxHtmlParser::CreateDOMTree()
5526e819 141{
b1a3a964 142 wxHtmlTagsCache cache(*m_Source);
211dfedd 143 m_TextPieces = new wxHtmlTextPieces;
b1a3a964 144 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
211dfedd
VS
145 m_CurTextPiece = 0;
146}
5526e819 147
b1a3a964 148extern bool wxIsCDATAElement(const wxString& tag);
7c6cd4a8 149
211dfedd 150void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
b1a3a964
VS
151 const wxString::const_iterator& begin_pos,
152 const wxString::const_iterator& end_pos,
211dfedd
VS
153 wxHtmlTagsCache *cache)
154{
b1a3a964
VS
155 if (end_pos <= begin_pos)
156 return;
5526e819 157
211dfedd 158 wxChar c;
b1a3a964
VS
159 wxString::const_iterator i = begin_pos;
160 wxString::const_iterator textBeginning = begin_pos;
d699f48b 161
7c6cd4a8
VS
162 // If the tag contains CDATA text, we include the text between beginning
163 // and ending tag verbosely. Setting i=end_pos will skip to the very
164 // end of this function where text piece is added, bypassing any child
165 // tags parsing (CDATA element can't have child elements by definition):
b1a3a964 166 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
7c6cd4a8
VS
167 {
168 i = end_pos;
169 }
170
04dbb646 171 while (i < end_pos)
4f9297b0 172 {
b1a3a964 173 c = *i;
5526e819 174
211dfedd
VS
175 if (c == wxT('<'))
176 {
177 // add text to m_TextPieces:
b1a3a964
VS
178 if (i > textBeginning)
179 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
211dfedd
VS
180
181 // if it is a comment, skip it:
b1a3a964 182 if ( SkipCommentTag(i, m_Source->end()) )
211dfedd 183 {
b1a3a964 184 textBeginning = i = i + 1; // skip closing '>' too
211dfedd 185 }
d699f48b 186
211dfedd 187 // add another tag to the tree:
b1a3a964 188 else if (i < end_pos-1 && *(i+1) != wxT('/'))
d1da8872 189 {
211dfedd 190 wxHtmlTag *chd;
d699f48b
KB
191 if (cur)
192 chd = new wxHtmlTag(cur, m_Source,
211dfedd 193 i, end_pos, cache, m_entitiesParser);
d699f48b 194 else
211dfedd
VS
195 {
196 chd = new wxHtmlTag(NULL, m_Source,
197 i, end_pos, cache, m_entitiesParser);
d699f48b 198 if (!m_Tags)
211dfedd 199 {
d699f48b 200 // if this is the first tag to be created make the root
211dfedd
VS
201 // m_Tags point to it:
202 m_Tags = chd;
203 }
204 else
205 {
d699f48b 206 // if there is already a root tag add this tag as
211dfedd
VS
207 // the last sibling:
208 chd->m_Prev = m_Tags->GetLastSibling();
209 chd->m_Prev->m_Next = chd;
210 }
211 }
212
213 if (chd->HasEnding())
214 {
215 CreateDOMSubTree(chd,
b1a3a964 216 chd->GetBeginIter(), chd->GetEndIter1(),
211dfedd 217 cache);
b1a3a964 218 i = chd->GetEndIter2();
211dfedd
VS
219 }
220 else
b1a3a964 221 i = chd->GetBeginIter();
d1da8872 222
211dfedd
VS
223 textBeginning = i;
224 }
225
226 // ... or skip ending tag:
d699f48b 227 else
211dfedd 228 {
b1a3a964 229 while (i < end_pos && *i != wxT('>')) ++i;
211dfedd 230 textBeginning = i+1;
5526e819 231 }
5526e819 232 }
b1a3a964 233 else ++i;
5526e819
VS
234 }
235
211dfedd 236 // add remaining text to m_TextPieces:
b1a3a964
VS
237 if (end_pos > textBeginning)
238 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
211dfedd
VS
239}
240
241void wxHtmlParser::DestroyDOMTree()
242{
243 wxHtmlTag *t1, *t2;
244 t1 = m_Tags;
245 while (t1)
246 {
247 t2 = t1->GetNextSibling();
248 delete t1;
249 t1 = t2;
250 }
251 m_Tags = m_CurTag = NULL;
252
253 delete m_TextPieces;
254 m_TextPieces = NULL;
255}
256
d699f48b 257void wxHtmlParser::DoParsing()
211dfedd
VS
258{
259 m_CurTag = m_Tags;
260 m_CurTextPiece = 0;
b1a3a964 261 DoParsing(m_Source->begin(), m_Source->end());
211dfedd
VS
262}
263
b1a3a964
VS
264void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
265 const wxString::const_iterator& end_pos)
211dfedd 266{
b1a3a964
VS
267 wxString::const_iterator begin_pos(begin_pos_);
268
269 if (end_pos <= begin_pos)
270 return;
d699f48b 271
211dfedd 272 wxHtmlTextPieces& pieces = *m_TextPieces;
3c47c047 273 size_t piecesCnt = pieces.size();
d699f48b 274
211dfedd
VS
275 while (begin_pos < end_pos)
276 {
b1a3a964 277 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
211dfedd 278 m_CurTag = m_CurTag->GetNextTag();
d699f48b 279 while (m_CurTextPiece < piecesCnt &&
b1a3a964 280 pieces[m_CurTextPiece].m_start < begin_pos)
211dfedd
VS
281 m_CurTextPiece++;
282
d699f48b
KB
283 if (m_CurTextPiece < piecesCnt &&
284 (!m_CurTag ||
b1a3a964 285 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
211dfedd
VS
286 {
287 // Add text:
f23e92e7 288 AddText(GetEntitiesParser()->Parse(
b1a3a964
VS
289 wxString(pieces[m_CurTextPiece].m_start,
290 pieces[m_CurTextPiece].m_end)));
291 begin_pos = pieces[m_CurTextPiece].m_end;
211dfedd
VS
292 m_CurTextPiece++;
293 }
294 else if (m_CurTag)
295 {
902725ee 296 if (m_CurTag->HasEnding())
b1a3a964 297 begin_pos = m_CurTag->GetEndIter2();
902725ee 298 else
b1a3a964 299 begin_pos = m_CurTag->GetBeginIter();
211dfedd
VS
300 wxHtmlTag *t = m_CurTag;
301 m_CurTag = m_CurTag->GetNextTag();
302 AddTag(*t);
2b5f62a0
VZ
303 if (m_stopParsing)
304 return;
211dfedd
VS
305 }
306 else break;
5526e819
VS
307 }
308}
309
5526e819
VS
310void wxHtmlParser::AddTag(const wxHtmlTag& tag)
311{
d1da8872 312 bool inner = false;
5526e819 313
2826ef0c
VS
314 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
315 if (h != m_HandlersHash.end())
2b5f62a0 316 {
2826ef0c 317 inner = h->second->HandleTag(tag);
2b5f62a0
VZ
318 if (m_stopParsing)
319 return;
320 }
04dbb646 321 if (!inner)
4f9297b0 322 {
5526e819 323 if (tag.HasEnding())
b1a3a964 324 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
5526e819
VS
325 }
326}
327
5526e819
VS
328void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
329{
4f9297b0 330 wxString s(handler->GetSupportedTags());
211dfedd 331 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 332
5526e819 333 while (tokenizer.HasMoreTokens())
2826ef0c 334 m_HandlersHash[tokenizer.GetNextToken()] = handler;
5526e819 335
2826ef0c 336 m_HandlersSet.insert(handler);
5526e819 337
4f9297b0 338 handler->SetParser(this);
5526e819
VS
339}
340
fbfb8bcc 341void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 342{
211dfedd 343 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
344 wxString key;
345
2826ef0c 346 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
a7a4d01b 347
04dbb646 348 while (tokenizer.HasMoreTokens())
4f9297b0 349 {
470252df 350 key = tokenizer.GetNextToken();
2826ef0c 351 m_HandlersHash[key] = handler;
a7a4d01b
VS
352 }
353}
354
a7a4d01b
VS
355void wxHtmlParser::PopTagHandler()
356{
2826ef0c
VS
357 wxCHECK_RET( !m_HandlersStack.empty(),
358 "attempt to remove HTML tag handler from empty stack" );
359
360 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
361 m_HandlersStack.pop_back();
362 m_HandlersHash = *prev;
363 delete prev;
a7a4d01b
VS
364}
365
211dfedd
VS
366void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
367{
368 wxHtmlParserState *s = new wxHtmlParserState;
369
370 s->m_curTag = m_CurTag;
371 s->m_tags = m_Tags;
372 s->m_textPieces = m_TextPieces;
373 s->m_curTextPiece = m_CurTextPiece;
374 s->m_source = m_Source;
375
376 s->m_nextState = m_SavedStates;
377 m_SavedStates = s;
378
379 m_CurTag = NULL;
380 m_Tags = NULL;
381 m_TextPieces = NULL;
382 m_CurTextPiece = 0;
b1a3a964 383 m_Source = NULL;
d699f48b 384
211dfedd
VS
385 SetSource(src);
386}
387
388bool wxHtmlParser::RestoreState()
389{
d1da8872 390 if (!m_SavedStates) return false;
d699f48b 391
0beefa20 392 DestroyDOMTree();
d989875a 393 delete m_Source;
0beefa20 394
211dfedd
VS
395 wxHtmlParserState *s = m_SavedStates;
396 m_SavedStates = s->m_nextState;
d699f48b 397
211dfedd
VS
398 m_CurTag = s->m_curTag;
399 m_Tags = s->m_tags;
400 m_TextPieces = s->m_textPieces;
401 m_CurTextPiece = s->m_curTextPiece;
402 m_Source = s->m_source;
d699f48b 403
211dfedd 404 delete s;
d1da8872 405 return true;
211dfedd
VS
406}
407
e7feeafa
VS
408wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
409{
b1a3a964 410 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
e7feeafa
VS
411}
412
5526e819
VS
413//-----------------------------------------------------------------------------
414// wxHtmlTagHandler
415//-----------------------------------------------------------------------------
416
417IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 418
e7feeafa
VS
419void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
420{
421 // It is safe to temporarily change the source being parsed,
422 // provided we restore the state back after parsing
423 m_Parser->SetSourceAndSaveState(source);
424 m_Parser->DoParsing();
425 m_Parser->RestoreState();
426}
427
daa616fc
VS
428
429//-----------------------------------------------------------------------------
430// wxHtmlEntitiesParser
431//-----------------------------------------------------------------------------
432
433IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
434
435wxHtmlEntitiesParser::wxHtmlEntitiesParser()
436#if wxUSE_WCHAR_T && !wxUSE_UNICODE
437 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 438#endif
daa616fc
VS
439{
440}
441
442wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
443{
5438a566 444#if wxUSE_WCHAR_T && !wxUSE_UNICODE
daa616fc 445 delete m_conv;
5438a566 446#endif
daa616fc 447}
5526e819 448
daa616fc
VS
449void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
450{
451#if wxUSE_WCHAR_T && !wxUSE_UNICODE
2b5f62a0
VZ
452 if (encoding == m_encoding)
453 return;
454
daa616fc 455 delete m_conv;
2b5f62a0 456
daa616fc 457 m_encoding = encoding;
2b5f62a0
VZ
458 if (m_encoding == wxFONTENCODING_SYSTEM)
459 m_conv = NULL;
460 else
daa616fc 461 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
d699f48b
KB
462#else
463 (void) encoding;
daa616fc
VS
464#endif
465}
466
96d665d2 467wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
daa616fc 468{
daa616fc 469 wxString output;
d1da8872 470
4f7e8fda
VS
471 const wxString::const_iterator end(input.end());
472 wxString::const_iterator c(input.begin());
473 wxString::const_iterator last(c);
04dbb646 474
4f7e8fda 475 for ( ; c < end; ++c )
daa616fc
VS
476 {
477 if (*c == wxT('&'))
478 {
4f7e8fda
VS
479 if ( output.empty() )
480 output.reserve(input.length());
481
daa616fc 482 if (c - last > 0)
4f7e8fda
VS
483 output.append(last, c);
484 if ( ++c == end )
9e2bd135 485 break;
d1da8872 486
daa616fc 487 wxString entity;
4f7e8fda 488 const wxString::const_iterator ent_s = c;
470252df 489 wxChar entity_char;
d1da8872 490
b1a3a964
VS
491 for ( ; c != end; ++c )
492 {
493 wxChar ch = *c;
494 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
495 (ch >= wxT('A') && ch <= wxT('Z')) ||
496 (ch >= wxT('0') && ch <= wxT('9')) ||
497 ch == wxT('_') || ch == wxT('#')) )
498 break;
499 }
500
4f7e8fda
VS
501 entity.append(ent_s, c);
502 if (c == end || *c != wxT(';')) --c;
211dfedd 503 last = c+1;
470252df
VS
504 entity_char = GetEntityChar(entity);
505 if (entity_char)
506 output << entity_char;
507 else
508 {
4f7e8fda 509 output.append(ent_s-1, c+1);
25271309 510 wxLogTrace(wxTRACE_HTML_DEBUG,
4f7e8fda
VS
511 "Unrecognized HTML entity: '%s'",
512 entity);
470252df 513 }
daa616fc
VS
514 }
515 }
4f7e8fda
VS
516 if ( last == input.begin() ) // common case: no entity
517 return input;
518 if ( last != end )
519 output.append(last, end);
daa616fc
VS
520 return output;
521}
522
2b5f62a0 523#if !wxUSE_UNICODE
96d665d2 524wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
daa616fc 525{
2b5f62a0 526#if wxUSE_WCHAR_T
daa616fc
VS
527 char buf[2];
528 wchar_t wbuf[2];
529 wbuf[0] = (wchar_t)code;
530 wbuf[1] = 0;
531 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 532 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
533 return '?';
534 return buf[0];
535#else
536 return (code < 256) ? (wxChar)code : '?';
537#endif
538}
2b5f62a0 539#endif
daa616fc 540
19817fd3
VS
541struct wxHtmlEntityInfo
542{
543 const wxStringCharType *name;
544 unsigned code;
545};
546
547extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
548{
549#if wxUSE_UNICODE_UTF8
550 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
551#else
552 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
553#endif
554}
555
96d665d2 556wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
daa616fc
VS
557{
558 unsigned code = 0;
04dbb646 559
daa616fc
VS
560 if (entity[0] == wxT('#'))
561 {
c471f7e1
VS
562 // NB: parsed value is a number, so it's OK to use wx_str(), internal
563 // representation is the same for numbers
564 const wxStringCharType *ent_s = entity.wx_str();
565 const wxStringCharType *format;
04dbb646 566
d9359369 567 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
daa616fc 568 {
d9359369 569 format = wxS("%x");
daa616fc
VS
570 ent_s++;
571 }
572 else
d9359369 573 format = wxS("%u");
daa616fc
VS
574 ent_s++;
575
576 if (wxSscanf(ent_s, format, &code) != 1)
577 code = 0;
578 }
579 else
580 {
19817fd3
VS
581 // store the literals in wx's internal representation (either char*
582 // in UTF-8 or wchar_t*) for best performance:
d9359369 583 #define ENTITY(name, code) { wxS(name), code }
19817fd3 584
daa616fc 585 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
586 ENTITY("AElig", 198),
587 ENTITY("Aacute", 193),
588 ENTITY("Acirc", 194),
589 ENTITY("Agrave", 192),
590 ENTITY("Alpha", 913),
591 ENTITY("Aring", 197),
592 ENTITY("Atilde", 195),
593 ENTITY("Auml", 196),
594 ENTITY("Beta", 914),
595 ENTITY("Ccedil", 199),
596 ENTITY("Chi", 935),
597 ENTITY("Dagger", 8225),
598 ENTITY("Delta", 916),
599 ENTITY("ETH", 208),
600 ENTITY("Eacute", 201),
601 ENTITY("Ecirc", 202),
602 ENTITY("Egrave", 200),
603 ENTITY("Epsilon", 917),
604 ENTITY("Eta", 919),
605 ENTITY("Euml", 203),
606 ENTITY("Gamma", 915),
607 ENTITY("Iacute", 205),
608 ENTITY("Icirc", 206),
609 ENTITY("Igrave", 204),
610 ENTITY("Iota", 921),
611 ENTITY("Iuml", 207),
612 ENTITY("Kappa", 922),
613 ENTITY("Lambda", 923),
614 ENTITY("Mu", 924),
615 ENTITY("Ntilde", 209),
616 ENTITY("Nu", 925),
617 ENTITY("OElig", 338),
618 ENTITY("Oacute", 211),
619 ENTITY("Ocirc", 212),
620 ENTITY("Ograve", 210),
621 ENTITY("Omega", 937),
622 ENTITY("Omicron", 927),
623 ENTITY("Oslash", 216),
624 ENTITY("Otilde", 213),
625 ENTITY("Ouml", 214),
626 ENTITY("Phi", 934),
627 ENTITY("Pi", 928),
628 ENTITY("Prime", 8243),
629 ENTITY("Psi", 936),
630 ENTITY("Rho", 929),
631 ENTITY("Scaron", 352),
632 ENTITY("Sigma", 931),
633 ENTITY("THORN", 222),
634 ENTITY("Tau", 932),
635 ENTITY("Theta", 920),
636 ENTITY("Uacute", 218),
637 ENTITY("Ucirc", 219),
638 ENTITY("Ugrave", 217),
639 ENTITY("Upsilon", 933),
640 ENTITY("Uuml", 220),
641 ENTITY("Xi", 926),
642 ENTITY("Yacute", 221),
643 ENTITY("Yuml", 376),
644 ENTITY("Zeta", 918),
645 ENTITY("aacute", 225),
646 ENTITY("acirc", 226),
647 ENTITY("acute", 180),
648 ENTITY("aelig", 230),
649 ENTITY("agrave", 224),
650 ENTITY("alefsym", 8501),
651 ENTITY("alpha", 945),
652 ENTITY("amp", 38),
653 ENTITY("and", 8743),
654 ENTITY("ang", 8736),
655 ENTITY("aring", 229),
656 ENTITY("asymp", 8776),
657 ENTITY("atilde", 227),
658 ENTITY("auml", 228),
659 ENTITY("bdquo", 8222),
660 ENTITY("beta", 946),
661 ENTITY("brvbar", 166),
662 ENTITY("bull", 8226),
663 ENTITY("cap", 8745),
664 ENTITY("ccedil", 231),
665 ENTITY("cedil", 184),
666 ENTITY("cent", 162),
667 ENTITY("chi", 967),
668 ENTITY("circ", 710),
669 ENTITY("clubs", 9827),
670 ENTITY("cong", 8773),
671 ENTITY("copy", 169),
672 ENTITY("crarr", 8629),
673 ENTITY("cup", 8746),
674 ENTITY("curren", 164),
675 ENTITY("dArr", 8659),
676 ENTITY("dagger", 8224),
677 ENTITY("darr", 8595),
678 ENTITY("deg", 176),
679 ENTITY("delta", 948),
680 ENTITY("diams", 9830),
681 ENTITY("divide", 247),
682 ENTITY("eacute", 233),
683 ENTITY("ecirc", 234),
684 ENTITY("egrave", 232),
685 ENTITY("empty", 8709),
686 ENTITY("emsp", 8195),
687 ENTITY("ensp", 8194),
688 ENTITY("epsilon", 949),
689 ENTITY("equiv", 8801),
690 ENTITY("eta", 951),
691 ENTITY("eth", 240),
692 ENTITY("euml", 235),
693 ENTITY("euro", 8364),
694 ENTITY("exist", 8707),
695 ENTITY("fnof", 402),
696 ENTITY("forall", 8704),
697 ENTITY("frac12", 189),
698 ENTITY("frac14", 188),
699 ENTITY("frac34", 190),
700 ENTITY("frasl", 8260),
701 ENTITY("gamma", 947),
702 ENTITY("ge", 8805),
703 ENTITY("gt", 62),
704 ENTITY("hArr", 8660),
705 ENTITY("harr", 8596),
706 ENTITY("hearts", 9829),
707 ENTITY("hellip", 8230),
708 ENTITY("iacute", 237),
709 ENTITY("icirc", 238),
710 ENTITY("iexcl", 161),
711 ENTITY("igrave", 236),
712 ENTITY("image", 8465),
713 ENTITY("infin", 8734),
714 ENTITY("int", 8747),
715 ENTITY("iota", 953),
716 ENTITY("iquest", 191),
717 ENTITY("isin", 8712),
718 ENTITY("iuml", 239),
719 ENTITY("kappa", 954),
720 ENTITY("lArr", 8656),
721 ENTITY("lambda", 955),
722 ENTITY("lang", 9001),
723 ENTITY("laquo", 171),
724 ENTITY("larr", 8592),
725 ENTITY("lceil", 8968),
726 ENTITY("ldquo", 8220),
727 ENTITY("le", 8804),
728 ENTITY("lfloor", 8970),
729 ENTITY("lowast", 8727),
730 ENTITY("loz", 9674),
731 ENTITY("lrm", 8206),
732 ENTITY("lsaquo", 8249),
733 ENTITY("lsquo", 8216),
734 ENTITY("lt", 60),
735 ENTITY("macr", 175),
736 ENTITY("mdash", 8212),
737 ENTITY("micro", 181),
738 ENTITY("middot", 183),
739 ENTITY("minus", 8722),
740 ENTITY("mu", 956),
741 ENTITY("nabla", 8711),
742 ENTITY("nbsp", 160),
743 ENTITY("ndash", 8211),
744 ENTITY("ne", 8800),
745 ENTITY("ni", 8715),
746 ENTITY("not", 172),
747 ENTITY("notin", 8713),
748 ENTITY("nsub", 8836),
749 ENTITY("ntilde", 241),
750 ENTITY("nu", 957),
751 ENTITY("oacute", 243),
752 ENTITY("ocirc", 244),
753 ENTITY("oelig", 339),
754 ENTITY("ograve", 242),
755 ENTITY("oline", 8254),
756 ENTITY("omega", 969),
757 ENTITY("omicron", 959),
758 ENTITY("oplus", 8853),
759 ENTITY("or", 8744),
760 ENTITY("ordf", 170),
761 ENTITY("ordm", 186),
762 ENTITY("oslash", 248),
763 ENTITY("otilde", 245),
764 ENTITY("otimes", 8855),
765 ENTITY("ouml", 246),
766 ENTITY("para", 182),
767 ENTITY("part", 8706),
768 ENTITY("permil", 8240),
769 ENTITY("perp", 8869),
770 ENTITY("phi", 966),
771 ENTITY("pi", 960),
772 ENTITY("piv", 982),
773 ENTITY("plusmn", 177),
774 ENTITY("pound", 163),
775 ENTITY("prime", 8242),
776 ENTITY("prod", 8719),
777 ENTITY("prop", 8733),
778 ENTITY("psi", 968),
779 ENTITY("quot", 34),
780 ENTITY("rArr", 8658),
781 ENTITY("radic", 8730),
782 ENTITY("rang", 9002),
783 ENTITY("raquo", 187),
784 ENTITY("rarr", 8594),
785 ENTITY("rceil", 8969),
786 ENTITY("rdquo", 8221),
787 ENTITY("real", 8476),
788 ENTITY("reg", 174),
789 ENTITY("rfloor", 8971),
790 ENTITY("rho", 961),
791 ENTITY("rlm", 8207),
792 ENTITY("rsaquo", 8250),
793 ENTITY("rsquo", 8217),
794 ENTITY("sbquo", 8218),
795 ENTITY("scaron", 353),
796 ENTITY("sdot", 8901),
797 ENTITY("sect", 167),
798 ENTITY("shy", 173),
799 ENTITY("sigma", 963),
800 ENTITY("sigmaf", 962),
801 ENTITY("sim", 8764),
802 ENTITY("spades", 9824),
803 ENTITY("sub", 8834),
804 ENTITY("sube", 8838),
805 ENTITY("sum", 8721),
806 ENTITY("sup", 8835),
807 ENTITY("sup1", 185),
808 ENTITY("sup2", 178),
809 ENTITY("sup3", 179),
810 ENTITY("supe", 8839),
811 ENTITY("szlig", 223),
812 ENTITY("tau", 964),
813 ENTITY("there4", 8756),
814 ENTITY("theta", 952),
815 ENTITY("thetasym", 977),
816 ENTITY("thinsp", 8201),
817 ENTITY("thorn", 254),
818 ENTITY("tilde", 732),
819 ENTITY("times", 215),
820 ENTITY("trade", 8482),
821 ENTITY("uArr", 8657),
822 ENTITY("uacute", 250),
823 ENTITY("uarr", 8593),
824 ENTITY("ucirc", 251),
825 ENTITY("ugrave", 249),
826 ENTITY("uml", 168),
827 ENTITY("upsih", 978),
828 ENTITY("upsilon", 965),
829 ENTITY("uuml", 252),
830 ENTITY("weierp", 8472),
831 ENTITY("xi", 958),
832 ENTITY("yacute", 253),
833 ENTITY("yen", 165),
834 ENTITY("yuml", 255),
835 ENTITY("zeta", 950),
836 ENTITY("zwj", 8205),
837 ENTITY("zwnj", 8204),
daa616fc 838 {NULL, 0}};
19817fd3 839 #undef ENTITY
daa616fc 840 static size_t substitutions_cnt = 0;
04dbb646 841
daa616fc
VS
842 if (substitutions_cnt == 0)
843 while (substitutions[substitutions_cnt].code != 0)
844 substitutions_cnt++;
845
3919d530
JS
846 wxHtmlEntityInfo *info = NULL;
847#ifdef __WXWINCE__
848 // bsearch crashes under WinCE for some reason
849 size_t i;
850 for (i = 0; i < substitutions_cnt; i++)
851 {
852 if (entity == substitutions[i].name)
853 {
854 info = & substitutions[i];
855 break;
856 }
857 }
858#else
19817fd3 859 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
860 substitutions_cnt,
861 sizeof(wxHtmlEntityInfo),
90350682 862 wxHtmlEntityCompare);
3919d530 863#endif
daa616fc
VS
864 if (info)
865 code = info->code;
866 }
04dbb646 867
daa616fc 868 if (code == 0)
470252df 869 return 0;
daa616fc
VS
870 else
871 return GetCharForCode(code);
872}
873
d1da8872 874wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
6cc4e6b8
VS
875 const wxString& url) const
876{
e02ecf7c 877 return m_FS ? m_FS->OpenFile(url) : NULL;
d1da8872 878
6cc4e6b8
VS
879}
880
2b5f62a0
VZ
881
882//-----------------------------------------------------------------------------
883// wxHtmlParser::ExtractCharsetInformation
884//-----------------------------------------------------------------------------
885
886class wxMetaTagParser : public wxHtmlParser
887{
888public:
2eb10e2a
VZ
889 wxMetaTagParser() { }
890
2b5f62a0 891 wxObject* GetProduct() { return NULL; }
2eb10e2a 892
2b5f62a0 893protected:
5bce3e6f 894 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a
VZ
895
896 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
2b5f62a0
VZ
897};
898
899class wxMetaTagHandler : public wxHtmlTagHandler
900{
901public:
902 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
903 wxString GetSupportedTags() { return wxT("META,BODY"); }
904 bool HandleTag(const wxHtmlTag& tag);
905
906private:
907 wxString *m_retval;
2eb10e2a
VZ
908
909 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
2b5f62a0
VZ
910};
911
912bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
913{
914 if (tag.GetName() == _T("BODY"))
915 {
916 m_Parser->StopParsing();
d1da8872 917 return false;
2b5f62a0
VZ
918 }
919
920 if (tag.HasParam(_T("HTTP-EQUIV")) &&
13fd234c 921 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
2b5f62a0
VZ
922 tag.HasParam(_T("CONTENT")))
923 {
5af11a94 924 wxString content = tag.GetParam(_T("CONTENT")).Lower();
2b5f62a0
VZ
925 if (content.Left(19) == _T("text/html; charset="))
926 {
927 *m_retval = content.Mid(19);
928 m_Parser->StopParsing();
929 }
930 }
d1da8872 931 return false;
2b5f62a0
VZ
932}
933
934
935/*static*/
936wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
937{
938 wxString charset;
e7274ba2
WS
939 wxMetaTagParser *parser = new wxMetaTagParser();
940 if(parser)
941 {
942 parser->AddTagHandler(new wxMetaTagHandler(&charset));
943 parser->Parse(markup);
944 delete parser;
945 }
2b5f62a0
VZ
946 return charset;
947}
948
4609ee2e
VZ
949/* static */
950bool
951wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
952 wxString::const_iterator end)
953{
954 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
955
956 wxString::const_iterator p = start;
957
958 // comments begin with "<!--" in HTML 4.0
95ebbfe1 959 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
4609ee2e
VZ
960 {
961 // not a comment at all
962 return false;
963 }
964
965 // skip the start of the comment tag in any case, if we don't find the
966 // closing tag we should ignore broken markup
967 start = p;
968
969 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
970 // comment delimiter and the closing tag character (section 3.2.4 of
971 // http://www.w3.org/TR/html401/)
972 int dashes = 0;
973 while ( ++p < end )
974 {
975 const wxChar c = *p;
976
977 if ( (c == wxT(' ') || c == wxT('\n') ||
978 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
979 {
980 // ignore white space before potential tag end
981 continue;
982 }
983
984 if ( c == wxT('>') && dashes >= 2 )
985 {
986 // found end of comment
987 start = p;
988 break;
989 }
990
991 if ( c == wxT('-') )
992 dashes++;
993 else
994 dashes = 0;
995 }
996
997 return true;
998}
999
1000#endif // wxUSE_HTML