]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
wxRTC: fixed guidelines overwriting adjacent cell borders; corrected capitalisation...
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
5// Copyright: (c) 1999 Vaclav Slavik
65571936 6// Licence: wxWindows licence
5526e819
VS
7/////////////////////////////////////////////////////////////////////////////
8
3096bd2f 9#include "wx/wxprec.h"
5526e819 10
2b5f62a0 11#ifdef __BORLANDC__
93763ad5 12 #pragma hdrstop
5526e819
VS
13#endif
14
93763ad5
WS
15#if wxUSE_HTML && wxUSE_STREAMS
16
b4f4d3dd 17#ifndef WX_PRECOMP
ad9835c9 18 #include "wx/dynarray.h"
04dbb646
VZ
19 #include "wx/log.h"
20 #include "wx/intl.h"
670f9935 21 #include "wx/app.h"
193d0c93 22 #include "wx/wxcrtvararg.h"
5526e819
VS
23#endif
24
69941f05
VS
25#include "wx/tokenzr.h"
26#include "wx/wfstream.h"
27#include "wx/url.h"
daa616fc 28#include "wx/fontmap.h"
69941f05
VS
29#include "wx/html/htmldefs.h"
30#include "wx/html/htmlpars.h"
3c47c047 31#include "wx/vector.h"
5526e819 32
7127d129
RR
33#ifdef __WXWINCE__
34 #include "wx/msw/wince/missing.h" // for bsearch()
35#endif
34fdf762
VS
36
37// DLL options compatibility check:
34fdf762 38WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 39
9a83f860 40const wxChar *wxTRACE_HTML_DEBUG = wxT("htmldebug");
25271309 41
211dfedd
VS
42//-----------------------------------------------------------------------------
43// wxHtmlParser helpers
44//-----------------------------------------------------------------------------
45
46class wxHtmlTextPiece
47{
48public:
3c47c047 49 wxHtmlTextPiece() {}
b1a3a964
VS
50 wxHtmlTextPiece(const wxString::const_iterator& start,
51 const wxString::const_iterator& end)
52 : m_start(start), m_end(end) {}
53 wxString::const_iterator m_start, m_end;
211dfedd
VS
54};
55
3c47c047
VS
56// NB: this is an empty class and not typedef because of forward declaration
57class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
58{
59};
5526e819 60
481c879b 61class wxHtmlParserState
211dfedd 62{
481c879b 63public:
211dfedd
VS
64 wxHtmlTag *m_curTag;
65 wxHtmlTag *m_tags;
66 wxHtmlTextPieces *m_textPieces;
67 int m_curTextPiece;
b1a3a964 68 const wxString *m_source;
211dfedd
VS
69 wxHtmlParserState *m_nextState;
70};
5526e819
VS
71
72//-----------------------------------------------------------------------------
73// wxHtmlParser
74//-----------------------------------------------------------------------------
75
76IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
77
04dbb646 78wxHtmlParser::wxHtmlParser()
2826ef0c
VS
79 : wxObject(),
80 m_FS(NULL)
daa616fc 81{
b1a3a964 82 m_Source = NULL;
daa616fc 83 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
84 m_Tags = NULL;
85 m_CurTag = NULL;
86 m_TextPieces = NULL;
87 m_CurTextPiece = 0;
88 m_SavedStates = NULL;
daa616fc
VS
89}
90
91wxHtmlParser::~wxHtmlParser()
92{
0beefa20
VS
93 while (RestoreState()) {}
94 DestroyDOMTree();
222ed1d6 95
2826ef0c
VS
96 WX_CLEAR_ARRAY(m_HandlersStack);
97 WX_CLEAR_HASH_SET(wxHtmlTagHandlersSet, m_HandlersSet);
daa616fc 98 delete m_entitiesParser;
b1a3a964 99 delete m_Source;
daa616fc 100}
5526e819
VS
101
102wxObject* wxHtmlParser::Parse(const wxString& source)
103{
5526e819
VS
104 InitParser(source);
105 DoParsing();
2b5f62a0 106 wxObject *result = GetProduct();
5526e819
VS
107 DoneParser();
108 return result;
109}
110
5526e819
VS
111void wxHtmlParser::InitParser(const wxString& source)
112{
1309ba6c 113 SetSource(source);
d1da8872 114 m_stopParsing = false;
5526e819 115}
1309ba6c 116
5526e819
VS
117void wxHtmlParser::DoneParser()
118{
211dfedd 119 DestroyDOMTree();
5526e819
VS
120}
121
1309ba6c
VS
122void wxHtmlParser::SetSource(const wxString& src)
123{
211dfedd 124 DestroyDOMTree();
d989875a
VS
125 // NB: This is allocated on heap because wxHtmlTag uses iterators and
126 // making a copy of m_Source string in SetSourceAndSaveState() and
127 // RestoreState() would invalidate them (because wxString::m_impl's
128 // memory would change completely twice and iterators use pointers
129 // into it). So instead, we keep the string object intact and only
130 // store/restore pointer to it, for which we need it to be allocated
131 // on the heap.
b1a3a964
VS
132 delete m_Source;
133 m_Source = new wxString(src);
211dfedd
VS
134 CreateDOMTree();
135 m_CurTag = NULL;
136 m_CurTextPiece = 0;
1309ba6c 137}
5526e819 138
211dfedd 139void wxHtmlParser::CreateDOMTree()
5526e819 140{
b1a3a964 141 wxHtmlTagsCache cache(*m_Source);
211dfedd 142 m_TextPieces = new wxHtmlTextPieces;
b1a3a964 143 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
211dfedd
VS
144 m_CurTextPiece = 0;
145}
5526e819 146
b1a3a964 147extern bool wxIsCDATAElement(const wxString& tag);
7c6cd4a8 148
211dfedd 149void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
b1a3a964
VS
150 const wxString::const_iterator& begin_pos,
151 const wxString::const_iterator& end_pos,
211dfedd
VS
152 wxHtmlTagsCache *cache)
153{
b1a3a964
VS
154 if (end_pos <= begin_pos)
155 return;
5526e819 156
211dfedd 157 wxChar c;
b1a3a964
VS
158 wxString::const_iterator i = begin_pos;
159 wxString::const_iterator textBeginning = begin_pos;
d699f48b 160
7c6cd4a8
VS
161 // If the tag contains CDATA text, we include the text between beginning
162 // and ending tag verbosely. Setting i=end_pos will skip to the very
163 // end of this function where text piece is added, bypassing any child
164 // tags parsing (CDATA element can't have child elements by definition):
b1a3a964 165 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
7c6cd4a8
VS
166 {
167 i = end_pos;
168 }
169
04dbb646 170 while (i < end_pos)
4f9297b0 171 {
b1a3a964 172 c = *i;
5526e819 173
211dfedd
VS
174 if (c == wxT('<'))
175 {
176 // add text to m_TextPieces:
b1a3a964
VS
177 if (i > textBeginning)
178 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
211dfedd
VS
179
180 // if it is a comment, skip it:
b1a3a964 181 if ( SkipCommentTag(i, m_Source->end()) )
211dfedd 182 {
b1a3a964 183 textBeginning = i = i + 1; // skip closing '>' too
211dfedd 184 }
d699f48b 185
211dfedd 186 // add another tag to the tree:
b1a3a964 187 else if (i < end_pos-1 && *(i+1) != wxT('/'))
d1da8872 188 {
211dfedd 189 wxHtmlTag *chd;
d699f48b
KB
190 if (cur)
191 chd = new wxHtmlTag(cur, m_Source,
211dfedd 192 i, end_pos, cache, m_entitiesParser);
d699f48b 193 else
211dfedd
VS
194 {
195 chd = new wxHtmlTag(NULL, m_Source,
196 i, end_pos, cache, m_entitiesParser);
d699f48b 197 if (!m_Tags)
211dfedd 198 {
d699f48b 199 // if this is the first tag to be created make the root
211dfedd
VS
200 // m_Tags point to it:
201 m_Tags = chd;
202 }
203 else
204 {
d699f48b 205 // if there is already a root tag add this tag as
211dfedd
VS
206 // the last sibling:
207 chd->m_Prev = m_Tags->GetLastSibling();
208 chd->m_Prev->m_Next = chd;
209 }
210 }
211
212 if (chd->HasEnding())
213 {
214 CreateDOMSubTree(chd,
b1a3a964 215 chd->GetBeginIter(), chd->GetEndIter1(),
211dfedd 216 cache);
b1a3a964 217 i = chd->GetEndIter2();
211dfedd
VS
218 }
219 else
b1a3a964 220 i = chd->GetBeginIter();
d1da8872 221
211dfedd
VS
222 textBeginning = i;
223 }
224
225 // ... or skip ending tag:
d699f48b 226 else
211dfedd 227 {
b1a3a964 228 while (i < end_pos && *i != wxT('>')) ++i;
36258204 229 textBeginning = i < end_pos ? i+1 : i;
5526e819 230 }
5526e819 231 }
b1a3a964 232 else ++i;
5526e819
VS
233 }
234
211dfedd 235 // add remaining text to m_TextPieces:
b1a3a964
VS
236 if (end_pos > textBeginning)
237 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
211dfedd
VS
238}
239
240void wxHtmlParser::DestroyDOMTree()
241{
242 wxHtmlTag *t1, *t2;
243 t1 = m_Tags;
244 while (t1)
245 {
246 t2 = t1->GetNextSibling();
247 delete t1;
248 t1 = t2;
249 }
250 m_Tags = m_CurTag = NULL;
251
5276b0a5 252 wxDELETE(m_TextPieces);
211dfedd
VS
253}
254
d699f48b 255void wxHtmlParser::DoParsing()
211dfedd
VS
256{
257 m_CurTag = m_Tags;
258 m_CurTextPiece = 0;
b1a3a964 259 DoParsing(m_Source->begin(), m_Source->end());
211dfedd
VS
260}
261
b1a3a964
VS
262void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
263 const wxString::const_iterator& end_pos)
211dfedd 264{
b1a3a964
VS
265 wxString::const_iterator begin_pos(begin_pos_);
266
267 if (end_pos <= begin_pos)
268 return;
d699f48b 269
211dfedd 270 wxHtmlTextPieces& pieces = *m_TextPieces;
3c47c047 271 size_t piecesCnt = pieces.size();
d699f48b 272
211dfedd
VS
273 while (begin_pos < end_pos)
274 {
b1a3a964 275 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
211dfedd 276 m_CurTag = m_CurTag->GetNextTag();
d699f48b 277 while (m_CurTextPiece < piecesCnt &&
b1a3a964 278 pieces[m_CurTextPiece].m_start < begin_pos)
211dfedd
VS
279 m_CurTextPiece++;
280
d699f48b
KB
281 if (m_CurTextPiece < piecesCnt &&
282 (!m_CurTag ||
b1a3a964 283 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
211dfedd
VS
284 {
285 // Add text:
f23e92e7 286 AddText(GetEntitiesParser()->Parse(
b1a3a964
VS
287 wxString(pieces[m_CurTextPiece].m_start,
288 pieces[m_CurTextPiece].m_end)));
289 begin_pos = pieces[m_CurTextPiece].m_end;
211dfedd
VS
290 m_CurTextPiece++;
291 }
292 else if (m_CurTag)
293 {
902725ee 294 if (m_CurTag->HasEnding())
b1a3a964 295 begin_pos = m_CurTag->GetEndIter2();
902725ee 296 else
b1a3a964 297 begin_pos = m_CurTag->GetBeginIter();
211dfedd
VS
298 wxHtmlTag *t = m_CurTag;
299 m_CurTag = m_CurTag->GetNextTag();
300 AddTag(*t);
2b5f62a0
VZ
301 if (m_stopParsing)
302 return;
211dfedd
VS
303 }
304 else break;
5526e819
VS
305 }
306}
307
5526e819
VS
308void wxHtmlParser::AddTag(const wxHtmlTag& tag)
309{
d1da8872 310 bool inner = false;
5526e819 311
2826ef0c
VS
312 wxHtmlTagHandlersHash::const_iterator h = m_HandlersHash.find(tag.GetName());
313 if (h != m_HandlersHash.end())
2b5f62a0 314 {
2826ef0c 315 inner = h->second->HandleTag(tag);
2b5f62a0
VZ
316 if (m_stopParsing)
317 return;
318 }
04dbb646 319 if (!inner)
4f9297b0 320 {
5526e819 321 if (tag.HasEnding())
b1a3a964 322 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
5526e819
VS
323 }
324}
325
5526e819
VS
326void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
327{
4f9297b0 328 wxString s(handler->GetSupportedTags());
211dfedd 329 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 330
5526e819 331 while (tokenizer.HasMoreTokens())
2826ef0c 332 m_HandlersHash[tokenizer.GetNextToken()] = handler;
5526e819 333
2826ef0c 334 m_HandlersSet.insert(handler);
5526e819 335
4f9297b0 336 handler->SetParser(this);
5526e819
VS
337}
338
fbfb8bcc 339void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 340{
211dfedd 341 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
342 wxString key;
343
2826ef0c 344 m_HandlersStack.push_back(new wxHtmlTagHandlersHash(m_HandlersHash));
a7a4d01b 345
04dbb646 346 while (tokenizer.HasMoreTokens())
4f9297b0 347 {
470252df 348 key = tokenizer.GetNextToken();
2826ef0c 349 m_HandlersHash[key] = handler;
a7a4d01b
VS
350 }
351}
352
a7a4d01b
VS
353void wxHtmlParser::PopTagHandler()
354{
2826ef0c
VS
355 wxCHECK_RET( !m_HandlersStack.empty(),
356 "attempt to remove HTML tag handler from empty stack" );
357
358 wxHtmlTagHandlersHash *prev = m_HandlersStack.back();
359 m_HandlersStack.pop_back();
360 m_HandlersHash = *prev;
361 delete prev;
a7a4d01b
VS
362}
363
211dfedd
VS
364void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
365{
366 wxHtmlParserState *s = new wxHtmlParserState;
367
368 s->m_curTag = m_CurTag;
369 s->m_tags = m_Tags;
370 s->m_textPieces = m_TextPieces;
371 s->m_curTextPiece = m_CurTextPiece;
372 s->m_source = m_Source;
373
374 s->m_nextState = m_SavedStates;
375 m_SavedStates = s;
376
377 m_CurTag = NULL;
378 m_Tags = NULL;
379 m_TextPieces = NULL;
380 m_CurTextPiece = 0;
b1a3a964 381 m_Source = NULL;
d699f48b 382
211dfedd
VS
383 SetSource(src);
384}
385
386bool wxHtmlParser::RestoreState()
387{
d1da8872 388 if (!m_SavedStates) return false;
d699f48b 389
0beefa20 390 DestroyDOMTree();
d989875a 391 delete m_Source;
0beefa20 392
211dfedd
VS
393 wxHtmlParserState *s = m_SavedStates;
394 m_SavedStates = s->m_nextState;
d699f48b 395
211dfedd
VS
396 m_CurTag = s->m_curTag;
397 m_Tags = s->m_tags;
398 m_TextPieces = s->m_textPieces;
399 m_CurTextPiece = s->m_curTextPiece;
400 m_Source = s->m_source;
d699f48b 401
211dfedd 402 delete s;
d1da8872 403 return true;
211dfedd
VS
404}
405
e7feeafa
VS
406wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
407{
b1a3a964 408 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
e7feeafa
VS
409}
410
5526e819
VS
411//-----------------------------------------------------------------------------
412// wxHtmlTagHandler
413//-----------------------------------------------------------------------------
414
415IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 416
e7feeafa
VS
417void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
418{
419 // It is safe to temporarily change the source being parsed,
420 // provided we restore the state back after parsing
421 m_Parser->SetSourceAndSaveState(source);
422 m_Parser->DoParsing();
423 m_Parser->RestoreState();
424}
425
daa616fc
VS
426
427//-----------------------------------------------------------------------------
428// wxHtmlEntitiesParser
429//-----------------------------------------------------------------------------
430
431IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
432
433wxHtmlEntitiesParser::wxHtmlEntitiesParser()
8d94819c 434#if !wxUSE_UNICODE
daa616fc 435 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 436#endif
daa616fc
VS
437{
438}
439
440wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
441{
8d94819c 442#if !wxUSE_UNICODE
daa616fc 443 delete m_conv;
5438a566 444#endif
daa616fc 445}
5526e819 446
8d94819c 447#if !wxUSE_UNICODE
daa616fc
VS
448void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
449{
2b5f62a0
VZ
450 if (encoding == m_encoding)
451 return;
452
daa616fc 453 delete m_conv;
2b5f62a0 454
daa616fc 455 m_encoding = encoding;
2b5f62a0
VZ
456 if (m_encoding == wxFONTENCODING_SYSTEM)
457 m_conv = NULL;
458 else
daa616fc 459 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
daa616fc 460}
8d94819c 461#endif // !wxUSE_UNICODE
daa616fc 462
96d665d2 463wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
daa616fc 464{
daa616fc 465 wxString output;
d1da8872 466
4f7e8fda
VS
467 const wxString::const_iterator end(input.end());
468 wxString::const_iterator c(input.begin());
469 wxString::const_iterator last(c);
04dbb646 470
4f7e8fda 471 for ( ; c < end; ++c )
daa616fc
VS
472 {
473 if (*c == wxT('&'))
474 {
4f7e8fda
VS
475 if ( output.empty() )
476 output.reserve(input.length());
477
daa616fc 478 if (c - last > 0)
4f7e8fda
VS
479 output.append(last, c);
480 if ( ++c == end )
9e2bd135 481 break;
d1da8872 482
daa616fc 483 wxString entity;
4f7e8fda 484 const wxString::const_iterator ent_s = c;
470252df 485 wxChar entity_char;
d1da8872 486
b1a3a964
VS
487 for ( ; c != end; ++c )
488 {
489 wxChar ch = *c;
490 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
491 (ch >= wxT('A') && ch <= wxT('Z')) ||
492 (ch >= wxT('0') && ch <= wxT('9')) ||
493 ch == wxT('_') || ch == wxT('#')) )
494 break;
495 }
496
4f7e8fda
VS
497 entity.append(ent_s, c);
498 if (c == end || *c != wxT(';')) --c;
211dfedd 499 last = c+1;
470252df
VS
500 entity_char = GetEntityChar(entity);
501 if (entity_char)
502 output << entity_char;
503 else
504 {
4f7e8fda 505 output.append(ent_s-1, c+1);
25271309 506 wxLogTrace(wxTRACE_HTML_DEBUG,
4f7e8fda
VS
507 "Unrecognized HTML entity: '%s'",
508 entity);
470252df 509 }
daa616fc
VS
510 }
511 }
4f7e8fda
VS
512 if ( last == input.begin() ) // common case: no entity
513 return input;
514 if ( last != end )
515 output.append(last, end);
daa616fc
VS
516 return output;
517}
518
2b5f62a0 519#if !wxUSE_UNICODE
96d665d2 520wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
daa616fc 521{
daa616fc
VS
522 char buf[2];
523 wchar_t wbuf[2];
524 wbuf[0] = (wchar_t)code;
525 wbuf[1] = 0;
526 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 527 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
528 return '?';
529 return buf[0];
daa616fc 530}
2b5f62a0 531#endif
daa616fc 532
19817fd3
VS
533struct wxHtmlEntityInfo
534{
535 const wxStringCharType *name;
536 unsigned code;
537};
538
539extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
540{
541#if wxUSE_UNICODE_UTF8
542 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
543#else
544 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
545#endif
546}
547
96d665d2 548wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
daa616fc
VS
549{
550 unsigned code = 0;
04dbb646 551
c0213e2a
VS
552 if (entity.empty())
553 return 0; // invalid entity reference
554
daa616fc
VS
555 if (entity[0] == wxT('#'))
556 {
c471f7e1
VS
557 // NB: parsed value is a number, so it's OK to use wx_str(), internal
558 // representation is the same for numbers
559 const wxStringCharType *ent_s = entity.wx_str();
560 const wxStringCharType *format;
04dbb646 561
d9359369 562 if (ent_s[1] == wxS('x') || ent_s[1] == wxS('X'))
daa616fc 563 {
d9359369 564 format = wxS("%x");
daa616fc
VS
565 ent_s++;
566 }
567 else
d9359369 568 format = wxS("%u");
daa616fc
VS
569 ent_s++;
570
571 if (wxSscanf(ent_s, format, &code) != 1)
572 code = 0;
573 }
574 else
575 {
19817fd3
VS
576 // store the literals in wx's internal representation (either char*
577 // in UTF-8 or wchar_t*) for best performance:
d9359369 578 #define ENTITY(name, code) { wxS(name), code }
19817fd3 579
daa616fc 580 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
581 ENTITY("AElig", 198),
582 ENTITY("Aacute", 193),
583 ENTITY("Acirc", 194),
584 ENTITY("Agrave", 192),
585 ENTITY("Alpha", 913),
586 ENTITY("Aring", 197),
587 ENTITY("Atilde", 195),
588 ENTITY("Auml", 196),
589 ENTITY("Beta", 914),
590 ENTITY("Ccedil", 199),
591 ENTITY("Chi", 935),
592 ENTITY("Dagger", 8225),
593 ENTITY("Delta", 916),
594 ENTITY("ETH", 208),
595 ENTITY("Eacute", 201),
596 ENTITY("Ecirc", 202),
597 ENTITY("Egrave", 200),
598 ENTITY("Epsilon", 917),
599 ENTITY("Eta", 919),
600 ENTITY("Euml", 203),
601 ENTITY("Gamma", 915),
602 ENTITY("Iacute", 205),
603 ENTITY("Icirc", 206),
604 ENTITY("Igrave", 204),
605 ENTITY("Iota", 921),
606 ENTITY("Iuml", 207),
607 ENTITY("Kappa", 922),
608 ENTITY("Lambda", 923),
609 ENTITY("Mu", 924),
610 ENTITY("Ntilde", 209),
611 ENTITY("Nu", 925),
612 ENTITY("OElig", 338),
613 ENTITY("Oacute", 211),
614 ENTITY("Ocirc", 212),
615 ENTITY("Ograve", 210),
616 ENTITY("Omega", 937),
617 ENTITY("Omicron", 927),
618 ENTITY("Oslash", 216),
619 ENTITY("Otilde", 213),
620 ENTITY("Ouml", 214),
621 ENTITY("Phi", 934),
622 ENTITY("Pi", 928),
623 ENTITY("Prime", 8243),
624 ENTITY("Psi", 936),
625 ENTITY("Rho", 929),
626 ENTITY("Scaron", 352),
627 ENTITY("Sigma", 931),
628 ENTITY("THORN", 222),
629 ENTITY("Tau", 932),
630 ENTITY("Theta", 920),
631 ENTITY("Uacute", 218),
632 ENTITY("Ucirc", 219),
633 ENTITY("Ugrave", 217),
634 ENTITY("Upsilon", 933),
635 ENTITY("Uuml", 220),
636 ENTITY("Xi", 926),
637 ENTITY("Yacute", 221),
638 ENTITY("Yuml", 376),
639 ENTITY("Zeta", 918),
640 ENTITY("aacute", 225),
641 ENTITY("acirc", 226),
642 ENTITY("acute", 180),
643 ENTITY("aelig", 230),
644 ENTITY("agrave", 224),
645 ENTITY("alefsym", 8501),
646 ENTITY("alpha", 945),
647 ENTITY("amp", 38),
648 ENTITY("and", 8743),
649 ENTITY("ang", 8736),
319106d6 650 ENTITY("apos", 39),
19817fd3
VS
651 ENTITY("aring", 229),
652 ENTITY("asymp", 8776),
653 ENTITY("atilde", 227),
654 ENTITY("auml", 228),
655 ENTITY("bdquo", 8222),
656 ENTITY("beta", 946),
657 ENTITY("brvbar", 166),
658 ENTITY("bull", 8226),
659 ENTITY("cap", 8745),
660 ENTITY("ccedil", 231),
661 ENTITY("cedil", 184),
662 ENTITY("cent", 162),
663 ENTITY("chi", 967),
664 ENTITY("circ", 710),
665 ENTITY("clubs", 9827),
666 ENTITY("cong", 8773),
667 ENTITY("copy", 169),
668 ENTITY("crarr", 8629),
669 ENTITY("cup", 8746),
670 ENTITY("curren", 164),
671 ENTITY("dArr", 8659),
672 ENTITY("dagger", 8224),
673 ENTITY("darr", 8595),
674 ENTITY("deg", 176),
675 ENTITY("delta", 948),
676 ENTITY("diams", 9830),
677 ENTITY("divide", 247),
678 ENTITY("eacute", 233),
679 ENTITY("ecirc", 234),
680 ENTITY("egrave", 232),
681 ENTITY("empty", 8709),
682 ENTITY("emsp", 8195),
683 ENTITY("ensp", 8194),
684 ENTITY("epsilon", 949),
685 ENTITY("equiv", 8801),
686 ENTITY("eta", 951),
687 ENTITY("eth", 240),
688 ENTITY("euml", 235),
689 ENTITY("euro", 8364),
690 ENTITY("exist", 8707),
691 ENTITY("fnof", 402),
692 ENTITY("forall", 8704),
693 ENTITY("frac12", 189),
694 ENTITY("frac14", 188),
695 ENTITY("frac34", 190),
696 ENTITY("frasl", 8260),
697 ENTITY("gamma", 947),
698 ENTITY("ge", 8805),
699 ENTITY("gt", 62),
700 ENTITY("hArr", 8660),
701 ENTITY("harr", 8596),
702 ENTITY("hearts", 9829),
703 ENTITY("hellip", 8230),
704 ENTITY("iacute", 237),
705 ENTITY("icirc", 238),
706 ENTITY("iexcl", 161),
707 ENTITY("igrave", 236),
708 ENTITY("image", 8465),
709 ENTITY("infin", 8734),
710 ENTITY("int", 8747),
711 ENTITY("iota", 953),
712 ENTITY("iquest", 191),
713 ENTITY("isin", 8712),
714 ENTITY("iuml", 239),
715 ENTITY("kappa", 954),
716 ENTITY("lArr", 8656),
717 ENTITY("lambda", 955),
718 ENTITY("lang", 9001),
719 ENTITY("laquo", 171),
720 ENTITY("larr", 8592),
721 ENTITY("lceil", 8968),
722 ENTITY("ldquo", 8220),
723 ENTITY("le", 8804),
724 ENTITY("lfloor", 8970),
725 ENTITY("lowast", 8727),
726 ENTITY("loz", 9674),
727 ENTITY("lrm", 8206),
728 ENTITY("lsaquo", 8249),
729 ENTITY("lsquo", 8216),
730 ENTITY("lt", 60),
731 ENTITY("macr", 175),
732 ENTITY("mdash", 8212),
733 ENTITY("micro", 181),
734 ENTITY("middot", 183),
735 ENTITY("minus", 8722),
736 ENTITY("mu", 956),
737 ENTITY("nabla", 8711),
738 ENTITY("nbsp", 160),
739 ENTITY("ndash", 8211),
740 ENTITY("ne", 8800),
741 ENTITY("ni", 8715),
742 ENTITY("not", 172),
743 ENTITY("notin", 8713),
744 ENTITY("nsub", 8836),
745 ENTITY("ntilde", 241),
746 ENTITY("nu", 957),
747 ENTITY("oacute", 243),
748 ENTITY("ocirc", 244),
749 ENTITY("oelig", 339),
750 ENTITY("ograve", 242),
751 ENTITY("oline", 8254),
752 ENTITY("omega", 969),
753 ENTITY("omicron", 959),
754 ENTITY("oplus", 8853),
755 ENTITY("or", 8744),
756 ENTITY("ordf", 170),
757 ENTITY("ordm", 186),
758 ENTITY("oslash", 248),
759 ENTITY("otilde", 245),
760 ENTITY("otimes", 8855),
761 ENTITY("ouml", 246),
762 ENTITY("para", 182),
763 ENTITY("part", 8706),
764 ENTITY("permil", 8240),
765 ENTITY("perp", 8869),
766 ENTITY("phi", 966),
767 ENTITY("pi", 960),
768 ENTITY("piv", 982),
769 ENTITY("plusmn", 177),
770 ENTITY("pound", 163),
771 ENTITY("prime", 8242),
772 ENTITY("prod", 8719),
773 ENTITY("prop", 8733),
774 ENTITY("psi", 968),
775 ENTITY("quot", 34),
776 ENTITY("rArr", 8658),
777 ENTITY("radic", 8730),
778 ENTITY("rang", 9002),
779 ENTITY("raquo", 187),
780 ENTITY("rarr", 8594),
781 ENTITY("rceil", 8969),
782 ENTITY("rdquo", 8221),
783 ENTITY("real", 8476),
784 ENTITY("reg", 174),
785 ENTITY("rfloor", 8971),
786 ENTITY("rho", 961),
787 ENTITY("rlm", 8207),
788 ENTITY("rsaquo", 8250),
789 ENTITY("rsquo", 8217),
790 ENTITY("sbquo", 8218),
791 ENTITY("scaron", 353),
792 ENTITY("sdot", 8901),
793 ENTITY("sect", 167),
794 ENTITY("shy", 173),
795 ENTITY("sigma", 963),
796 ENTITY("sigmaf", 962),
797 ENTITY("sim", 8764),
798 ENTITY("spades", 9824),
799 ENTITY("sub", 8834),
800 ENTITY("sube", 8838),
801 ENTITY("sum", 8721),
802 ENTITY("sup", 8835),
803 ENTITY("sup1", 185),
804 ENTITY("sup2", 178),
805 ENTITY("sup3", 179),
806 ENTITY("supe", 8839),
807 ENTITY("szlig", 223),
808 ENTITY("tau", 964),
809 ENTITY("there4", 8756),
810 ENTITY("theta", 952),
811 ENTITY("thetasym", 977),
812 ENTITY("thinsp", 8201),
813 ENTITY("thorn", 254),
814 ENTITY("tilde", 732),
815 ENTITY("times", 215),
816 ENTITY("trade", 8482),
817 ENTITY("uArr", 8657),
818 ENTITY("uacute", 250),
819 ENTITY("uarr", 8593),
820 ENTITY("ucirc", 251),
821 ENTITY("ugrave", 249),
822 ENTITY("uml", 168),
823 ENTITY("upsih", 978),
824 ENTITY("upsilon", 965),
825 ENTITY("uuml", 252),
826 ENTITY("weierp", 8472),
827 ENTITY("xi", 958),
828 ENTITY("yacute", 253),
829 ENTITY("yen", 165),
830 ENTITY("yuml", 255),
831 ENTITY("zeta", 950),
832 ENTITY("zwj", 8205),
833 ENTITY("zwnj", 8204),
daa616fc 834 {NULL, 0}};
19817fd3 835 #undef ENTITY
daa616fc 836 static size_t substitutions_cnt = 0;
04dbb646 837
daa616fc
VS
838 if (substitutions_cnt == 0)
839 while (substitutions[substitutions_cnt].code != 0)
840 substitutions_cnt++;
841
e822d1bd 842 wxHtmlEntityInfo *info;
3919d530
JS
843#ifdef __WXWINCE__
844 // bsearch crashes under WinCE for some reason
e822d1bd 845 info = NULL;
3919d530
JS
846 size_t i;
847 for (i = 0; i < substitutions_cnt; i++)
848 {
849 if (entity == substitutions[i].name)
850 {
851 info = & substitutions[i];
852 break;
853 }
854 }
855#else
19817fd3 856 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
857 substitutions_cnt,
858 sizeof(wxHtmlEntityInfo),
90350682 859 wxHtmlEntityCompare);
3919d530 860#endif
daa616fc
VS
861 if (info)
862 code = info->code;
863 }
04dbb646 864
daa616fc 865 if (code == 0)
470252df 866 return 0;
daa616fc
VS
867 else
868 return GetCharForCode(code);
869}
870
948c6134 871wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType type,
6cc4e6b8
VS
872 const wxString& url) const
873{
948c6134
MW
874 int flags = wxFS_READ;
875 if (type == wxHTML_URL_IMAGE)
876 flags |= wxFS_SEEKABLE;
877
878 return m_FS ? m_FS->OpenFile(url, flags) : NULL;
d1da8872 879
6cc4e6b8
VS
880}
881
2b5f62a0
VZ
882
883//-----------------------------------------------------------------------------
884// wxHtmlParser::ExtractCharsetInformation
885//-----------------------------------------------------------------------------
886
887class wxMetaTagParser : public wxHtmlParser
888{
889public:
2eb10e2a
VZ
890 wxMetaTagParser() { }
891
2b5f62a0 892 wxObject* GetProduct() { return NULL; }
2eb10e2a 893
2b5f62a0 894protected:
5bce3e6f 895 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a 896
c0c133e1 897 wxDECLARE_NO_COPY_CLASS(wxMetaTagParser);
2b5f62a0
VZ
898};
899
900class wxMetaTagHandler : public wxHtmlTagHandler
901{
902public:
903 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
904 wxString GetSupportedTags() { return wxT("META,BODY"); }
905 bool HandleTag(const wxHtmlTag& tag);
906
907private:
908 wxString *m_retval;
2eb10e2a 909
c0c133e1 910 wxDECLARE_NO_COPY_CLASS(wxMetaTagHandler);
2b5f62a0
VZ
911};
912
913bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
914{
9a83f860 915 if (tag.GetName() == wxT("BODY"))
2b5f62a0
VZ
916 {
917 m_Parser->StopParsing();
d1da8872 918 return false;
2b5f62a0
VZ
919 }
920
9a83f860
VZ
921 if (tag.HasParam(wxT("HTTP-EQUIV")) &&
922 tag.GetParam(wxT("HTTP-EQUIV")).IsSameAs(wxT("Content-Type"), false) &&
923 tag.HasParam(wxT("CONTENT")))
2b5f62a0 924 {
9a83f860
VZ
925 wxString content = tag.GetParam(wxT("CONTENT")).Lower();
926 if (content.Left(19) == wxT("text/html; charset="))
2b5f62a0
VZ
927 {
928 *m_retval = content.Mid(19);
929 m_Parser->StopParsing();
930 }
931 }
d1da8872 932 return false;
2b5f62a0
VZ
933}
934
935
936/*static*/
937wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
938{
939 wxString charset;
e7274ba2
WS
940 wxMetaTagParser *parser = new wxMetaTagParser();
941 if(parser)
942 {
943 parser->AddTagHandler(new wxMetaTagHandler(&charset));
944 parser->Parse(markup);
945 delete parser;
946 }
2b5f62a0
VZ
947 return charset;
948}
949
4609ee2e
VZ
950/* static */
951bool
952wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
953 wxString::const_iterator end)
954{
9a83f860 955 wxASSERT_MSG( *start == '<', wxT("should be called on the tag start") );
4609ee2e
VZ
956
957 wxString::const_iterator p = start;
958
e4f762f4
VS
959 // Comments begin with "<!--" in HTML 4.0; anything shorter or not containing
960 // these characters is not a comment and we're not going to skip it.
961 if ( ++p == end || *p != '!' )
962 return false;
963 if ( ++p == end || *p != '-' )
964 return false;
965 if ( ++p == end || *p != '-' )
966 return false;
4609ee2e
VZ
967
968 // skip the start of the comment tag in any case, if we don't find the
969 // closing tag we should ignore broken markup
970 start = p;
971
972 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
973 // comment delimiter and the closing tag character (section 3.2.4 of
974 // http://www.w3.org/TR/html401/)
975 int dashes = 0;
976 while ( ++p < end )
977 {
978 const wxChar c = *p;
979
980 if ( (c == wxT(' ') || c == wxT('\n') ||
981 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
982 {
983 // ignore white space before potential tag end
984 continue;
985 }
986
987 if ( c == wxT('>') && dashes >= 2 )
988 {
989 // found end of comment
990 start = p;
991 break;
992 }
993
994 if ( c == wxT('-') )
995 dashes++;
996 else
997 dashes = 0;
998 }
999
1000 return true;
1001}
1002
1003#endif // wxUSE_HTML