]> git.saurik.com Git - wxWidgets.git/blame - src/html/htmlpars.cpp
use fallback encoding in wxConvAuto when input is not in UTF-8
[wxWidgets.git] / src / html / htmlpars.cpp
CommitLineData
5526e819 1/////////////////////////////////////////////////////////////////////////////
93763ad5 2// Name: src/html/htmlpars.cpp
5526e819
VS
3// Purpose: wxHtmlParser class (generic parser)
4// Author: Vaclav Slavik
69941f05 5// RCS-ID: $Id$
5526e819 6// Copyright: (c) 1999 Vaclav Slavik
65571936 7// Licence: wxWindows licence
5526e819
VS
8/////////////////////////////////////////////////////////////////////////////
9
3096bd2f 10#include "wx/wxprec.h"
5526e819 11
2b5f62a0 12#ifdef __BORLANDC__
93763ad5 13 #pragma hdrstop
5526e819
VS
14#endif
15
93763ad5
WS
16#if wxUSE_HTML && wxUSE_STREAMS
17
b4f4d3dd 18#ifndef WX_PRECOMP
ad9835c9 19 #include "wx/dynarray.h"
04dbb646
VZ
20 #include "wx/log.h"
21 #include "wx/intl.h"
670f9935 22 #include "wx/app.h"
193d0c93 23 #include "wx/wxcrtvararg.h"
5526e819
VS
24#endif
25
69941f05
VS
26#include "wx/tokenzr.h"
27#include "wx/wfstream.h"
28#include "wx/url.h"
daa616fc 29#include "wx/fontmap.h"
69941f05
VS
30#include "wx/html/htmldefs.h"
31#include "wx/html/htmlpars.h"
3c47c047 32#include "wx/vector.h"
5526e819 33
7127d129
RR
34#ifdef __WXWINCE__
35 #include "wx/msw/wince/missing.h" // for bsearch()
36#endif
34fdf762
VS
37
38// DLL options compatibility check:
34fdf762 39WX_CHECK_BUILD_OPTIONS("wxHTML")
34fdf762 40
25271309
VZ
41const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
42
211dfedd
VS
43//-----------------------------------------------------------------------------
44// wxHtmlParser helpers
45//-----------------------------------------------------------------------------
46
47class wxHtmlTextPiece
48{
49public:
3c47c047 50 wxHtmlTextPiece() {}
b1a3a964
VS
51 wxHtmlTextPiece(const wxString::const_iterator& start,
52 const wxString::const_iterator& end)
53 : m_start(start), m_end(end) {}
54 wxString::const_iterator m_start, m_end;
211dfedd
VS
55};
56
3c47c047
VS
57// NB: this is an empty class and not typedef because of forward declaration
58class wxHtmlTextPieces : public wxVector<wxHtmlTextPiece>
59{
60};
5526e819 61
481c879b 62class wxHtmlParserState
211dfedd 63{
481c879b 64public:
211dfedd
VS
65 wxHtmlTag *m_curTag;
66 wxHtmlTag *m_tags;
67 wxHtmlTextPieces *m_textPieces;
68 int m_curTextPiece;
b1a3a964 69 const wxString *m_source;
211dfedd
VS
70 wxHtmlParserState *m_nextState;
71};
5526e819
VS
72
73//-----------------------------------------------------------------------------
74// wxHtmlParser
75//-----------------------------------------------------------------------------
76
77IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
78
04dbb646 79wxHtmlParser::wxHtmlParser()
211dfedd 80 : wxObject(), m_HandlersHash(wxKEY_STRING),
daa616fc
VS
81 m_FS(NULL), m_HandlersStack(NULL)
82{
b1a3a964 83 m_Source = NULL;
daa616fc 84 m_entitiesParser = new wxHtmlEntitiesParser;
211dfedd
VS
85 m_Tags = NULL;
86 m_CurTag = NULL;
87 m_TextPieces = NULL;
88 m_CurTextPiece = 0;
89 m_SavedStates = NULL;
daa616fc
VS
90}
91
92wxHtmlParser::~wxHtmlParser()
93{
0beefa20
VS
94 while (RestoreState()) {}
95 DestroyDOMTree();
222ed1d6
MB
96
97 if (m_HandlersStack)
98 {
99 wxList& tmp = *m_HandlersStack;
100 wxList::iterator it, en;
101 for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
102 delete (wxHashTable*)*it;
103 tmp.clear();
104 }
daa616fc
VS
105 delete m_HandlersStack;
106 m_HandlersHash.Clear();
222ed1d6 107 WX_CLEAR_LIST(wxList, m_HandlersList);
daa616fc 108 delete m_entitiesParser;
b1a3a964 109 delete m_Source;
daa616fc 110}
5526e819
VS
111
112wxObject* wxHtmlParser::Parse(const wxString& source)
113{
5526e819
VS
114 InitParser(source);
115 DoParsing();
2b5f62a0 116 wxObject *result = GetProduct();
5526e819
VS
117 DoneParser();
118 return result;
119}
120
5526e819
VS
121void wxHtmlParser::InitParser(const wxString& source)
122{
1309ba6c 123 SetSource(source);
d1da8872 124 m_stopParsing = false;
5526e819 125}
1309ba6c 126
5526e819
VS
127void wxHtmlParser::DoneParser()
128{
211dfedd 129 DestroyDOMTree();
5526e819
VS
130}
131
1309ba6c
VS
132void wxHtmlParser::SetSource(const wxString& src)
133{
211dfedd 134 DestroyDOMTree();
d989875a
VS
135 // NB: This is allocated on heap because wxHtmlTag uses iterators and
136 // making a copy of m_Source string in SetSourceAndSaveState() and
137 // RestoreState() would invalidate them (because wxString::m_impl's
138 // memory would change completely twice and iterators use pointers
139 // into it). So instead, we keep the string object intact and only
140 // store/restore pointer to it, for which we need it to be allocated
141 // on the heap.
b1a3a964
VS
142 delete m_Source;
143 m_Source = new wxString(src);
211dfedd
VS
144 CreateDOMTree();
145 m_CurTag = NULL;
146 m_CurTextPiece = 0;
1309ba6c 147}
5526e819 148
211dfedd 149void wxHtmlParser::CreateDOMTree()
5526e819 150{
b1a3a964 151 wxHtmlTagsCache cache(*m_Source);
211dfedd 152 m_TextPieces = new wxHtmlTextPieces;
b1a3a964 153 CreateDOMSubTree(NULL, m_Source->begin(), m_Source->end(), &cache);
211dfedd
VS
154 m_CurTextPiece = 0;
155}
5526e819 156
b1a3a964 157extern bool wxIsCDATAElement(const wxString& tag);
7c6cd4a8 158
211dfedd 159void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
b1a3a964
VS
160 const wxString::const_iterator& begin_pos,
161 const wxString::const_iterator& end_pos,
211dfedd
VS
162 wxHtmlTagsCache *cache)
163{
b1a3a964
VS
164 if (end_pos <= begin_pos)
165 return;
5526e819 166
211dfedd 167 wxChar c;
b1a3a964
VS
168 wxString::const_iterator i = begin_pos;
169 wxString::const_iterator textBeginning = begin_pos;
d699f48b 170
7c6cd4a8
VS
171 // If the tag contains CDATA text, we include the text between beginning
172 // and ending tag verbosely. Setting i=end_pos will skip to the very
173 // end of this function where text piece is added, bypassing any child
174 // tags parsing (CDATA element can't have child elements by definition):
b1a3a964 175 if (cur != NULL && wxIsCDATAElement(cur->GetName()))
7c6cd4a8
VS
176 {
177 i = end_pos;
178 }
179
04dbb646 180 while (i < end_pos)
4f9297b0 181 {
b1a3a964 182 c = *i;
5526e819 183
211dfedd
VS
184 if (c == wxT('<'))
185 {
186 // add text to m_TextPieces:
b1a3a964
VS
187 if (i > textBeginning)
188 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, i));
211dfedd
VS
189
190 // if it is a comment, skip it:
b1a3a964 191 if ( SkipCommentTag(i, m_Source->end()) )
211dfedd 192 {
b1a3a964 193 textBeginning = i = i + 1; // skip closing '>' too
211dfedd 194 }
d699f48b 195
211dfedd 196 // add another tag to the tree:
b1a3a964 197 else if (i < end_pos-1 && *(i+1) != wxT('/'))
d1da8872 198 {
211dfedd 199 wxHtmlTag *chd;
d699f48b
KB
200 if (cur)
201 chd = new wxHtmlTag(cur, m_Source,
211dfedd 202 i, end_pos, cache, m_entitiesParser);
d699f48b 203 else
211dfedd
VS
204 {
205 chd = new wxHtmlTag(NULL, m_Source,
206 i, end_pos, cache, m_entitiesParser);
d699f48b 207 if (!m_Tags)
211dfedd 208 {
d699f48b 209 // if this is the first tag to be created make the root
211dfedd
VS
210 // m_Tags point to it:
211 m_Tags = chd;
212 }
213 else
214 {
d699f48b 215 // if there is already a root tag add this tag as
211dfedd
VS
216 // the last sibling:
217 chd->m_Prev = m_Tags->GetLastSibling();
218 chd->m_Prev->m_Next = chd;
219 }
220 }
221
222 if (chd->HasEnding())
223 {
224 CreateDOMSubTree(chd,
b1a3a964 225 chd->GetBeginIter(), chd->GetEndIter1(),
211dfedd 226 cache);
b1a3a964 227 i = chd->GetEndIter2();
211dfedd
VS
228 }
229 else
b1a3a964 230 i = chd->GetBeginIter();
d1da8872 231
211dfedd
VS
232 textBeginning = i;
233 }
234
235 // ... or skip ending tag:
d699f48b 236 else
211dfedd 237 {
b1a3a964 238 while (i < end_pos && *i != wxT('>')) ++i;
211dfedd 239 textBeginning = i+1;
5526e819 240 }
5526e819 241 }
b1a3a964 242 else ++i;
5526e819
VS
243 }
244
211dfedd 245 // add remaining text to m_TextPieces:
b1a3a964
VS
246 if (end_pos > textBeginning)
247 m_TextPieces->push_back(wxHtmlTextPiece(textBeginning, end_pos));
211dfedd
VS
248}
249
250void wxHtmlParser::DestroyDOMTree()
251{
252 wxHtmlTag *t1, *t2;
253 t1 = m_Tags;
254 while (t1)
255 {
256 t2 = t1->GetNextSibling();
257 delete t1;
258 t1 = t2;
259 }
260 m_Tags = m_CurTag = NULL;
261
262 delete m_TextPieces;
263 m_TextPieces = NULL;
264}
265
d699f48b 266void wxHtmlParser::DoParsing()
211dfedd
VS
267{
268 m_CurTag = m_Tags;
269 m_CurTextPiece = 0;
b1a3a964 270 DoParsing(m_Source->begin(), m_Source->end());
211dfedd
VS
271}
272
b1a3a964
VS
273void wxHtmlParser::DoParsing(const wxString::const_iterator& begin_pos_,
274 const wxString::const_iterator& end_pos)
211dfedd 275{
b1a3a964
VS
276 wxString::const_iterator begin_pos(begin_pos_);
277
278 if (end_pos <= begin_pos)
279 return;
d699f48b 280
211dfedd 281 wxHtmlTextPieces& pieces = *m_TextPieces;
3c47c047 282 size_t piecesCnt = pieces.size();
d699f48b 283
211dfedd
VS
284 while (begin_pos < end_pos)
285 {
b1a3a964 286 while (m_CurTag && m_CurTag->GetBeginIter() < begin_pos)
211dfedd 287 m_CurTag = m_CurTag->GetNextTag();
d699f48b 288 while (m_CurTextPiece < piecesCnt &&
b1a3a964 289 pieces[m_CurTextPiece].m_start < begin_pos)
211dfedd
VS
290 m_CurTextPiece++;
291
d699f48b
KB
292 if (m_CurTextPiece < piecesCnt &&
293 (!m_CurTag ||
b1a3a964 294 pieces[m_CurTextPiece].m_start < m_CurTag->GetBeginIter()))
211dfedd
VS
295 {
296 // Add text:
f23e92e7 297 AddText(GetEntitiesParser()->Parse(
b1a3a964
VS
298 wxString(pieces[m_CurTextPiece].m_start,
299 pieces[m_CurTextPiece].m_end)));
300 begin_pos = pieces[m_CurTextPiece].m_end;
211dfedd
VS
301 m_CurTextPiece++;
302 }
303 else if (m_CurTag)
304 {
902725ee 305 if (m_CurTag->HasEnding())
b1a3a964 306 begin_pos = m_CurTag->GetEndIter2();
902725ee 307 else
b1a3a964 308 begin_pos = m_CurTag->GetBeginIter();
211dfedd
VS
309 wxHtmlTag *t = m_CurTag;
310 m_CurTag = m_CurTag->GetNextTag();
311 AddTag(*t);
2b5f62a0
VZ
312 if (m_stopParsing)
313 return;
211dfedd
VS
314 }
315 else break;
5526e819
VS
316 }
317}
318
5526e819
VS
319void wxHtmlParser::AddTag(const wxHtmlTag& tag)
320{
321 wxHtmlTagHandler *h;
d1da8872 322 bool inner = false;
5526e819
VS
323
324 h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
325 if (h)
2b5f62a0 326 {
4f9297b0 327 inner = h->HandleTag(tag);
2b5f62a0
VZ
328 if (m_stopParsing)
329 return;
330 }
04dbb646 331 if (!inner)
4f9297b0 332 {
5526e819 333 if (tag.HasEnding())
b1a3a964 334 DoParsing(tag.GetBeginIter(), tag.GetEndIter1());
5526e819
VS
335 }
336}
337
5526e819
VS
338void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
339{
4f9297b0 340 wxString s(handler->GetSupportedTags());
211dfedd 341 wxStringTokenizer tokenizer(s, wxT(", "));
5526e819 342
5526e819 343 while (tokenizer.HasMoreTokens())
470252df 344 m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
5526e819
VS
345
346 if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
347 m_HandlersList.Append(handler);
348
4f9297b0 349 handler->SetParser(this);
5526e819
VS
350}
351
fbfb8bcc 352void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
a7a4d01b 353{
211dfedd 354 wxStringTokenizer tokenizer(tags, wxT(", "));
a7a4d01b
VS
355 wxString key;
356
04dbb646 357 if (m_HandlersStack == NULL)
4f9297b0 358 {
a7a4d01b 359 m_HandlersStack = new wxList;
a7a4d01b
VS
360 }
361
222ed1d6 362 m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
a7a4d01b 363
04dbb646 364 while (tokenizer.HasMoreTokens())
4f9297b0 365 {
470252df 366 key = tokenizer.GetNextToken();
a7a4d01b
VS
367 m_HandlersHash.Delete(key);
368 m_HandlersHash.Put(key, handler);
369 }
370}
371
a7a4d01b
VS
372void wxHtmlParser::PopTagHandler()
373{
222ed1d6 374 wxList::compatibility_iterator first;
04dbb646 375
dfa4a244 376 if ( !m_HandlersStack ||
28b4db7f
VZ
377#if wxUSE_STL
378 !(first = m_HandlersStack->GetFirst())
379#else // !wxUSE_STL
380 ((first = m_HandlersStack->GetFirst()) == NULL)
381#endif // wxUSE_STL/!wxUSE_STL
382 )
f3c82859
VS
383 {
384 wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
385 return;
386 }
4f9297b0 387 m_HandlersHash = *((wxHashTable*) first->GetData());
222ed1d6
MB
388 delete (wxHashTable*) first->GetData();
389 m_HandlersStack->Erase(first);
a7a4d01b
VS
390}
391
211dfedd
VS
392void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
393{
394 wxHtmlParserState *s = new wxHtmlParserState;
395
396 s->m_curTag = m_CurTag;
397 s->m_tags = m_Tags;
398 s->m_textPieces = m_TextPieces;
399 s->m_curTextPiece = m_CurTextPiece;
400 s->m_source = m_Source;
401
402 s->m_nextState = m_SavedStates;
403 m_SavedStates = s;
404
405 m_CurTag = NULL;
406 m_Tags = NULL;
407 m_TextPieces = NULL;
408 m_CurTextPiece = 0;
b1a3a964 409 m_Source = NULL;
d699f48b 410
211dfedd
VS
411 SetSource(src);
412}
413
414bool wxHtmlParser::RestoreState()
415{
d1da8872 416 if (!m_SavedStates) return false;
d699f48b 417
0beefa20 418 DestroyDOMTree();
d989875a 419 delete m_Source;
0beefa20 420
211dfedd
VS
421 wxHtmlParserState *s = m_SavedStates;
422 m_SavedStates = s->m_nextState;
d699f48b 423
211dfedd
VS
424 m_CurTag = s->m_curTag;
425 m_Tags = s->m_tags;
426 m_TextPieces = s->m_textPieces;
427 m_CurTextPiece = s->m_curTextPiece;
428 m_Source = s->m_source;
d699f48b 429
211dfedd 430 delete s;
d1da8872 431 return true;
211dfedd
VS
432}
433
e7feeafa
VS
434wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
435{
b1a3a964 436 return wxString(tag.GetBeginIter(), tag.GetEndIter1());
e7feeafa
VS
437}
438
5526e819
VS
439//-----------------------------------------------------------------------------
440// wxHtmlTagHandler
441//-----------------------------------------------------------------------------
442
443IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
daa616fc 444
e7feeafa
VS
445void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
446{
447 // It is safe to temporarily change the source being parsed,
448 // provided we restore the state back after parsing
449 m_Parser->SetSourceAndSaveState(source);
450 m_Parser->DoParsing();
451 m_Parser->RestoreState();
452}
453
daa616fc
VS
454
455//-----------------------------------------------------------------------------
456// wxHtmlEntitiesParser
457//-----------------------------------------------------------------------------
458
459IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
460
461wxHtmlEntitiesParser::wxHtmlEntitiesParser()
462#if wxUSE_WCHAR_T && !wxUSE_UNICODE
463 : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
223d09f6 464#endif
daa616fc
VS
465{
466}
467
468wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
469{
5438a566 470#if wxUSE_WCHAR_T && !wxUSE_UNICODE
daa616fc 471 delete m_conv;
5438a566 472#endif
daa616fc 473}
5526e819 474
daa616fc
VS
475void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
476{
477#if wxUSE_WCHAR_T && !wxUSE_UNICODE
2b5f62a0
VZ
478 if (encoding == m_encoding)
479 return;
480
daa616fc 481 delete m_conv;
2b5f62a0 482
daa616fc 483 m_encoding = encoding;
2b5f62a0
VZ
484 if (m_encoding == wxFONTENCODING_SYSTEM)
485 m_conv = NULL;
486 else
daa616fc 487 m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
d699f48b
KB
488#else
489 (void) encoding;
daa616fc
VS
490#endif
491}
492
96d665d2 493wxString wxHtmlEntitiesParser::Parse(const wxString& input) const
daa616fc 494{
daa616fc 495 wxString output;
d1da8872 496
4f7e8fda
VS
497 const wxString::const_iterator end(input.end());
498 wxString::const_iterator c(input.begin());
499 wxString::const_iterator last(c);
04dbb646 500
4f7e8fda 501 for ( ; c < end; ++c )
daa616fc
VS
502 {
503 if (*c == wxT('&'))
504 {
4f7e8fda
VS
505 if ( output.empty() )
506 output.reserve(input.length());
507
daa616fc 508 if (c - last > 0)
4f7e8fda
VS
509 output.append(last, c);
510 if ( ++c == end )
9e2bd135 511 break;
d1da8872 512
daa616fc 513 wxString entity;
4f7e8fda 514 const wxString::const_iterator ent_s = c;
470252df 515 wxChar entity_char;
d1da8872 516
b1a3a964
VS
517 for ( ; c != end; ++c )
518 {
519 wxChar ch = *c;
520 if ( !((ch >= wxT('a') && ch <= wxT('z')) ||
521 (ch >= wxT('A') && ch <= wxT('Z')) ||
522 (ch >= wxT('0') && ch <= wxT('9')) ||
523 ch == wxT('_') || ch == wxT('#')) )
524 break;
525 }
526
4f7e8fda
VS
527 entity.append(ent_s, c);
528 if (c == end || *c != wxT(';')) --c;
211dfedd 529 last = c+1;
470252df
VS
530 entity_char = GetEntityChar(entity);
531 if (entity_char)
532 output << entity_char;
533 else
534 {
4f7e8fda 535 output.append(ent_s-1, c+1);
25271309 536 wxLogTrace(wxTRACE_HTML_DEBUG,
4f7e8fda
VS
537 "Unrecognized HTML entity: '%s'",
538 entity);
470252df 539 }
daa616fc
VS
540 }
541 }
4f7e8fda
VS
542 if ( last == input.begin() ) // common case: no entity
543 return input;
544 if ( last != end )
545 output.append(last, end);
daa616fc
VS
546 return output;
547}
548
2b5f62a0 549#if !wxUSE_UNICODE
96d665d2 550wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code) const
daa616fc 551{
2b5f62a0 552#if wxUSE_WCHAR_T
daa616fc
VS
553 char buf[2];
554 wchar_t wbuf[2];
555 wbuf[0] = (wchar_t)code;
556 wbuf[1] = 0;
557 wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
eaf1a1d9 558 if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
daa616fc
VS
559 return '?';
560 return buf[0];
561#else
562 return (code < 256) ? (wxChar)code : '?';
563#endif
564}
2b5f62a0 565#endif
daa616fc 566
19817fd3
VS
567struct wxHtmlEntityInfo
568{
569 const wxStringCharType *name;
570 unsigned code;
571};
572
573extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
574{
575#if wxUSE_UNICODE_UTF8
576 return strcmp((char*)key, ((wxHtmlEntityInfo*)item)->name);
577#else
578 return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
579#endif
580}
581
96d665d2 582wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity) const
daa616fc
VS
583{
584 unsigned code = 0;
04dbb646 585
daa616fc
VS
586 if (entity[0] == wxT('#'))
587 {
c471f7e1
VS
588 // NB: parsed value is a number, so it's OK to use wx_str(), internal
589 // representation is the same for numbers
590 const wxStringCharType *ent_s = entity.wx_str();
591 const wxStringCharType *format;
04dbb646 592
c471f7e1 593 if (ent_s[1] == wxSTRING_TEXT('x') || ent_s[1] == wxSTRING_TEXT('X'))
daa616fc 594 {
c471f7e1 595 format = wxSTRING_TEXT("%x");
daa616fc
VS
596 ent_s++;
597 }
598 else
c471f7e1 599 format = wxSTRING_TEXT("%u");
daa616fc
VS
600 ent_s++;
601
602 if (wxSscanf(ent_s, format, &code) != 1)
603 code = 0;
604 }
605 else
606 {
19817fd3
VS
607 // store the literals in wx's internal representation (either char*
608 // in UTF-8 or wchar_t*) for best performance:
609 #define ENTITY(name, code) { wxSTRING_TEXT(name), code }
610
daa616fc 611 static wxHtmlEntityInfo substitutions[] = {
19817fd3
VS
612 ENTITY("AElig", 198),
613 ENTITY("Aacute", 193),
614 ENTITY("Acirc", 194),
615 ENTITY("Agrave", 192),
616 ENTITY("Alpha", 913),
617 ENTITY("Aring", 197),
618 ENTITY("Atilde", 195),
619 ENTITY("Auml", 196),
620 ENTITY("Beta", 914),
621 ENTITY("Ccedil", 199),
622 ENTITY("Chi", 935),
623 ENTITY("Dagger", 8225),
624 ENTITY("Delta", 916),
625 ENTITY("ETH", 208),
626 ENTITY("Eacute", 201),
627 ENTITY("Ecirc", 202),
628 ENTITY("Egrave", 200),
629 ENTITY("Epsilon", 917),
630 ENTITY("Eta", 919),
631 ENTITY("Euml", 203),
632 ENTITY("Gamma", 915),
633 ENTITY("Iacute", 205),
634 ENTITY("Icirc", 206),
635 ENTITY("Igrave", 204),
636 ENTITY("Iota", 921),
637 ENTITY("Iuml", 207),
638 ENTITY("Kappa", 922),
639 ENTITY("Lambda", 923),
640 ENTITY("Mu", 924),
641 ENTITY("Ntilde", 209),
642 ENTITY("Nu", 925),
643 ENTITY("OElig", 338),
644 ENTITY("Oacute", 211),
645 ENTITY("Ocirc", 212),
646 ENTITY("Ograve", 210),
647 ENTITY("Omega", 937),
648 ENTITY("Omicron", 927),
649 ENTITY("Oslash", 216),
650 ENTITY("Otilde", 213),
651 ENTITY("Ouml", 214),
652 ENTITY("Phi", 934),
653 ENTITY("Pi", 928),
654 ENTITY("Prime", 8243),
655 ENTITY("Psi", 936),
656 ENTITY("Rho", 929),
657 ENTITY("Scaron", 352),
658 ENTITY("Sigma", 931),
659 ENTITY("THORN", 222),
660 ENTITY("Tau", 932),
661 ENTITY("Theta", 920),
662 ENTITY("Uacute", 218),
663 ENTITY("Ucirc", 219),
664 ENTITY("Ugrave", 217),
665 ENTITY("Upsilon", 933),
666 ENTITY("Uuml", 220),
667 ENTITY("Xi", 926),
668 ENTITY("Yacute", 221),
669 ENTITY("Yuml", 376),
670 ENTITY("Zeta", 918),
671 ENTITY("aacute", 225),
672 ENTITY("acirc", 226),
673 ENTITY("acute", 180),
674 ENTITY("aelig", 230),
675 ENTITY("agrave", 224),
676 ENTITY("alefsym", 8501),
677 ENTITY("alpha", 945),
678 ENTITY("amp", 38),
679 ENTITY("and", 8743),
680 ENTITY("ang", 8736),
681 ENTITY("aring", 229),
682 ENTITY("asymp", 8776),
683 ENTITY("atilde", 227),
684 ENTITY("auml", 228),
685 ENTITY("bdquo", 8222),
686 ENTITY("beta", 946),
687 ENTITY("brvbar", 166),
688 ENTITY("bull", 8226),
689 ENTITY("cap", 8745),
690 ENTITY("ccedil", 231),
691 ENTITY("cedil", 184),
692 ENTITY("cent", 162),
693 ENTITY("chi", 967),
694 ENTITY("circ", 710),
695 ENTITY("clubs", 9827),
696 ENTITY("cong", 8773),
697 ENTITY("copy", 169),
698 ENTITY("crarr", 8629),
699 ENTITY("cup", 8746),
700 ENTITY("curren", 164),
701 ENTITY("dArr", 8659),
702 ENTITY("dagger", 8224),
703 ENTITY("darr", 8595),
704 ENTITY("deg", 176),
705 ENTITY("delta", 948),
706 ENTITY("diams", 9830),
707 ENTITY("divide", 247),
708 ENTITY("eacute", 233),
709 ENTITY("ecirc", 234),
710 ENTITY("egrave", 232),
711 ENTITY("empty", 8709),
712 ENTITY("emsp", 8195),
713 ENTITY("ensp", 8194),
714 ENTITY("epsilon", 949),
715 ENTITY("equiv", 8801),
716 ENTITY("eta", 951),
717 ENTITY("eth", 240),
718 ENTITY("euml", 235),
719 ENTITY("euro", 8364),
720 ENTITY("exist", 8707),
721 ENTITY("fnof", 402),
722 ENTITY("forall", 8704),
723 ENTITY("frac12", 189),
724 ENTITY("frac14", 188),
725 ENTITY("frac34", 190),
726 ENTITY("frasl", 8260),
727 ENTITY("gamma", 947),
728 ENTITY("ge", 8805),
729 ENTITY("gt", 62),
730 ENTITY("hArr", 8660),
731 ENTITY("harr", 8596),
732 ENTITY("hearts", 9829),
733 ENTITY("hellip", 8230),
734 ENTITY("iacute", 237),
735 ENTITY("icirc", 238),
736 ENTITY("iexcl", 161),
737 ENTITY("igrave", 236),
738 ENTITY("image", 8465),
739 ENTITY("infin", 8734),
740 ENTITY("int", 8747),
741 ENTITY("iota", 953),
742 ENTITY("iquest", 191),
743 ENTITY("isin", 8712),
744 ENTITY("iuml", 239),
745 ENTITY("kappa", 954),
746 ENTITY("lArr", 8656),
747 ENTITY("lambda", 955),
748 ENTITY("lang", 9001),
749 ENTITY("laquo", 171),
750 ENTITY("larr", 8592),
751 ENTITY("lceil", 8968),
752 ENTITY("ldquo", 8220),
753 ENTITY("le", 8804),
754 ENTITY("lfloor", 8970),
755 ENTITY("lowast", 8727),
756 ENTITY("loz", 9674),
757 ENTITY("lrm", 8206),
758 ENTITY("lsaquo", 8249),
759 ENTITY("lsquo", 8216),
760 ENTITY("lt", 60),
761 ENTITY("macr", 175),
762 ENTITY("mdash", 8212),
763 ENTITY("micro", 181),
764 ENTITY("middot", 183),
765 ENTITY("minus", 8722),
766 ENTITY("mu", 956),
767 ENTITY("nabla", 8711),
768 ENTITY("nbsp", 160),
769 ENTITY("ndash", 8211),
770 ENTITY("ne", 8800),
771 ENTITY("ni", 8715),
772 ENTITY("not", 172),
773 ENTITY("notin", 8713),
774 ENTITY("nsub", 8836),
775 ENTITY("ntilde", 241),
776 ENTITY("nu", 957),
777 ENTITY("oacute", 243),
778 ENTITY("ocirc", 244),
779 ENTITY("oelig", 339),
780 ENTITY("ograve", 242),
781 ENTITY("oline", 8254),
782 ENTITY("omega", 969),
783 ENTITY("omicron", 959),
784 ENTITY("oplus", 8853),
785 ENTITY("or", 8744),
786 ENTITY("ordf", 170),
787 ENTITY("ordm", 186),
788 ENTITY("oslash", 248),
789 ENTITY("otilde", 245),
790 ENTITY("otimes", 8855),
791 ENTITY("ouml", 246),
792 ENTITY("para", 182),
793 ENTITY("part", 8706),
794 ENTITY("permil", 8240),
795 ENTITY("perp", 8869),
796 ENTITY("phi", 966),
797 ENTITY("pi", 960),
798 ENTITY("piv", 982),
799 ENTITY("plusmn", 177),
800 ENTITY("pound", 163),
801 ENTITY("prime", 8242),
802 ENTITY("prod", 8719),
803 ENTITY("prop", 8733),
804 ENTITY("psi", 968),
805 ENTITY("quot", 34),
806 ENTITY("rArr", 8658),
807 ENTITY("radic", 8730),
808 ENTITY("rang", 9002),
809 ENTITY("raquo", 187),
810 ENTITY("rarr", 8594),
811 ENTITY("rceil", 8969),
812 ENTITY("rdquo", 8221),
813 ENTITY("real", 8476),
814 ENTITY("reg", 174),
815 ENTITY("rfloor", 8971),
816 ENTITY("rho", 961),
817 ENTITY("rlm", 8207),
818 ENTITY("rsaquo", 8250),
819 ENTITY("rsquo", 8217),
820 ENTITY("sbquo", 8218),
821 ENTITY("scaron", 353),
822 ENTITY("sdot", 8901),
823 ENTITY("sect", 167),
824 ENTITY("shy", 173),
825 ENTITY("sigma", 963),
826 ENTITY("sigmaf", 962),
827 ENTITY("sim", 8764),
828 ENTITY("spades", 9824),
829 ENTITY("sub", 8834),
830 ENTITY("sube", 8838),
831 ENTITY("sum", 8721),
832 ENTITY("sup", 8835),
833 ENTITY("sup1", 185),
834 ENTITY("sup2", 178),
835 ENTITY("sup3", 179),
836 ENTITY("supe", 8839),
837 ENTITY("szlig", 223),
838 ENTITY("tau", 964),
839 ENTITY("there4", 8756),
840 ENTITY("theta", 952),
841 ENTITY("thetasym", 977),
842 ENTITY("thinsp", 8201),
843 ENTITY("thorn", 254),
844 ENTITY("tilde", 732),
845 ENTITY("times", 215),
846 ENTITY("trade", 8482),
847 ENTITY("uArr", 8657),
848 ENTITY("uacute", 250),
849 ENTITY("uarr", 8593),
850 ENTITY("ucirc", 251),
851 ENTITY("ugrave", 249),
852 ENTITY("uml", 168),
853 ENTITY("upsih", 978),
854 ENTITY("upsilon", 965),
855 ENTITY("uuml", 252),
856 ENTITY("weierp", 8472),
857 ENTITY("xi", 958),
858 ENTITY("yacute", 253),
859 ENTITY("yen", 165),
860 ENTITY("yuml", 255),
861 ENTITY("zeta", 950),
862 ENTITY("zwj", 8205),
863 ENTITY("zwnj", 8204),
daa616fc 864 {NULL, 0}};
19817fd3 865 #undef ENTITY
daa616fc 866 static size_t substitutions_cnt = 0;
04dbb646 867
daa616fc
VS
868 if (substitutions_cnt == 0)
869 while (substitutions[substitutions_cnt].code != 0)
870 substitutions_cnt++;
871
3919d530
JS
872 wxHtmlEntityInfo *info = NULL;
873#ifdef __WXWINCE__
874 // bsearch crashes under WinCE for some reason
875 size_t i;
876 for (i = 0; i < substitutions_cnt; i++)
877 {
878 if (entity == substitutions[i].name)
879 {
880 info = & substitutions[i];
881 break;
882 }
883 }
884#else
19817fd3 885 info = (wxHtmlEntityInfo*) bsearch(entity.wx_str(), substitutions,
daa616fc
VS
886 substitutions_cnt,
887 sizeof(wxHtmlEntityInfo),
90350682 888 wxHtmlEntityCompare);
3919d530 889#endif
daa616fc
VS
890 if (info)
891 code = info->code;
892 }
04dbb646 893
daa616fc 894 if (code == 0)
470252df 895 return 0;
daa616fc
VS
896 else
897 return GetCharForCode(code);
898}
899
d1da8872 900wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
6cc4e6b8
VS
901 const wxString& url) const
902{
e02ecf7c 903 return m_FS ? m_FS->OpenFile(url) : NULL;
d1da8872 904
6cc4e6b8
VS
905}
906
2b5f62a0
VZ
907
908//-----------------------------------------------------------------------------
909// wxHtmlParser::ExtractCharsetInformation
910//-----------------------------------------------------------------------------
911
912class wxMetaTagParser : public wxHtmlParser
913{
914public:
2eb10e2a
VZ
915 wxMetaTagParser() { }
916
2b5f62a0 917 wxObject* GetProduct() { return NULL; }
2eb10e2a 918
2b5f62a0 919protected:
5bce3e6f 920 virtual void AddText(const wxString& WXUNUSED(txt)) {}
2eb10e2a
VZ
921
922 DECLARE_NO_COPY_CLASS(wxMetaTagParser)
2b5f62a0
VZ
923};
924
925class wxMetaTagHandler : public wxHtmlTagHandler
926{
927public:
928 wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
929 wxString GetSupportedTags() { return wxT("META,BODY"); }
930 bool HandleTag(const wxHtmlTag& tag);
931
932private:
933 wxString *m_retval;
2eb10e2a
VZ
934
935 DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
2b5f62a0
VZ
936};
937
938bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
939{
940 if (tag.GetName() == _T("BODY"))
941 {
942 m_Parser->StopParsing();
d1da8872 943 return false;
2b5f62a0
VZ
944 }
945
946 if (tag.HasParam(_T("HTTP-EQUIV")) &&
13fd234c 947 tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
2b5f62a0
VZ
948 tag.HasParam(_T("CONTENT")))
949 {
5af11a94 950 wxString content = tag.GetParam(_T("CONTENT")).Lower();
2b5f62a0
VZ
951 if (content.Left(19) == _T("text/html; charset="))
952 {
953 *m_retval = content.Mid(19);
954 m_Parser->StopParsing();
955 }
956 }
d1da8872 957 return false;
2b5f62a0
VZ
958}
959
960
961/*static*/
962wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
963{
964 wxString charset;
e7274ba2
WS
965 wxMetaTagParser *parser = new wxMetaTagParser();
966 if(parser)
967 {
968 parser->AddTagHandler(new wxMetaTagHandler(&charset));
969 parser->Parse(markup);
970 delete parser;
971 }
2b5f62a0
VZ
972 return charset;
973}
974
4609ee2e
VZ
975/* static */
976bool
977wxHtmlParser::SkipCommentTag(wxString::const_iterator& start,
978 wxString::const_iterator end)
979{
980 wxASSERT_MSG( *start == '<', _T("should be called on the tag start") );
981
982 wxString::const_iterator p = start;
983
984 // comments begin with "<!--" in HTML 4.0
95ebbfe1 985 if ( p > end - 3 || *++p != '!' || *++p != '-' || *++p != '-' )
4609ee2e
VZ
986 {
987 // not a comment at all
988 return false;
989 }
990
991 // skip the start of the comment tag in any case, if we don't find the
992 // closing tag we should ignore broken markup
993 start = p;
994
995 // comments end with "--[ \t\r\n]*>", i.e. white space is allowed between
996 // comment delimiter and the closing tag character (section 3.2.4 of
997 // http://www.w3.org/TR/html401/)
998 int dashes = 0;
999 while ( ++p < end )
1000 {
1001 const wxChar c = *p;
1002
1003 if ( (c == wxT(' ') || c == wxT('\n') ||
1004 c == wxT('\r') || c == wxT('\t')) && dashes >= 2 )
1005 {
1006 // ignore white space before potential tag end
1007 continue;
1008 }
1009
1010 if ( c == wxT('>') && dashes >= 2 )
1011 {
1012 // found end of comment
1013 start = p;
1014 break;
1015 }
1016
1017 if ( c == wxT('-') )
1018 dashes++;
1019 else
1020 dashes = 0;
1021 }
1022
1023 return true;
1024}
1025
1026#endif // wxUSE_HTML