fixed wxHTML parsing to run in O(n) even in UTF8 build
[wxWidgets.git] / src / html / htmltag.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmltag.cpp
3 // Purpose: wxHtmlTag class (represents single tag)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML
17
18 #include "wx/html/htmltag.h"
19
20 #ifndef WX_PRECOMP
21 #include "wx/colour.h"
22 #include "wx/wxcrtvararg.h"
23 #endif
24
25 #include "wx/html/htmlpars.h"
26 #include "wx/vector.h"
27
28 #include <stdio.h> // for vsscanf
29 #include <stdarg.h>
30
31 //-----------------------------------------------------------------------------
32 // wxHtmlTagsCache
33 //-----------------------------------------------------------------------------
34
35 struct wxHtmlCacheItem
36 {
37 // this is "pos" value passed to wxHtmlTag's constructor.
38 // it is position of '<' character of the tag
39 wxString::const_iterator Key;
40
41 // Tag type
42 enum Type
43 {
44 Type_Normal, // normal tag with a matching ending tag
45 Type_NoMatchingEndingTag, // there's no ending tag for this tag
46 Type_EndingTag // this is ending tag </..>
47 };
48 Type type;
49
50 // end positions for the tag:
51 // end1 is '<' of ending tag,
52 // end2 is '>' or both are
53 wxString::const_iterator End1, End2;
54
55 // name of this tag
56 wxChar *Name;
57 };
58
59 // NB: this is an empty class and not typedef because of forward declaration
60 class wxHtmlTagsCacheData : public wxVector<wxHtmlCacheItem>
61 {
62 };
63
64 bool wxIsCDATAElement(const wxChar *tag)
65 {
66 return (wxStrcmp(tag, _T("SCRIPT")) == 0) ||
67 (wxStrcmp(tag, _T("STYLE")) == 0);
68 }
69
70 bool wxIsCDATAElement(const wxString& tag)
71 {
72 return (wxStrcmp(tag.wx_str(), wxSTRING_TEXT("SCRIPT")) == 0) ||
73 (wxStrcmp(tag.wx_str(), wxSTRING_TEXT("STYLE")) == 0);
74 }
75
76 wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
77 {
78 m_Cache = new wxHtmlTagsCacheData;
79 m_CachePos = 0;
80
81 wxChar tagBuffer[256];
82
83 const wxString::const_iterator end = source.end();
84 for ( wxString::const_iterator pos = source.begin(); pos < end; ++pos )
85 {
86 if (*pos == wxT('<')) // tag found:
87 {
88 // don't cache comment tags
89 if ( wxHtmlParser::SkipCommentTag(pos, source.end()) )
90 continue;
91
92 size_t tg = Cache().size();
93 Cache().push_back(wxHtmlCacheItem());
94
95 wxString::const_iterator stpos = pos++;
96 Cache()[tg].Key = stpos;
97
98 int i;
99 for ( i = 0;
100 pos < end && i < (int)WXSIZEOF(tagBuffer) - 1 &&
101 *pos != wxT('>') && !wxIsspace(*pos);
102 ++i, ++pos )
103 {
104 tagBuffer[i] = (wxChar)wxToupper(*pos);
105 }
106 tagBuffer[i] = _T('\0');
107
108 Cache()[tg].Name = new wxChar[i+1];
109 memcpy(Cache()[tg].Name, tagBuffer, (i+1)*sizeof(wxChar));
110
111 while (pos < end && *pos != wxT('>'))
112 ++pos;
113
114 if ((stpos+1) < end && *(stpos+1) == wxT('/')) // ending tag:
115 {
116 Cache()[tg].type = wxHtmlCacheItem::Type_EndingTag;
117 // find matching begin tag:
118 for (i = tg; i >= 0; i--)
119 {
120 if ((Cache()[i].type == wxHtmlCacheItem::Type_NoMatchingEndingTag) && (wxStrcmp(Cache()[i].Name, tagBuffer+1) == 0))
121 {
122 Cache()[i].type = wxHtmlCacheItem::Type_Normal;
123 Cache()[i].End1 = stpos;
124 Cache()[i].End2 = pos + 1;
125 break;
126 }
127 }
128 }
129 else
130 {
131 Cache()[tg].type = wxHtmlCacheItem::Type_NoMatchingEndingTag;
132
133 if (wxIsCDATAElement(tagBuffer))
134 {
135 // store the orig pos in case we are missing the closing
136 // tag (see below)
137 const wxString::const_iterator old_pos = pos;
138 bool foundCloseTag = false;
139
140 // find next matching tag
141 int tag_len = wxStrlen(tagBuffer);
142 while (pos < end)
143 {
144 // find the ending tag
145 while (pos + 1 < end &&
146 (*pos != '<' || *(pos+1) != '/'))
147 ++pos;
148 if (*pos == '<')
149 ++pos;
150
151 // see if it matches
152 int match_pos = 0;
153 while (pos < end && match_pos < tag_len )
154 {
155 wxChar c = *pos;
156 if ( c == '>' || c == '<' )
157 break;
158
159 // cast to wxChar needed to suppress warning in
160 // Unicode build
161 if ((wxChar)wxToupper(c) == tagBuffer[match_pos])
162 {
163 ++match_pos;
164 }
165 else if (c == wxT(' ') || c == wxT('\n') ||
166 c == wxT('\r') || c == wxT('\t'))
167 {
168 // need to skip over these
169 }
170 else
171 {
172 match_pos = 0;
173 }
174 ++pos;
175 }
176
177 // found a match
178 if (match_pos == tag_len)
179 {
180 pos = pos - tag_len - 3;
181 foundCloseTag = true;
182 break;
183 }
184 else // keep looking for the closing tag
185 {
186 ++pos;
187 }
188 }
189 if (!foundCloseTag)
190 {
191 // we didn't find closing tag; this means the markup
192 // is incorrect and the best thing we can do is to
193 // ignore the unclosed tag and continue parsing as if
194 // it didn't exist:
195 pos = old_pos;
196 }
197 }
198 }
199 }
200 }
201
202 // ok, we're done, now we'll free .Name members of cache - we don't need it anymore:
203 for ( wxHtmlTagsCacheData::iterator i = Cache().begin();
204 i != Cache().end(); ++i )
205 {
206 delete[] i->Name;
207 i->Name = NULL;
208 }
209 }
210
211 wxHtmlTagsCache::~wxHtmlTagsCache()
212 {
213 delete m_Cache;
214 }
215
216 void wxHtmlTagsCache::QueryTag(const wxString::const_iterator& at,
217 const wxString::const_iterator& inputEnd,
218 wxString::const_iterator *end1,
219 wxString::const_iterator *end2,
220 bool *hasEnding)
221 {
222 if (Cache().empty())
223 return;
224
225 if (Cache()[m_CachePos].Key != at)
226 {
227 int delta = (at < Cache()[m_CachePos].Key) ? -1 : 1;
228 do
229 {
230 m_CachePos += delta;
231
232 if ( m_CachePos < 0 || m_CachePos >= (int)Cache().size() )
233 {
234 if ( m_CachePos < 0 )
235 m_CachePos = 0;
236 else
237 m_CachePos = Cache().size() - 1;
238 // something is very wrong with HTML, give up by returning an
239 // impossibly large value which is going to be ignored by the
240 // caller
241 *end1 =
242 *end2 = inputEnd;
243 *hasEnding = true;
244 return;
245 }
246 }
247 while (Cache()[m_CachePos].Key != at);
248 }
249 *end1 = Cache()[m_CachePos].End1;
250 *end2 = Cache()[m_CachePos].End2;
251 *hasEnding = (Cache()[m_CachePos].type == wxHtmlCacheItem::Type_Normal);
252 }
253
254
255
256
257 //-----------------------------------------------------------------------------
258 // wxHtmlTag
259 //-----------------------------------------------------------------------------
260
261 wxHtmlTag::wxHtmlTag(wxHtmlTag *parent,
262 const wxString *source,
263 const wxString::const_iterator& pos,
264 const wxString::const_iterator& end_pos,
265 wxHtmlTagsCache *cache,
266 wxHtmlEntitiesParser *entParser)
267 {
268 /* Setup DOM relations */
269
270 m_Next = NULL;
271 m_FirstChild = m_LastChild = NULL;
272 m_Parent = parent;
273 if (parent)
274 {
275 m_Prev = m_Parent->m_LastChild;
276 if (m_Prev == NULL)
277 m_Parent->m_FirstChild = this;
278 else
279 m_Prev->m_Next = this;
280 m_Parent->m_LastChild = this;
281 }
282 else
283 m_Prev = NULL;
284
285 /* Find parameters and their values: */
286
287 wxChar c;
288
289 // fill-in name, params and begin pos:
290 wxString::const_iterator i(pos+1);
291
292 // find tag's name and convert it to uppercase:
293 while ((i < end_pos) &&
294 ((c = *(i++)) != wxT(' ') && c != wxT('\r') &&
295 c != wxT('\n') && c != wxT('\t') &&
296 c != wxT('>')))
297 {
298 if ((c >= wxT('a')) && (c <= wxT('z')))
299 c -= (wxT('a') - wxT('A'));
300 m_Name << c;
301 }
302
303 // if the tag has parameters, read them and "normalize" them,
304 // i.e. convert to uppercase, replace whitespaces by spaces and
305 // remove whitespaces around '=':
306 if (*(i-1) != wxT('>'))
307 {
308 #define IS_WHITE(c) (c == wxT(' ') || c == wxT('\r') || \
309 c == wxT('\n') || c == wxT('\t'))
310 wxString pname, pvalue;
311 wxChar quote;
312 enum
313 {
314 ST_BEFORE_NAME = 1,
315 ST_NAME,
316 ST_BEFORE_EQ,
317 ST_BEFORE_VALUE,
318 ST_VALUE
319 } state;
320
321 quote = 0;
322 state = ST_BEFORE_NAME;
323 while (i < end_pos)
324 {
325 c = *(i++);
326
327 if (c == wxT('>') && !(state == ST_VALUE && quote != 0))
328 {
329 if (state == ST_BEFORE_EQ || state == ST_NAME)
330 {
331 m_ParamNames.Add(pname);
332 m_ParamValues.Add(wxGetEmptyString());
333 }
334 else if (state == ST_VALUE && quote == 0)
335 {
336 m_ParamNames.Add(pname);
337 if (entParser)
338 m_ParamValues.Add(entParser->Parse(pvalue));
339 else
340 m_ParamValues.Add(pvalue);
341 }
342 break;
343 }
344 switch (state)
345 {
346 case ST_BEFORE_NAME:
347 if (!IS_WHITE(c))
348 {
349 pname = c;
350 state = ST_NAME;
351 }
352 break;
353 case ST_NAME:
354 if (IS_WHITE(c))
355 state = ST_BEFORE_EQ;
356 else if (c == wxT('='))
357 state = ST_BEFORE_VALUE;
358 else
359 pname << c;
360 break;
361 case ST_BEFORE_EQ:
362 if (c == wxT('='))
363 state = ST_BEFORE_VALUE;
364 else if (!IS_WHITE(c))
365 {
366 m_ParamNames.Add(pname);
367 m_ParamValues.Add(wxGetEmptyString());
368 pname = c;
369 state = ST_NAME;
370 }
371 break;
372 case ST_BEFORE_VALUE:
373 if (!IS_WHITE(c))
374 {
375 if (c == wxT('"') || c == wxT('\''))
376 quote = c, pvalue = wxGetEmptyString();
377 else
378 quote = 0, pvalue = c;
379 state = ST_VALUE;
380 }
381 break;
382 case ST_VALUE:
383 if ((quote != 0 && c == quote) ||
384 (quote == 0 && IS_WHITE(c)))
385 {
386 m_ParamNames.Add(pname);
387 if (quote == 0)
388 {
389 // VS: backward compatibility, no real reason,
390 // but wxHTML code relies on this... :(
391 pvalue.MakeUpper();
392 }
393 if (entParser)
394 m_ParamValues.Add(entParser->Parse(pvalue));
395 else
396 m_ParamValues.Add(pvalue);
397 state = ST_BEFORE_NAME;
398 }
399 else
400 pvalue << c;
401 break;
402 }
403 }
404
405 #undef IS_WHITE
406 }
407 m_Begin = i;
408 cache->QueryTag(pos, source->end(), &m_End1, &m_End2, &m_hasEnding);
409 if (m_End1 > end_pos) m_End1 = end_pos;
410 if (m_End2 > end_pos) m_End2 = end_pos;
411
412 #if WXWIN_COMPATIBILITY_2_8
413 m_sourceStart = source->begin();
414 #endif
415 }
416
417 wxHtmlTag::~wxHtmlTag()
418 {
419 wxHtmlTag *t1, *t2;
420 t1 = m_FirstChild;
421 while (t1)
422 {
423 t2 = t1->GetNextSibling();
424 delete t1;
425 t1 = t2;
426 }
427 }
428
429 bool wxHtmlTag::HasParam(const wxString& par) const
430 {
431 return (m_ParamNames.Index(par, false) != wxNOT_FOUND);
432 }
433
434 wxString wxHtmlTag::GetParam(const wxString& par, bool with_commas) const
435 {
436 int index = m_ParamNames.Index(par, false);
437 if (index == wxNOT_FOUND)
438 return wxGetEmptyString();
439 if (with_commas)
440 {
441 // VS: backward compatibility, seems to be never used by wxHTML...
442 wxString s;
443 s << wxT('"') << m_ParamValues[index] << wxT('"');
444 return s;
445 }
446 else
447 return m_ParamValues[index];
448 }
449
450 int wxHtmlTag::ScanParam(const wxString& par,
451 const char *format,
452 void *param) const
453 {
454 wxString parval = GetParam(par);
455 return wxSscanf(parval, format, param);
456 }
457
458 int wxHtmlTag::ScanParam(const wxString& par,
459 const wchar_t *format,
460 void *param) const
461 {
462 wxString parval = GetParam(par);
463 return wxSscanf(parval, format, param);
464 }
465
466 bool wxHtmlTag::GetParamAsColour(const wxString& par, wxColour *clr) const
467 {
468 wxCHECK_MSG( clr, false, _T("invalid colour argument") );
469
470 wxString str = GetParam(par);
471
472 // handle colours defined in HTML 4.0 first:
473 if (str.length() > 1 && str[0] != _T('#'))
474 {
475 #define HTML_COLOUR(name, r, g, b) \
476 if (str.IsSameAs(wxSTRING_TEXT(name), false)) \
477 { clr->Set(r, g, b); return true; }
478 HTML_COLOUR("black", 0x00,0x00,0x00)
479 HTML_COLOUR("silver", 0xC0,0xC0,0xC0)
480 HTML_COLOUR("gray", 0x80,0x80,0x80)
481 HTML_COLOUR("white", 0xFF,0xFF,0xFF)
482 HTML_COLOUR("maroon", 0x80,0x00,0x00)
483 HTML_COLOUR("red", 0xFF,0x00,0x00)
484 HTML_COLOUR("purple", 0x80,0x00,0x80)
485 HTML_COLOUR("fuchsia", 0xFF,0x00,0xFF)
486 HTML_COLOUR("green", 0x00,0x80,0x00)
487 HTML_COLOUR("lime", 0x00,0xFF,0x00)
488 HTML_COLOUR("olive", 0x80,0x80,0x00)
489 HTML_COLOUR("yellow", 0xFF,0xFF,0x00)
490 HTML_COLOUR("navy", 0x00,0x00,0x80)
491 HTML_COLOUR("blue", 0x00,0x00,0xFF)
492 HTML_COLOUR("teal", 0x00,0x80,0x80)
493 HTML_COLOUR("aqua", 0x00,0xFF,0xFF)
494 #undef HTML_COLOUR
495 }
496
497 // then try to parse #rrggbb representations or set from other well
498 // known names (note that this doesn't strictly conform to HTML spec,
499 // but it doesn't do real harm -- but it *must* be done after the standard
500 // colors are handled above):
501 if (clr->Set(str))
502 return true;
503
504 return false;
505 }
506
507 bool wxHtmlTag::GetParamAsInt(const wxString& par, int *clr) const
508 {
509 if (!HasParam(par)) return false;
510 long i;
511 bool succ = GetParam(par).ToLong(&i);
512 *clr = (int)i;
513 return succ;
514 }
515
516 wxString wxHtmlTag::GetAllParams() const
517 {
518 // VS: this function is for backward compatibility only,
519 // never used by wxHTML
520 wxString s;
521 size_t cnt = m_ParamNames.GetCount();
522 for (size_t i = 0; i < cnt; i++)
523 {
524 s << m_ParamNames[i];
525 s << wxT('=');
526 if (m_ParamValues[i].Find(wxT('"')) != wxNOT_FOUND)
527 s << wxT('\'') << m_ParamValues[i] << wxT('\'');
528 else
529 s << wxT('"') << m_ParamValues[i] << wxT('"');
530 }
531 return s;
532 }
533
534 wxHtmlTag *wxHtmlTag::GetFirstSibling() const
535 {
536 if (m_Parent)
537 return m_Parent->m_FirstChild;
538 else
539 {
540 wxHtmlTag *cur = (wxHtmlTag*)this;
541 while (cur->m_Prev)
542 cur = cur->m_Prev;
543 return cur;
544 }
545 }
546
547 wxHtmlTag *wxHtmlTag::GetLastSibling() const
548 {
549 if (m_Parent)
550 return m_Parent->m_LastChild;
551 else
552 {
553 wxHtmlTag *cur = (wxHtmlTag*)this;
554 while (cur->m_Next)
555 cur = cur->m_Next;
556 return cur;
557 }
558 }
559
560 wxHtmlTag *wxHtmlTag::GetNextTag() const
561 {
562 if (m_FirstChild) return m_FirstChild;
563 if (m_Next) return m_Next;
564 wxHtmlTag *cur = m_Parent;
565 if (!cur) return NULL;
566 while (cur->m_Parent && !cur->m_Next)
567 cur = cur->m_Parent;
568 return cur->m_Next;
569 }
570
571 #endif