Don't crash on malformed HTML in wxHTML font tag handler.
[wxWidgets.git] / src / html / htmltag.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/html/htmltag.cpp
3 // Purpose: wxHtmlTag class (represents single tag)
4 // Author: Vaclav Slavik
5 // RCS-ID: $Id$
6 // Copyright: (c) 1999 Vaclav Slavik
7 // Licence: wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9
10 #include "wx/wxprec.h"
11
12 #ifdef __BORLANDC__
13 #pragma hdrstop
14 #endif
15
16 #if wxUSE_HTML
17
18 #include "wx/html/htmltag.h"
19
20 #ifndef WX_PRECOMP
21 #include "wx/colour.h"
22 #include "wx/wxcrtvararg.h"
23 #endif
24
25 #include "wx/html/htmlpars.h"
26 #include "wx/html/styleparams.h"
27
28 #include "wx/vector.h"
29
30 #include <stdio.h> // for vsscanf
31 #include <stdarg.h>
32
33 //-----------------------------------------------------------------------------
34 // wxHtmlTagsCache
35 //-----------------------------------------------------------------------------
36
37 struct wxHtmlCacheItem
38 {
39 // this is "pos" value passed to wxHtmlTag's constructor.
40 // it is position of '<' character of the tag
41 wxString::const_iterator Key;
42
43 // Tag type
44 enum Type
45 {
46 Type_Normal, // normal tag with a matching ending tag
47 Type_NoMatchingEndingTag, // there's no ending tag for this tag
48 Type_EndingTag // this is ending tag </..>
49 };
50 Type type;
51
52 // end positions for the tag:
53 // end1 is '<' of ending tag,
54 // end2 is '>' or both are
55 wxString::const_iterator End1, End2;
56
57 // name of this tag
58 wxChar *Name;
59 };
60
61 // NB: this is an empty class and not typedef because of forward declaration
62 class wxHtmlTagsCacheData : public wxVector<wxHtmlCacheItem>
63 {
64 };
65
66 bool wxIsCDATAElement(const wxChar *tag)
67 {
68 return (wxStrcmp(tag, wxT("SCRIPT")) == 0) ||
69 (wxStrcmp(tag, wxT("STYLE")) == 0);
70 }
71
72 bool wxIsCDATAElement(const wxString& tag)
73 {
74 return (wxStrcmp(tag.wx_str(), wxS("SCRIPT")) == 0) ||
75 (wxStrcmp(tag.wx_str(), wxS("STYLE")) == 0);
76 }
77
78 wxHtmlTagsCache::wxHtmlTagsCache(const wxString& source)
79 {
80 m_Cache = new wxHtmlTagsCacheData;
81 m_CachePos = 0;
82
83 wxChar tagBuffer[256];
84
85 const wxString::const_iterator end = source.end();
86 for ( wxString::const_iterator pos = source.begin(); pos < end; ++pos )
87 {
88 if (*pos == wxT('<')) // tag found:
89 {
90 // don't cache comment tags
91 if ( wxHtmlParser::SkipCommentTag(pos, source.end()) )
92 continue;
93
94 size_t tg = Cache().size();
95 Cache().push_back(wxHtmlCacheItem());
96
97 wxString::const_iterator stpos = pos++;
98 Cache()[tg].Key = stpos;
99
100 int i;
101 for ( i = 0;
102 pos < end && i < (int)WXSIZEOF(tagBuffer) - 1 &&
103 *pos != wxT('>') && !wxIsspace(*pos);
104 ++i, ++pos )
105 {
106 tagBuffer[i] = (wxChar)wxToupper(*pos);
107 }
108 tagBuffer[i] = wxT('\0');
109
110 Cache()[tg].Name = new wxChar[i+1];
111 memcpy(Cache()[tg].Name, tagBuffer, (i+1)*sizeof(wxChar));
112
113 while (pos < end && *pos != wxT('>'))
114 ++pos;
115
116 if ((stpos+1) < end && *(stpos+1) == wxT('/')) // ending tag:
117 {
118 Cache()[tg].type = wxHtmlCacheItem::Type_EndingTag;
119 // find matching begin tag:
120 for (i = tg; i >= 0; i--)
121 {
122 if ((Cache()[i].type == wxHtmlCacheItem::Type_NoMatchingEndingTag) && (wxStrcmp(Cache()[i].Name, tagBuffer+1) == 0))
123 {
124 Cache()[i].type = wxHtmlCacheItem::Type_Normal;
125 Cache()[i].End1 = stpos;
126 Cache()[i].End2 = pos + 1;
127 break;
128 }
129 }
130 }
131 else
132 {
133 Cache()[tg].type = wxHtmlCacheItem::Type_NoMatchingEndingTag;
134
135 if (wxIsCDATAElement(tagBuffer))
136 {
137 // store the orig pos in case we are missing the closing
138 // tag (see below)
139 const wxString::const_iterator old_pos = pos;
140 bool foundCloseTag = false;
141
142 // find next matching tag
143 int tag_len = wxStrlen(tagBuffer);
144 while (pos < end)
145 {
146 // find the ending tag
147 while (pos + 1 < end &&
148 (*pos != '<' || *(pos+1) != '/'))
149 ++pos;
150 if (*pos == '<')
151 ++pos;
152
153 // see if it matches
154 int match_pos = 0;
155 while (pos < end && match_pos < tag_len )
156 {
157 wxChar c = *pos;
158 if ( c == '>' || c == '<' )
159 break;
160
161 // cast to wxChar needed to suppress warning in
162 // Unicode build
163 if ((wxChar)wxToupper(c) == tagBuffer[match_pos])
164 {
165 ++match_pos;
166 }
167 else if (c == wxT(' ') || c == wxT('\n') ||
168 c == wxT('\r') || c == wxT('\t'))
169 {
170 // need to skip over these
171 }
172 else
173 {
174 match_pos = 0;
175 }
176 ++pos;
177 }
178
179 // found a match
180 if (match_pos == tag_len)
181 {
182 pos = pos - tag_len - 3;
183 foundCloseTag = true;
184 break;
185 }
186 else // keep looking for the closing tag
187 {
188 ++pos;
189 }
190 }
191 if (!foundCloseTag)
192 {
193 // we didn't find closing tag; this means the markup
194 // is incorrect and the best thing we can do is to
195 // ignore the unclosed tag and continue parsing as if
196 // it didn't exist:
197 pos = old_pos;
198 }
199 }
200 }
201 }
202 }
203
204 // ok, we're done, now we'll free .Name members of cache - we don't need it anymore:
205 for ( wxHtmlTagsCacheData::iterator i = Cache().begin();
206 i != Cache().end(); ++i )
207 {
208 wxDELETEA(i->Name);
209 }
210 }
211
212 wxHtmlTagsCache::~wxHtmlTagsCache()
213 {
214 delete m_Cache;
215 }
216
217 void wxHtmlTagsCache::QueryTag(const wxString::const_iterator& at,
218 const wxString::const_iterator& inputEnd,
219 wxString::const_iterator *end1,
220 wxString::const_iterator *end2,
221 bool *hasEnding)
222 {
223 if (Cache().empty())
224 return;
225
226 if (Cache()[m_CachePos].Key != at)
227 {
228 int delta = (at < Cache()[m_CachePos].Key) ? -1 : 1;
229 do
230 {
231 m_CachePos += delta;
232
233 if ( m_CachePos < 0 || m_CachePos >= (int)Cache().size() )
234 {
235 if ( m_CachePos < 0 )
236 m_CachePos = 0;
237 else
238 m_CachePos = Cache().size() - 1;
239 // something is very wrong with HTML, give up by returning an
240 // impossibly large value which is going to be ignored by the
241 // caller
242 *end1 =
243 *end2 = inputEnd;
244 *hasEnding = true;
245 return;
246 }
247 }
248 while (Cache()[m_CachePos].Key != at);
249 }
250
251 switch ( Cache()[m_CachePos].type )
252 {
253 case wxHtmlCacheItem::Type_Normal:
254 *end1 = Cache()[m_CachePos].End1;
255 *end2 = Cache()[m_CachePos].End2;
256 *hasEnding = true;
257 break;
258
259 case wxHtmlCacheItem::Type_EndingTag:
260 wxFAIL_MSG("QueryTag called for ending tag - can't be");
261 // but if it does happen, fall through, better than crashing
262
263 case wxHtmlCacheItem::Type_NoMatchingEndingTag:
264 // If input HTML is invalid and there's no closing tag for this
265 // one, pretend that it runs all the way to the end of input
266 *end1 = inputEnd;
267 *end2 = inputEnd;
268 *hasEnding = false;
269 break;
270 }
271 }
272
273
274
275
276 //-----------------------------------------------------------------------------
277 // wxHtmlTag
278 //-----------------------------------------------------------------------------
279
280 wxHtmlTag::wxHtmlTag(wxHtmlTag *parent,
281 const wxString *source,
282 const wxString::const_iterator& pos,
283 const wxString::const_iterator& end_pos,
284 wxHtmlTagsCache *cache,
285 wxHtmlEntitiesParser *entParser)
286 {
287 /* Setup DOM relations */
288
289 m_Next = NULL;
290 m_FirstChild = m_LastChild = NULL;
291 m_Parent = parent;
292 if (parent)
293 {
294 m_Prev = m_Parent->m_LastChild;
295 if (m_Prev == NULL)
296 m_Parent->m_FirstChild = this;
297 else
298 m_Prev->m_Next = this;
299 m_Parent->m_LastChild = this;
300 }
301 else
302 m_Prev = NULL;
303
304 /* Find parameters and their values: */
305
306 wxChar c wxDUMMY_INITIALIZE(0);
307
308 // fill-in name, params and begin pos:
309 wxString::const_iterator i(pos+1);
310
311 // find tag's name and convert it to uppercase:
312 while ((i < end_pos) &&
313 ((c = *(i++)) != wxT(' ') && c != wxT('\r') &&
314 c != wxT('\n') && c != wxT('\t') &&
315 c != wxT('>') && c != wxT('/')))
316 {
317 if ((c >= wxT('a')) && (c <= wxT('z')))
318 c -= (wxT('a') - wxT('A'));
319 m_Name << c;
320 }
321
322 // if the tag has parameters, read them and "normalize" them,
323 // i.e. convert to uppercase, replace whitespaces by spaces and
324 // remove whitespaces around '=':
325 if (*(i-1) != wxT('>'))
326 {
327 #define IS_WHITE(c) (c == wxT(' ') || c == wxT('\r') || \
328 c == wxT('\n') || c == wxT('\t'))
329 wxString pname, pvalue;
330 wxChar quote;
331 enum
332 {
333 ST_BEFORE_NAME = 1,
334 ST_NAME,
335 ST_BEFORE_EQ,
336 ST_BEFORE_VALUE,
337 ST_VALUE
338 } state;
339
340 quote = 0;
341 state = ST_BEFORE_NAME;
342 while (i < end_pos)
343 {
344 c = *(i++);
345
346 if (c == wxT('>') && !(state == ST_VALUE && quote != 0))
347 {
348 if (state == ST_BEFORE_EQ || state == ST_NAME)
349 {
350 m_ParamNames.Add(pname);
351 m_ParamValues.Add(wxGetEmptyString());
352 }
353 else if (state == ST_VALUE && quote == 0)
354 {
355 m_ParamNames.Add(pname);
356 if (entParser)
357 m_ParamValues.Add(entParser->Parse(pvalue));
358 else
359 m_ParamValues.Add(pvalue);
360 }
361 break;
362 }
363 switch (state)
364 {
365 case ST_BEFORE_NAME:
366 if (!IS_WHITE(c))
367 {
368 pname = c;
369 state = ST_NAME;
370 }
371 break;
372 case ST_NAME:
373 if (IS_WHITE(c))
374 state = ST_BEFORE_EQ;
375 else if (c == wxT('='))
376 state = ST_BEFORE_VALUE;
377 else
378 pname << c;
379 break;
380 case ST_BEFORE_EQ:
381 if (c == wxT('='))
382 state = ST_BEFORE_VALUE;
383 else if (!IS_WHITE(c))
384 {
385 m_ParamNames.Add(pname);
386 m_ParamValues.Add(wxGetEmptyString());
387 pname = c;
388 state = ST_NAME;
389 }
390 break;
391 case ST_BEFORE_VALUE:
392 if (!IS_WHITE(c))
393 {
394 if (c == wxT('"') || c == wxT('\''))
395 quote = c, pvalue = wxGetEmptyString();
396 else
397 quote = 0, pvalue = c;
398 state = ST_VALUE;
399 }
400 break;
401 case ST_VALUE:
402 if ((quote != 0 && c == quote) ||
403 (quote == 0 && IS_WHITE(c)))
404 {
405 m_ParamNames.Add(pname);
406 if (quote == 0)
407 {
408 // VS: backward compatibility, no real reason,
409 // but wxHTML code relies on this... :(
410 pvalue.MakeUpper();
411 }
412 if (entParser)
413 m_ParamValues.Add(entParser->Parse(pvalue));
414 else
415 m_ParamValues.Add(pvalue);
416 state = ST_BEFORE_NAME;
417 }
418 else
419 pvalue << c;
420 break;
421 }
422 }
423
424 #undef IS_WHITE
425 }
426 m_Begin = i;
427 cache->QueryTag(pos, source->end(), &m_End1, &m_End2, &m_hasEnding);
428 if (m_End1 > end_pos) m_End1 = end_pos;
429 if (m_End2 > end_pos) m_End2 = end_pos;
430
431 #if WXWIN_COMPATIBILITY_2_8
432 m_sourceStart = source->begin();
433 #endif
434
435 // Try to parse any style parameters that can be handled simply by
436 // converting them to the equivalent HTML 3 attributes: this is a far cry
437 // from perfect but better than nothing.
438 static const struct EquivAttr
439 {
440 const char *style;
441 const char *attr;
442 } equivAttrs[] =
443 {
444 { "text-align", "ALIGN" },
445 { "width", "WIDTH" },
446 { "vertical-align", "VALIGN" },
447 { "background", "BGCOLOR" },
448 };
449
450 wxHtmlStyleParams styleParams(*this);
451 for ( unsigned n = 0; n < WXSIZEOF(equivAttrs); n++ )
452 {
453 const EquivAttr& ea = equivAttrs[n];
454 if ( styleParams.HasParam(ea.style) && !HasParam(ea.attr) )
455 {
456 m_ParamNames.Add(ea.attr);
457 m_ParamValues.Add(styleParams.GetParam(ea.style));
458 }
459 }
460 }
461
462 wxHtmlTag::~wxHtmlTag()
463 {
464 wxHtmlTag *t1, *t2;
465 t1 = m_FirstChild;
466 while (t1)
467 {
468 t2 = t1->GetNextSibling();
469 delete t1;
470 t1 = t2;
471 }
472 }
473
474 bool wxHtmlTag::HasParam(const wxString& par) const
475 {
476 return (m_ParamNames.Index(par, false) != wxNOT_FOUND);
477 }
478
479 wxString wxHtmlTag::GetParam(const wxString& par, bool with_quotes) const
480 {
481 int index = m_ParamNames.Index(par, false);
482 if (index == wxNOT_FOUND)
483 return wxGetEmptyString();
484 if (with_quotes)
485 {
486 // VS: backward compatibility, seems to be never used by wxHTML...
487 wxString s;
488 s << wxT('"') << m_ParamValues[index] << wxT('"');
489 return s;
490 }
491 else
492 return m_ParamValues[index];
493 }
494
495 int wxHtmlTag::ScanParam(const wxString& par,
496 const char *format,
497 void *param) const
498 {
499 wxString parval = GetParam(par);
500 return wxSscanf(parval, format, param);
501 }
502
503 int wxHtmlTag::ScanParam(const wxString& par,
504 const wchar_t *format,
505 void *param) const
506 {
507 wxString parval = GetParam(par);
508 return wxSscanf(parval, format, param);
509 }
510
511 /* static */
512 bool wxHtmlTag::ParseAsColour(const wxString& str, wxColour *clr)
513 {
514 wxCHECK_MSG( clr, false, wxT("invalid colour argument") );
515
516 // handle colours defined in HTML 4.0 first:
517 if (str.length() > 1 && str[0] != wxT('#'))
518 {
519 #define HTML_COLOUR(name, r, g, b) \
520 if (str.IsSameAs(wxS(name), false)) \
521 { clr->Set(r, g, b); return true; }
522 HTML_COLOUR("black", 0x00,0x00,0x00)
523 HTML_COLOUR("silver", 0xC0,0xC0,0xC0)
524 HTML_COLOUR("gray", 0x80,0x80,0x80)
525 HTML_COLOUR("white", 0xFF,0xFF,0xFF)
526 HTML_COLOUR("maroon", 0x80,0x00,0x00)
527 HTML_COLOUR("red", 0xFF,0x00,0x00)
528 HTML_COLOUR("purple", 0x80,0x00,0x80)
529 HTML_COLOUR("fuchsia", 0xFF,0x00,0xFF)
530 HTML_COLOUR("green", 0x00,0x80,0x00)
531 HTML_COLOUR("lime", 0x00,0xFF,0x00)
532 HTML_COLOUR("olive", 0x80,0x80,0x00)
533 HTML_COLOUR("yellow", 0xFF,0xFF,0x00)
534 HTML_COLOUR("navy", 0x00,0x00,0x80)
535 HTML_COLOUR("blue", 0x00,0x00,0xFF)
536 HTML_COLOUR("teal", 0x00,0x80,0x80)
537 HTML_COLOUR("aqua", 0x00,0xFF,0xFF)
538 #undef HTML_COLOUR
539 }
540
541 // then try to parse #rrggbb representations or set from other well
542 // known names (note that this doesn't strictly conform to HTML spec,
543 // but it doesn't do real harm -- but it *must* be done after the standard
544 // colors are handled above):
545 if (clr->Set(str))
546 return true;
547
548 return false;
549 }
550
551 bool wxHtmlTag::GetParamAsColour(const wxString& par, wxColour *clr) const
552 {
553 const wxString str = GetParam(par);
554 return !str.empty() && ParseAsColour(str, clr);
555 }
556
557 bool wxHtmlTag::GetParamAsInt(const wxString& par, int *clr) const
558 {
559 if ( !HasParam(par) )
560 return false;
561
562 long i;
563 if ( !GetParam(par).ToLong(&i) )
564 return false;
565
566 *clr = (int)i;
567 return true;
568 }
569
570 wxString wxHtmlTag::GetAllParams() const
571 {
572 // VS: this function is for backward compatibility only,
573 // never used by wxHTML
574 wxString s;
575 size_t cnt = m_ParamNames.GetCount();
576 for (size_t i = 0; i < cnt; i++)
577 {
578 s << m_ParamNames[i];
579 s << wxT('=');
580 if (m_ParamValues[i].Find(wxT('"')) != wxNOT_FOUND)
581 s << wxT('\'') << m_ParamValues[i] << wxT('\'');
582 else
583 s << wxT('"') << m_ParamValues[i] << wxT('"');
584 }
585 return s;
586 }
587
588 wxHtmlTag *wxHtmlTag::GetFirstSibling() const
589 {
590 if (m_Parent)
591 return m_Parent->m_FirstChild;
592 else
593 {
594 wxHtmlTag *cur = (wxHtmlTag*)this;
595 while (cur->m_Prev)
596 cur = cur->m_Prev;
597 return cur;
598 }
599 }
600
601 wxHtmlTag *wxHtmlTag::GetLastSibling() const
602 {
603 if (m_Parent)
604 return m_Parent->m_LastChild;
605 else
606 {
607 wxHtmlTag *cur = (wxHtmlTag*)this;
608 while (cur->m_Next)
609 cur = cur->m_Next;
610 return cur;
611 }
612 }
613
614 wxHtmlTag *wxHtmlTag::GetNextTag() const
615 {
616 if (m_FirstChild) return m_FirstChild;
617 if (m_Next) return m_Next;
618 wxHtmlTag *cur = m_Parent;
619 if (!cur) return NULL;
620 while (cur->m_Parent && !cur->m_Next)
621 cur = cur->m_Parent;
622 return cur->m_Next;
623 }
624
625 #endif