]> git.saurik.com Git - wxWidgets.git/blob - src/common/markupparser.cpp
Implement GetSelectedHTML for the ie and gtk webkit backends and document.
[wxWidgets.git] / src / common / markupparser.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/markupparser.cpp
3 // Purpose: Implementation of wxMarkupParser.
4 // Author: Vadim Zeitlin
5 // Created: 2011-02-16
6 // RCS-ID: $Id: $
7 // Copyright: (c) 2011 Vadim Zeitlin <vadim@wxwidgets.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #if wxUSE_MARKUP
27
28 #ifndef WX_PRECOMP
29 #include "wx/log.h"
30 #endif
31
32 #include "wx/private/markupparser.h"
33
34 #include "wx/stack.h"
35
36 namespace
37 {
38
39 // ----------------------------------------------------------------------------
40 // constants
41 // ----------------------------------------------------------------------------
42
43 // Array containing the predefined XML 1.0 entities.
44 const struct XMLEntity
45 {
46 const char *name;
47 int len; // == strlen(name)
48 char value;
49 } xmlEntities[] =
50 {
51 { "lt", 2, '<' },
52 { "gt", 2, '>' },
53 { "amp", 3, '&' },
54 { "apos", 4, '\''},
55 { "quot", 4, '"' },
56 };
57
58 // ----------------------------------------------------------------------------
59 // helper functions
60 // ----------------------------------------------------------------------------
61
62 wxString
63 ExtractUntil(char ch, wxString::const_iterator& it, wxString::const_iterator end)
64 {
65 wxString str;
66 for ( ; it != end; ++it )
67 {
68 if ( *it == ch )
69 return str;
70
71 str += *it;
72 }
73
74 // Return empty string to indicate that we didn't find ch at all.
75 return wxString();
76 }
77
78 } // anonymous namespace
79
80 // ============================================================================
81 // wxMarkupParser implementation
82 // ============================================================================
83
84 wxString
85 wxMarkupParser::ParseAttrs(wxString attrs, TagAndAttrs& tagAndAttrs)
86 {
87 if ( tagAndAttrs.name.CmpNoCase("span") != 0 && !attrs.empty() )
88 {
89 return wxString::Format("tag \"%s\" can't have attributes",
90 tagAndAttrs.name);
91 }
92
93 // TODO: Parse more attributes described at
94 // http://library.gnome.org/devel/pango/stable/PangoMarkupFormat.html
95 // and at least ignore them gracefully instead of giving errors (but
96 // quite a few of them could be supported as well, notable font_desc).
97
98 wxMarkupSpanAttributes& spanAttrs = tagAndAttrs.attrs;
99
100 while ( !attrs.empty() )
101 {
102 wxString rest;
103 const wxString attr = attrs.BeforeFirst(' ', &rest);
104 attrs = rest;
105
106 // The "original" versions are used for error messages only.
107 wxString valueOrig;
108 const wxString nameOrig = attr.BeforeFirst('=', &valueOrig);
109
110 const wxString name = nameOrig.Lower();
111 wxString value = valueOrig.Lower();
112
113 // All attributes values must be quoted.
114 if ( value.length() < 2 ||
115 (value[0] != value.Last()) ||
116 (value[0] != '"' && value[0] != '\'') )
117 {
118 return wxString::Format("bad quoting for value of \"%s\"",
119 nameOrig);
120 }
121
122 value.assign(value, 1, value.length() - 2);
123
124 if ( name == "foreground" || name == "fgcolor" || name == "color" )
125 {
126 spanAttrs.m_fgCol = value;
127 }
128 else if ( name == "background" || name == "bgcolor" )
129 {
130 spanAttrs.m_bgCol = value;
131 }
132 else if ( name == "font_family" || name == "face" )
133 {
134 spanAttrs.m_fontFace = value;
135 }
136 else if ( name == "font_weight" || name == "weight" )
137 {
138 unsigned long weight;
139
140 if ( value == "ultralight" || value == "light" || value == "normal" )
141 spanAttrs.m_isBold = wxMarkupSpanAttributes::No;
142 else if ( value == "bold" || value == "ultrabold" || value == "heavy" )
143 spanAttrs.m_isBold = wxMarkupSpanAttributes::Yes;
144 else if ( value.ToULong(&weight) )
145 spanAttrs.m_isBold = weight >= 600 ? wxMarkupSpanAttributes::Yes
146 : wxMarkupSpanAttributes::No;
147 else
148 return wxString::Format("invalid font weight \"%s\"", valueOrig);
149 }
150 else if ( name == "font_style" || name == "style" )
151 {
152 if ( value == "normal" )
153 spanAttrs.m_isItalic = wxMarkupSpanAttributes::No;
154 else if ( value == "oblique" || value == "italic" )
155 spanAttrs.m_isItalic = wxMarkupSpanAttributes::Yes;
156 else
157 return wxString::Format("invalid font style \"%s\"", valueOrig);
158 }
159 else if ( name == "size" )
160 {
161 unsigned long size;
162 if ( value.ToULong(&size) )
163 {
164 spanAttrs.m_sizeKind = wxMarkupSpanAttributes::Size_PointParts;
165 spanAttrs.m_fontSize = size;
166 }
167 else if ( value == "smaller" || value == "larger" )
168 {
169 spanAttrs.m_sizeKind = wxMarkupSpanAttributes::Size_Relative;
170 spanAttrs.m_fontSize = value == "smaller" ? -1 : +1;
171 }
172 else // Must be a CSS-like size specification
173 {
174 int cssSize = 1;
175 wxString rest;
176 if ( value.StartsWith("xx-", &rest) )
177 cssSize = 3;
178 else if ( value.StartsWith("x-", &rest) )
179 cssSize = 2;
180 else if ( value == "medium" )
181 cssSize = 0;
182 else
183 rest = value;
184
185 if ( cssSize != 0 )
186 {
187 if ( rest == "small" )
188 cssSize = -cssSize;
189 else if ( rest != "large" )
190 return wxString::Format("invalid font size \"%s\"",
191 valueOrig);
192 }
193
194 spanAttrs.m_sizeKind = wxMarkupSpanAttributes::Size_Symbolic;
195 spanAttrs.m_fontSize = cssSize;
196 }
197 }
198 }
199
200 return wxString();
201 }
202
203 bool wxMarkupParser::OutputTag(const TagAndAttrs& tagAndAttrs, bool start)
204 {
205 if ( tagAndAttrs.name.CmpNoCase("span") == 0 )
206 {
207 if ( start )
208 m_output.OnSpanStart(tagAndAttrs.attrs);
209 else
210 m_output.OnSpanEnd(tagAndAttrs.attrs);
211
212 return true;
213 }
214 else // non-span tag
215 {
216 static const struct TagHandler
217 {
218 const char *name;
219 void (wxMarkupParserOutput::*startFunc)();
220 void (wxMarkupParserOutput::*endFunc)();
221 } tagHandlers[] =
222 {
223 { "b", &wxMarkupParserOutput::OnBoldStart,
224 &wxMarkupParserOutput::OnBoldEnd },
225 { "i", &wxMarkupParserOutput::OnItalicStart,
226 &wxMarkupParserOutput::OnItalicEnd },
227 { "u", &wxMarkupParserOutput::OnUnderlinedStart,
228 &wxMarkupParserOutput::OnUnderlinedEnd },
229 { "s", &wxMarkupParserOutput::OnStrikethroughStart,
230 &wxMarkupParserOutput::OnStrikethroughEnd },
231 { "big", &wxMarkupParserOutput::OnBigStart,
232 &wxMarkupParserOutput::OnBigEnd },
233 { "small", &wxMarkupParserOutput::OnSmallStart,
234 &wxMarkupParserOutput::OnSmallEnd },
235 { "tt", &wxMarkupParserOutput::OnTeletypeStart,
236 &wxMarkupParserOutput::OnTeletypeEnd },
237 };
238
239 for ( unsigned n = 0; n < WXSIZEOF(tagHandlers); n++ )
240 {
241 const TagHandler& h = tagHandlers[n];
242
243 if ( tagAndAttrs.name.CmpNoCase(h.name) == 0 )
244 {
245 if ( start )
246 (m_output.*(h.startFunc))();
247 else
248 (m_output.*(h.endFunc))();
249
250 return true;
251 }
252 }
253 }
254
255 // Unknown tag name.
256 return false;
257 }
258
259 bool wxMarkupParser::Parse(const wxString& text)
260 {
261 // The stack containing the names and corresponding attributes (which are
262 // actually only used for <span> tags) of all of the currently opened tag
263 // or none if we're not inside any tag.
264 wxStack<TagAndAttrs> tags;
265
266 // Current run of text.
267 wxString current;
268
269 const wxString::const_iterator end = text.end();
270 for ( wxString::const_iterator it = text.begin(); it != end; ++it )
271 {
272 switch ( (*it).GetValue() )
273 {
274 case '<':
275 {
276 // Flush the text preceding the tag, if any.
277 if ( !current.empty() )
278 {
279 m_output.OnText(current);
280 current.clear();
281 }
282
283 // Remember the tag starting position for the error
284 // messages.
285 const size_t pos = it - text.begin();
286
287 bool start = true;
288 if ( ++it != end && *it == '/' )
289 {
290 start = false;
291 ++it;
292 }
293
294 const wxString tag = ExtractUntil('>', it, end);
295 if ( tag.empty() )
296 {
297 wxLogDebug("%s at %lu.",
298 it == end ? "Unclosed tag starting"
299 : "Empty tag",
300 pos);
301 return false;
302 }
303
304 if ( start )
305 {
306 wxString attrs;
307 const wxString name = tag.BeforeFirst(' ', &attrs);
308
309 TagAndAttrs tagAndAttrs(name);
310 const wxString err = ParseAttrs(attrs, tagAndAttrs);
311 if ( !err.empty() )
312 {
313 wxLogDebug("Bad attributes for \"%s\" "
314 "at %lu: %s.",
315 name, pos, err);
316 return false;
317 }
318
319 tags.push(tagAndAttrs);
320 }
321 else // end tag
322 {
323 if ( tags.empty() || tags.top().name != tag )
324 {
325 wxLogDebug("Unmatched closing tag \"%s\" at %lu.",
326 tag, pos);
327 return false;
328 }
329 }
330
331 if ( !OutputTag(tags.top(), start) )
332 {
333 wxLogDebug("Unknown tag at %lu.", pos);
334 return false;
335 }
336
337 if ( !start )
338 tags.pop();
339 }
340 break;
341
342 case '>':
343 wxLogDebug("'>' should be escaped as \"&gt\"; at %lu.",
344 it - text.begin());
345 break;
346
347 case '&':
348 // Processing is somewhat complicated: we need to recognize at
349 // least the "&lt;" entity to allow escaping left square
350 // brackets in the markup and, in fact, we recognize all of the
351 // standard XML entities for consistency with Pango markup
352 // parsing.
353 //
354 // However we also allow '&' to appear unescaped, i.e. directly
355 // and not as "&amp;" when it is used to introduce the mnemonic
356 // for the label. In this case we simply leave it alone.
357 //
358 // Notice that this logic makes it impossible to have a label
359 // with "lt;" inside it and using "l" as mnemonic but hopefully
360 // this shouldn't be a problem in practice.
361 {
362 const size_t pos = it - text.begin() + 1;
363
364 unsigned n;
365 for ( n = 0; n < WXSIZEOF(xmlEntities); n++ )
366 {
367 const XMLEntity& xmlEnt = xmlEntities[n];
368 if ( text.compare(pos, xmlEnt.len, xmlEnt.name) == 0
369 && text[pos + xmlEnt.len] == ';' )
370 {
371 // Escape the ampersands if needed to protect them
372 // from being interpreted as mnemonics indicators.
373 if ( xmlEnt.value == '&' )
374 current += "&&";
375 else
376 current += xmlEnt.value;
377
378 it += xmlEnt.len + 1; // +1 for '&' itself
379
380 break;
381 }
382 }
383
384 if ( n < WXSIZEOF(xmlEntities) )
385 break;
386 //else: fall through, '&' is not special
387 }
388
389 default:
390 current += *it;
391 }
392 }
393
394 if ( !tags.empty() )
395 {
396 wxLogDebug("Missing closing tag for \"%s\"", tags.top().name);
397 return false;
398 }
399
400 if ( !current.empty() )
401 m_output.OnText(current);
402
403 return true;
404 }
405
406 /* static */
407 wxString wxMarkupParser::Quote(const wxString& text)
408 {
409 wxString quoted;
410 quoted.reserve(text.length());
411
412 for ( wxString::const_iterator it = text.begin(); it != text.end(); ++it )
413 {
414 unsigned n;
415 for ( n = 0; n < WXSIZEOF(xmlEntities); n++ )
416 {
417 const XMLEntity& xmlEnt = xmlEntities[n];
418 if ( *it == xmlEnt.value )
419 {
420 quoted << '&' << xmlEnt.name << ';';
421 break;
422 }
423 }
424
425 if ( n == WXSIZEOF(xmlEntities) )
426 quoted += *it;
427 }
428
429 return quoted;
430 }
431
432 /* static */
433 wxString wxMarkupParser::Strip(const wxString& text)
434 {
435 class StripOutput : public wxMarkupParserOutput
436 {
437 public:
438 StripOutput() { }
439
440 const wxString& GetText() const { return m_text; }
441
442 virtual void OnText(const wxString& text) { m_text += text; }
443
444 virtual void OnBoldStart() { }
445 virtual void OnBoldEnd() { }
446
447 virtual void OnItalicStart() { }
448 virtual void OnItalicEnd() { }
449
450 virtual void OnUnderlinedStart() { }
451 virtual void OnUnderlinedEnd() { }
452
453 virtual void OnStrikethroughStart() { }
454 virtual void OnStrikethroughEnd() { }
455
456 virtual void OnBigStart() { }
457 virtual void OnBigEnd() { }
458
459 virtual void OnSmallStart() { }
460 virtual void OnSmallEnd() { }
461
462 virtual void OnTeletypeStart() { }
463 virtual void OnTeletypeEnd() { }
464
465 virtual void OnSpanStart(const wxMarkupSpanAttributes& WXUNUSED(a)) { }
466 virtual void OnSpanEnd(const wxMarkupSpanAttributes& WXUNUSED(a)) { }
467
468 private:
469 wxString m_text;
470 };
471
472 StripOutput output;
473 wxMarkupParser parser(output);
474 if ( !parser.Parse(text) )
475 return wxString();
476
477 return output.GetText();
478 }
479
480 #endif // wxUSE_MARKUP