Add a class for parsing simple markup.
[wxWidgets.git] / src / common / markupparser.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/markupparser.cpp
3 // Purpose: Implementation of wxMarkupParser.
4 // Author: Vadim Zeitlin
5 // Created: 2011-02-16
6 // RCS-ID: $Id: $
7 // Copyright: (c) 2011 Vadim Zeitlin <vadim@wxwidgets.org>
8 // Licence: wxWindows licence
9 ///////////////////////////////////////////////////////////////////////////////
10
11 // ============================================================================
12 // declarations
13 // ============================================================================
14
15 // ----------------------------------------------------------------------------
16 // headers
17 // ----------------------------------------------------------------------------
18
19 // for compilers that support precompilation, includes "wx.h".
20 #include "wx/wxprec.h"
21
22 #ifdef __BORLANDC__
23 #pragma hdrstop
24 #endif
25
26 #ifndef WX_PRECOMP
27 #endif // WX_PRECOMP
28
29 #include "wx/private/markupparser.h"
30
31 #include "wx/stack.h"
32
33 namespace
34 {
35
36 // ----------------------------------------------------------------------------
37 // constants
38 // ----------------------------------------------------------------------------
39
40 // Array containing the predefined XML 1.0 entities.
41 const struct XMLEntity
42 {
43 const char *name;
44 int len; // == strlen(name)
45 char value;
46 } xmlEntities[] =
47 {
48 { "lt", 2, '<' },
49 { "gt", 2, '>' },
50 { "amp", 3, '&' },
51 { "apos", 4, '\''},
52 { "quot", 4, '"' },
53 };
54
55 // ----------------------------------------------------------------------------
56 // helper functions
57 // ----------------------------------------------------------------------------
58
59 wxString
60 ExtractUntil(char ch, wxString::const_iterator& it, wxString::const_iterator end)
61 {
62 wxString str;
63 for ( ; it != end; ++it )
64 {
65 if ( *it == ch )
66 return str;
67
68 str += *it;
69 }
70
71 // Return empty string to indicate that we didn't find ch at all.
72 return wxString();
73 }
74
75 } // anonymous namespace
76
77 // ============================================================================
78 // wxMarkupParser implementation
79 // ============================================================================
80
81 wxString
82 wxMarkupParser::ParseAttrs(wxString attrs, TagAndAttrs& tagAndAttrs)
83 {
84 if ( tagAndAttrs.name.CmpNoCase("span") != 0 && !attrs.empty() )
85 {
86 return wxString::Format("tag \"%s\" can't have attributes",
87 tagAndAttrs.name);
88 }
89
90 // TODO: Parse more attributes described at
91 // http://library.gnome.org/devel/pango/stable/PangoMarkupFormat.html
92 // and at least ignore them gracefully instead of giving errors (but
93 // quite a few of them could be supported as well, notable font_desc).
94
95 wxMarkupSpanAttributes& spanAttrs = tagAndAttrs.attrs;
96
97 while ( !attrs.empty() )
98 {
99 wxString rest;
100 const wxString attr = attrs.BeforeFirst(' ', &rest);
101 attrs = rest;
102
103 // The "original" versions are used for error messages only.
104 wxString valueOrig;
105 const wxString nameOrig = attr.BeforeFirst('=', &valueOrig);
106
107 const wxString name = nameOrig.Lower();
108 wxString value = valueOrig.Lower();
109
110 // All attributes values must be quoted.
111 if ( value.length() < 2 ||
112 (value[0] != value.Last()) ||
113 (value[0] != '"' && value[0] != '\'') )
114 {
115 return wxString::Format("bad quoting for value of \"%s\"",
116 nameOrig);
117 }
118
119 value.assign(value, 1, value.length() - 2);
120
121 if ( name == "foreground" || name == "fgcolor" || name == "color" )
122 {
123 spanAttrs.m_fgCol = value;
124 }
125 else if ( name == "background" || name == "bgcolor" )
126 {
127 spanAttrs.m_bgCol = value;
128 }
129 else if ( name == "font_family" || name == "face" )
130 {
131 spanAttrs.m_fontFace = value;
132 }
133 else if ( name == "font_weight" || name == "weight" )
134 {
135 unsigned long weight;
136
137 if ( value == "ultralight" || value == "light" || value == "normal" )
138 spanAttrs.m_isBold = wxMarkupSpanAttributes::No;
139 else if ( value == "bold" || value == "ultrabold" || value == "heavy" )
140 spanAttrs.m_isBold = wxMarkupSpanAttributes::Yes;
141 else if ( value.ToULong(&weight) )
142 spanAttrs.m_isBold = weight >= 600 ? wxMarkupSpanAttributes::Yes
143 : wxMarkupSpanAttributes::No;
144 else
145 return wxString::Format("invalid font weight \"%s\"", valueOrig);
146 }
147 else if ( name == "font_style" || name == "style" )
148 {
149 if ( value == "normal" )
150 spanAttrs.m_isItalic = wxMarkupSpanAttributes::No;
151 else if ( value == "oblique" || value == "italic" )
152 spanAttrs.m_isItalic = wxMarkupSpanAttributes::Yes;
153 else
154 return wxString::Format("invalid font style \"%s\"", valueOrig);
155 }
156 else if ( name == "size" )
157 {
158 unsigned long size;
159 if ( value.ToULong(&size) )
160 {
161 spanAttrs.m_sizeKind = wxMarkupSpanAttributes::Size_PointParts;
162 spanAttrs.m_fontSize = size;
163 }
164 else if ( value == "smaller" || value == "larger" )
165 {
166 spanAttrs.m_sizeKind = wxMarkupSpanAttributes::Size_Relative;
167 spanAttrs.m_fontSize = value == "smaller" ? -1 : +1;
168 }
169 else // Must be a CSS-like size specification
170 {
171 int cssSize = 1;
172 wxString rest;
173 if ( value.StartsWith("xx-", &rest) )
174 cssSize = 3;
175 else if ( value.StartsWith("x-", &rest) )
176 cssSize = 2;
177 else if ( value == "medium" )
178 cssSize = 0;
179 else
180 rest = value;
181
182 if ( cssSize != 0 )
183 {
184 if ( rest == "small" )
185 cssSize = -cssSize;
186 else if ( rest != "large" )
187 return wxString::Format("invalid font size \"%s\"",
188 valueOrig);
189 }
190
191 spanAttrs.m_sizeKind = wxMarkupSpanAttributes::Size_Symbolic;
192 spanAttrs.m_fontSize = cssSize;
193 }
194 }
195 }
196
197 return wxString();
198 }
199
200 bool wxMarkupParser::OutputTag(const TagAndAttrs& tagAndAttrs, bool start)
201 {
202 if ( tagAndAttrs.name.CmpNoCase("span") == 0 )
203 {
204 if ( start )
205 m_output.OnSpanStart(tagAndAttrs.attrs);
206 else
207 m_output.OnSpanEnd(tagAndAttrs.attrs);
208
209 return true;
210 }
211 else // non-span tag
212 {
213 static const struct TagHandler
214 {
215 const char *name;
216 void (wxMarkupParserOutput::*startFunc)();
217 void (wxMarkupParserOutput::*endFunc)();
218 } tagHandlers[] =
219 {
220 { "b", &wxMarkupParserOutput::OnBoldStart,
221 &wxMarkupParserOutput::OnBoldEnd },
222 { "i", &wxMarkupParserOutput::OnItalicStart,
223 &wxMarkupParserOutput::OnItalicEnd },
224 { "u", &wxMarkupParserOutput::OnUnderlinedStart,
225 &wxMarkupParserOutput::OnUnderlinedEnd },
226 { "s", &wxMarkupParserOutput::OnStrikethroughStart,
227 &wxMarkupParserOutput::OnStrikethroughEnd },
228 { "big", &wxMarkupParserOutput::OnBigStart,
229 &wxMarkupParserOutput::OnBigEnd },
230 { "small", &wxMarkupParserOutput::OnSmallStart,
231 &wxMarkupParserOutput::OnSmallEnd },
232 { "tt", &wxMarkupParserOutput::OnTeletypeStart,
233 &wxMarkupParserOutput::OnTeletypeEnd },
234 };
235
236 for ( unsigned n = 0; n < WXSIZEOF(tagHandlers); n++ )
237 {
238 const TagHandler& h = tagHandlers[n];
239
240 if ( tagAndAttrs.name.CmpNoCase(h.name) == 0 )
241 {
242 if ( start )
243 (m_output.*(h.startFunc))();
244 else
245 (m_output.*(h.endFunc))();
246
247 return true;
248 }
249 }
250 }
251
252 // Unknown tag name.
253 return false;
254 }
255
256 bool wxMarkupParser::Parse(const wxString& text)
257 {
258 // The stack containing the names and corresponding attributes (which are
259 // actually only used for <span> tags) of all of the currently opened tag
260 // or none if we're not inside any tag.
261 wxStack<TagAndAttrs> tags;
262
263 // Current run of text.
264 wxString current;
265
266 const wxString::const_iterator end = text.end();
267 for ( wxString::const_iterator it = text.begin(); it != end; ++it )
268 {
269 switch ( (*it).GetValue() )
270 {
271 case '<':
272 {
273 // Flush the text preceding the tag, if any.
274 if ( !current.empty() )
275 {
276 m_output.OnText(current);
277 current.clear();
278 }
279
280 // Remember the tag starting position for the error
281 // messages.
282 const size_t pos = it - text.begin();
283
284 bool start = true;
285 if ( ++it != end && *it == '/' )
286 {
287 start = false;
288 ++it;
289 }
290
291 const wxString tag = ExtractUntil('>', it, end);
292 if ( tag.empty() )
293 {
294 wxLogDebug("%s at %lu.",
295 it == end ? "Unclosed tag starting"
296 : "Empty tag",
297 pos);
298 return false;
299 }
300
301 if ( start )
302 {
303 wxString attrs;
304 const wxString name = tag.BeforeFirst(' ', &attrs);
305
306 TagAndAttrs tagAndAttrs(name);
307 const wxString err = ParseAttrs(attrs, tagAndAttrs);
308 if ( !err.empty() )
309 {
310 wxLogDebug("Bad attributes for \"%s\" "
311 "at %lu: %s.",
312 name, pos, err);
313 return false;
314 }
315
316 tags.push(tagAndAttrs);
317 }
318 else // end tag
319 {
320 if ( tags.empty() || tags.top().name != tag )
321 {
322 wxLogDebug("Unmatched closing tag \"%s\" at %lu.",
323 tag, pos);
324 return false;
325 }
326 }
327
328 if ( !OutputTag(tags.top(), start) )
329 {
330 wxLogDebug("Unknown tag at %lu.", pos);
331 return false;
332 }
333
334 if ( !start )
335 tags.pop();
336 }
337 break;
338
339 case '>':
340 wxLogDebug("'>' should be escaped as \"&gt\"; at %lu.",
341 it - text.begin());
342 break;
343
344 case '&':
345 // Processing is somewhat complicated: we need to recognize at
346 // least the "&lt;" entity to allow escaping left square
347 // brackets in the markup and, in fact, we recognize all of the
348 // standard XML entities for consistency with Pango markup
349 // parsing.
350 //
351 // However we also allow '&' to appear unescaped, i.e. directly
352 // and not as "&amp;" when it is used to introduce the mnemonic
353 // for the label. In this case we simply leave it alone.
354 //
355 // Notice that this logic makes it impossible to have a label
356 // with "lt;" inside it and using "l" as mnemonic but hopefully
357 // this shouldn't be a problem in practice.
358 {
359 const size_t pos = it - text.begin() + 1;
360
361 unsigned n;
362 for ( n = 0; n < WXSIZEOF(xmlEntities); n++ )
363 {
364 const XMLEntity& xmlEnt = xmlEntities[n];
365 if ( text.compare(pos, xmlEnt.len, xmlEnt.name) == 0
366 && text[pos + xmlEnt.len] == ';' )
367 {
368 // Escape the ampersands if needed to protect them
369 // from being interpreted as mnemonics indicators.
370 if ( xmlEnt.value == '&' )
371 current += "&&";
372 else
373 current += xmlEnt.value;
374
375 it += xmlEnt.len + 1; // +1 for '&' itself
376
377 break;
378 }
379 }
380
381 if ( n < WXSIZEOF(xmlEntities) )
382 break;
383 //else: fall through, '&' is not special
384 }
385
386 default:
387 current += *it;
388 }
389 }
390
391 if ( !tags.empty() )
392 {
393 wxLogDebug("Missing closing tag for \"%s\"", tags.top().name);
394 return false;
395 }
396
397 if ( !current.empty() )
398 m_output.OnText(current);
399
400 return true;
401 }
402
403 /* static */
404 wxString wxMarkupParser::Quote(const wxString& text)
405 {
406 wxString quoted;
407 quoted.reserve(text.length());
408
409 for ( wxString::const_iterator it = text.begin(); it != text.end(); ++it )
410 {
411 unsigned n;
412 for ( n = 0; n < WXSIZEOF(xmlEntities); n++ )
413 {
414 const XMLEntity& xmlEnt = xmlEntities[n];
415 if ( *it == xmlEnt.value )
416 {
417 quoted << '&' << xmlEnt.name << ';';
418 break;
419 }
420 }
421
422 if ( n == WXSIZEOF(xmlEntities) )
423 quoted += *it;
424 }
425
426 return quoted;
427 }