Unicode compilation fixes
[wxWidgets.git] / src / common / regex.cpp
1 ///////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/regex.cpp
3 // Purpose: regular expression matching
4 // Author: Karsten Ballüder and Vadim Zeitlin
5 // Modified by:
6 // Created: 13.07.01
7 // RCS-ID: $Id$
8 // Copyright: (c) 2000 Karsten Ballüder <ballueder@gmx.net>
9 // 2001 Vadim Zeitlin <vadim@wxwindows.org>
10 // Licence: wxWindows licence
11 ///////////////////////////////////////////////////////////////////////////////
12
13 // ============================================================================
14 // declarations
15 // ============================================================================
16
17 // ----------------------------------------------------------------------------
18 // headers
19 // ----------------------------------------------------------------------------
20
21 #ifdef __GNUG__
22 #pragma implementation "regex.h"
23 #endif
24
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
27
28 #ifdef __BORLANDC__
29 #pragma hdrstop
30 #endif
31
32 #if wxUSE_REGEX
33
34 #ifndef WX_PRECOMP
35 #include "wx/object.h"
36 #include "wx/string.h"
37 #include "wx/log.h"
38 #include "wx/intl.h"
39 #endif //WX_PRECOMP
40
41 // FreeBSD requires this, it probably doesn't hurt for others
42 #ifdef __UNIX__
43 #include <sys/types.h>
44 #endif
45
46 #include <regex.h>
47
48 #include "wx/regex.h"
49
50 // ----------------------------------------------------------------------------
51 // private classes
52 // ----------------------------------------------------------------------------
53
54 // the real implementation of wxRegEx
55 class wxRegExImpl
56 {
57 public:
58 // ctor and dtor
59 wxRegExImpl();
60 ~wxRegExImpl();
61
62 // return TRUE if Compile() had been called successfully
63 bool IsValid() const { return m_isCompiled; }
64
65 // RE operations
66 bool Compile(const wxString& expr, int flags = 0);
67 bool Matches(const wxChar *str, int flags = 0) const;
68 bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
69 int Replace(wxString *pattern, const wxString& replacement,
70 size_t maxMatches = 0) const;
71
72 private:
73 // return the string containing the error message for the given err code
74 wxString GetErrorMsg(int errorcode) const;
75
76 // free the RE if compiled
77 void Free()
78 {
79 if ( IsValid() )
80 {
81 regfree(&m_RegEx);
82
83 m_isCompiled = FALSE;
84 }
85 }
86
87 // compiled RE
88 regex_t m_RegEx;
89
90 // the subexpressions data
91 regmatch_t *m_Matches;
92 size_t m_nMatches;
93
94 // TRUE if m_RegEx is valid
95 bool m_isCompiled;
96 };
97
98 // ============================================================================
99 // implementation
100 // ============================================================================
101
102 // ----------------------------------------------------------------------------
103 // wxRegExImpl
104 // ----------------------------------------------------------------------------
105
106 wxRegExImpl::wxRegExImpl()
107 {
108 m_isCompiled = FALSE;
109 m_Matches = NULL;
110 m_nMatches = 0;
111 }
112
113 wxRegExImpl::~wxRegExImpl()
114 {
115 Free();
116
117 delete [] m_Matches;
118 }
119
120 wxString wxRegExImpl::GetErrorMsg(int errorcode) const
121 {
122 wxString msg;
123
124 // first get the string length needed
125 int len = regerror(errorcode, &m_RegEx, NULL, 0);
126 if ( len > 0 )
127 {
128 len++;
129
130 #if wxUSE_UNICODE
131 wxCharBuffer buf(len);
132
133 (void)regerror(errorcode, &m_RegEx, (char *)buf.data(), len);
134
135 msg = buf.data();
136 #else // !Unicode
137 (void)regerror(errorcode, &m_RegEx, msg.GetWriteBuf(len), len);
138
139 msg.UngetWriteBuf();
140 #endif // Unicode/!Unicode
141 }
142 else // regerror() returned 0
143 {
144 msg = _("unknown error");
145 }
146
147 return msg;
148 }
149
150 bool wxRegExImpl::Compile(const wxString& expr, int flags)
151 {
152 Free();
153
154 // translate our flags to regcomp() ones
155 wxASSERT_MSG( !(flags &
156 ~(wxRE_BASIC | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
157 _T("unrecognized flags in wxRegEx::Compile") );
158
159 int flagsRE = 0;
160 if ( !(flags & wxRE_BASIC) )
161 flagsRE |= REG_EXTENDED;
162 if ( flags & wxRE_ICASE )
163 flagsRE |= REG_ICASE;
164 if ( flags & wxRE_NOSUB )
165 flagsRE |= REG_NOSUB;
166 if ( flags & wxRE_NEWLINE )
167 flagsRE |= REG_NEWLINE;
168
169 // compile it
170 int errorcode = regcomp(&m_RegEx, expr.mb_str(), flagsRE);
171 if ( errorcode )
172 {
173 wxLogError(_("Invalid regular expression '%s': %s"),
174 expr.c_str(), GetErrorMsg(errorcode).c_str());
175
176 m_isCompiled = FALSE;
177 }
178 else // ok
179 {
180 // don't allocate the matches array now, but do it later if necessary
181 if ( flags & wxRE_NOSUB )
182 {
183 // we don't need it at all
184 m_nMatches = 0;
185 }
186 else
187 {
188 // we will alloc the array later (only if really needed) but count
189 // the number of sub-expressions in the regex right now
190
191 // there is always one for the whole expression
192 m_nMatches = 1;
193
194 // and some more for bracketed subexperessions
195 const wxChar *cptr = expr.c_str();
196 wxChar prev = _T('\0');
197 while ( *cptr != _T('\0') )
198 {
199 // is this a subexpr start, i.e. "(" for extended regex or
200 // "\(" for a basic one?
201 if ( *cptr == _T('(') &&
202 (flags & wxRE_BASIC ? prev == _T('\\')
203 : prev != _T('\\')) )
204 {
205 m_nMatches++;
206 }
207
208 prev = *cptr;
209 cptr++;
210 }
211 }
212
213 m_isCompiled = TRUE;
214 }
215
216 return IsValid();
217 }
218
219 bool wxRegExImpl::Matches(const wxChar *str, int flags) const
220 {
221 wxCHECK_MSG( IsValid(), FALSE, _T("must successfully Compile() first") );
222
223 // translate our flags to regexec() ones
224 wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
225 _T("unrecognized flags in wxRegEx::Matches") );
226
227 int flagsRE = 0;
228 if ( flags & wxRE_NOTBOL )
229 flagsRE |= REG_NOTBOL;
230 if ( flags & wxRE_NOTEOL )
231 flagsRE |= REG_NOTEOL;
232
233 // allocate matches array if needed
234 wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
235 if ( !m_Matches && m_nMatches )
236 {
237 self->m_Matches = new regmatch_t[m_nMatches];
238 }
239
240 // do match it
241 int rc = regexec(&self->m_RegEx, wxConvertWX2MB(str), m_nMatches, m_Matches, flagsRE);
242
243 switch ( rc )
244 {
245 case 0:
246 // matched successfully
247 return TRUE;
248
249 default:
250 // an error occured
251 wxLogError(_("Failed to match '%s' in regular expression: %s"),
252 str, GetErrorMsg(rc).c_str());
253 // fall through
254
255 case REG_NOMATCH:
256 // no match
257 return FALSE;
258 }
259 }
260
261 bool wxRegExImpl::GetMatch(size_t *start, size_t *len, size_t index) const
262 {
263 wxCHECK_MSG( IsValid(), FALSE, _T("must successfully Compile() first") );
264 wxCHECK_MSG( m_Matches, FALSE, _T("can't use with wxRE_NOSUB") );
265 wxCHECK_MSG( index < m_nMatches, FALSE, _T("invalid match index") );
266
267 const regmatch_t& match = m_Matches[index];
268
269 if ( start )
270 *start = match.rm_so;
271 if ( len )
272 *len = match.rm_eo - match.rm_so;
273
274 return TRUE;
275 }
276
277 int wxRegExImpl::Replace(wxString *text,
278 const wxString& replacement,
279 size_t maxMatches) const
280 {
281 wxCHECK_MSG( text, -1, _T("NULL text in wxRegEx::Replace") );
282 wxCHECK_MSG( IsValid(), -1, _T("must successfully Compile() first") );
283
284 // the replacement text
285 wxString textNew;
286
287 // attempt at optimization: don't iterate over the string if it doesn't
288 // contain back references at all
289 bool mayHaveBackrefs =
290 replacement.find_first_of(_T("\\&")) != wxString::npos;
291
292 if ( !mayHaveBackrefs )
293 {
294 textNew = replacement;
295 }
296
297 // the position where we start looking for the match
298 //
299 // NB: initial version had a nasty bug because it used a wxChar* instead of
300 // an index but the problem is that replace() in the loop invalidates
301 // all pointers into the string so we have to use indices instead
302 size_t matchStart = 0;
303
304 // number of replacement made: we won't make more than maxMatches of them
305 // (unless maxMatches is 0 which doesn't limit the number of replacements)
306 size_t countRepl = 0;
307
308 // note that "^" shouldn't match after the first call to Matches() so we
309 // use wxRE_NOTBOL to prevent it from happening
310 while ( (!maxMatches || countRepl < maxMatches) &&
311 Matches(text->c_str() + matchStart, countRepl ? wxRE_NOTBOL : 0) )
312 {
313 // the string possibly contains back references: we need to calculate
314 // the replacement text anew after each match
315 if ( mayHaveBackrefs )
316 {
317 mayHaveBackrefs = FALSE;
318 textNew.clear();
319 textNew.reserve(replacement.length());
320
321 for ( const wxChar *p = replacement.c_str(); *p; p++ )
322 {
323 size_t index = (size_t)-1;
324
325 if ( *p == _T('\\') )
326 {
327 if ( wxIsdigit(*++p) )
328 {
329 // back reference
330 wxChar *end;
331 index = (size_t)wxStrtoul(p, &end, 10);
332 p = end - 1; // -1 to compensate for p++ in the loop
333 }
334 //else: backslash used as escape character
335 }
336 else if ( *p == _T('&') )
337 {
338 // treat this as "\0" for compatbility with ed and such
339 index = 0;
340 }
341
342 // do we have a back reference?
343 if ( index != (size_t)-1 )
344 {
345 // yes, get its text
346 size_t start, len;
347 if ( !GetMatch(&start, &len, index) )
348 {
349 wxFAIL_MSG( _T("invalid back reference") );
350
351 // just eat it...
352 }
353 else
354 {
355 textNew += wxString(text->c_str() + matchStart + start,
356 len);
357
358 mayHaveBackrefs = TRUE;
359 }
360 }
361 else // ordinary character
362 {
363 textNew += *p;
364 }
365 }
366 }
367
368 size_t start, len;
369 if ( !GetMatch(&start, &len) )
370 {
371 // we did have match as Matches() returned true above!
372 wxFAIL_MSG( _T("internal logic error in wxRegEx::Replace") );
373
374 return -1;
375 }
376
377 matchStart += start;
378 text->replace(matchStart, len, textNew);
379
380 countRepl++;
381
382 matchStart += textNew.length();
383 }
384
385 return countRepl;
386 }
387
388 // ----------------------------------------------------------------------------
389 // wxRegEx: all methods are mostly forwarded to wxRegExImpl
390 // ----------------------------------------------------------------------------
391
392 void wxRegEx::Init()
393 {
394 m_impl = NULL;
395 }
396
397
398 wxRegEx::~wxRegEx()
399 {
400 delete m_impl;
401 }
402
403 bool wxRegEx::Compile(const wxString& expr, int flags)
404 {
405 if ( !m_impl )
406 {
407 m_impl = new wxRegExImpl;
408 }
409
410 if ( !m_impl->Compile(expr, flags) )
411 {
412 // error message already given in wxRegExImpl::Compile
413 delete m_impl;
414 m_impl = NULL;
415
416 return FALSE;
417 }
418
419 return TRUE;
420 }
421
422 bool wxRegEx::Matches(const wxChar *str, int flags) const
423 {
424 wxCHECK_MSG( IsValid(), FALSE, _T("must successfully Compile() first") );
425
426 return m_impl->Matches(str, flags);
427 }
428
429 bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
430 {
431 wxCHECK_MSG( IsValid(), FALSE, _T("must successfully Compile() first") );
432
433 return m_impl->GetMatch(start, len, index);
434 }
435
436 wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
437 {
438 size_t start, len;
439 if ( !GetMatch(&start, &len, index) )
440 return wxEmptyString;
441
442 return text.Mid(start, len);
443 }
444
445 int wxRegEx::Replace(wxString *pattern,
446 const wxString& replacement,
447 size_t maxMatches) const
448 {
449 wxCHECK_MSG( IsValid(), -1, _T("must successfully Compile() first") );
450
451 return m_impl->Replace(pattern, replacement, maxMatches);
452 }
453
454 #endif // wxUSE_REGEX