]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2002-2003, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | * file name: regex.h | |
7 | * encoding: US-ASCII | |
8 | * indentation:4 | |
9 | * | |
10 | * created on: 2002oct22 | |
11 | * created by: Andy Heninger | |
12 | * | |
13 | * ICU Regular Expressions, API for C++ | |
14 | */ | |
15 | ||
16 | #ifndef REGEX_H | |
17 | #define REGEX_H | |
18 | ||
19 | ||
20 | /** | |
21 | * \file | |
22 | * \brief C++ API: Regular Expressions | |
23 | * | |
24 | * <h2>Regular Expression API</h2> | |
25 | * | |
26 | * <p>The ICU API for processing regular expressions consists of two classes, | |
27 | * <code>RegexPattern</code> and <code>RegexMatcher</code>. | |
28 | * <code>RegexPattern</code> objects represent a pre-processed, or compiled | |
29 | * regular expression. They are created from a regular expression pattern string, | |
30 | * and can be used to create <RegexMatcher> objects for the pattern.</p> | |
31 | * | |
32 | * <p>Class <code>RegexMatcher</code> bundles together a regular expression | |
33 | * pattern and a target string to which the search pattern will be applied. | |
34 | * <code>RegexMatcher</code> includes API for doing plain find or search | |
35 | * operations, for search and replace operations, and for obtaining detailed | |
36 | * information about bounds of a match. </p> | |
37 | */ | |
38 | ||
39 | #include "unicode/utypes.h" | |
40 | ||
41 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
42 | ||
43 | #include "unicode/uobject.h" | |
44 | #include "unicode/unistr.h" | |
45 | #include "unicode/parseerr.h" | |
46 | ||
47 | U_NAMESPACE_BEGIN | |
48 | ||
49 | ||
50 | // Forward Declarations... | |
51 | ||
52 | class RegexMatcher; | |
53 | class UVector; | |
54 | class UVector32; | |
55 | class UnicodeSet; | |
56 | struct REStackFrame; | |
57 | struct Regex8BitSet; | |
58 | ||
59 | ||
60 | /** | |
61 | * Constants for Regular Expression Match Modes. | |
62 | * @draft ICU 2.4 | |
63 | */ | |
64 | enum { | |
65 | /** Forces normalization of pattern and strings. @draft ICU 2.4 */ | |
66 | UREGEX_CANON_EQ = 128, | |
67 | ||
68 | /** Enable case insensitive matching. @draft ICU 2.4 */ | |
69 | UREGEX_CASE_INSENSITIVE = 2, | |
70 | ||
71 | /** Allow white space and comments within patterns @draft ICU 2.4 */ | |
72 | UREGEX_COMMENTS = 4, | |
73 | ||
74 | /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. | |
75 | * @draft ICU 2.4 */ | |
76 | UREGEX_DOTALL = 32, | |
77 | ||
78 | /** Control behavior of "$" and "^" | |
79 | * If set, recognize line terminators within string, | |
80 | * otherwise, match only at start and end of input string. | |
81 | * @draft ICU 2.4 */ | |
82 | UREGEX_MULTILINE = 8 | |
83 | }; | |
84 | ||
85 | ||
86 | ||
87 | /** | |
88 | * Class <code>RegexPattern</code> represents a compiled regular expression. It includes | |
89 | * factory methods for creating a RegexPattern object from the source (string) form | |
90 | * of a regular expression, methods for creating RegexMatchers that allow the pattern | |
91 | * to be applied to input text, and a few convenience methods for simple common | |
92 | * uses of regular expressions. | |
93 | * | |
94 | * <p>Class RegexPattern is not intended to be subclassed.</p> | |
95 | * | |
96 | * @draft ICU 2.4 | |
97 | */ | |
98 | class U_I18N_API RegexPattern: public UObject { | |
99 | public: | |
100 | ||
101 | /** | |
102 | * default constructor. Create a RegexPattern object that refers to no actual | |
103 | * pattern. Not normally needed; RegexPattern objects are usually | |
104 | * created using the factory method <code>compile()</code>. | |
105 | * | |
106 | * @draft ICU 2.4 | |
107 | */ | |
108 | RegexPattern(); | |
109 | ||
110 | /** | |
111 | * Copy Constructor. Create a new RegexPattern object that is equivalent | |
112 | * to the source object. | |
113 | * @draft ICU 2.4 | |
114 | */ | |
115 | RegexPattern(const RegexPattern &source); | |
116 | ||
117 | /** | |
118 | * Destructor. Note that a RegexPattern object must persist so long as any | |
119 | * RegexMatcher objects that were created from the RegexPattern are active. | |
120 | * @draft ICU 2.4 | |
121 | */ | |
122 | virtual ~RegexPattern(); | |
123 | ||
124 | /** | |
125 | * Comparison operator. Two RegexPattern objects are considered equal if they | |
126 | * were constructed from identical source patterns using the same match flag | |
127 | * settings. | |
128 | * @param that a RegexPattern object to compare with "this". | |
129 | * @return TRUE if the objects are equivalent. | |
130 | * @draft ICU 2.4 | |
131 | */ | |
132 | UBool operator==(const RegexPattern& that) const; | |
133 | ||
134 | /** | |
135 | * Comparison operator. Two RegexPattern objects are considered equal if they | |
136 | * were constructed from identical source patterns using the same match flag | |
137 | * settings. | |
138 | * @param that a RegexPattern object to compare with "this". | |
139 | * @return TRUE if the objects are different. | |
140 | * @draft ICU 2.4 | |
141 | */ | |
142 | inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}; | |
143 | ||
144 | /** | |
145 | * Assignment operator. After assignment, this RegexPattern will behave identically | |
146 | * to the source object. | |
147 | * @draft ICU 2.4 | |
148 | */ | |
149 | RegexPattern &operator =(const RegexPattern &source); | |
150 | ||
151 | /** | |
152 | * Create an exact copy of this RegexPattern object. Since RegexPattern is not | |
153 | * intended to be subclasses, <code>clone()</code> and the copy construction are | |
154 | * equivalent operations. | |
155 | * @return the copy of this RegexPattern | |
156 | * @draft ICU 2.4 | |
157 | */ | |
158 | virtual RegexPattern *clone() const; | |
159 | ||
160 | ||
161 | /** | |
162 | * Compiles the regular expression in string form into a RegexPattern | |
163 | * object. These compile methods, rather than the constructors, are the usual | |
164 | * way that RegexPattern objects are created. | |
165 | * | |
166 | * <p>Note that RegexPattern objects must not be deleted while RegexMatcher | |
167 | * objects created from the pattern are active. RegexMatchers keep a pointer | |
168 | * back to their pattern, so premature deletion of the pattern is a | |
169 | * catastrophic error.</p> | |
170 | * | |
171 | * <p>All pattern match mode flags are set to their default values.</p> | |
172 | * | |
173 | * @param regex The regular expression to be compiled. | |
174 | * @param pe Receives the position (line and column nubers) of any error | |
175 | * within the regular expression.) | |
176 | * @param status A reference to a UErrorCode to receive any errors. | |
177 | * @return A regexPattern object for the compiled pattern. | |
178 | * | |
179 | * @draft ICU 2.4 | |
180 | */ | |
181 | static RegexPattern *compile( const UnicodeString ®ex, | |
182 | UParseError &pe, | |
183 | UErrorCode &status); | |
184 | ||
185 | /** | |
186 | * Compiles the regular expression in string form into a RegexPattern | |
187 | * object using the specified match mode flags. These compile methods, | |
188 | * rather than the constructors, are the usual way that RegexPattern objects | |
189 | * are created. | |
190 | * | |
191 | * <p>Note that RegexPattern objects must not be deleted while RegexMatcher | |
192 | * objects created from the pattern are active. RegexMatchers keep a pointer | |
193 | * back to their pattern, so premature deletion of the pattern is a | |
194 | * catastrophic error.</p> | |
195 | * | |
196 | * @param regex The regular expression to be compiled. | |
197 | * @param flags The match mode flags to be used. | |
198 | * @param pe Receives the position (line and column nubers) of any error | |
199 | * within the regular expression.) | |
200 | * @param status A reference to a UErrorCode to receive any errors. | |
201 | * @return A regexPattern object for the compiled pattern. | |
202 | * | |
203 | * @draft ICU 2.4 | |
204 | */ | |
205 | static RegexPattern *compile( const UnicodeString ®ex, | |
206 | uint32_t flags, | |
207 | UParseError &pe, | |
208 | UErrorCode &status); | |
209 | ||
210 | ||
211 | /** | |
212 | * Compiles the regular expression in string form into a RegexPattern | |
213 | * object using the specified match mode flags. These compile methods, | |
214 | * rather than the constructors, are the usual way that RegexPattern objects | |
215 | * are created. | |
216 | * | |
217 | * <p>Note that RegexPattern objects must not be deleted while RegexMatcher | |
218 | * objects created from the pattern are active. RegexMatchers keep a pointer | |
219 | * back to their pattern, so premature deletion of the pattern is a | |
220 | * catastrophic error.</p> | |
221 | * | |
222 | * @param regex The regular expression to be compiled. | |
223 | * @param flags The match mode flags to be used. | |
224 | * @param status A reference to a UErrorCode to receive any errors. | |
225 | * @return A regexPattern object for the compiled pattern. | |
226 | * | |
227 | * @draft ICU 2.6 | |
228 | */ | |
229 | static RegexPattern *compile( const UnicodeString ®ex, | |
230 | uint32_t flags, | |
231 | UErrorCode &status); | |
232 | ||
233 | ||
234 | /** | |
235 | * Get the match mode flags that were used when compiling this pattern. | |
236 | * @return the match mode flags | |
237 | * @draft ICU 2.4 | |
238 | */ | |
239 | virtual uint32_t flags() const; | |
240 | ||
241 | /** | |
242 | * Creates a RegexMatcher that will match the given input against this pattern. The | |
243 | * RegexMatcher can then be used to perform match, find or replace operations | |
244 | * on the input. Note that a RegexPattern object must not be deleted while | |
245 | * RegexMatchers created from it still exist and might possibly be used again. | |
246 | * | |
247 | * @param input The input string to which the regular expression will be applied. | |
248 | * @param status A reference to a UErrorCode to receive any errors. | |
249 | * @return A RegexMatcher object for this pattern and input. | |
250 | * | |
251 | * @draft ICU 2.4 | |
252 | */ | |
253 | virtual RegexMatcher *matcher(const UnicodeString &input, | |
254 | UErrorCode &status) const; | |
255 | ||
256 | ||
257 | /** | |
258 | * Creates a RegexMatcher that will match against this pattern. The | |
259 | * RegexMatcher can be used to perform match, find or replace operations. | |
260 | * Note that a RegexPattern object must not be deleted while | |
261 | * RegexMatchers created from it still exist and might possibly be used again. | |
262 | * | |
263 | * @param status A reference to a UErrorCode to receive any errors. | |
264 | * @return A RegexMatcher object for this pattern and input. | |
265 | * | |
266 | * @draft ICU 2.6 | |
267 | */ | |
268 | virtual RegexMatcher *matcher(UErrorCode &status) const; | |
269 | ||
270 | ||
271 | /** | |
272 | * Test whether a string matches a regular expression. This convenience function | |
273 | * both compiles the reguluar expression and applies it in a single operation. | |
274 | * Note that if the same pattern needs to be applied repeatedly, this method will be | |
275 | * less efficient than creating and reusing a RegexPattern object. | |
276 | * | |
277 | * @param regex The regular expression | |
278 | * @param input The string data to be matched | |
279 | * @param pe Receives the position of any syntax errors within the regular expression | |
280 | * @param status A reference to a UErrorCode to receive any errors. | |
281 | * @return True if the regular expression exactly matches the full input string. | |
282 | * | |
283 | * @draft ICU 2.4 | |
284 | */ | |
285 | static UBool matches(const UnicodeString ®ex, | |
286 | const UnicodeString &input, | |
287 | UParseError &pe, | |
288 | UErrorCode &status); | |
289 | ||
290 | ||
291 | /** | |
292 | * Returns the regular expression from which this pattern was compiled. | |
293 | * @draft ICU 2.4 | |
294 | */ | |
295 | virtual UnicodeString pattern() const; | |
296 | ||
297 | ||
298 | /** | |
299 | * Split a string into fields. Somewhat like split() from Perl. | |
300 | * The pattern matches identify delimiters that separate the input | |
301 | * into fields. The input data between the matches becomes the | |
302 | * fields themselves. | |
303 | * <p> | |
304 | * For the best performance on split() operations, | |
305 | * <code>RegexMatcher::split</code> is perferable to this function | |
306 | * | |
307 | * @param input The string to be split into fields. The field delimiters | |
308 | * match the pattern (in the "this" object) | |
309 | * @param dest An array of UnicodeStrings to receive the results of the split. | |
310 | * This is an array of actual UnicodeString objects, not an | |
311 | * array of pointers to strings. Local (stack based) arrays can | |
312 | * work well here. | |
313 | * @param destCapacity The number of elements in the destination array. | |
314 | * If the number of fields found is less than destCapacity, the | |
315 | * extra strings in the destination array are not altered. | |
316 | * If the number of destination strings is less than the number | |
317 | * of fields, the trailing part of the input string, including any | |
318 | * field delimiters, is placed in the last destination string. | |
319 | * @param status A reference to a UErrorCode to receive any errors. | |
320 | * @return The number of fields into which the input string was split. | |
321 | * @draft ICU 2.4 | |
322 | */ | |
323 | virtual int32_t split(const UnicodeString &input, | |
324 | UnicodeString dest[], | |
325 | int32_t destCapacity, | |
326 | UErrorCode &status) const; | |
327 | ||
328 | ||
329 | ||
330 | /** | |
331 | * dump Debug function, displays the compiled form of a pattern. | |
332 | * @internal | |
333 | */ | |
334 | void dump() const; | |
335 | ||
336 | /** | |
337 | * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
338 | * | |
339 | * @draft ICU 2.4 | |
340 | */ | |
341 | virtual inline UClassID getDynamicClassID() const; | |
342 | ||
343 | /** | |
344 | * ICU "poor man's RTTI", returns a UClassID for this class. | |
345 | * | |
346 | * @draft ICU 2.4 | |
347 | */ | |
348 | static inline UClassID getStaticClassID(); | |
349 | ||
350 | private: | |
351 | // | |
352 | // Implementation Data | |
353 | // | |
354 | UnicodeString fPattern; // The original pattern string. | |
355 | uint32_t fFlags; // The flags used when compiling the pattern. | |
356 | // | |
357 | UVector32 *fCompiledPat; // The compiled pattern p-code. | |
358 | UnicodeString fLiteralText; // Any literal string data from the pattern, | |
359 | // after un-escaping, for use during the match. | |
360 | ||
361 | UVector *fSets; // Any UnicodeSets referenced from the pattern. | |
362 | Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) | |
363 | ||
364 | ||
365 | UErrorCode fDeferredStatus; // status if some prior error has left this | |
366 | // RegexPattern in an unusable state. | |
367 | ||
368 | int32_t fMinMatchLen; // Minimum Match Length. All matches will have length | |
369 | // >= this value. For some patterns, this calculated | |
370 | // value may be less than the true shortest | |
371 | // possible match. | |
372 | ||
373 | int32_t fFrameSize; // Size of a state stack frame in the | |
374 | // execution engine. | |
375 | ||
376 | int32_t fDataSize; // The size of the data needed by the pattern that | |
377 | // does not go on the state stack, but has just | |
378 | // a single copy per matcher. | |
379 | ||
380 | UVector32 *fGroupMap; // Map from capture group number to position of | |
381 | // the group's variables in the matcher stack frame. | |
382 | ||
383 | int32_t fMaxCaptureDigits; | |
384 | ||
385 | UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined | |
386 | // regex character classes, e.g. Word. | |
387 | ||
388 | Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only | |
389 | // sets for predefined regex classes. | |
390 | ||
391 | int32_t fStartType; // Info on how a match must start. | |
392 | int32_t fInitialStringIdx; // | |
393 | int32_t fInitialStringLen; | |
394 | UnicodeSet *fInitialChars; | |
395 | UChar32 fInitialChar; | |
396 | Regex8BitSet *fInitialChars8; | |
397 | ||
398 | /** | |
399 | * The address of this static class variable serves as this class's ID | |
400 | * for ICU "poor man's RTTI". | |
401 | */ | |
402 | static const char fgClassID; | |
403 | ||
404 | friend class RegexCompile; | |
405 | friend class RegexMatcher; | |
406 | ||
407 | // | |
408 | // Implementation Methods | |
409 | // | |
410 | void init(); // Common initialization, for use by constructors. | |
411 | void zap(); // Common cleanup | |
412 | void dumpOp(int32_t index) const; | |
413 | ||
414 | ||
415 | }; | |
416 | ||
417 | ||
418 | ||
419 | ||
420 | ||
421 | ||
422 | ||
423 | ||
424 | ||
425 | /** | |
426 | * class RegexMatcher bundles together a reular expression pattern and | |
427 | * input text to which the expression can be applied. It includes methods | |
428 | * for testing for matches, and for find and replace operations. | |
429 | * | |
430 | * <p>Class RegexMatcher is not intended to be subclassed.</p> | |
431 | * | |
432 | * @draft ICU 2.4 | |
433 | */ | |
434 | class U_I18N_API RegexMatcher: public UObject { | |
435 | public: | |
436 | ||
437 | /** | |
438 | * Construct a RegexMatcher for a regular expression. | |
439 | * This is a convenience method that avoids the need to explicitly create | |
440 | * a RegexPattern object. Note that if several RegexMatchers need to be | |
441 | * created for the same expression, it will be more efficient to | |
442 | * separately create and cache a RegexPattern object, and use | |
443 | * its matcher() method to create the RegexMatcher objects. | |
444 | * | |
445 | * @param regexp The Regular Expression to be compiled. | |
446 | * @param flags Regular expression options, such as case insensitive matching. | |
447 | * @see UREGEX_CASE_INSENSITIVE | |
448 | * @param status Any errors are reported by setting this UErrorCode variable. | |
449 | * @draft ICU 2.6 | |
450 | */ | |
451 | RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); | |
452 | ||
453 | /** | |
454 | * Construct a RegexMatcher for a regular expression. | |
455 | * This is a convenience method that avoids the need to explicitly create | |
456 | * a RegexPattern object. Note that if several RegexMatchers need to be | |
457 | * created for the same expression, it will be more efficient to | |
458 | * separately create and cache a RegexPattern object, and use | |
459 | * its matcher() method to create the RegexMatcher objects. | |
460 | * | |
461 | * @param regexp The Regular Expression to be compiled. | |
462 | * @param input The string to match | |
463 | * @param flags Regular expression options, such as case insensitive matching. | |
464 | * @see UREGEX_CASE_INSENSITIVE | |
465 | * @param status Any errors are reported by setting this UErrorCode variable. | |
466 | * @draft ICU 2.6 | |
467 | */ | |
468 | RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, | |
469 | uint32_t flags, UErrorCode &status); | |
470 | ||
471 | ||
472 | /** | |
473 | * Destructor. | |
474 | * | |
475 | * @draft ICU 2.4 | |
476 | */ | |
477 | virtual ~RegexMatcher(); | |
478 | ||
479 | ||
480 | /** | |
481 | * Attempts to match the entire input string against the pattern. | |
482 | * @param status A reference to a UErrorCode to receive any errors. | |
483 | * @return TRUE if there is a match | |
484 | * @draft ICU 2.4 | |
485 | */ | |
486 | virtual UBool matches(UErrorCode &status); | |
487 | ||
488 | ||
489 | ||
490 | /** | |
491 | * Attempts to match the input string, starting from the beginning, against the pattern. | |
492 | * Like the matches() method, this function always starts at the beginning of the input string; | |
493 | * unlike that function, it does not require that the entire input string be matched. | |
494 | * | |
495 | * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, | |
496 | * <code>end()</code>, and <code>group()</code> functions.</p> | |
497 | * | |
498 | * @param status A reference to a UErrorCode to receive any errors. | |
499 | * @return TRUE if there is a match at the start of the input string. | |
500 | * @draft ICU 2.4 | |
501 | */ | |
502 | virtual UBool lookingAt(UErrorCode &status); | |
503 | ||
504 | ||
505 | /** | |
506 | * Find the next pattern match in the input string. | |
507 | * The find begins searching the input at the location following the end of | |
508 | * the previous match, or at the start of the string if there is no previous match. | |
509 | * If a match is found, <code>start(), end()</code> and <code>group()</code> | |
510 | * will provide more information regarding the match. | |
511 | * <p>Note that if the input string is changed by the application, | |
512 | * use find(startPos, status) instead of find(), because the saved starting | |
513 | * position may not be valid with the altered input string.</p> | |
514 | * @return TRUE if a match is found. | |
515 | * @draft ICU 2.4 | |
516 | */ | |
517 | virtual UBool find(); | |
518 | ||
519 | ||
520 | /** | |
521 | * Resets this RegexMatcher and then attempts to find the next substring of the | |
522 | * input string that matches the pattern, starting at the specified index. | |
523 | * | |
524 | * @param start the position in the input string to begin the search | |
525 | * @param status A reference to a UErrorCode to receive any errors. | |
526 | * @return TRUE if a match is found. | |
527 | * @draft ICU 2.4 | |
528 | */ | |
529 | virtual UBool find(int32_t start, UErrorCode &status); | |
530 | ||
531 | ||
532 | /** | |
533 | * Returns a string containing the text matched by the previous match. | |
534 | * If the pattern can match an empty string, an empty string may be returned. | |
535 | * @param status A reference to a UErrorCode to receive any errors. | |
536 | * Possible errors are U_REGEX_INVALID_STATE if no match | |
537 | * has been attempted or the last match failed. | |
538 | * @return a string containing the matched input text. | |
539 | * @draft ICU 2.4 | |
540 | */ | |
541 | virtual UnicodeString group(UErrorCode &status) const; | |
542 | ||
543 | ||
544 | /** | |
545 | * Returns a string containing the text captured by the given group | |
546 | * during the previous match operation. Group(0) is the entire match. | |
547 | * | |
548 | * @param groupNum the capture group number | |
549 | * @param status A reference to a UErrorCode to receive any errors. | |
550 | * Possible errors are U_REGEX_INVALID_STATE if no match | |
551 | * has been attempted or the last match failed and | |
552 | * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. | |
553 | * @return the captured text | |
554 | * @draft ICU 2.4 | |
555 | */ | |
556 | virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; | |
557 | ||
558 | ||
559 | /** | |
560 | * Returns the number of capturing groups in this matcher's pattern. | |
561 | * @return the number of capture groups | |
562 | * @draft ICU 2.4 | |
563 | */ | |
564 | virtual int32_t groupCount() const; | |
565 | ||
566 | ||
567 | /** | |
568 | * Returns the index in the input string of the start of the text matched | |
569 | * during the previous match operation. | |
570 | * @param status a reference to a UErrorCode to receive any errors. | |
571 | * @return The position in the input string of the start of the last match. | |
572 | * @draft ICU 2.4 | |
573 | */ | |
574 | virtual int32_t start(UErrorCode &status) const; | |
575 | ||
576 | ||
577 | /** | |
578 | * Returns the index in the input string of the start of the text matched by the | |
579 | * specified capture group during the previous match operation. Return -1 if | |
580 | * the capture group exists in the pattern, but was not part of the last match. | |
581 | * | |
582 | * @param group the capture group number | |
583 | * @param status A reference to a UErrorCode to receive any errors. Possible | |
584 | * errors are U_REGEX_INVALID_STATE if no match has been | |
585 | * attempted or the last match failed, and | |
586 | * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number | |
587 | * @return the start position of substring matched by the specified group. | |
588 | * @draft ICU 2.4 | |
589 | */ | |
590 | virtual int32_t start(int group, UErrorCode &status) const; | |
591 | ||
592 | ||
593 | /** | |
594 | * Returns the index in the input string of the character following the | |
595 | * text matched during the previous match operation. | |
596 | * @param status A reference to a UErrorCode to receive any errors. Possible | |
597 | * errors are U_REGEX_INVALID_STATE if no match has been | |
598 | * attempted or the last match failed. | |
599 | * @return the index of the last character matched, plus one. | |
600 | * @draft ICU 2.4 | |
601 | */ | |
602 | virtual int32_t end(UErrorCode &status) const; | |
603 | ||
604 | ||
605 | /** | |
606 | * Returns the index in the input string of the character following the | |
607 | * text matched by the specified capture group during the previous match operation. | |
608 | * @param group the capture group number | |
609 | * @param status A reference to a UErrorCode to receive any errors. Possible | |
610 | * errors are U_REGEX_INVALID_STATE if no match has been | |
611 | * attempted or the last match failed and | |
612 | * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number | |
613 | * @return the index of the last character, plus one, of the text | |
614 | * captured by the specifed group during the previous match operation. | |
615 | * Return -1 if the capture group was not part of the match. | |
616 | * @draft ICU 2.4 | |
617 | */ | |
618 | virtual int32_t end(int group, UErrorCode &status) const; | |
619 | ||
620 | ||
621 | /** | |
622 | * Resets this matcher. The effect is to remove any memory of previous matches, | |
623 | * and to cause subsequent find() operations to begin at the beginning of | |
624 | * the input string. | |
625 | * | |
626 | * @return this RegexMatcher. | |
627 | * @draft ICU 2.4 | |
628 | */ | |
629 | virtual RegexMatcher &reset(); | |
630 | ||
631 | ||
632 | /** | |
633 | * Resets this matcher with a new input string. This allows instances of RegexMatcher | |
634 | * to be reused, which is more efficient than creating a new RegexMatcher for | |
635 | * each input string to be processed. | |
636 | * @return this RegexMatcher. | |
637 | * @draft ICU 2.4 | |
638 | */ | |
639 | virtual RegexMatcher &reset(const UnicodeString &input); | |
640 | ||
641 | ||
642 | /** | |
643 | * Returns the input string being matched. The returned string is not a copy, | |
644 | * but the live input string. It should not be altered or deleted. | |
645 | * @return the input string | |
646 | * @draft ICU 2.4 | |
647 | */ | |
648 | virtual const UnicodeString &input() const; | |
649 | ||
650 | ||
651 | /** | |
652 | * Returns the pattern that is interpreted by this matcher. | |
653 | * @return the RegexPattern for this RegexMatcher | |
654 | * @draft ICU 2.4 | |
655 | */ | |
656 | virtual const RegexPattern &pattern() const; | |
657 | ||
658 | ||
659 | /** | |
660 | * Replaces every substring of the input that matches the pattern | |
661 | * with the given replacement string. This is a convenience function that | |
662 | * provides a complete find-and-replace-all operation. | |
663 | * | |
664 | * This method first resets this matcher. It then scans the input string | |
665 | * looking for matches of the pattern. Input that is not part of any | |
666 | * match is left unchanged; each match is replaced in the result by the | |
667 | * replacement string. The replacement string may contain references to | |
668 | * capture groups. | |
669 | * | |
670 | * @param replacement a string containing the replacement text. | |
671 | * @param status a reference to a UErrorCode to receive any errors. | |
672 | * @return a string containing the results of the find and replace. | |
673 | * @draft ICU 2.4 | |
674 | */ | |
675 | virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); | |
676 | ||
677 | ||
678 | /** | |
679 | * Replaces the first substring of the input that matches | |
680 | * the pattern with the replacement string. This is a convenience | |
681 | * function that provides a complete find-and-replace operation. | |
682 | * | |
683 | * <p>This function first resets this RegexMatcher. It then scans the input string | |
684 | * looking for a match of the pattern. Input that is not part | |
685 | * of the match is appended directly to the result string; the match is replaced | |
686 | * in the result by the replacement string. The replacement string may contain | |
687 | * references to captured groups.</p> | |
688 | * | |
689 | * <p>The state of the matcher (the position at which a subsequent find() | |
690 | * would begin) after completing a replaceFirst() is not specified. The | |
691 | * RegexMatcher should be reset before doing additional find() operations.</p> | |
692 | * | |
693 | * @param replacement a string containing the replacement text. | |
694 | * @param status a reference to a UErrorCode to receive any errors. | |
695 | * @return a string containing the results of the find and replace. | |
696 | * @draft ICU 2.4 | |
697 | */ | |
698 | virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); | |
699 | ||
700 | /** | |
701 | * Implements a replace operation intended to be used as part of an | |
702 | * incremental find-and-replace. | |
703 | * | |
704 | * <p>The input string, starting from the end of the previous match and ending at | |
705 | * the start of the current match, is appended to the destination string. Then the | |
706 | * replacement string is appended to the output string, | |
707 | * including handling any substitutions of captured text.</p> | |
708 | * | |
709 | * <p>For simple, prepackaged, non-incremental find-and-replace | |
710 | * operations, see replaceFirst() or replaceAll().</p> | |
711 | * | |
712 | * @param dest A UnicodeString to which the results of the find-and-replace are appended. | |
713 | * @param replacement A UnicodeString that provides the text to be substitured for | |
714 | * the input text that matched the regexp pattern. The replacement | |
715 | * text may contain references to captured text from the | |
716 | * input. | |
717 | * @param status A reference to a UErrorCode to receive any errors. Possible | |
718 | * errors are U_REGEX_INVALID_STATE if no match has been | |
719 | * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR | |
720 | * if the replacement text specifies a capture group that | |
721 | * does not exist in the pattern. | |
722 | * | |
723 | * @return this RegexMatcher | |
724 | * @draft ICU 2.4 | |
725 | * | |
726 | */ | |
727 | virtual RegexMatcher &appendReplacement(UnicodeString &dest, | |
728 | const UnicodeString &replacement, UErrorCode &status); | |
729 | ||
730 | ||
731 | /** | |
732 | * As the final step in a find-and-replace operation, append the remainder | |
733 | * of the input string, starting at the position following the last match, | |
734 | * to the destination string. <code>appendTail()</code> is intended to be invoked after one | |
735 | * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. | |
736 | * | |
737 | * @param dest A UnicodeString to which the results of the find-and-replace are appended. | |
738 | * @return the destination string. | |
739 | * @draft ICU 2.4 | |
740 | */ | |
741 | virtual UnicodeString &appendTail(UnicodeString &dest); | |
742 | ||
743 | ||
744 | ||
745 | /** | |
746 | * Split a string into fields. Somewhat like split() from Perl. | |
747 | * The pattern matches identify delimiters that separate the input | |
748 | * into fields. The input data between the matches becomes the | |
749 | * fields themselves. | |
750 | * <p> | |
751 | * | |
752 | * @param input The string to be split into fields. The field delimiters | |
753 | * match the pattern (in the "this" object). This matcher | |
754 | * will be reset to this input string. | |
755 | * @param dest An array of UnicodeStrings to receive the results of the split. | |
756 | * This is an array of actual UnicodeString objects, not an | |
757 | * array of pointers to strings. Local (stack based) arrays can | |
758 | * work well here. | |
759 | * @param destCapacity The number of elements in the destination array. | |
760 | * If the number of fields found is less than destCapacity, the | |
761 | * extra strings in the destination array are not altered. | |
762 | * If the number of destination strings is less than the number | |
763 | * of fields, the trailing part of the input string, including any | |
764 | * field delimiters, is placed in the last destination string. | |
765 | * @param status A reference to a UErrorCode to receive any errors. | |
766 | * @return The number of fields into which the input string was split. | |
767 | * @draft ICU 2.6 | |
768 | */ | |
769 | virtual int32_t split(const UnicodeString &input, | |
770 | UnicodeString dest[], | |
771 | int32_t destCapacity, | |
772 | UErrorCode &status); | |
773 | ||
774 | ||
775 | ||
776 | /** | |
777 | * setTrace Debug function, enable/disable tracing of the matching engine. | |
778 | * For internal ICU development use only. DO NO USE!!!! | |
779 | * @internal | |
780 | */ | |
781 | void setTrace(UBool state); | |
782 | ||
783 | ||
784 | /** | |
785 | * ICU "poor man's RTTI", returns a UClassID for this class. | |
786 | * | |
787 | * @draft ICU 2.2 | |
788 | */ | |
789 | static inline UClassID getStaticClassID(); | |
790 | ||
791 | /** | |
792 | * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
793 | * | |
794 | * @draft ICU 2.2 | |
795 | */ | |
796 | virtual inline UClassID getDynamicClassID() const; | |
797 | ||
798 | private: | |
799 | // Constructors and other object boilerplate are private. | |
800 | // Instances of RegexMatcher can not be assigned, copied, cloned, etc. | |
801 | RegexMatcher(); // default constructor not implemented | |
802 | RegexMatcher(const RegexPattern *pat); | |
803 | RegexMatcher(const RegexMatcher &other); | |
804 | RegexMatcher &operator =(const RegexMatcher &rhs); | |
805 | friend class RegexPattern; | |
806 | ||
807 | ||
808 | // | |
809 | // MatchAt This is the internal interface to the match engine itself. | |
810 | // Match status comes back in matcher member variables. | |
811 | // | |
812 | void MatchAt(int32_t startIdx, UErrorCode &status); | |
813 | inline void backTrack(int32_t &inputIdx, int32_t &patIdx); | |
814 | UBool isWordBoundary(int32_t pos); // perform the \b test | |
815 | REStackFrame *resetStack(); | |
816 | inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, | |
817 | int32_t frameSize, UErrorCode &status); | |
818 | ||
819 | ||
820 | const RegexPattern *fPattern; | |
821 | RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and | |
822 | // should delete it when through. | |
823 | const UnicodeString *fInput; | |
824 | ||
825 | UBool fMatch; // True if the last match was successful. | |
826 | int32_t fMatchStart; // Position of the start of the most recent match | |
827 | int32_t fMatchEnd; // First position after the end of the most recent match | |
828 | int32_t fLastMatchEnd; // First position after the end of the previous match. | |
829 | ||
830 | UVector32 *fStack; | |
831 | REStackFrame *fFrame; // After finding a match, the last active stack | |
832 | // frame, which will contain the capture group results. | |
833 | // NOT valid while match engine is running. | |
834 | ||
835 | int32_t *fData; // Data area for use by the compiled pattern. | |
836 | int32_t fSmallData[8]; // Use this for data if it's enough. | |
837 | ||
838 | UBool fTraceDebug; // Set true for debug tracing of match engine. | |
839 | ||
840 | UErrorCode fDeferredStatus; // Save error state if that cannot be immediately | |
841 | // reported, or that permanently disables this matcher. | |
842 | ||
843 | /** | |
844 | * The address of this static class variable serves as this class's ID | |
845 | * for ICU "poor man's RTTI". | |
846 | */ | |
847 | static const char fgClassID; | |
848 | ||
849 | ||
850 | }; | |
851 | ||
852 | inline UClassID RegexPattern::getStaticClassID() { return (UClassID)&fgClassID; } | |
853 | inline UClassID RegexPattern::getDynamicClassID() const { return getStaticClassID(); } | |
854 | ||
855 | inline UClassID RegexMatcher::getStaticClassID() { return (UClassID)&fgClassID; } | |
856 | inline UClassID RegexMatcher::getDynamicClassID() const { return getStaticClassID(); } | |
857 | ||
858 | ||
859 | U_NAMESPACE_END | |
860 | #endif // UCONFIG_NO_REGULAR_EXPRESSIONS | |
861 | #endif |