2 **********************************************************************
3 * Copyright (C) 2004-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
10 * created on: 2004mar09
11 * created by: Andy Heninger
13 * ICU Regular Expressions, API for C
18 * \brief C API: Regular Expressions
20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
26 #include "unicode/utext.h"
27 #include "unicode/utypes.h"
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
31 #include "unicode/localpointer.h"
32 #include "unicode/parseerr.h"
34 struct URegularExpression
;
36 * Structure representing a compiled regular rexpression, plus the results
37 * of a match operation.
40 typedef struct URegularExpression URegularExpression
;
44 * Constants for Regular Expression Match Modes.
47 typedef enum URegexpFlag
{
49 #ifndef U_HIDE_DRAFT_API
50 /** Forces normalization of pattern and strings.
51 Not implemented yet, just a placeholder, hence draft.
53 UREGEX_CANON_EQ
= 128,
55 /** Enable case insensitive matching. @stable ICU 2.4 */
56 UREGEX_CASE_INSENSITIVE
= 2,
58 /** Allow white space and comments within patterns @stable ICU 2.4 */
61 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
65 /** If set, treat the entire pattern as a literal string.
66 * Metacharacters or escape sequences in the input sequence will be given
67 * no special meaning. Not implemented yet as of ICU 4.4.
69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact
70 * on matching when used in conjunction with this flag.
71 * The other flags become superfluous.
72 * TODO: say which escapes are still handled; anything Java does
73 * early (\\u) we should still do.
78 /** Control behavior of "$" and "^"
79 * If set, recognize line terminators within string,
80 * otherwise, match only at start and end of input string.
84 /** Unix-only line endings.
85 * When this mode is enabled, only \\u000a is recognized as a line ending
86 * in the behavior of ., ^, and $.
89 UREGEX_UNIX_LINES
= 1,
91 /** Unicode word boundaries.
92 * If set, \b uses the Unicode TR 29 definition of word boundaries.
93 * Warning: Unicode word boundaries are quite different from
94 * traditional regular expression word boundaries. See
95 * http://unicode.org/reports/tr29/#Word_Boundaries
100 /** Error on Unrecognized backslash escapes.
101 * If set, fail with an error on patterns that contain
102 * backslash-escaped ASCII letters without a known specail
103 * meaning. If this flag is not set, these
104 * escaped letters represent themselves.
107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
= 512
112 * Open (compile) an ICU regular expression. Compiles the regular expression in
113 * string form into an internal representation using the specified match mode flags.
114 * The resulting regular expression handle can then be used to perform various
115 * matching operations.
118 * @param pattern The Regular Expression pattern to be compiled.
119 * @param patternLength The length of the pattern, or -1 if the pattern is
121 * @param flags Flags that alter the default matching behavior for
122 * the regular expression, UREGEX_CASE_INSENSITIVE, for
123 * example. For default behavior, set this parameter to zero.
124 * See <code>enum URegexpFlag</code>. All desired flags
125 * are bitwise-ORed together.
126 * @param pe Receives the position (line and column nubers) of any syntax
127 * error within the source regular expression string. If this
128 * information is not wanted, pass NULL for this parameter.
129 * @param status Receives error detected by this function.
133 U_STABLE URegularExpression
* U_EXPORT2
134 uregex_open( const UChar
*pattern
,
135 int32_t patternLength
,
141 * Open (compile) an ICU regular expression. Compiles the regular expression in
142 * string form into an internal representation using the specified match mode flags.
143 * The resulting regular expression handle can then be used to perform various
144 * matching operations.
146 * The contents of the pattern UText will be extracted and saved. Ownership of the
147 * UText struct itself remains with the caller. This is to match the behavior of
150 * @param pattern The Regular Expression pattern to be compiled.
151 * @param flags Flags that alter the default matching behavior for
152 * the regular expression, UREGEX_CASE_INSENSITIVE, for
153 * example. For default behavior, set this parameter to zero.
154 * See <code>enum URegexpFlag</code>. All desired flags
155 * are bitwise-ORed together.
156 * @param pe Receives the position (line and column nubers) of any syntax
157 * error within the source regular expression string. If this
158 * information is not wanted, pass NULL for this parameter.
159 * @param status Receives error detected by this function.
163 U_DRAFT URegularExpression
* U_EXPORT2
164 uregex_openUText(UText
*pattern
,
170 * Open (compile) an ICU regular expression. The resulting regular expression
171 * handle can then be used to perform various matching operations.
173 * This function is the same as uregex_open, except that the pattern
174 * is supplied as an 8 bit char * string in the default code page.
176 * @param pattern The Regular Expression pattern to be compiled,
178 * @param flags Flags that alter the default matching behavior for
179 * the regular expression, UREGEX_CASE_INSENSITIVE, for
180 * example. For default behavior, set this parameter to zero.
181 * See <code>enum URegexpFlag</code>. All desired flags
182 * are bitwise-ORed together.
183 * @param pe Receives the position (line and column nubers) of any syntax
184 * error within the source regular expression string. If this
185 * information is not wanted, pass NULL for this parameter.
186 * @param status Receives errors detected by this function.
187 * @return The URegularExpression object representing the compiled
192 #if !UCONFIG_NO_CONVERSION
193 U_STABLE URegularExpression
* U_EXPORT2
194 uregex_openC( const char *pattern
,
203 * Close the regular expression, recovering all resources (memory) it
206 * @param regexp The regular expression to be closed.
209 U_STABLE
void U_EXPORT2
210 uregex_close(URegularExpression
*regexp
);
212 #if U_SHOW_CPLUSPLUS_API
217 * \class LocalURegularExpressionPointer
218 * "Smart pointer" class, closes a URegularExpression via uregex_close().
219 * For most methods see the LocalPointerBase base class.
221 * @see LocalPointerBase
225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer
, URegularExpression
, uregex_close
);
232 * Make a copy of a compiled regular expression. Cloning a regular
233 * expression is faster than opening a second instance from the source
234 * form of the expression, and requires less memory.
236 * Note that the current input string and the position of any matched text
237 * within it are not cloned; only the pattern itself and and the
238 * match mode flags are copied.
240 * Cloning can be particularly useful to threaded applications that perform
241 * multiple match operations in parallel. Each concurrent RE
242 * operation requires its own instance of a URegularExpression.
244 * @param regexp The compiled regular expression to be cloned.
245 * @param status Receives indication of any errors encountered
246 * @return the cloned copy of the compiled regular expression.
249 U_STABLE URegularExpression
* U_EXPORT2
250 uregex_clone(const URegularExpression
*regexp
, UErrorCode
*status
);
253 * Returns a pointer to the source form of the pattern for this regular expression.
254 * This function will work even if the pattern was originally specified as a UText.
256 * @param regexp The compiled regular expression.
257 * @param patLength This output parameter will be set to the length of the
258 * pattern string. A NULL pointer may be used here if the
259 * pattern length is not needed, as would be the case if
260 * the pattern is known in advance to be a NUL terminated
262 * @param status Receives errors detected by this function.
263 * @return a pointer to the pattern string. The storage for the string is
264 * owned by the regular expression object, and must not be
265 * altered or deleted by the application. The returned string
266 * will remain valid until the regular expression is closed.
269 U_STABLE
const UChar
* U_EXPORT2
270 uregex_pattern(const URegularExpression
*regexp
,
275 * Returns the source text of the pattern for this regular expression.
276 * This function will work even if the pattern was originally specified as a UChar string.
278 * @param regexp The compiled regular expression.
279 * @param status Receives errors detected by this function.
280 * @return the pattern text. The storage for the text is owned by the regular expression
281 * object, and must not be altered or deleted.
285 U_DRAFT UText
* U_EXPORT2
286 uregex_patternUText(const URegularExpression
*regexp
,
291 * Get the match mode flags that were specified when compiling this regular expression.
292 * @param status Receives errors detected by this function.
293 * @param regexp The compiled regular expression.
294 * @return The match mode flags
298 U_STABLE
int32_t U_EXPORT2
299 uregex_flags(const URegularExpression
*regexp
,
304 * Set the subject text string upon which the regular expression will look for matches.
305 * This function may be called any number of times, allowing the regular
306 * expression pattern to be applied to different strings.
308 * Regular expression matching operations work directly on the application's
309 * string data. No copy is made. The subject string data must not be
310 * altered after calling this function until after all regular expression
311 * operations involving this string data are completed.
313 * Zero length strings are permitted. In this case, no subsequent match
314 * operation will dereference the text string pointer.
316 * @param regexp The compiled regular expression.
317 * @param text The subject text string.
318 * @param textLength The length of the subject text, or -1 if the string
320 * @param status Receives errors detected by this function.
323 U_STABLE
void U_EXPORT2
324 uregex_setText(URegularExpression
*regexp
,
331 * Set the subject text string upon which the regular expression will look for matches.
332 * This function may be called any number of times, allowing the regular
333 * expression pattern to be applied to different strings.
335 * Regular expression matching operations work directly on the application's
336 * string data; only a shallow clone is made. The subject string data must not be
337 * altered after calling this function until after all regular expression
338 * operations involving this string data are completed.
340 * @param regexp The compiled regular expression.
341 * @param text The subject text string.
342 * @param status Receives errors detected by this function.
346 U_DRAFT
void U_EXPORT2
347 uregex_setUText(URegularExpression
*regexp
,
352 * Get the subject text that is currently associated with this
353 * regular expression object. If the input was supplied using uregex_setText(),
354 * that pointer will be returned. Otherwise, the characters in the input will
355 * be extracted to a buffer and returned. In either case, ownership remains
356 * with the regular expression object.
358 * This function will work even if the input was originally specified as a UText.
360 * @param regexp The compiled regular expression.
361 * @param textLength The length of the string is returned in this output parameter.
362 * A NULL pointer may be used here if the
363 * text length is not needed, as would be the case if
364 * the text is known in advance to be a NUL terminated
366 * @param status Receives errors detected by this function.
367 * @return Pointer to the subject text string currently associated with
368 * this regular expression.
371 U_STABLE
const UChar
* U_EXPORT2
372 uregex_getText(URegularExpression
*regexp
,
378 * Get the subject text that is currently associated with this
379 * regular expression object.
381 * This function will work even if the input was originally specified as a UChar string.
383 * @param regexp The compiled regular expression.
384 * @param dest A mutable UText in which to store the current input.
385 * If NULL, a new UText will be created as an immutable shallow clone
386 * of the actual input string.
387 * @param status Receives errors detected by this function.
388 * @return The subject text currently associated with this regular expression.
389 * If a pre-allocated UText was provided, it will always be used and returned.
393 U_DRAFT UText
* U_EXPORT2
394 uregex_getUText(URegularExpression
*regexp
,
399 * Attempts to match the input string against the pattern.
400 * To succeed, the match must extend to the end of the string,
401 * or cover the complete match region.
403 * If startIndex >= zero the match operation starts at the specified
404 * index and must extend to the end of the input string. Any region
405 * that has been specified is reset.
407 * If startIndex == -1 the match must cover the input region, or the entire
408 * input string if no region has been set. This directly corresponds to
409 * Matcher.matches() in Java
411 * @param regexp The compiled regular expression.
412 * @param startIndex The input string (native) index at which to begin matching, or -1
413 * to match the input Region.
414 * @param status Receives errors detected by this function.
415 * @return TRUE if there is a match
418 U_STABLE UBool U_EXPORT2
419 uregex_matches(URegularExpression
*regexp
,
424 * 64bit version of uregex_matches.
427 U_DRAFT UBool U_EXPORT2
428 uregex_matches64(URegularExpression
*regexp
,
433 * Attempts to match the input string, starting from the specified index, against the pattern.
434 * The match may be of any length, and is not required to extend to the end
435 * of the input string. Contrast with uregex_matches().
437 * <p>If startIndex is >= 0 any input region that was set for this
438 * URegularExpression is reset before the operation begins.
440 * <p>If the specified starting index == -1 the match begins at the start of the input
441 * region, or at the start of the full string if no region has been specified.
442 * This corresponds directly with Matcher.lookingAt() in Java.
444 * <p>If the match succeeds then more information can be obtained via the
445 * <code>uregexp_start()</code>, <code>uregexp_end()</code>,
446 * and <code>uregexp_group()</code> functions.</p>
448 * @param regexp The compiled regular expression.
449 * @param startIndex The input string (native) index at which to begin matching, or
450 * -1 to match the Input Region
451 * @param status A reference to a UErrorCode to receive any errors.
452 * @return TRUE if there is a match.
455 U_STABLE UBool U_EXPORT2
456 uregex_lookingAt(URegularExpression
*regexp
,
461 * 64bit version of uregex_lookingAt.
464 U_DRAFT UBool U_EXPORT2
465 uregex_lookingAt64(URegularExpression
*regexp
,
470 * Find the first matching substring of the input string that matches the pattern.
471 * If startIndex is >= zero the search for a match begins at the specified index,
472 * and any match region is reset. This corresponds directly with
473 * Matcher.find(startIndex) in Java.
475 * If startIndex == -1 the search begins at the start of the input region,
476 * or at the start of the full string if no region has been specified.
478 * If a match is found, <code>uregex_start(), uregex_end()</code>, and
479 * <code>uregex_group()</code> will provide more information regarding the match.
481 * @param regexp The compiled regular expression.
482 * @param startIndex The position (native) in the input string to begin the search, or
483 * -1 to search within the Input Region.
484 * @param status A reference to a UErrorCode to receive any errors.
485 * @return TRUE if a match is found.
488 U_STABLE UBool U_EXPORT2
489 uregex_find(URegularExpression
*regexp
,
494 * 64bit version of uregex_find.
497 U_DRAFT UBool U_EXPORT2
498 uregex_find64(URegularExpression
*regexp
,
503 * Find the next pattern match in the input string. Begin searching
504 * the input at the location following the end of he previous match,
505 * or at the start of the string (or region) if there is no
506 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and
507 * <code>uregex_group()</code> will provide more information regarding the match.
509 * @param regexp The compiled regular expression.
510 * @param status A reference to a UErrorCode to receive any errors.
511 * @return TRUE if a match is found.
515 U_STABLE UBool U_EXPORT2
516 uregex_findNext(URegularExpression
*regexp
,
520 * Get the number of capturing groups in this regular expression's pattern.
521 * @param regexp The compiled regular expression.
522 * @param status A reference to a UErrorCode to receive any errors.
523 * @return the number of capture groups
526 U_STABLE
int32_t U_EXPORT2
527 uregex_groupCount(URegularExpression
*regexp
,
530 /** Extract the string for the specified matching expression or subexpression.
531 * Group #0 is the complete string of matched text.
532 * Group #1 is the text matched by the first set of capturing parentheses.
534 * @param regexp The compiled regular expression.
535 * @param groupNum The capture group to extract. Group 0 is the complete
536 * match. The value of this parameter must be
537 * less than or equal to the number of capture groups in
539 * @param dest Buffer to receive the matching string data
540 * @param destCapacity Capacity of the dest buffer.
541 * @param status A reference to a UErrorCode to receive any errors.
542 * @return Length of matching data,
543 * or -1 if no applicable match.
546 U_STABLE
int32_t U_EXPORT2
547 uregex_group(URegularExpression
*regexp
,
550 int32_t destCapacity
,
554 /** Returns a shallow immutable clone of the entire input string. The returned UText current native index
555 * is set to the beginning of the requested capture group. The capture group length is also
556 * returned via groupLength.
557 * Group #0 is the complete string of matched text.
558 * Group #1 is the text matched by the first set of capturing parentheses.
560 * @param regexp The compiled regular expression.
561 * @param groupNum The capture group to extract. Group 0 is the complete
562 * match. The value of this parameter must be
563 * less than or equal to the number of capture groups in
565 * @param dest A mutable UText in which to store the current input.
566 * If NULL, a new UText will be created as an immutable shallow clone
567 * of the entire input string.
568 * @param groupLength The group length of the desired capture group.
569 * @param status A reference to a UErrorCode to receive any errors.
570 * @return The subject text currently associated with this regular expression.
571 * If a pre-allocated UText was provided, it will always be used and returned.
576 U_DRAFT UText
* U_EXPORT2
577 uregex_groupUText(URegularExpression
*regexp
,
580 int64_t *groupLength
,
584 /** Extract the string for the specified matching expression or subexpression.
585 * Group #0 is the complete string of matched text.
586 * Group #1 is the text matched by the first set of capturing parentheses.
588 * @param regexp The compiled regular expression.
589 * @param groupNum The capture group to extract. Group 0 is the complete
590 * match. The value of this parameter must be
591 * less than or equal to the number of capture groups in
593 * @param dest Mutable UText to receive the matching string data.
594 * If NULL, a new UText will be created (which may not be mutable).
595 * @param status A reference to a UErrorCode to receive any errors.
596 * @return The matching string data. If a pre-allocated UText was provided,
597 * it will always be used and returned.
599 * @internal ICU 4.4 technology preview
601 U_INTERNAL UText
* U_EXPORT2
602 uregex_groupUTextDeep(URegularExpression
*regexp
,
608 * Returns the index in the input string of the start of the text matched by the
609 * specified capture group during the previous match operation. Return -1 if
610 * the capture group was not part of the last match.
611 * Group #0 refers to the complete range of matched text.
612 * Group #1 refers to the text matched by the first set of capturing parentheses.
614 * @param regexp The compiled regular expression.
615 * @param groupNum The capture group number
616 * @param status A reference to a UErrorCode to receive any errors.
617 * @return the starting (native) position in the input of the text matched
618 * by the specified group.
621 U_STABLE
int32_t U_EXPORT2
622 uregex_start(URegularExpression
*regexp
,
627 * 64bit version of uregex_start.
630 U_DRAFT
int64_t U_EXPORT2
631 uregex_start64(URegularExpression
*regexp
,
636 * Returns the index in the input string of the position following the end
637 * of the text matched by the specified capture group.
638 * Return -1 if the capture group was not part of the last match.
639 * Group #0 refers to the complete range of matched text.
640 * Group #1 refers to the text matched by the first set of capturing parentheses.
642 * @param regexp The compiled regular expression.
643 * @param groupNum The capture group number
644 * @param status A reference to a UErrorCode to receive any errors.
645 * @return the (native) index of the position following the last matched character.
648 U_STABLE
int32_t U_EXPORT2
649 uregex_end(URegularExpression
*regexp
,
654 * 64bit version of uregex_end.
657 U_DRAFT
int64_t U_EXPORT2
658 uregex_end64(URegularExpression
*regexp
,
663 * Reset any saved state from the previous match. Has the effect of
664 * causing uregex_findNext to begin at the specified index, and causing
665 * uregex_start(), uregex_end() and uregex_group() to return an error
666 * indicating that there is no match information available. Clears any
667 * match region that may have been set.
669 * @param regexp The compiled regular expression.
670 * @param index The position (native) in the text at which a
671 * uregex_findNext() should begin searching.
672 * @param status A reference to a UErrorCode to receive any errors.
675 U_STABLE
void U_EXPORT2
676 uregex_reset(URegularExpression
*regexp
,
681 * 64bit version of uregex_reset.
684 U_DRAFT
void U_EXPORT2
685 uregex_reset64(URegularExpression
*regexp
,
689 /** Sets the limits of the matching region for this URegularExpression.
690 * The region is the part of the input string that will be considered when matching.
691 * Invoking this method resets any saved state from the previous match,
692 * then sets the region to start at the index specified by the start parameter
693 * and end at the index specified by the end parameter.
695 * Depending on the transparency and anchoring being used (see useTransparentBounds
696 * and useAnchoringBounds), certain constructs such as anchors may behave differently
697 * at or around the boundaries of the region
699 * The function will fail if start is greater than limit, or if either index
700 * is less than zero or greater than the length of the string being matched.
702 * @param regexp The compiled regular expression.
703 * @param regionStart The (native) index to begin searches at.
704 * @param regionLimit The (native) index to end searches at (exclusive).
705 * @param status A pointer to a UErrorCode to receive any errors.
708 U_STABLE
void U_EXPORT2
709 uregex_setRegion(URegularExpression
*regexp
,
715 * 64bit version of uregex_setRegion.
718 U_DRAFT
void U_EXPORT2
719 uregex_setRegion64(URegularExpression
*regexp
,
725 * Variation on uregex_setRegion to set the region without resetting the start index
726 * without resetting the position for subsequent matches.
729 U_DRAFT
void U_EXPORT2
730 uregex_setRegionAndStart(URegularExpression
*regexp
,
737 * Reports the start index of the matching region. Any matches found are limited to
738 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
740 * @param regexp The compiled regular expression.
741 * @param status A pointer to a UErrorCode to receive any errors.
742 * @return The starting (native) index of this matcher's region.
745 U_STABLE
int32_t U_EXPORT2
746 uregex_regionStart(const URegularExpression
*regexp
,
750 * 64bit version of uregex_regionStart.
753 U_DRAFT
int64_t U_EXPORT2
754 uregex_regionStart64(const URegularExpression
*regexp
,
758 * Reports the end index (exclusive) of the matching region for this URegularExpression.
759 * Any matches found are limited to to the region bounded by regionStart (inclusive)
760 * and regionEnd (exclusive).
762 * @param regexp The compiled regular expression.
763 * @param status A pointer to a UErrorCode to receive any errors.
764 * @return The ending point (native) of this matcher's region.
767 U_STABLE
int32_t U_EXPORT2
768 uregex_regionEnd(const URegularExpression
*regexp
,
772 * 64bit version of uregex_regionEnd.
775 U_DRAFT
int64_t U_EXPORT2
776 uregex_regionEnd64(const URegularExpression
*regexp
,
780 * Queries the transparency of region bounds for this URegularExpression.
781 * See useTransparentBounds for a description of transparent and opaque bounds.
782 * By default, matching boundaries are opaque.
784 * @param regexp The compiled regular expression.
785 * @param status A pointer to a UErrorCode to receive any errors.
786 * @return TRUE if this matcher is using opaque bounds, false if it is not.
789 U_STABLE UBool U_EXPORT2
790 uregex_hasTransparentBounds(const URegularExpression
*regexp
,
795 * Sets the transparency of region bounds for this URegularExpression.
796 * Invoking this function with an argument of TRUE will set matches to use transparent bounds.
797 * If the boolean argument is FALSE, then opaque bounds will be used.
799 * Using transparent bounds, the boundaries of the matching region are transparent
800 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
801 * see text beyond the boundaries of the region while checking for a match.
803 * With opaque bounds, no text outside of the matching region is visible to lookahead,
804 * lookbehind, and boundary matching constructs.
806 * By default, opaque bounds are used.
808 * @param regexp The compiled regular expression.
809 * @param b TRUE for transparent bounds; FALSE for opaque bounds
810 * @param status A pointer to a UErrorCode to receive any errors.
813 U_STABLE
void U_EXPORT2
814 uregex_useTransparentBounds(URegularExpression
*regexp
,
820 * Return true if this URegularExpression is using anchoring bounds.
821 * By default, anchoring region bounds are used.
823 * @param regexp The compiled regular expression.
824 * @param status A pointer to a UErrorCode to receive any errors.
825 * @return TRUE if this matcher is using anchoring bounds.
828 U_STABLE UBool U_EXPORT2
829 uregex_hasAnchoringBounds(const URegularExpression
*regexp
,
834 * Set whether this URegularExpression is using Anchoring Bounds for its region.
835 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
836 * and end of the region. Without Anchoring Bounds, anchors will only match at
837 * the positions they would in the complete text.
839 * Anchoring Bounds are the default for regions.
841 * @param regexp The compiled regular expression.
842 * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
843 * @param status A pointer to a UErrorCode to receive any errors.
846 U_STABLE
void U_EXPORT2
847 uregex_useAnchoringBounds(URegularExpression
*regexp
,
852 * Return TRUE if the most recent matching operation touched the
853 * end of the text being processed. In this case, additional input text could
854 * change the results of that match.
856 * @param regexp The compiled regular expression.
857 * @param status A pointer to a UErrorCode to receive any errors.
858 * @return TRUE if the most recent match hit the end of input
861 U_STABLE UBool U_EXPORT2
862 uregex_hitEnd(const URegularExpression
*regexp
,
866 * Return TRUE the most recent match succeeded and additional input could cause
867 * it to fail. If this function returns false and a match was found, then more input
868 * might change the match but the match won't be lost. If a match was not found,
869 * then requireEnd has no meaning.
871 * @param regexp The compiled regular expression.
872 * @param status A pointer to a UErrorCode to receive any errors.
873 * @return TRUE if more input could cause the most recent match to no longer match.
876 U_STABLE UBool U_EXPORT2
877 uregex_requireEnd(const URegularExpression
*regexp
,
885 * Replaces every substring of the input that matches the pattern
886 * with the given replacement string. This is a convenience function that
887 * provides a complete find-and-replace-all operation.
889 * This method scans the input string looking for matches of the pattern.
890 * Input that is not part of any match is copied unchanged to the
891 * destination buffer. Matched regions are replaced in the output
892 * buffer by the replacement string. The replacement string may contain
893 * references to capture groups; these take the form of $1, $2, etc.
895 * @param regexp The compiled regular expression.
896 * @param replacementText A string containing the replacement text.
897 * @param replacementLength The length of the replacement string, or
898 * -1 if it is NUL terminated.
899 * @param destBuf A (UChar *) buffer that will receive the result.
900 * @param destCapacity The capacity of the desitnation buffer.
901 * @param status A reference to a UErrorCode to receive any errors.
902 * @return The length of the string resulting from the find
903 * and replace operation. In the event that the
904 * destination capacity is inadequate, the return value
905 * is still the full length of the untruncated string.
908 U_STABLE
int32_t U_EXPORT2
909 uregex_replaceAll(URegularExpression
*regexp
,
910 const UChar
*replacementText
,
911 int32_t replacementLength
,
913 int32_t destCapacity
,
917 * Replaces every substring of the input that matches the pattern
918 * with the given replacement string. This is a convenience function that
919 * provides a complete find-and-replace-all operation.
921 * This method scans the input string looking for matches of the pattern.
922 * Input that is not part of any match is copied unchanged to the
923 * destination buffer. Matched regions are replaced in the output
924 * buffer by the replacement string. The replacement string may contain
925 * references to capture groups; these take the form of $1, $2, etc.
927 * @param regexp The compiled regular expression.
928 * @param replacement A string containing the replacement text.
929 * @param dest A mutable UText that will receive the result.
930 * If NULL, a new UText will be created (which may not be mutable).
931 * @param status A reference to a UErrorCode to receive any errors.
932 * @return A UText containing the results of the find and replace.
933 * If a pre-allocated UText was provided, it will always be used and returned.
937 U_DRAFT UText
* U_EXPORT2
938 uregex_replaceAllUText(URegularExpression
*regexp
,
944 * Replaces the first substring of the input that matches the pattern
945 * with the given replacement string. This is a convenience function that
946 * provides a complete find-and-replace operation.
948 * This method scans the input string looking for a match of the pattern.
949 * All input that is not part of the match is copied unchanged to the
950 * destination buffer. The matched region is replaced in the output
951 * buffer by the replacement string. The replacement string may contain
952 * references to capture groups; these take the form of $1, $2, etc.
954 * @param regexp The compiled regular expression.
955 * @param replacementText A string containing the replacement text.
956 * @param replacementLength The length of the replacement string, or
957 * -1 if it is NUL terminated.
958 * @param destBuf A (UChar *) buffer that will receive the result.
959 * @param destCapacity The capacity of the desitnation buffer.
960 * @param status a reference to a UErrorCode to receive any errors.
961 * @return The length of the string resulting from the find
962 * and replace operation. In the event that the
963 * destination capacity is inadequate, the return value
964 * is still the full length of the untruncated string.
967 U_STABLE
int32_t U_EXPORT2
968 uregex_replaceFirst(URegularExpression
*regexp
,
969 const UChar
*replacementText
,
970 int32_t replacementLength
,
972 int32_t destCapacity
,
976 * Replaces the first substring of the input that matches the pattern
977 * with the given replacement string. This is a convenience function that
978 * provides a complete find-and-replace operation.
980 * This method scans the input string looking for a match of the pattern.
981 * All input that is not part of the match is copied unchanged to the
982 * destination buffer. The matched region is replaced in the output
983 * buffer by the replacement string. The replacement string may contain
984 * references to capture groups; these take the form of $1, $2, etc.
986 * @param regexp The compiled regular expression.
987 * @param replacement A string containing the replacement text.
988 * @param dest A mutable UText that will receive the result.
989 * If NULL, a new UText will be created (which may not be mutable).
990 * @param status A reference to a UErrorCode to receive any errors.
991 * @return A UText containing the results of the find and replace.
992 * If a pre-allocated UText was provided, it will always be used and returned.
996 U_DRAFT UText
* U_EXPORT2
997 uregex_replaceFirstUText(URegularExpression
*regexp
,
1000 UErrorCode
*status
);
1004 * Implements a replace operation intended to be used as part of an
1005 * incremental find-and-replace.
1007 * <p>The input string, starting from the end of the previous match and ending at
1008 * the start of the current match, is appended to the destination string. Then the
1009 * replacement string is appended to the output string,
1010 * including handling any substitutions of captured text.</p>
1012 * <p>A note on preflight computation of buffersize and error handling:
1013 * Calls to uregex_appendReplacement() and uregex_appendTail() are
1014 * designed to be chained, one after another, with the destination
1015 * buffer pointer and buffer capacity updated after each in preparation
1016 * to for the next. If the destination buffer is exhausted partway through such a
1017 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal
1018 * ICU conventions are for a function to perform no action if it is
1019 * called with an error status, but for this one case, uregex_appendRepacement()
1020 * will operate normally so that buffer size computations will complete
1023 * <p>For simple, prepackaged, non-incremental find-and-replace
1024 * operations, see replaceFirst() or replaceAll().</p>
1026 * @param regexp The regular expression object.
1027 * @param replacementText The string that will replace the matched portion of the
1028 * input string as it is copied to the destination buffer.
1029 * The replacement text may contain references ($1, for
1030 * example) to capture groups from the match.
1031 * @param replacementLength The length of the replacement text string,
1032 * or -1 if the string is NUL terminated.
1033 * @param destBuf The buffer into which the results of the
1034 * find-and-replace are placed. On return, this pointer
1035 * will be updated to refer to the beginning of the
1036 * unused portion of buffer, leaving it in position for
1037 * a subsequent call to this function.
1038 * @param destCapacity The size of the output buffer, On return, this
1039 * parameter will be updated to reflect the space remaining
1040 * unused in the output buffer.
1041 * @param status A reference to a UErrorCode to receive any errors.
1042 * @return The length of the result string. In the event that
1043 * destCapacity is inadequate, the full length of the
1044 * untruncated output string is returned.
1049 U_STABLE
int32_t U_EXPORT2
1050 uregex_appendReplacement(URegularExpression
*regexp
,
1051 const UChar
*replacementText
,
1052 int32_t replacementLength
,
1054 int32_t *destCapacity
,
1055 UErrorCode
*status
);
1059 * Implements a replace operation intended to be used as part of an
1060 * incremental find-and-replace.
1062 * <p>The input string, starting from the end of the previous match and ending at
1063 * the start of the current match, is appended to the destination string. Then the
1064 * replacement string is appended to the output string,
1065 * including handling any substitutions of captured text.</p>
1067 * <p>For simple, prepackaged, non-incremental find-and-replace
1068 * operations, see replaceFirst() or replaceAll().</p>
1070 * @param regexp The regular expression object.
1071 * @param replacementText The string that will replace the matched portion of the
1072 * input string as it is copied to the destination buffer.
1073 * The replacement text may contain references ($1, for
1074 * example) to capture groups from the match.
1075 * @param dest A mutable UText that will receive the result. Must not be NULL.
1076 * @param status A reference to a UErrorCode to receive any errors.
1080 U_DRAFT
void U_EXPORT2
1081 uregex_appendReplacementUText(URegularExpression
*regexp
,
1082 UText
*replacementText
,
1084 UErrorCode
*status
);
1088 * As the final step in a find-and-replace operation, append the remainder
1089 * of the input string, starting at the position following the last match,
1090 * to the destination string. <code>uregex_appendTail()</code> is intended
1091 * to be invoked after one or more invocations of the
1092 * <code>uregex_appendReplacement()</code> function.
1094 * @param regexp The regular expression object. This is needed to
1095 * obtain the input string and with the position
1096 * of the last match within it.
1097 * @param destBuf The buffer in which the results of the
1098 * find-and-replace are placed. On return, the pointer
1099 * will be updated to refer to the beginning of the
1100 * unused portion of buffer.
1101 * @param destCapacity The size of the output buffer, On return, this
1102 * value will be updated to reflect the space remaining
1103 * unused in the output buffer.
1104 * @param status A reference to a UErrorCode to receive any errors.
1105 * @return The length of the result string. In the event that
1106 * destCapacity is inadequate, the full length of the
1107 * untruncated output string is returned.
1111 U_STABLE
int32_t U_EXPORT2
1112 uregex_appendTail(URegularExpression
*regexp
,
1114 int32_t *destCapacity
,
1115 UErrorCode
*status
);
1119 * As the final step in a find-and-replace operation, append the remainder
1120 * of the input string, starting at the position following the last match,
1121 * to the destination string. <code>uregex_appendTailUText()</code> is intended
1122 * to be invoked after one or more invocations of the
1123 * <code>uregex_appendReplacementUText()</code> function.
1125 * @param regexp The regular expression object. This is needed to
1126 * obtain the input string and with the position
1127 * of the last match within it.
1128 * @param dest A mutable UText that will receive the result. Must not be NULL.
1129 * @return The destination UText.
1133 U_DRAFT UText
* U_EXPORT2
1134 uregex_appendTailUText(URegularExpression
*regexp
,
1136 UErrorCode
*status
);
1141 * Split a string into fields. Somewhat like split() from Perl.
1142 * The pattern matches identify delimiters that separate the input
1143 * into fields. The input data between the matches becomes the
1144 * fields themselves.
1146 * Each of the fields is copied from the input string to the destination
1147 * buffer, and NUL terminated. The position of each field within
1148 * the destination buffer is returned in the destFields array.
1150 * Note: another choice for the design of this function would be to not
1151 * copy the resulting fields at all, but to return indexes and
1152 * lengths within the source text.
1153 * Advantages would be
1154 * o Faster. No Copying.
1155 * o Nothing extra needed when field data may contain embedded NUL chars.
1156 * o Less memory needed if working on large data.
1158 * o Less consistent with C++ split, which copies into an
1159 * array of UnicodeStrings.
1160 * o No NUL termination, extracted fields would be less convenient
1161 * to use in most cases.
1162 * o Possible problems in the future, when support Unicode Normalization
1163 * could cause the fields to not correspond exactly to
1164 * a range of the source text.
1166 * @param regexp The compiled regular expression.
1167 * @param destBuf A (UChar *) buffer to receive the fields that
1168 * are extracted from the input string. These
1169 * field pointers will refer to positions within the
1170 * destination buffer supplied by the caller. Any
1171 * extra positions within the destFields array will be
1173 * @param destCapacity The capacity of the destBuf.
1174 * @param requiredCapacity The actual capacity required of the destBuf.
1175 * If destCapacity is too small, requiredCapacity will return
1176 * the total capacity required to hold all of the output, and
1177 * a U_BUFFER_OVERFLOW_ERROR will be returned.
1178 * @param destFields An array to be filled with the position of each
1179 * of the extracted fields within destBuf.
1180 * @param destFieldsCapacity The number of elements in the destFields array.
1181 * If the number of fields found is less than destFieldsCapacity,
1182 * the extra destFields elements are set to zero.
1183 * If destFieldsCapacity is too small, the trailing part of the
1184 * input, including any field delimiters, is treated as if it
1185 * were the last field - it is copied to the destBuf, and
1186 * its position is in the destBuf is stored in the last element
1187 * of destFields. This behavior mimics that of Perl. It is not
1188 * an error condition, and no error status is returned when all destField
1189 * positions are used.
1190 * @param status A reference to a UErrorCode to receive any errors.
1191 * @return The number of fields into which the input string was split.
1194 U_STABLE
int32_t U_EXPORT2
1195 uregex_split( URegularExpression
*regexp
,
1197 int32_t destCapacity
,
1198 int32_t *requiredCapacity
,
1199 UChar
*destFields
[],
1200 int32_t destFieldsCapacity
,
1201 UErrorCode
*status
);
1205 * Split a string into fields. Somewhat like split() from Perl.
1206 * The pattern matches identify delimiters that separate the input
1207 * into fields. The input data between the matches becomes the
1208 * fields themselves.
1210 * The behavior of this function is not very closely aligned with uregex_split();
1211 * instead, it is based on (and implemented directly on top of) the C++ split method.
1213 * @param regexp The compiled regular expression.
1214 * @param destFields An array of mutable UText structs to receive the results of the split.
1215 * If a field is NULL, a new UText is allocated to contain the results for
1216 * that field. This new UText is not guaranteed to be mutable.
1217 * @param destFieldsCapacity The number of elements in the destination array.
1218 * If the number of fields found is less than destCapacity, the
1219 * extra strings in the destination array are not altered.
1220 * If the number of destination strings is less than the number
1221 * of fields, the trailing part of the input string, including any
1222 * field delimiters, is placed in the last destination string.
1223 * This behavior mimics that of Perl. It is not an error condition, and no
1224 * error status is returned when all destField positions are used.
1225 * @param status A reference to a UErrorCode to receive any errors.
1226 * @return The number of fields into which the input string was split.
1230 U_DRAFT
int32_t U_EXPORT2
1231 uregex_splitUText(URegularExpression
*regexp
,
1232 UText
*destFields
[],
1233 int32_t destFieldsCapacity
,
1234 UErrorCode
*status
);
1240 * Set a processing time limit for match operations with this URegularExpression.
1242 * Some patterns, when matching certain strings, can run in exponential time.
1243 * For practical purposes, the match operation may appear to be in an
1245 * When a limit is set a match operation will fail with an error if the
1246 * limit is exceeded.
1248 * The units of the limit are steps of the match engine.
1249 * Correspondence with actual processor time will depend on the speed
1250 * of the processor and the details of the specific pattern, but will
1251 * typically be on the order of milliseconds.
1253 * By default, the matching time is not limited.
1256 * @param regexp The compiled regular expression.
1257 * @param limit The limit value, or 0 for no limit.
1258 * @param status A reference to a UErrorCode to receive any errors.
1261 U_STABLE
void U_EXPORT2
1262 uregex_setTimeLimit(URegularExpression
*regexp
,
1264 UErrorCode
*status
);
1267 * Get the time limit for for matches with this URegularExpression.
1268 * A return value of zero indicates that there is no limit.
1270 * @param regexp The compiled regular expression.
1271 * @param status A reference to a UErrorCode to receive any errors.
1272 * @return the maximum allowed time for a match, in units of processing steps.
1275 U_STABLE
int32_t U_EXPORT2
1276 uregex_getTimeLimit(const URegularExpression
*regexp
,
1277 UErrorCode
*status
);
1280 * Set the amount of heap storage avaliable for use by the match backtracking stack.
1282 * ICU uses a backtracking regular expression engine, with the backtrack stack
1283 * maintained on the heap. This function sets the limit to the amount of memory
1284 * that can be used for this purpose. A backtracking stack overflow will
1285 * result in an error from the match operation that caused it.
1287 * A limit is desirable because a malicious or poorly designed pattern can use
1288 * excessive memory, potentially crashing the process. A limit is enabled
1291 * @param regexp The compiled regular expression.
1292 * @param limit The maximum size, in bytes, of the matching backtrack stack.
1293 * A value of -1 means no limit.
1294 * The limit must be greater than zero, or -1.
1295 * @param status A reference to a UErrorCode to receive any errors.
1299 U_STABLE
void U_EXPORT2
1300 uregex_setStackLimit(URegularExpression
*regexp
,
1302 UErrorCode
*status
);
1305 * Get the size of the heap storage available for use by the back tracking stack.
1307 * @return the maximum backtracking stack size, in bytes, or zero if the
1308 * stack size is unlimited.
1311 U_STABLE
int32_t U_EXPORT2
1312 uregex_getStackLimit(const URegularExpression
*regexp
,
1313 UErrorCode
*status
);
1317 * Function pointer for a regular expression matching callback function.
1318 * When set, a callback function will be called periodically during matching
1319 * operations. If the call back function returns FALSE, the matching
1320 * operation will be terminated early.
1322 * Note: the callback function must not call other functions on this
1323 * URegularExpression.
1325 * @param context context pointer. The callback function will be invoked
1326 * with the context specified at the time that
1327 * uregex_setMatchCallback() is called.
1328 * @param steps the accumulated processing time, in match steps,
1329 * for this matching operation.
1330 * @return TRUE to continue the matching operation.
1331 * FALSE to terminate the matching operation.
1335 typedef UBool U_CALLCONV
URegexMatchCallback (
1336 const void *context
,
1341 * Set a callback function for this URegularExpression.
1342 * During matching operations the function will be called periodically,
1343 * giving the application the opportunity to terminate a long-running
1346 * @param regexp The compiled regular expression.
1347 * @param callback A pointer to the user-supplied callback function.
1348 * @param context User context pointer. The value supplied at the
1349 * time the callback function is set will be saved
1350 * and passed to the callback each time that it is called.
1351 * @param status A reference to a UErrorCode to receive any errors.
1354 U_STABLE
void U_EXPORT2
1355 uregex_setMatchCallback(URegularExpression
*regexp
,
1356 URegexMatchCallback
*callback
,
1357 const void *context
,
1358 UErrorCode
*status
);
1362 * Get the callback function for this URegularExpression.
1364 * @param regexp The compiled regular expression.
1365 * @param callback Out paramater, receives a pointer to the user-supplied
1366 * callback function.
1367 * @param context Out parameter, receives the user context pointer that
1368 * was set when uregex_setMatchCallback() was called.
1369 * @param status A reference to a UErrorCode to receive any errors.
1372 U_STABLE
void U_EXPORT2
1373 uregex_getMatchCallback(const URegularExpression
*regexp
,
1374 URegexMatchCallback
**callback
,
1375 const void **context
,
1376 UErrorCode
*status
);
1380 * Function pointer for a regular expression find callback function.
1382 * When set, a callback function will be called during a find operation
1383 * and for operations that depend on find, such as findNext, split and some replace
1384 * operations like replaceFirst.
1385 * The callback will usually be called after each attempt at a match, but this is not a
1386 * guarantee that the callback will be invoked at each character. For finds where the
1387 * match engine is invoked at each character, this may be close to true, but less likely
1388 * for more optimized loops where the pattern is known to only start, and the match
1389 * engine invoked, at certain characters.
1390 * When invoked, this callback will specify the index at which a match operation is about
1391 * to be attempted, giving the application the opportunity to terminate a long-running
1394 * If the call back function returns FALSE, the find operation will be terminated early.
1396 * Note: the callback function must not call other functions on this
1397 * URegularExpression
1399 * @param context context pointer. The callback function will be invoked
1400 * with the context specified at the time that
1401 * uregex_setFindProgressCallback() is called.
1402 * @param matchIndex the next index at which a match attempt will be attempted for this
1403 * find operation. If this callback interrupts the search, this is the
1404 * index at which a find/findNext operation may be re-initiated.
1405 * @return TRUE to continue the matching operation.
1406 * FALSE to terminate the matching operation.
1410 typedef UBool U_CALLCONV
URegexFindProgressCallback (
1411 const void *context
,
1412 int64_t matchIndex
);
1416 * Set the find progress callback function for this URegularExpression.
1418 * @param regexp The compiled regular expression.
1419 * @param callback A pointer to the user-supplied callback function.
1420 * @param context User context pointer. The value supplied at the
1421 * time the callback function is set will be saved
1422 * and passed to the callback each time that it is called.
1423 * @param status A reference to a UErrorCode to receive any errors.
1426 U_DRAFT
void U_EXPORT2
1427 uregex_setFindProgressCallback(URegularExpression
*regexp
,
1428 URegexFindProgressCallback
*callback
,
1429 const void *context
,
1430 UErrorCode
*status
);
1434 * Get the find progress callback function for this URegularExpression.
1436 * @param regexp The compiled regular expression.
1437 * @param callback Out paramater, receives a pointer to the user-supplied
1438 * callback function.
1439 * @param context Out parameter, receives the user context pointer that
1440 * was set when uregex_setFindProgressCallback() was called.
1441 * @param status A reference to a UErrorCode to receive any errors.
1444 U_DRAFT
void U_EXPORT2
1445 uregex_getFindProgressCallback(const URegularExpression
*regexp
,
1446 URegexFindProgressCallback
**callback
,
1447 const void **context
,
1448 UErrorCode
*status
);
1450 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
1451 #endif /* UREGEX_H */