]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ********************************************************************** | |
73c04bcf | 3 | * Copyright (C) 2004-2006, International Business Machines |
374ca955 A |
4 | * Corporation and others. All Rights Reserved. |
5 | ********************************************************************** | |
6 | * file name: regex.h | |
7 | * encoding: US-ASCII | |
8 | * indentation:4 | |
9 | * | |
10 | * created on: 2004mar09 | |
11 | * created by: Andy Heninger | |
12 | * | |
13 | * ICU Regular Expressions, API for C | |
14 | */ | |
15 | ||
16 | /** | |
17 | * \file | |
18 | * \brief C API: Regular Expressions | |
19 | * | |
20 | * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> | |
21 | */ | |
22 | ||
23 | #ifndef UREGEX_H | |
24 | #define UREGEX_H | |
25 | ||
26 | #include "unicode/utypes.h" | |
27 | ||
28 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
29 | ||
30 | #include "unicode/parseerr.h" | |
31 | ||
374ca955 A |
32 | struct URegularExpression; |
33 | /** | |
34 | * Structure represeting a compiled regular rexpression, plus the results | |
35 | * of a match operation. | |
73c04bcf | 36 | * @stable ICU 3.0 |
374ca955 A |
37 | */ |
38 | typedef struct URegularExpression URegularExpression; | |
39 | ||
374ca955 A |
40 | |
41 | /** | |
42 | * Constants for Regular Expression Match Modes. | |
43 | * @stable ICU 2.4 | |
44 | */ | |
45 | typedef enum URegexpFlag{ | |
374ca955 | 46 | |
73c04bcf A |
47 | #ifndef U_HIDE_DRAFT_API |
48 | /** Forces normalization of pattern and strings. | |
49 | Not implemented yet, just a placeholder, hence draft. | |
50 | @draft ICU 2.4 */ | |
51 | UREGEX_CANON_EQ = 128, | |
52 | #endif | |
374ca955 A |
53 | /** Enable case insensitive matching. @stable ICU 2.4 */ |
54 | UREGEX_CASE_INSENSITIVE = 2, | |
55 | ||
56 | /** Allow white space and comments within patterns @stable ICU 2.4 */ | |
57 | UREGEX_COMMENTS = 4, | |
58 | ||
59 | /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. | |
60 | * @stable ICU 2.4 */ | |
61 | UREGEX_DOTALL = 32, | |
62 | ||
63 | /** Control behavior of "$" and "^" | |
64 | * If set, recognize line terminators within string, | |
65 | * otherwise, match only at start and end of input string. | |
66 | * @stable ICU 2.4 */ | |
67 | UREGEX_MULTILINE = 8, | |
68 | ||
69 | /** Unicode word boundaries. | |
70 | * If set, \b uses the Unicode TR 29 definition of word boundaries. | |
71 | * Warning: Unicode word boundaries are quite different from | |
72 | * traditional regular expression word boundaries. See | |
73 | * http://unicode.org/reports/tr29/#Word_Boundaries | |
73c04bcf | 74 | * @stable ICU 2.8 |
374ca955 A |
75 | */ |
76 | UREGEX_UWORD = 256 | |
77 | } URegexpFlag; | |
78 | ||
79 | /** | |
80 | * Open (compile) an ICU regular expression. Compiles the regular expression in | |
81 | * string form into an internal representation using the specified match mode flags. | |
82 | * The resulting regular expression handle can then be used to perform various | |
83 | * matching operations. | |
84 | * | |
85 | * @param pattern The Regular Expression pattern to be compiled. | |
86 | * @param patternLength The length of the pattern, or -1 if the pattern is | |
87 | * NUL termintated. | |
88 | * @param flags Flags that alter the default matching behavior for | |
89 | * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
90 | * example. For default behavior, set this parameter to zero. | |
91 | * See <code>enum URegexpFlag</code>. All desired flags | |
92 | * are bitwise-ORed together. | |
93 | * @param pe Receives the position (line and column nubers) of any syntax | |
94 | * error within the source regular expression string. If this | |
95 | * information is not wanted, pass NULL for this parameter. | |
96 | * @param status Receives error detected by this function. | |
73c04bcf | 97 | * @stable ICU 3.0 |
374ca955 A |
98 | * |
99 | */ | |
73c04bcf | 100 | U_STABLE URegularExpression * U_EXPORT2 |
374ca955 A |
101 | uregex_open( const UChar *pattern, |
102 | int32_t patternLength, | |
103 | uint32_t flags, | |
104 | UParseError *pe, | |
105 | UErrorCode *status); | |
106 | ||
107 | /** | |
108 | * Open (compile) an ICU regular expression. The resulting regular expression | |
109 | * handle can then be used to perform various matching operations. | |
110 | * <p> | |
111 | * This function is the same as uregex_open, except that the pattern | |
112 | * is supplied as an 8 bit char * string in the default code page. | |
113 | * | |
114 | * @param pattern The Regular Expression pattern to be compiled, | |
115 | * NUL termintated. | |
116 | * @param flags Flags that alter the default matching behavior for | |
117 | * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
118 | * example. For default behavior, set this parameter to zero. | |
119 | * See <code>enum URegexpFlag</code>. All desired flags | |
120 | * are bitwise-ORed together. | |
121 | * @param pe Receives the position (line and column nubers) of any syntax | |
122 | * error within the source regular expression string. If this | |
123 | * information is not wanted, pass NULL for this parameter. | |
124 | * @param status Receives errors detected by this function. | |
125 | * @return The URegularExpression object representing the compiled | |
126 | * pattern. | |
127 | * | |
73c04bcf | 128 | * @stable ICU 3.0 |
374ca955 | 129 | */ |
73c04bcf A |
130 | #if !UCONFIG_NO_CONVERSION |
131 | U_STABLE URegularExpression * U_EXPORT2 | |
374ca955 A |
132 | uregex_openC( const char *pattern, |
133 | uint32_t flags, | |
134 | UParseError *pe, | |
135 | UErrorCode *status); | |
73c04bcf | 136 | #endif |
374ca955 A |
137 | |
138 | ||
139 | ||
140 | /** | |
141 | * Close the regular expression, recovering all resources (memory) it | |
142 | * was holding. | |
143 | * | |
144 | * @param regexp The regular expression to be closed. | |
73c04bcf | 145 | * @stable ICU 3.0 |
374ca955 | 146 | */ |
73c04bcf | 147 | U_STABLE void U_EXPORT2 |
374ca955 A |
148 | uregex_close(URegularExpression *regexp); |
149 | ||
150 | /** | |
151 | * Make a copy of a compiled regular expression. Cloning a regular | |
152 | * expression is faster than opening a second instance from the source | |
153 | * form of the expression, and requires less memory. | |
154 | * <p> | |
155 | * Note that the current input string and the position of any matched text | |
156 | * within it are not cloned; only the pattern itself and and the | |
157 | * match mode flags are copied. | |
158 | * <p> | |
159 | * Cloning can be particularly useful to threaded applications that perform | |
160 | * multiple match operations in parallel. Each concurrent RE | |
161 | * operation requires its own instance of a URegularExpression. | |
162 | * | |
163 | * @param regexp The compiled regular expression to be cloned. | |
164 | * @param status Receives indication of any errors encountered | |
165 | * @return the cloned copy of the compiled regular expression. | |
73c04bcf | 166 | * @stable ICU 3.0 |
374ca955 | 167 | */ |
73c04bcf | 168 | U_STABLE URegularExpression * U_EXPORT2 |
374ca955 A |
169 | uregex_clone(const URegularExpression *regexp, UErrorCode *status); |
170 | ||
171 | /** | |
172 | * Return a pointer to the source form of the pattern for this regular expression. | |
173 | * | |
174 | * @param regexp The compiled regular expression. | |
175 | * @param patLength This output parameter will be set to the length of the | |
176 | * pattern string. A NULL pointer may be used here if the | |
177 | * pattern length is not needed, as would be the case if | |
178 | * the pattern is known in advance to be a NUL terminated | |
179 | * string. | |
180 | * @param status Receives errors detected by this function. | |
181 | * @return a pointer to the pattern string. The storage for the string is | |
182 | * owned by the regular expression object, and must not be | |
183 | * altered or deleted by the application. The returned string | |
184 | * will remain valid until the regular expression is closed. | |
73c04bcf | 185 | * @stable ICU 3.0 |
374ca955 | 186 | */ |
73c04bcf | 187 | U_STABLE const UChar * U_EXPORT2 |
374ca955 A |
188 | uregex_pattern(const URegularExpression *regexp, |
189 | int32_t *patLength, | |
190 | UErrorCode *status); | |
191 | ||
192 | ||
193 | /** | |
194 | * Get the match mode flags that were specified when compiling this regular expression. | |
195 | * @param status Receives errors detected by this function. | |
196 | * @param regexp The compiled regular expression. | |
197 | * @return The match mode flags | |
198 | * @see URegexpFlag | |
73c04bcf | 199 | * @stable ICU 3.0 |
374ca955 | 200 | */ |
73c04bcf | 201 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
202 | uregex_flags(const URegularExpression *regexp, |
203 | UErrorCode *status); | |
204 | ||
205 | ||
206 | /** | |
207 | * Set the subject text string upon which the regular expression will look for matches. | |
208 | * This function may be called any number of times, allowing the regular | |
209 | * expression pattern to be applied to different strings. | |
210 | * <p> | |
211 | * Regular expression matching operations work directly on the application's | |
212 | * string data. No copy is made. The subject string data must not be | |
213 | * altered after calling this function until after all regular expression | |
214 | * operations involving this string data are completed. | |
215 | * <p> | |
216 | * Zero length strings are permitted. In this case, no subsequent match | |
217 | * operation will dereference the text string pointer. | |
218 | * | |
219 | * @param regexp The compiled regular expression. | |
220 | * @param text The subject text string. | |
221 | * @param textLength The length of the subject text, or -1 if the string | |
222 | * is NUL terminated. | |
223 | * @param status Receives errors detected by this function. | |
73c04bcf | 224 | * @stable ICU 3.0 |
374ca955 | 225 | */ |
73c04bcf | 226 | U_STABLE void U_EXPORT2 |
374ca955 A |
227 | uregex_setText(URegularExpression *regexp, |
228 | const UChar *text, | |
229 | int32_t textLength, | |
230 | UErrorCode *status); | |
231 | ||
232 | /** | |
233 | * Get the subject text that is currently associated with this | |
234 | * regular expression object. This simply returns whatever string | |
235 | * pointer was previously supplied via uregex_setText(). | |
236 | * | |
237 | * @param regexp The compiled regular expression. | |
238 | * @param textLength The length of the string is returned in this output parameter. | |
239 | * A NULL pointer may be used here if the | |
240 | * text length is not needed, as would be the case if | |
241 | * the text is known in advance to be a NUL terminated | |
242 | * string. | |
243 | * @param status Receives errors detected by this function. | |
244 | * @return Poiner to the subject text string currently associated with | |
245 | * this regular expression. | |
73c04bcf | 246 | * @stable ICU 3.0 |
374ca955 | 247 | */ |
73c04bcf | 248 | U_STABLE const UChar * U_EXPORT2 |
374ca955 A |
249 | uregex_getText(URegularExpression *regexp, |
250 | int32_t *textLength, | |
251 | UErrorCode *status); | |
252 | ||
253 | /** | |
254 | * Attempts to match the input string, beginning at startIndex, against the pattern. | |
255 | * To succeed, the match must extend to the end of the input string. | |
256 | * | |
257 | * @param regexp The compiled regular expression. | |
258 | * @param startIndex The input string index at which to begin matching. | |
259 | * @param status Receives errors detected by this function. | |
260 | * @return TRUE if there is a match | |
73c04bcf | 261 | * @stable ICU 3.0 |
374ca955 | 262 | */ |
73c04bcf | 263 | U_STABLE UBool U_EXPORT2 |
374ca955 A |
264 | uregex_matches(URegularExpression *regexp, |
265 | int32_t startIndex, | |
266 | UErrorCode *status); | |
267 | ||
268 | /** | |
269 | * Attempts to match the input string, starting from the specified index, against the pattern. | |
270 | * The match may be of any length, and is not required to extend to the end | |
271 | * of the input string. Contrast with uregex_matches(). | |
272 | * | |
273 | * <p>If the match succeeds then more information can be obtained via the | |
274 | * <code>uregexp_start()</code>, <code>uregexp_end()</code>, | |
275 | * and <code>uregexp_group()</code> functions.</p> | |
276 | * | |
277 | * @param regexp The compiled regular expression. | |
278 | * @param startIndex The input string index at which to begin matching. | |
279 | * @param status A reference to a UErrorCode to receive any errors. | |
280 | * @return TRUE if there is a match. | |
73c04bcf | 281 | * @stable ICU 3.0 |
374ca955 | 282 | */ |
73c04bcf | 283 | U_STABLE UBool U_EXPORT2 |
374ca955 A |
284 | uregex_lookingAt(URegularExpression *regexp, |
285 | int32_t startIndex, | |
286 | UErrorCode *status); | |
287 | ||
288 | /** | |
289 | * Find the first matching substring of the input string that matches the pattern. | |
290 | * The search for a match begins at the specified index. | |
291 | * If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
292 | * <code>uregex_group()</code> will provide more information regarding the match. | |
293 | * | |
294 | * @param regexp The compiled regular expression. | |
295 | * @param startIndex The position in the input string to begin the search | |
296 | * @param status A reference to a UErrorCode to receive any errors. | |
297 | * @return TRUE if a match is found. | |
73c04bcf | 298 | * @stable ICU 3.0 |
374ca955 | 299 | */ |
73c04bcf | 300 | U_STABLE UBool U_EXPORT2 |
374ca955 A |
301 | uregex_find(URegularExpression *regexp, |
302 | int32_t startIndex, | |
303 | UErrorCode *status); | |
304 | ||
305 | /** | |
306 | * Find the next pattern match in the input string. | |
307 | * Begin searching the input at the location following the end of | |
308 | * the previous match, or at the start of the string if there is no previous match. | |
309 | * If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
310 | * <code>uregex_group()</code> will provide more information regarding the match. | |
311 | * | |
312 | * @param regexp The compiled regular expression. | |
313 | * @param status A reference to a UErrorCode to receive any errors. | |
314 | * @return TRUE if a match is found. | |
315 | * @see uregex_reset | |
73c04bcf | 316 | * @stable ICU 3.0 |
374ca955 | 317 | */ |
73c04bcf | 318 | U_STABLE UBool U_EXPORT2 |
374ca955 A |
319 | uregex_findNext(URegularExpression *regexp, |
320 | UErrorCode *status); | |
321 | ||
322 | /** | |
323 | * Get the number of capturing groups in this regular expression's pattern. | |
324 | * @param regexp The compiled regular expression. | |
325 | * @param status A reference to a UErrorCode to receive any errors. | |
326 | * @return the number of capture groups | |
73c04bcf | 327 | * @stable ICU 3.0 |
374ca955 | 328 | */ |
73c04bcf | 329 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
330 | uregex_groupCount(URegularExpression *regexp, |
331 | UErrorCode *status); | |
332 | ||
333 | /** Extract the string for the specified matching expression or subexpression. | |
334 | * Group #0 is the complete string of matched text. | |
335 | * Group #1 is the text matched by the first set of capturing parentheses. | |
336 | * | |
337 | * @param regexp The compiled regular expression. | |
338 | * @param groupNum The capture group to extract. Group 0 is the complete | |
339 | * match. The value of this parameter must be | |
340 | * less than or equal to the number of capture groups in | |
341 | * the pattern. | |
342 | * @param dest Buffer to receive the matching string data | |
343 | * @param destCapacity Capacity of the dest buffer. | |
344 | * @param status A reference to a UErrorCode to receive any errors. | |
345 | * @return Length of matching data, | |
346 | * or -1 if no applicable match. | |
73c04bcf | 347 | * @stable ICU 3.0 |
374ca955 | 348 | */ |
73c04bcf | 349 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
350 | uregex_group(URegularExpression *regexp, |
351 | int32_t groupNum, | |
352 | UChar *dest, | |
353 | int32_t destCapacity, | |
354 | UErrorCode *status); | |
355 | ||
356 | ||
357 | /** | |
358 | * Returns the index in the input string of the start of the text matched by the | |
359 | * specified capture group during the previous match operation. Return -1 if | |
360 | * the capture group was not part of the last match. | |
361 | * Group #0 refers to the complete range of matched text. | |
362 | * Group #1 refers to the text matched by the first set of capturing parentheses. | |
363 | * | |
364 | * @param regexp The compiled regular expression. | |
365 | * @param groupNum The capture group number | |
366 | * @param status A reference to a UErrorCode to receive any errors. | |
367 | * @return the starting position in the input of the text matched | |
368 | * by the specified group. | |
73c04bcf | 369 | * @stable ICU 3.0 |
374ca955 | 370 | */ |
73c04bcf | 371 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
372 | uregex_start(URegularExpression *regexp, |
373 | int32_t groupNum, | |
374 | UErrorCode *status); | |
375 | ||
376 | /** | |
377 | * Returns the index in the input string of the position following the end | |
378 | * of the text matched by the specified capture group. | |
379 | * Return -1 if the capture group was not part of the last match. | |
380 | * Group #0 refers to the complete range of matched text. | |
381 | * Group #1 refers to the text matched by the first set of capturing parentheses. | |
382 | * | |
383 | * @param regexp The compiled regular expression. | |
384 | * @param groupNum The capture group number | |
385 | * @param status A reference to a UErrorCode to receive any errors. | |
386 | * @return the index of the position following the last matched character. | |
73c04bcf | 387 | * @stable ICU 3.0 |
374ca955 | 388 | */ |
73c04bcf | 389 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
390 | uregex_end(URegularExpression *regexp, |
391 | int32_t groupNum, | |
392 | UErrorCode *status); | |
393 | ||
394 | /** | |
395 | * Reset any saved state from the previous match. Has the effect of | |
396 | * causing uregex_findNext to begin at the specified index, and causing | |
397 | * uregex_start(), uregex_end() and uregex_group() to return an error | |
398 | * indicating that there is no match information available. | |
399 | * | |
400 | * @param regexp The compiled regular expression. | |
401 | * @param index The position in the text at which a | |
402 | * uregex_findNext() should begin searching. | |
403 | * @param status A reference to a UErrorCode to receive any errors. | |
73c04bcf | 404 | * @stable ICU 3.0 |
374ca955 | 405 | */ |
73c04bcf | 406 | U_STABLE void U_EXPORT2 |
374ca955 A |
407 | uregex_reset(URegularExpression *regexp, |
408 | int32_t index, | |
409 | UErrorCode *status); | |
410 | ||
411 | /** | |
412 | * Replaces every substring of the input that matches the pattern | |
413 | * with the given replacement string. This is a convenience function that | |
414 | * provides a complete find-and-replace-all operation. | |
415 | * | |
416 | * This method scans the input string looking for matches of the pattern. | |
417 | * Input that is not part of any match is copied unchanged to the | |
418 | * destination buffer. Matched regions are replaced in the output | |
419 | * buffer by the replacement string. The replacement string may contain | |
420 | * references to capture groups; these take the form of $1, $2, etc. | |
421 | * | |
422 | * @param regexp The compiled regular expression. | |
423 | * @param replacementText A string containing the replacement text. | |
424 | * @param replacementLength The length of the replacement string, or | |
425 | * -1 if it is NUL terminated. | |
426 | * @param destBuf A (UChar *) buffer that will receive the result. | |
427 | * @param destCapacity The capacity of the desitnation buffer. | |
428 | * @param status A reference to a UErrorCode to receive any errors. | |
429 | * @return The length of the string resulting from the find | |
430 | * and replace operation. In the event that the | |
431 | * destination capacity is inadequate, the return value | |
432 | * is still the full length of the untruncated string. | |
73c04bcf | 433 | * @stable ICU 3.0 |
374ca955 | 434 | */ |
73c04bcf | 435 | U_STABLE int32_t U_EXPORT2 |
374ca955 | 436 | uregex_replaceAll(URegularExpression *regexp, |
73c04bcf | 437 | const UChar *replacementText, |
374ca955 A |
438 | int32_t replacementLength, |
439 | UChar *destBuf, | |
440 | int32_t destCapacity, | |
441 | UErrorCode *status); | |
442 | ||
443 | ||
444 | /** | |
445 | * Replaces the first substring of the input that matches the pattern | |
446 | * with the given replacement string. This is a convenience function that | |
447 | * provides a complete find-and-replace operation. | |
448 | * | |
449 | * This method scans the input string looking for a match of the pattern. | |
450 | * All input that is not part of the match is copied unchanged to the | |
451 | * destination buffer. The matched region is replaced in the output | |
452 | * buffer by the replacement string. The replacement string may contain | |
453 | * references to capture groups; these take the form of $1, $2, etc. | |
454 | * | |
455 | * @param regexp The compiled regular expression. | |
456 | * @param replacementText A string containing the replacement text. | |
457 | * @param replacementLength The length of the replacement string, or | |
458 | * -1 if it is NUL terminated. | |
459 | * @param destBuf A (UChar *) buffer that will receive the result. | |
460 | * @param destCapacity The capacity of the desitnation buffer. | |
461 | * @param status a reference to a UErrorCode to receive any errors. | |
462 | * @return The length of the string resulting from the find | |
463 | * and replace operation. In the event that the | |
464 | * destination capacity is inadequate, the return value | |
465 | * is still the full length of the untruncated string. | |
73c04bcf | 466 | * @stable ICU 3.0 |
374ca955 | 467 | */ |
73c04bcf | 468 | U_STABLE int32_t U_EXPORT2 |
374ca955 | 469 | uregex_replaceFirst(URegularExpression *regexp, |
73c04bcf | 470 | const UChar *replacementText, |
374ca955 A |
471 | int32_t replacementLength, |
472 | UChar *destBuf, | |
473 | int32_t destCapacity, | |
474 | UErrorCode *status); | |
475 | ||
476 | ||
477 | /** | |
478 | * Implements a replace operation intended to be used as part of an | |
479 | * incremental find-and-replace. | |
480 | * | |
481 | * <p>The input string, starting from the end of the previous match and ending at | |
482 | * the start of the current match, is appended to the destination string. Then the | |
483 | * replacement string is appended to the output string, | |
484 | * including handling any substitutions of captured text.</p> | |
485 | * | |
486 | * <p>A note on preflight computation of buffersize and error handling: | |
487 | * Calls to uregex_appendReplacement() and uregex_appendTail() are | |
488 | * designed to be chained, one after another, with the destination | |
489 | * buffer pointer and buffer capacity updated after each in preparation | |
490 | * to for the next. If the destination buffer is exhausted partway through such a | |
491 | * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal | |
492 | * ICU conventions are for a function to perform no action if it is | |
493 | * called with an error status, but for this one case, uregex_appendRepacement() | |
494 | * will operate normally so that buffer size computations will complete | |
495 | * correctly. | |
496 | * | |
497 | * <p>For simple, prepackaged, non-incremental find-and-replace | |
498 | * operations, see replaceFirst() or replaceAll().</p> | |
499 | * | |
500 | * @param regexp The regular expression object. | |
501 | * @param replacementText The string that will replace the matched portion of the | |
502 | * input string as it is copied to the destination buffer. | |
503 | * The replacement text may contain references ($1, for | |
504 | * example) to capture groups from the match. | |
505 | * @param replacementLength The length of the replacement text string, | |
506 | * or -1 if the string is NUL terminated. | |
507 | * @param destBuf The buffer into which the results of the | |
508 | * find-and-replace are placed. On return, this pointer | |
509 | * will be updated to refer to the beginning of the | |
510 | * unused portion of buffer, leaving it in position for | |
511 | * a subsequent call to this function. | |
512 | * @param destCapacity The size of the output buffer, On return, this | |
513 | * parameter will be updated to reflect the space remaining | |
514 | * unused in the output buffer. | |
515 | * @param status A reference to a UErrorCode to receive any errors. | |
516 | * @return The length of the result string. In the event that | |
517 | * destCapacity is inadequate, the full length of the | |
518 | * untruncated output string is returned. | |
519 | * | |
73c04bcf | 520 | * @stable ICU 3.0 |
374ca955 A |
521 | * |
522 | */ | |
73c04bcf | 523 | U_STABLE int32_t U_EXPORT2 |
374ca955 | 524 | uregex_appendReplacement(URegularExpression *regexp, |
73c04bcf | 525 | const UChar *replacementText, |
374ca955 A |
526 | int32_t replacementLength, |
527 | UChar **destBuf, | |
528 | int32_t *destCapacity, | |
529 | UErrorCode *status); | |
530 | ||
531 | ||
532 | /** | |
533 | * As the final step in a find-and-replace operation, append the remainder | |
534 | * of the input string, starting at the position following the last match, | |
535 | * to the destination string. <code>uregex_appendTail()</code> is intended | |
536 | * to be invoked after one or more invocations of the | |
537 | * <code>uregex_appendReplacement()</code> function. | |
538 | * | |
539 | * @param regexp The regular expression object. This is needed to | |
540 | * obtain the input string and with the position | |
541 | * of the last match within it. | |
542 | * @param destBuf The buffer in which the results of the | |
543 | * find-and-replace are placed. On return, the pointer | |
544 | * will be updated to refer to the beginning of the | |
545 | * unused portion of buffer. | |
546 | * @param destCapacity The size of the output buffer, On return, this | |
547 | * value will be updated to reflect the space remaining | |
548 | * unused in the output buffer. | |
549 | * @param status A reference to a UErrorCode to receive any errors. | |
550 | * @return The length of the result string. In the event that | |
551 | * destCapacity is inadequate, the full length of the | |
552 | * untruncated output string is returned. | |
553 | * | |
73c04bcf | 554 | * @stable ICU 3.0 |
374ca955 | 555 | */ |
73c04bcf | 556 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
557 | uregex_appendTail(URegularExpression *regexp, |
558 | UChar **destBuf, | |
559 | int32_t *destCapacity, | |
560 | UErrorCode *status); | |
561 | ||
562 | ||
563 | ||
564 | ||
565 | /** | |
566 | * Split a string into fields. Somewhat like split() from Perl. | |
567 | * The pattern matches identify delimiters that separate the input | |
568 | * into fields. The input data between the matches becomes the | |
569 | * fields themselves. | |
570 | * <p> | |
571 | * Each of the fields is copied from the input string to the destination | |
572 | * buffer, and the NUL terminated. The position of each field within | |
573 | * the destination buffer is returned in the destFields array. | |
73c04bcf A |
574 | * |
575 | * Note: another choice for the design of this function would be to not | |
576 | * copy the resulting fields at all, but to return indexes and | |
577 | * lengths within the source text. | |
578 | * Advantages would be | |
579 | * o Faster. No Copying. | |
580 | * o Nothing extra needed when field data may contain embedded NUL chars. | |
581 | * o Less memory needed if working on large data. | |
582 | * Disadvantages | |
583 | * o Less consistent with C++ split, which copies into an | |
584 | * array of UnicodeStrings. | |
585 | * o No NUL termination, extracted fields would be less convenient | |
586 | * to use in most cases. | |
587 | * o Possible problems in the future, when support Unicode Normalization | |
588 | * could cause the fields to not correspond exactly to | |
589 | * a range of the source text. | |
374ca955 A |
590 | * |
591 | * @param regexp The compiled regular expression. | |
592 | * @param destBuf A (UChar *) buffer to receive the fields that | |
593 | * are extracted from the input string. These | |
594 | * field pointers will refer to positions within the | |
595 | * destination buffer supplied by the caller. Any | |
596 | * extra positions within the destFields array will be | |
597 | * set to NULL. | |
598 | * @param destCapacity The capacity of the destBuf. | |
599 | * @param requiredCapacity The actual capacity required of the destBuf. | |
600 | * If destCapacity is too small, requiredCapacity will return | |
601 | * the total capacity required to hold all of the output, and | |
602 | * a U_BUFFER_OVERFLOW_ERROR will be returned. | |
603 | * @param destFields An array to be filled with the position of each | |
604 | * of the extracted fields within destBuf. | |
605 | * @param destFieldsCapacity The number of elements in the destFields array. | |
606 | * If the number of fields found is less than destFieldsCapacity, | |
607 | * the extra destFields elements are set to zero. | |
608 | * If destFieldsCapacity is too small, the trailing part of the | |
609 | * input, including any field delimiters, is treated as if it | |
610 | * were the last field - it is copied to the destBuf, and | |
611 | * its position is in the destBuf is stored in the last element | |
612 | * of destFields. This behavior mimics that of Perl. It is not | |
613 | * an error condition, and no error status is returned when all destField | |
614 | * positions are used. | |
615 | * @param status A reference to a UErrorCode to receive any errors. | |
616 | * @return The number of fields into which the input string was split. | |
73c04bcf | 617 | * @stable ICU 3.0 |
374ca955 | 618 | */ |
73c04bcf | 619 | U_STABLE int32_t U_EXPORT2 |
374ca955 A |
620 | uregex_split( URegularExpression *regexp, |
621 | UChar *destBuf, | |
622 | int32_t destCapacity, | |
623 | int32_t *requiredCapacity, | |
624 | UChar *destFields[], | |
625 | int32_t destFieldsCapacity, | |
626 | UErrorCode *status); | |
627 | ||
628 | ||
629 | ||
630 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
631 | #endif /* UREGEX_H */ |