]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
73c04bcf | 3 | * Copyright (C) 2004-2005, International Business Machines |
374ca955 A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * file name: regex.cpp | |
7 | */ | |
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
12 | ||
13 | #include "unicode/regex.h" | |
14 | #include "unicode/uregex.h" | |
15 | #include "unicode/unistr.h" | |
16 | #include "unicode/ustring.h" | |
17 | #include "unicode/uchar.h" | |
18 | #include "unicode/uobject.h" | |
19 | #include "umutex.h" | |
20 | #include "uassert.h" | |
21 | #include "cmemory.h" | |
22 | ||
23 | struct URegularExpression: public UMemory { | |
24 | public: | |
25 | URegularExpression(); | |
26 | ~URegularExpression(); | |
27 | int32_t fMagic; | |
28 | RegexPattern *fPat; | |
29 | int32_t *fPatRefCount; | |
30 | UChar *fPatString; | |
31 | int32_t fPatStringLen; | |
32 | RegexMatcher *fMatcher; | |
33 | const UChar *fText; // Text from setText() | |
34 | int32_t fTextLength; // Length provided by user with setText(), which | |
35 | // may be -1. | |
36 | ||
37 | UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString. | |
38 | // TODO: regexp engine should not depend on UnicodeString. | |
39 | }; | |
40 | ||
41 | static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII | |
42 | ||
43 | U_NAMESPACE_USE | |
44 | ||
45 | URegularExpression::URegularExpression() { | |
46 | fMagic = REXP_MAGIC; | |
47 | fPat = NULL; | |
48 | fPatRefCount = NULL; | |
49 | fPatString = NULL; | |
50 | fPatStringLen = 0; | |
51 | fMatcher = NULL; | |
52 | fText = NULL; | |
53 | fTextLength = 0; | |
54 | } | |
55 | ||
56 | URegularExpression::~URegularExpression() { | |
57 | delete fMatcher; | |
58 | fMatcher = NULL; | |
59 | if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { | |
60 | delete fPat; | |
61 | uprv_free(fPatString); | |
62 | uprv_free(fPatRefCount); | |
63 | } | |
64 | fMagic = 0; | |
65 | } | |
66 | ||
67 | //---------------------------------------------------------------------------------------- | |
68 | // | |
69 | // validateRE Do boilerplate style checks on API function parameters. | |
70 | // Return TRUE if they look OK. | |
71 | //---------------------------------------------------------------------------------------- | |
72 | static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { | |
73 | if (U_FAILURE(*status)) { | |
74 | return FALSE; | |
75 | } | |
76 | if (re == NULL || re->fMagic != REXP_MAGIC) { | |
77 | // U_ASSERT(FALSE); | |
78 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
79 | return FALSE; | |
80 | } | |
81 | if (requiresText && re->fText == NULL) { | |
82 | *status = U_REGEX_INVALID_STATE; | |
83 | return FALSE; | |
84 | } | |
85 | return TRUE; | |
86 | } | |
87 | ||
88 | //---------------------------------------------------------------------------------------- | |
89 | // | |
90 | // uregex_open | |
91 | // | |
92 | //---------------------------------------------------------------------------------------- | |
93 | U_CAPI URegularExpression * U_EXPORT2 | |
94 | uregex_open( const UChar *pattern, | |
95 | int32_t patternLength, | |
96 | uint32_t flags, | |
97 | UParseError *pe, | |
98 | UErrorCode *status) { | |
99 | ||
100 | if (U_FAILURE(*status)) { | |
101 | return NULL; | |
102 | } | |
103 | if (pattern == NULL || patternLength < -1 || patternLength == 0) { | |
104 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
105 | return NULL; | |
106 | } | |
107 | int32_t actualPatLen = patternLength; | |
108 | if (actualPatLen == -1) { | |
109 | actualPatLen = u_strlen(pattern); | |
110 | } | |
111 | ||
112 | URegularExpression *re = new URegularExpression; | |
113 | int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); | |
114 | UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); | |
115 | if (re == NULL || refC == NULL || patBuf == NULL) { | |
116 | *status = U_MEMORY_ALLOCATION_ERROR; | |
117 | delete re; | |
118 | uprv_free(refC); | |
119 | uprv_free(patBuf); | |
120 | return NULL; | |
121 | } | |
122 | re->fPatRefCount = refC; | |
123 | *re->fPatRefCount = 1; | |
124 | ||
125 | // | |
126 | // Make a copy of the pattern string, so we can return it later if asked. | |
127 | // For compiling the pattern, we will use a read-only-aliased UnicodeString | |
128 | // of this local copy, to avoid making even more copies. | |
129 | // | |
130 | re->fPatString = patBuf; | |
131 | re->fPatStringLen = patternLength; | |
132 | u_memcpy(patBuf, pattern, actualPatLen); | |
133 | patBuf[actualPatLen] = 0; | |
134 | UnicodeString patString(patternLength==-1, patBuf, patternLength); | |
135 | ||
136 | // | |
137 | // Compile the pattern | |
138 | // | |
139 | if (pe != NULL) { | |
140 | re->fPat = RegexPattern::compile(patString, flags, *pe, *status); | |
141 | } else { | |
142 | re->fPat = RegexPattern::compile(patString, flags, *status); | |
143 | } | |
144 | if (U_FAILURE(*status)) { | |
145 | goto ErrorExit; | |
146 | } | |
147 | ||
148 | // | |
149 | // Create the matcher object | |
150 | // | |
151 | re->fMatcher = re->fPat->matcher(*status); | |
152 | if (U_SUCCESS(*status)) { | |
153 | return re; | |
154 | } | |
155 | ||
156 | ErrorExit: | |
157 | delete re; | |
158 | return NULL; | |
159 | ||
160 | } | |
161 | ||
374ca955 A |
162 | //---------------------------------------------------------------------------------------- |
163 | // | |
164 | // uregex_close | |
165 | // | |
166 | //---------------------------------------------------------------------------------------- | |
167 | U_CAPI void U_EXPORT2 | |
168 | uregex_close(URegularExpression *re) { | |
169 | UErrorCode status = U_ZERO_ERROR; | |
170 | if (validateRE(re, &status, FALSE) == FALSE) { | |
171 | return; | |
172 | } | |
173 | delete re; | |
174 | } | |
175 | ||
176 | ||
177 | //---------------------------------------------------------------------------------------- | |
178 | // | |
179 | // uregex_clone | |
180 | // | |
181 | //---------------------------------------------------------------------------------------- | |
182 | U_CAPI URegularExpression * U_EXPORT2 | |
183 | uregex_clone(const URegularExpression *source, UErrorCode *status) { | |
184 | if (validateRE(source, status, FALSE) == FALSE) { | |
185 | return NULL; | |
186 | } | |
187 | ||
188 | URegularExpression *clone = new URegularExpression; | |
189 | if (clone == NULL) { | |
190 | *status = U_MEMORY_ALLOCATION_ERROR; | |
191 | return NULL; | |
192 | } | |
193 | ||
194 | clone->fMatcher = source->fPat->matcher(*status); | |
195 | if (U_FAILURE(*status)) { | |
196 | delete clone; | |
197 | return NULL; | |
198 | } | |
199 | if (clone == NULL) { | |
200 | *status = U_MEMORY_ALLOCATION_ERROR; | |
201 | return NULL; | |
202 | } | |
203 | ||
204 | clone->fPat = source->fPat; | |
205 | clone->fPatRefCount = source->fPatRefCount; | |
206 | clone->fPatString = source->fPatString; | |
207 | clone->fPatStringLen = source->fPatStringLen; | |
208 | umtx_atomic_inc(source->fPatRefCount); | |
209 | // Note: fText is not cloned. | |
210 | ||
211 | return clone; | |
73c04bcf | 212 | } |
374ca955 A |
213 | |
214 | ||
215 | ||
216 | ||
73c04bcf | 217 | //------------------------------------------------------------------------------ |
374ca955 A |
218 | // |
219 | // uregex_pattern | |
220 | // | |
73c04bcf | 221 | //------------------------------------------------------------------------------ |
374ca955 A |
222 | U_CAPI const UChar * U_EXPORT2 |
223 | uregex_pattern(const URegularExpression *regexp, | |
224 | int32_t *patLength, | |
225 | UErrorCode *status) { | |
226 | ||
227 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
228 | return NULL; | |
229 | } | |
230 | if (patLength != NULL) { | |
231 | *patLength = regexp->fPatStringLen; | |
232 | } | |
233 | return regexp->fPatString; | |
73c04bcf | 234 | } |
374ca955 A |
235 | |
236 | ||
73c04bcf | 237 | //------------------------------------------------------------------------------ |
374ca955 A |
238 | // |
239 | // uregex_flags | |
240 | // | |
73c04bcf | 241 | //------------------------------------------------------------------------------ |
374ca955 A |
242 | U_CAPI int32_t U_EXPORT2 |
243 | uregex_flags(const URegularExpression *regexp, UErrorCode *status) { | |
244 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
245 | return 0; | |
246 | } | |
247 | int32_t flags = regexp->fPat->flags(); | |
248 | return flags; | |
73c04bcf | 249 | } |
374ca955 A |
250 | |
251 | ||
73c04bcf | 252 | //------------------------------------------------------------------------------ |
374ca955 A |
253 | // |
254 | // uregex_setText | |
255 | // | |
73c04bcf | 256 | //------------------------------------------------------------------------------ |
374ca955 A |
257 | U_CAPI void U_EXPORT2 |
258 | uregex_setText(URegularExpression *regexp, | |
259 | const UChar *text, | |
260 | int32_t textLength, | |
261 | UErrorCode *status) { | |
262 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
263 | return; | |
264 | } | |
265 | if (text == NULL || textLength < -1) { | |
266 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
267 | return; | |
268 | } | |
269 | regexp->fText = text; | |
270 | regexp->fTextLength = textLength; | |
271 | UBool isTerminated = (textLength == -1); | |
272 | ||
273 | regexp->fTextString.setTo(isTerminated, text, textLength); | |
274 | regexp->fMatcher->reset(regexp->fTextString); | |
73c04bcf | 275 | } |
374ca955 A |
276 | |
277 | ||
278 | ||
73c04bcf | 279 | //------------------------------------------------------------------------------ |
374ca955 A |
280 | // |
281 | // uregex_getText | |
282 | // | |
73c04bcf | 283 | //------------------------------------------------------------------------------ |
374ca955 A |
284 | U_CAPI const UChar * U_EXPORT2 |
285 | uregex_getText(URegularExpression *regexp, | |
286 | int32_t *textLength, | |
287 | UErrorCode *status) { | |
288 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
289 | return NULL; | |
290 | } | |
291 | if (textLength != NULL) { | |
292 | *textLength = regexp->fTextLength; | |
293 | } | |
294 | return regexp->fText; | |
73c04bcf | 295 | } |
374ca955 A |
296 | |
297 | ||
73c04bcf | 298 | //------------------------------------------------------------------------------ |
374ca955 A |
299 | // |
300 | // uregex_matches | |
301 | // | |
73c04bcf | 302 | //------------------------------------------------------------------------------ |
374ca955 A |
303 | U_CAPI UBool U_EXPORT2 |
304 | uregex_matches(URegularExpression *regexp, | |
305 | int32_t startIndex, | |
306 | UErrorCode *status) { | |
307 | if (validateRE(regexp, status) == FALSE) { | |
308 | return FALSE; | |
309 | } | |
310 | UBool result = regexp->fMatcher->matches(startIndex, *status); | |
311 | return result; | |
73c04bcf | 312 | } |
374ca955 A |
313 | |
314 | ||
315 | ||
73c04bcf | 316 | //------------------------------------------------------------------------------ |
374ca955 A |
317 | // |
318 | // uregex_lookingAt | |
319 | // | |
73c04bcf | 320 | //------------------------------------------------------------------------------ |
374ca955 A |
321 | U_CAPI UBool U_EXPORT2 |
322 | uregex_lookingAt(URegularExpression *regexp, | |
323 | int32_t startIndex, | |
324 | UErrorCode *status) { | |
325 | if (validateRE(regexp, status) == FALSE) { | |
326 | return FALSE; | |
327 | } | |
328 | UBool result = regexp->fMatcher->lookingAt(startIndex, *status); | |
329 | return result; | |
73c04bcf | 330 | } |
374ca955 A |
331 | |
332 | ||
333 | ||
73c04bcf | 334 | //------------------------------------------------------------------------------ |
374ca955 A |
335 | // |
336 | // uregex_find | |
337 | // | |
73c04bcf | 338 | //------------------------------------------------------------------------------ |
374ca955 A |
339 | U_CAPI UBool U_EXPORT2 |
340 | uregex_find(URegularExpression *regexp, | |
341 | int32_t startIndex, | |
342 | UErrorCode *status) { | |
343 | if (validateRE(regexp, status) == FALSE) { | |
344 | return FALSE; | |
345 | } | |
346 | UBool result = regexp->fMatcher->find(startIndex, *status); | |
347 | return result; | |
73c04bcf | 348 | } |
374ca955 | 349 | |
73c04bcf | 350 | //------------------------------------------------------------------------------ |
374ca955 A |
351 | // |
352 | // uregex_findNext | |
353 | // | |
73c04bcf | 354 | //------------------------------------------------------------------------------ |
374ca955 A |
355 | U_CAPI UBool U_EXPORT2 |
356 | uregex_findNext(URegularExpression *regexp, | |
357 | UErrorCode *status) { | |
358 | if (validateRE(regexp, status) == FALSE) { | |
359 | return FALSE; | |
360 | } | |
361 | UBool result = regexp->fMatcher->find(); | |
362 | return result; | |
73c04bcf | 363 | } |
374ca955 | 364 | |
73c04bcf | 365 | //------------------------------------------------------------------------------ |
374ca955 A |
366 | // |
367 | // uregex_groupCount | |
368 | // | |
73c04bcf | 369 | //------------------------------------------------------------------------------ |
374ca955 A |
370 | U_CAPI int32_t U_EXPORT2 |
371 | uregex_groupCount(URegularExpression *regexp, | |
372 | UErrorCode *status) { | |
373 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
374 | return 0; | |
375 | } | |
376 | int32_t result = regexp->fMatcher->groupCount(); | |
377 | return result; | |
73c04bcf | 378 | } |
374ca955 A |
379 | |
380 | ||
73c04bcf | 381 | //------------------------------------------------------------------------------ |
374ca955 A |
382 | // |
383 | // uregex_group | |
384 | // | |
73c04bcf | 385 | //------------------------------------------------------------------------------ |
374ca955 A |
386 | U_CAPI int32_t U_EXPORT2 |
387 | uregex_group(URegularExpression *regexp, | |
388 | int32_t groupNum, | |
389 | UChar *dest, | |
390 | int32_t destCapacity, | |
391 | UErrorCode *status) { | |
392 | if (validateRE(regexp, status) == FALSE) { | |
393 | return 0; | |
394 | } | |
395 | if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { | |
396 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
397 | return 0; | |
398 | } | |
399 | ||
400 | // | |
401 | // Pick up the range of characters from the matcher | |
402 | // | |
403 | int32_t startIx = regexp->fMatcher->start(groupNum, *status); | |
404 | int32_t endIx = regexp->fMatcher->end (groupNum, *status); | |
405 | if (U_FAILURE(*status)) { | |
406 | return 0; | |
407 | } | |
408 | ||
409 | // | |
410 | // Trim length based on buffer capacity | |
411 | // | |
412 | int32_t fullLength = endIx - startIx; | |
413 | int32_t copyLength = fullLength; | |
414 | if (copyLength < destCapacity) { | |
415 | dest[copyLength] = 0; | |
416 | } else if (copyLength == destCapacity) { | |
417 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
418 | } else { | |
419 | copyLength = destCapacity; | |
420 | *status = U_BUFFER_OVERFLOW_ERROR; | |
421 | } | |
422 | ||
423 | // | |
424 | // Copy capture group to user's buffer | |
425 | // | |
426 | if (copyLength > 0) { | |
427 | u_memcpy(dest, ®exp->fText[startIx], copyLength); | |
428 | } | |
429 | return fullLength; | |
73c04bcf | 430 | } |
374ca955 A |
431 | |
432 | ||
73c04bcf | 433 | //------------------------------------------------------------------------------ |
374ca955 A |
434 | // |
435 | // uregex_start | |
436 | // | |
73c04bcf | 437 | //------------------------------------------------------------------------------ |
374ca955 A |
438 | U_CAPI int32_t U_EXPORT2 |
439 | uregex_start(URegularExpression *regexp, | |
440 | int32_t groupNum, | |
441 | UErrorCode *status) { | |
442 | if (validateRE(regexp, status) == FALSE) { | |
443 | return 0; | |
444 | } | |
445 | int32_t result = regexp->fMatcher->start(groupNum, *status); | |
446 | return result; | |
73c04bcf | 447 | } |
374ca955 A |
448 | |
449 | ||
73c04bcf | 450 | //------------------------------------------------------------------------------ |
374ca955 A |
451 | // |
452 | // uregex_end | |
453 | // | |
73c04bcf | 454 | //------------------------------------------------------------------------------ |
374ca955 A |
455 | U_CAPI int32_t U_EXPORT2 |
456 | uregex_end(URegularExpression *regexp, | |
457 | int32_t groupNum, | |
458 | UErrorCode *status) { | |
459 | if (validateRE(regexp, status) == FALSE) { | |
460 | return 0; | |
461 | } | |
462 | int32_t result = regexp->fMatcher->end(groupNum, *status); | |
463 | return result; | |
73c04bcf | 464 | } |
374ca955 | 465 | |
73c04bcf | 466 | //------------------------------------------------------------------------------ |
374ca955 A |
467 | // |
468 | // uregex_reset | |
469 | // | |
73c04bcf | 470 | //------------------------------------------------------------------------------ |
374ca955 A |
471 | U_CAPI void U_EXPORT2 |
472 | uregex_reset(URegularExpression *regexp, | |
473 | int32_t index, | |
474 | UErrorCode *status) { | |
475 | if (validateRE(regexp, status) == FALSE) { | |
476 | return; | |
477 | } | |
478 | regexp->fMatcher->reset(index, *status); | |
73c04bcf | 479 | } |
374ca955 A |
480 | |
481 | ||
73c04bcf | 482 | //------------------------------------------------------------------------------ |
374ca955 A |
483 | // |
484 | // uregex_replaceAll | |
485 | // | |
73c04bcf | 486 | //------------------------------------------------------------------------------ |
374ca955 A |
487 | U_CAPI int32_t U_EXPORT2 |
488 | uregex_replaceAll(URegularExpression *regexp, | |
73c04bcf | 489 | const UChar *replacementText, |
374ca955 A |
490 | int32_t replacementLength, |
491 | UChar *destBuf, | |
492 | int32_t destCapacity, | |
493 | UErrorCode *status) { | |
494 | if (validateRE(regexp, status) == FALSE) { | |
495 | return 0; | |
496 | } | |
497 | if (replacementText == NULL || replacementLength < -1 || | |
498 | destBuf == NULL && destCapacity > 0 || | |
499 | destCapacity < 0) { | |
500 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
501 | return 0; | |
502 | } | |
503 | ||
504 | int32_t len = 0; | |
505 | uregex_reset(regexp, 0, status); | |
506 | while (uregex_findNext(regexp, status)) { | |
507 | len += uregex_appendReplacement(regexp, replacementText, replacementLength, | |
508 | &destBuf, &destCapacity, status); | |
509 | } | |
510 | len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); | |
511 | ||
512 | return len; | |
73c04bcf | 513 | } |
374ca955 A |
514 | |
515 | ||
73c04bcf | 516 | //------------------------------------------------------------------------------ |
374ca955 A |
517 | // |
518 | // uregex_replaceFirst | |
519 | // | |
73c04bcf | 520 | //------------------------------------------------------------------------------ |
374ca955 A |
521 | U_CAPI int32_t U_EXPORT2 |
522 | uregex_replaceFirst(URegularExpression *regexp, | |
73c04bcf | 523 | const UChar *replacementText, |
374ca955 A |
524 | int32_t replacementLength, |
525 | UChar *destBuf, | |
526 | int32_t destCapacity, | |
527 | UErrorCode *status) { | |
528 | if (validateRE(regexp, status) == FALSE) { | |
529 | return 0; | |
530 | } | |
531 | if (replacementText == NULL || replacementLength < -1 || | |
532 | destBuf == NULL && destCapacity > 0 || | |
533 | destCapacity < 0) { | |
534 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
535 | return 0; | |
536 | } | |
537 | ||
538 | int32_t len = 0; | |
539 | UBool findSucceeded; | |
540 | uregex_reset(regexp, 0, status); | |
541 | findSucceeded = uregex_find(regexp, 0, status); | |
542 | if (findSucceeded) { | |
543 | len = uregex_appendReplacement(regexp, replacementText, replacementLength, | |
544 | &destBuf, &destCapacity, status); | |
545 | } | |
546 | len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); | |
547 | ||
548 | return len; | |
73c04bcf | 549 | } |
374ca955 A |
550 | |
551 | ||
73c04bcf | 552 | //------------------------------------------------------------------------------ |
374ca955 A |
553 | // |
554 | // uregex_appendReplacement | |
555 | // | |
73c04bcf | 556 | //------------------------------------------------------------------------------ |
374ca955 A |
557 | |
558 | ||
559 | // | |
560 | // Dummy class, because these functions need to be friends of class RegexMatcher, | |
561 | // and stand-alone C functions don't work as friends | |
562 | // | |
563 | U_NAMESPACE_BEGIN | |
564 | class RegexCImpl { | |
565 | public: | |
566 | inline static int32_t appendReplacement(URegularExpression *regexp, | |
73c04bcf | 567 | const UChar *replacementText, |
374ca955 A |
568 | int32_t replacementLength, |
569 | UChar **destBuf, | |
570 | int32_t *destCapacity, | |
571 | UErrorCode *status); | |
572 | ||
573 | inline static int32_t appendTail(URegularExpression *regexp, | |
574 | UChar **destBuf, | |
575 | int32_t *destCapacity, | |
576 | UErrorCode *status); | |
577 | }; | |
578 | U_NAMESPACE_END | |
579 | ||
580 | ||
581 | // | |
582 | // Call-back function for u_unescapeAt(), used when we encounter | |
583 | // \uxxxx or \Uxxxxxxxxx escapes in the replacement text. | |
584 | // | |
585 | U_CDECL_BEGIN | |
586 | static UChar U_CALLCONV | |
587 | unescape_charAt(int32_t offset, void *context) { | |
588 | UChar c16 = ((UChar *)context)[offset]; | |
589 | return c16; | |
590 | } | |
591 | U_CDECL_END | |
592 | ||
593 | ||
594 | static const UChar BACKSLASH = 0x5c; | |
595 | static const UChar DOLLARSIGN = 0x24; | |
596 | ||
597 | // | |
598 | // Move a character to an output buffer, with bounds checking on the index. | |
599 | // Index advances even if capacity is exceeded, for preflight size computations. | |
600 | // This little sequence is used a LOT. | |
601 | // | |
602 | static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { | |
603 | if (*idx < bufCapacity) { | |
604 | buf[*idx] = c; | |
605 | } | |
606 | (*idx)++; | |
607 | } | |
608 | ||
609 | ||
610 | // | |
611 | // appendReplacement, the actual implementation. | |
612 | // | |
613 | int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, | |
73c04bcf | 614 | const UChar *replacementText, |
374ca955 A |
615 | int32_t replacementLength, |
616 | UChar **destBuf, | |
617 | int32_t *destCapacity, | |
618 | UErrorCode *status) { | |
619 | ||
620 | // If we come in with a buffer overflow error, don't suppress the operation. | |
621 | // A series of appendReplacements, appendTail need to correctly preflight | |
622 | // the buffer size when an overflow happens somewhere in the middle. | |
623 | UBool pendingBufferOverflow = FALSE; | |
624 | if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) { | |
625 | pendingBufferOverflow = TRUE; | |
626 | *status = U_ZERO_ERROR; | |
627 | } | |
628 | ||
629 | // | |
630 | // Validate all paramters | |
631 | // | |
632 | if (validateRE(regexp, status) == FALSE) { | |
633 | return 0; | |
634 | } | |
635 | if (replacementText == NULL || replacementLength < -1 || | |
636 | destCapacity == NULL || destBuf == NULL || | |
637 | *destBuf == NULL && *destCapacity > 0 || | |
638 | *destCapacity < 0) { | |
639 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
640 | return 0; | |
641 | } | |
642 | ||
643 | RegexMatcher *m = regexp->fMatcher; | |
644 | if (m->fMatch == FALSE) { | |
645 | *status = U_REGEX_INVALID_STATE; | |
646 | return 0; | |
647 | } | |
648 | ||
649 | UChar *dest = *destBuf; | |
650 | int32_t capacity = *destCapacity; | |
651 | int32_t destIdx = 0; | |
652 | int32_t i; | |
653 | ||
654 | // If it wasn't supplied by the caller, get the length of the replacement text. | |
655 | // TODO: slightly smarter logic in the copy loop could watch for the NUL on | |
656 | // the fly and avoid this step. | |
657 | if (replacementLength == -1) { | |
658 | replacementLength = u_strlen(replacementText); | |
659 | } | |
660 | ||
661 | // Copy input string from the end of previous match to start of current match | |
662 | for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) { | |
663 | appendToBuf(regexp->fText[i], &destIdx, dest, capacity); | |
664 | } | |
665 | ||
666 | ||
667 | ||
668 | // scan the replacement text, looking for substitutions ($n) and \escapes. | |
669 | int32_t replIdx = 0; | |
670 | while (replIdx < replacementLength) { | |
671 | UChar c = replacementText[replIdx]; | |
672 | replIdx++; | |
673 | if (c != DOLLARSIGN && c != BACKSLASH) { | |
674 | // Common case, no substitution, no escaping, | |
675 | // just copy the char to the dest buf. | |
676 | appendToBuf(c, &destIdx, dest, capacity); | |
677 | continue; | |
678 | } | |
679 | ||
680 | if (c == BACKSLASH) { | |
681 | // Backslash Escape. Copy the following char out without further checks. | |
682 | // Note: Surrogate pairs don't need any special handling | |
683 | // The second half wont be a '$' or a '\', and | |
684 | // will move to the dest normally on the next | |
685 | // loop iteration. | |
686 | if (replIdx >= replacementLength) { | |
687 | break; | |
688 | } | |
689 | c = replacementText[replIdx]; | |
690 | ||
691 | if (c==0x55/*U*/ || c==0x75/*u*/) { | |
692 | // We have a \udddd or \Udddddddd escape sequence. | |
693 | UChar32 escapedChar = | |
694 | u_unescapeAt(unescape_charAt, | |
695 | &replIdx, // Index is updated by unescapeAt | |
696 | replacementLength, // Length of replacement text | |
73c04bcf | 697 | (void *)replacementText); |
374ca955 A |
698 | |
699 | if (escapedChar != (UChar32)0xFFFFFFFF) { | |
700 | if (escapedChar <= 0xffff) { | |
701 | appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); | |
702 | } else { | |
703 | appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); | |
704 | appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); | |
705 | } | |
706 | continue; | |
707 | } | |
708 | // Note: if the \u escape was invalid, just fall through and | |
709 | // treat it as a plain \<anything> escape. | |
710 | } | |
711 | ||
712 | // Plain backslash escape. Just put out the escaped character. | |
713 | appendToBuf(c, &destIdx, dest, capacity); | |
714 | ||
715 | replIdx++; | |
716 | continue; | |
717 | } | |
718 | ||
719 | ||
720 | ||
721 | // We've got a $. Pick up a capture group number if one follows. | |
722 | // Consume at most the number of digits necessary for the largest capture | |
723 | // number that is valid for this pattern. | |
724 | ||
725 | int32_t numDigits = 0; | |
726 | int32_t groupNum = 0; | |
727 | UChar32 digitC; | |
728 | for (;;) { | |
729 | if (replIdx >= replacementLength) { | |
730 | break; | |
731 | } | |
732 | U16_GET(replacementText, 0, replIdx, replacementLength, digitC); | |
733 | if (u_isdigit(digitC) == FALSE) { | |
734 | break; | |
735 | } | |
736 | ||
737 | U16_FWD_1(replacementText, replIdx, replacementLength); | |
738 | groupNum=groupNum*10 + u_charDigitValue(digitC); | |
739 | numDigits++; | |
740 | if (numDigits >= m->fPattern->fMaxCaptureDigits) { | |
741 | break; | |
742 | } | |
743 | } | |
744 | ||
745 | ||
746 | if (numDigits == 0) { | |
747 | // The $ didn't introduce a group number at all. | |
748 | // Treat it as just part of the substitution text. | |
749 | appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); | |
750 | continue; | |
751 | } | |
752 | ||
753 | // Finally, append the capture group data to the destination. | |
754 | int32_t capacityRemaining = capacity - destIdx; | |
755 | if (capacityRemaining < 0) { | |
756 | capacityRemaining = 0; | |
757 | } | |
758 | destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status); | |
759 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
760 | // Ignore buffer overflow when extracting the group. We need to | |
761 | // continue on to get full size of the untruncated result. We will | |
762 | // raise our own buffer overflow error at the end. | |
763 | *status = U_ZERO_ERROR; | |
764 | } | |
765 | ||
766 | if (U_FAILURE(*status)) { | |
767 | // Can fail if group number is out of range. | |
768 | break; | |
769 | } | |
770 | ||
771 | } | |
772 | ||
773 | // | |
774 | // Nul Terminate the dest buffer if possible. | |
775 | // Set the appropriate buffer overflow or not terminated error, if needed. | |
776 | // | |
777 | if (destIdx < capacity) { | |
778 | dest[destIdx] = 0; | |
779 | } else if (destIdx == *destCapacity) { | |
780 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
781 | } else { | |
782 | *status = U_BUFFER_OVERFLOW_ERROR; | |
783 | } | |
784 | ||
785 | // | |
786 | // Return an updated dest buffer and capacity to the caller. | |
787 | // | |
788 | if (destIdx > 0 && *destCapacity > 0) { | |
789 | if (destIdx < capacity) { | |
790 | *destBuf += destIdx; | |
791 | *destCapacity -= destIdx; | |
792 | } else { | |
793 | *destBuf += capacity; | |
794 | *destCapacity = 0; | |
795 | } | |
796 | } | |
797 | ||
798 | // If we came in with a buffer overflow, make sure we go out with one also. | |
799 | // (A zero length match right at the end of the previous match could | |
800 | // make this function succeed even though a previous call had overflowed the buf) | |
801 | if (pendingBufferOverflow && U_SUCCESS(*status)) { | |
802 | *status = U_BUFFER_OVERFLOW_ERROR; | |
803 | } | |
804 | ||
805 | return destIdx; | |
806 | } | |
807 | ||
808 | // | |
809 | // appendReplacement the acutal API function, | |
810 | // | |
811 | U_CAPI int32_t U_EXPORT2 | |
812 | uregex_appendReplacement(URegularExpression *regexp, | |
73c04bcf | 813 | const UChar *replacementText, |
374ca955 A |
814 | int32_t replacementLength, |
815 | UChar **destBuf, | |
816 | int32_t *destCapacity, | |
817 | UErrorCode *status) { | |
818 | return RegexCImpl::appendReplacement( | |
819 | regexp, replacementText, replacementLength,destBuf, destCapacity, status); | |
820 | } | |
821 | ||
822 | ||
73c04bcf | 823 | //------------------------------------------------------------------------------ |
374ca955 A |
824 | // |
825 | // uregex_appendTail | |
826 | // | |
73c04bcf | 827 | //------------------------------------------------------------------------------ |
374ca955 A |
828 | int32_t RegexCImpl::appendTail(URegularExpression *regexp, |
829 | UChar **destBuf, | |
830 | int32_t *destCapacity, | |
831 | UErrorCode *status) { | |
832 | ||
833 | // If we come in with a buffer overflow error, don't suppress the operation. | |
834 | // A series of appendReplacements, appendTail need to correctly preflight | |
835 | // the buffer size when an overflow happens somewhere in the middle. | |
836 | UBool pendingBufferOverflow = FALSE; | |
837 | if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) { | |
838 | pendingBufferOverflow = TRUE; | |
839 | *status = U_ZERO_ERROR; | |
840 | } | |
841 | ||
842 | if (validateRE(regexp, status) == FALSE) { | |
843 | return 0; | |
844 | } | |
845 | if (destCapacity == NULL || destBuf == NULL || | |
846 | *destBuf == NULL && *destCapacity > 0 || | |
847 | *destCapacity < 0) { | |
848 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
849 | return 0; | |
850 | } | |
851 | ||
852 | RegexMatcher *m = regexp->fMatcher; | |
853 | ||
854 | int32_t srcIdx; | |
855 | if (m->fMatch) { | |
856 | // The most recent call to find() succeeded. | |
857 | srcIdx = m->fMatchEnd; | |
858 | } else { | |
859 | // The last call to find() on this matcher failed(). | |
860 | // Look back to the end of the last find() that succeeded for src index. | |
861 | srcIdx = m->fLastMatchEnd; | |
862 | if (srcIdx == -1) { | |
863 | // There has been no successful match with this matcher. | |
864 | // We want to copy the whole string. | |
865 | srcIdx = 0; | |
866 | } | |
867 | } | |
868 | ||
869 | int32_t destIdx = 0; | |
870 | int32_t destCap = *destCapacity; | |
871 | UChar *dest = *destBuf; | |
872 | ||
873 | for (;;) { | |
874 | if (srcIdx == regexp->fTextLength) { | |
875 | break; | |
876 | } | |
877 | UChar c = regexp->fText[srcIdx]; | |
878 | if (c == 0 && regexp->fTextLength == -1) { | |
879 | break; | |
880 | } | |
881 | if (destIdx < destCap) { | |
882 | dest[destIdx] = c; | |
883 | } else { | |
884 | // We've overflowed the dest buffer. | |
885 | // If the total input string length is known, we can | |
886 | // compute the total buffer size needed without scanning through the string. | |
887 | if (regexp->fTextLength > 0) { | |
888 | destIdx += (regexp->fTextLength - srcIdx); | |
889 | break; | |
890 | } | |
891 | } | |
892 | srcIdx++; | |
893 | destIdx++; | |
894 | } | |
895 | ||
896 | // | |
897 | // NUL terminate the output string, if possible, otherwise issue the | |
898 | // appropriate error or warning. | |
899 | // | |
900 | if (destIdx < destCap) { | |
901 | dest[destIdx] = 0; | |
902 | } else if (destIdx == destCap) { | |
903 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
904 | } else { | |
905 | *status = U_BUFFER_OVERFLOW_ERROR; | |
906 | } | |
907 | ||
908 | // | |
909 | // Update the user's buffer ptr and capacity vars to reflect the | |
910 | // amount used. | |
911 | // | |
912 | if (destIdx < destCap) { | |
913 | *destBuf += destIdx; | |
914 | *destCapacity -= destIdx; | |
915 | } else { | |
916 | *destBuf += destCap; | |
917 | *destCapacity = 0; | |
918 | } | |
919 | ||
920 | if (pendingBufferOverflow && U_SUCCESS(*status)) { | |
921 | *status = U_BUFFER_OVERFLOW_ERROR; | |
922 | } | |
923 | ||
924 | return destIdx; | |
73c04bcf | 925 | } |
374ca955 A |
926 | |
927 | ||
928 | U_CAPI int32_t U_EXPORT2 | |
929 | uregex_appendTail(URegularExpression *regexp, | |
930 | UChar **destBuf, | |
931 | int32_t *destCapacity, | |
932 | UErrorCode *status) { | |
933 | return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); | |
934 | } | |
935 | ||
936 | ||
73c04bcf | 937 | //------------------------------------------------------------------------------ |
374ca955 A |
938 | // |
939 | // copyString Internal utility to copy a string to an output buffer, | |
940 | // while managing buffer overflow and preflight size | |
941 | // computation. NUL termination is added to destination, | |
942 | // and the NUL is counted in the output size. | |
943 | // | |
73c04bcf | 944 | //------------------------------------------------------------------------------ |
374ca955 A |
945 | static void copyString(UChar *destBuffer, // Destination buffer. |
946 | int32_t destCapacity, // Total capacity of dest buffer | |
947 | int32_t *destIndex, // Index into dest buffer. Updated on return. | |
948 | // Update not clipped to destCapacity. | |
949 | const UChar *srcPtr, // Pointer to source string | |
950 | int32_t srcLen) // Source string len. | |
951 | { | |
952 | int32_t si; | |
953 | int32_t di = *destIndex; | |
954 | UChar c; | |
955 | ||
956 | for (si=0; si<srcLen; si++) { | |
957 | c = srcPtr[si]; | |
958 | if (di < destCapacity) { | |
959 | destBuffer[di] = c; | |
960 | di++; | |
961 | } else { | |
962 | di += srcLen - si; | |
963 | break; | |
964 | } | |
965 | } | |
73c04bcf A |
966 | if (di<destCapacity) { |
967 | destBuffer[di] = 0; | |
968 | } | |
969 | di++; | |
374ca955 A |
970 | *destIndex = di; |
971 | } | |
972 | ||
973 | ||
73c04bcf | 974 | //------------------------------------------------------------------------------ |
374ca955 A |
975 | // |
976 | // uregex_split | |
977 | // | |
73c04bcf | 978 | //------------------------------------------------------------------------------ |
374ca955 A |
979 | U_CAPI int32_t U_EXPORT2 |
980 | uregex_split( URegularExpression *regexp, | |
981 | UChar *destBuf, | |
982 | int32_t destCapacity, | |
983 | int32_t *requiredCapacity, | |
984 | UChar *destFields[], | |
985 | int32_t destFieldsCapacity, | |
986 | UErrorCode *status) { | |
987 | if (validateRE(regexp, status) == FALSE) { | |
988 | return 0; | |
989 | } | |
990 | if (destBuf == NULL && destCapacity > 0 || | |
991 | destCapacity < 0 || | |
992 | destFields == NULL || | |
993 | destFieldsCapacity < 1 ) { | |
994 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
995 | return 0; | |
996 | } | |
997 | ||
998 | // | |
999 | // Reset for the input text | |
1000 | // | |
1001 | regexp->fMatcher->reset(); | |
1002 | int32_t inputLen = regexp->fTextString.length(); | |
1003 | int32_t nextOutputStringStart = 0; | |
1004 | if (inputLen == 0) { | |
1005 | return 0; | |
1006 | } | |
1007 | ||
1008 | ||
1009 | // | |
1010 | // Loop through the input text, searching for the delimiter pattern | |
1011 | // | |
1012 | int32_t i; // Index of the field being processed. | |
1013 | int32_t destIdx = 0; // Next available position in destBuf; | |
1014 | int32_t numCaptureGroups = regexp->fMatcher->groupCount(); | |
1015 | for (i=0; ; i++) { | |
1016 | if (i>=destFieldsCapacity-1) { | |
1017 | // There are one or zero output string left. | |
1018 | // Fill the last output string with whatever is left from the input, then exit the loop. | |
1019 | // ( i will be == destFieldsCapacity if we filled the output array while processing | |
1020 | // capture groups of the delimiter expression, in which case we will discard the | |
1021 | // last capture group saved in favor of the unprocessed remainder of the | |
1022 | // input string.) | |
1023 | int32_t remainingLength = inputLen-nextOutputStringStart; | |
1024 | if (remainingLength > 0) { | |
1025 | } | |
1026 | if (i >= destFieldsCapacity) { | |
1027 | // No fields are left. Recycle the last one for holding the trailing part of | |
1028 | // the input string. | |
1029 | i = destFieldsCapacity-1; | |
1030 | destIdx = (int32_t)(destFields[i] - destFields[0]); | |
1031 | } | |
1032 | ||
1033 | destFields[i] = &destBuf[destIdx]; | |
1034 | copyString(destBuf, destCapacity, &destIdx, | |
1035 | ®exp->fText[nextOutputStringStart], remainingLength); | |
1036 | break; | |
1037 | } | |
1038 | ||
1039 | if (regexp->fMatcher->find()) { | |
1040 | // We found another delimiter. Move everything from where we started looking | |
1041 | // up until the start of the delimiter into the next output string. | |
1042 | int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart; | |
1043 | destFields[i] = &destBuf[destIdx]; | |
1044 | copyString(destBuf, destCapacity, &destIdx, | |
1045 | ®exp->fText[nextOutputStringStart], fieldLen); | |
1046 | nextOutputStringStart = regexp->fMatcher->end(*status); | |
1047 | ||
1048 | // If the delimiter pattern has capturing parentheses, the captured | |
1049 | // text goes out into the next n destination strings. | |
1050 | int32_t groupNum; | |
1051 | for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { | |
1052 | // If we've run out of output string slots, bail out. | |
1053 | if (i==destFieldsCapacity-1) { | |
1054 | break; | |
1055 | } | |
1056 | i++; | |
1057 | ||
1058 | // Set up to extract the capture group contents into the dest buffer. | |
1059 | UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow | |
1060 | // error while extracting this group. | |
1061 | int32_t remainingCapacity = destCapacity - destIdx; | |
1062 | if (remainingCapacity < 0) { | |
1063 | remainingCapacity = 0; | |
1064 | } | |
1065 | destFields[i] = &destBuf[destIdx]; | |
1066 | int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus); | |
1067 | destIdx += t + 1; // Record the space used in the output string buffer. | |
1068 | // +1 for the NUL that terminates the string. | |
1069 | } | |
1070 | ||
1071 | if (nextOutputStringStart == inputLen) { | |
1072 | // The delimiter was at the end of the string. We're done. | |
1073 | break; | |
1074 | } | |
1075 | ||
1076 | } | |
1077 | else | |
1078 | { | |
1079 | // We ran off the end of the input while looking for the next delimiter. | |
1080 | // All the remaining text goes into the current output string. | |
1081 | destFields[i] = &destBuf[destIdx]; | |
1082 | copyString(destBuf, destCapacity, &destIdx, | |
1083 | ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart); | |
1084 | break; | |
1085 | } | |
1086 | } | |
1087 | ||
1088 | // Zero out any unused portion of the destFields array | |
1089 | int j; | |
1090 | for (j=i+1; j<destFieldsCapacity; j++) { | |
1091 | destFields[j] = NULL; | |
1092 | } | |
1093 | ||
1094 | if (requiredCapacity != NULL) { | |
1095 | *requiredCapacity = destIdx; | |
1096 | } | |
73c04bcf | 1097 | if (destIdx > destCapacity) { |
374ca955 A |
1098 | *status = U_BUFFER_OVERFLOW_ERROR; |
1099 | } | |
1100 | return i+1; | |
1101 | } | |
1102 | ||
1103 | ||
374ca955 | 1104 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
73c04bcf | 1105 |