]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/uregex.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / uregex.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2004-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: regex.cpp
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22
23 #include "regextxt.h"
24
25 #include <stdio.h>
26
27 U_NAMESPACE_BEGIN
28
29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
30
31 struct RegularExpression: public UMemory {
32 public:
33 RegularExpression();
34 ~RegularExpression();
35 int32_t fMagic;
36 RegexPattern *fPat;
37 int32_t *fPatRefCount;
38 UChar *fPatString;
39 int32_t fPatStringLen;
40 RegexMatcher *fMatcher;
41 const UChar *fText; // Text from setText()
42 int32_t fTextLength; // Length provided by user with setText(), which
43 // may be -1.
44 UBool fOwnsText;
45 };
46
47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
48
49 RegularExpression::RegularExpression() {
50 fMagic = REXP_MAGIC;
51 fPat = NULL;
52 fPatRefCount = NULL;
53 fPatString = NULL;
54 fPatStringLen = 0;
55 fMatcher = NULL;
56 fText = NULL;
57 fTextLength = 0;
58 fOwnsText = FALSE;
59 }
60
61 RegularExpression::~RegularExpression() {
62 delete fMatcher;
63 fMatcher = NULL;
64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
65 delete fPat;
66 uprv_free(fPatString);
67 uprv_free(fPatRefCount);
68 }
69 if (fOwnsText && fText!=NULL) {
70 uprv_free((void *)fText);
71 }
72 fMagic = 0;
73 }
74
75 U_NAMESPACE_END
76
77 U_NAMESPACE_USE
78
79 //----------------------------------------------------------------------------------------
80 //
81 // validateRE Do boilerplate style checks on API function parameters.
82 // Return TRUE if they look OK.
83 //----------------------------------------------------------------------------------------
84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
85 if (U_FAILURE(*status)) {
86 return FALSE;
87 }
88 if (re == NULL || re->fMagic != REXP_MAGIC) {
89 *status = U_ILLEGAL_ARGUMENT_ERROR;
90 return FALSE;
91 }
92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
93 if (requiresText && re->fText == NULL && !re->fOwnsText) {
94 *status = U_REGEX_INVALID_STATE;
95 return FALSE;
96 }
97 return TRUE;
98 }
99
100 //----------------------------------------------------------------------------------------
101 //
102 // uregex_open
103 //
104 //----------------------------------------------------------------------------------------
105 U_CAPI URegularExpression * U_EXPORT2
106 uregex_open( const UChar *pattern,
107 int32_t patternLength,
108 uint32_t flags,
109 UParseError *pe,
110 UErrorCode *status) {
111
112 if (U_FAILURE(*status)) {
113 return NULL;
114 }
115 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
116 *status = U_ILLEGAL_ARGUMENT_ERROR;
117 return NULL;
118 }
119 int32_t actualPatLen = patternLength;
120 if (actualPatLen == -1) {
121 actualPatLen = u_strlen(pattern);
122 }
123
124 RegularExpression *re = new RegularExpression;
125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
127 if (re == NULL || refC == NULL || patBuf == NULL) {
128 *status = U_MEMORY_ALLOCATION_ERROR;
129 delete re;
130 uprv_free(refC);
131 uprv_free(patBuf);
132 return NULL;
133 }
134 re->fPatRefCount = refC;
135 *re->fPatRefCount = 1;
136
137 //
138 // Make a copy of the pattern string, so we can return it later if asked.
139 // For compiling the pattern, we will use a UText wrapper around
140 // this local copy, to avoid making even more copies.
141 //
142 re->fPatString = patBuf;
143 re->fPatStringLen = patternLength;
144 u_memcpy(patBuf, pattern, actualPatLen);
145 patBuf[actualPatLen] = 0;
146
147 UText patText = UTEXT_INITIALIZER;
148 utext_openUChars(&patText, patBuf, patternLength, status);
149
150 //
151 // Compile the pattern
152 //
153 if (pe != NULL) {
154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
155 } else {
156 re->fPat = RegexPattern::compile(&patText, flags, *status);
157 }
158 utext_close(&patText);
159
160 if (U_FAILURE(*status)) {
161 goto ErrorExit;
162 }
163
164 //
165 // Create the matcher object
166 //
167 re->fMatcher = re->fPat->matcher(*status);
168 if (U_SUCCESS(*status)) {
169 return (URegularExpression*)re;
170 }
171
172 ErrorExit:
173 delete re;
174 return NULL;
175
176 }
177
178 //----------------------------------------------------------------------------------------
179 //
180 // uregex_openUText
181 //
182 //----------------------------------------------------------------------------------------
183 U_CAPI URegularExpression * U_EXPORT2
184 uregex_openUText(UText *pattern,
185 uint32_t flags,
186 UParseError *pe,
187 UErrorCode *status) {
188
189 if (U_FAILURE(*status)) {
190 return NULL;
191 }
192 if (pattern == NULL) {
193 *status = U_ILLEGAL_ARGUMENT_ERROR;
194 return NULL;
195 }
196
197 int64_t patternNativeLength = utext_nativeLength(pattern);
198
199 if (patternNativeLength == 0) {
200 *status = U_ILLEGAL_ARGUMENT_ERROR;
201 return NULL;
202 }
203
204 RegularExpression *re = new RegularExpression;
205
206 UErrorCode lengthStatus = U_ZERO_ERROR;
207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
208
209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
211 if (re == NULL || refC == NULL || patBuf == NULL) {
212 *status = U_MEMORY_ALLOCATION_ERROR;
213 delete re;
214 uprv_free(refC);
215 uprv_free(patBuf);
216 return NULL;
217 }
218 re->fPatRefCount = refC;
219 *re->fPatRefCount = 1;
220
221 //
222 // Make a copy of the pattern string, so we can return it later if asked.
223 // For compiling the pattern, we will use a read-only UText wrapper
224 // around this local copy, to avoid making even more copies.
225 //
226 re->fPatString = patBuf;
227 re->fPatStringLen = pattern16Length;
228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
229
230 UText patText = UTEXT_INITIALIZER;
231 utext_openUChars(&patText, patBuf, pattern16Length, status);
232
233 //
234 // Compile the pattern
235 //
236 if (pe != NULL) {
237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
238 } else {
239 re->fPat = RegexPattern::compile(&patText, flags, *status);
240 }
241 utext_close(&patText);
242
243 if (U_FAILURE(*status)) {
244 goto ErrorExit;
245 }
246
247 //
248 // Create the matcher object
249 //
250 re->fMatcher = re->fPat->matcher(*status);
251 if (U_SUCCESS(*status)) {
252 return (URegularExpression*)re;
253 }
254
255 ErrorExit:
256 delete re;
257 return NULL;
258
259 }
260
261 //----------------------------------------------------------------------------------------
262 //
263 // uregex_close
264 //
265 //----------------------------------------------------------------------------------------
266 U_CAPI void U_EXPORT2
267 uregex_close(URegularExpression *re2) {
268 RegularExpression *re = (RegularExpression*)re2;
269 UErrorCode status = U_ZERO_ERROR;
270 if (validateRE(re, &status, FALSE) == FALSE) {
271 return;
272 }
273 delete re;
274 }
275
276
277 //----------------------------------------------------------------------------------------
278 //
279 // uregex_clone
280 //
281 //----------------------------------------------------------------------------------------
282 U_CAPI URegularExpression * U_EXPORT2
283 uregex_clone(const URegularExpression *source2, UErrorCode *status) {
284 RegularExpression *source = (RegularExpression*)source2;
285 if (validateRE(source, status, FALSE) == FALSE) {
286 return NULL;
287 }
288
289 RegularExpression *clone = new RegularExpression;
290 if (clone == NULL) {
291 *status = U_MEMORY_ALLOCATION_ERROR;
292 return NULL;
293 }
294
295 clone->fMatcher = source->fPat->matcher(*status);
296 if (U_FAILURE(*status)) {
297 delete clone;
298 return NULL;
299 }
300
301 clone->fPat = source->fPat;
302 clone->fPatRefCount = source->fPatRefCount;
303 clone->fPatString = source->fPatString;
304 clone->fPatStringLen = source->fPatStringLen;
305 umtx_atomic_inc(source->fPatRefCount);
306 // Note: fText is not cloned.
307
308 return (URegularExpression*)clone;
309 }
310
311
312
313
314 //------------------------------------------------------------------------------
315 //
316 // uregex_pattern
317 //
318 //------------------------------------------------------------------------------
319 U_CAPI const UChar * U_EXPORT2
320 uregex_pattern(const URegularExpression *regexp2,
321 int32_t *patLength,
322 UErrorCode *status) {
323 RegularExpression *regexp = (RegularExpression*)regexp2;
324
325 if (validateRE(regexp, status, FALSE) == FALSE) {
326 return NULL;
327 }
328 if (patLength != NULL) {
329 *patLength = regexp->fPatStringLen;
330 }
331 return regexp->fPatString;
332 }
333
334
335 //------------------------------------------------------------------------------
336 //
337 // uregex_patternUText
338 //
339 //------------------------------------------------------------------------------
340 U_CAPI UText * U_EXPORT2
341 uregex_patternUText(const URegularExpression *regexp2,
342 UErrorCode *status) {
343 RegularExpression *regexp = (RegularExpression*)regexp2;
344 return regexp->fPat->patternText(*status);
345 }
346
347
348 //------------------------------------------------------------------------------
349 //
350 // uregex_flags
351 //
352 //------------------------------------------------------------------------------
353 U_CAPI int32_t U_EXPORT2
354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
355 RegularExpression *regexp = (RegularExpression*)regexp2;
356 if (validateRE(regexp, status, FALSE) == FALSE) {
357 return 0;
358 }
359 int32_t flags = regexp->fPat->flags();
360 return flags;
361 }
362
363
364 //------------------------------------------------------------------------------
365 //
366 // uregex_setText
367 //
368 //------------------------------------------------------------------------------
369 U_CAPI void U_EXPORT2
370 uregex_setText(URegularExpression *regexp2,
371 const UChar *text,
372 int32_t textLength,
373 UErrorCode *status) {
374 RegularExpression *regexp = (RegularExpression*)regexp2;
375 if (validateRE(regexp, status, FALSE) == FALSE) {
376 return;
377 }
378 if (text == NULL || textLength < -1) {
379 *status = U_ILLEGAL_ARGUMENT_ERROR;
380 return;
381 }
382
383 if (regexp->fOwnsText && regexp->fText != NULL) {
384 uprv_free((void *)regexp->fText);
385 }
386
387 regexp->fText = text;
388 regexp->fTextLength = textLength;
389 regexp->fOwnsText = FALSE;
390
391 UText input = UTEXT_INITIALIZER;
392 utext_openUChars(&input, text, textLength, status);
393 regexp->fMatcher->reset(&input);
394 utext_close(&input); // reset() made a shallow clone, so we don't need this copy
395 }
396
397
398 //------------------------------------------------------------------------------
399 //
400 // uregex_setUText
401 //
402 //------------------------------------------------------------------------------
403 U_CAPI void U_EXPORT2
404 uregex_setUText(URegularExpression *regexp2,
405 UText *text,
406 UErrorCode *status) {
407 RegularExpression *regexp = (RegularExpression*)regexp2;
408 if (validateRE(regexp, status, FALSE) == FALSE) {
409 return;
410 }
411 if (text == NULL) {
412 *status = U_ILLEGAL_ARGUMENT_ERROR;
413 return;
414 }
415
416 if (regexp->fOwnsText && regexp->fText != NULL) {
417 uprv_free((void *)regexp->fText);
418 }
419
420 regexp->fText = NULL; // only fill it in on request
421 regexp->fTextLength = -1;
422 regexp->fOwnsText = TRUE;
423 regexp->fMatcher->reset(text);
424 }
425
426
427
428 //------------------------------------------------------------------------------
429 //
430 // uregex_getText
431 //
432 //------------------------------------------------------------------------------
433 U_CAPI const UChar * U_EXPORT2
434 uregex_getText(URegularExpression *regexp2,
435 int32_t *textLength,
436 UErrorCode *status) {
437 RegularExpression *regexp = (RegularExpression*)regexp2;
438 if (validateRE(regexp, status, FALSE) == FALSE) {
439 return NULL;
440 }
441
442 if (regexp->fText == NULL) {
443 // need to fill in the text
444 UText *inputText = regexp->fMatcher->inputText();
445 int64_t inputNativeLength = utext_nativeLength(inputText);
446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
447 regexp->fText = inputText->chunkContents;
448 regexp->fTextLength = (int32_t)inputNativeLength;
449 regexp->fOwnsText = FALSE; // because the UText owns it
450 } else {
451 UErrorCode lengthStatus = U_ZERO_ERROR;
452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
454
455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
456 regexp->fText = inputChars;
457 regexp->fOwnsText = TRUE; // should already be set but just in case
458 }
459 }
460
461 if (textLength != NULL) {
462 *textLength = regexp->fTextLength;
463 }
464 return regexp->fText;
465 }
466
467
468 //------------------------------------------------------------------------------
469 //
470 // uregex_getUText
471 //
472 //------------------------------------------------------------------------------
473 U_CAPI UText * U_EXPORT2
474 uregex_getUText(URegularExpression *regexp2,
475 UText *dest,
476 UErrorCode *status) {
477 RegularExpression *regexp = (RegularExpression*)regexp2;
478 if (validateRE(regexp, status, FALSE) == FALSE) {
479 return dest;
480 }
481 return regexp->fMatcher->getInput(dest, *status);
482 }
483
484
485 //------------------------------------------------------------------------------
486 //
487 // uregex_matches
488 //
489 //------------------------------------------------------------------------------
490 U_CAPI UBool U_EXPORT2
491 uregex_matches(URegularExpression *regexp2,
492 int32_t startIndex,
493 UErrorCode *status) {
494 return uregex_matches64( regexp2, (int64_t)startIndex, status);
495 }
496
497 U_CAPI UBool U_EXPORT2
498 uregex_matches64(URegularExpression *regexp2,
499 int64_t startIndex,
500 UErrorCode *status) {
501 RegularExpression *regexp = (RegularExpression*)regexp2;
502 UBool result = FALSE;
503 if (validateRE(regexp, status) == FALSE) {
504 return result;
505 }
506 if (startIndex == -1) {
507 result = regexp->fMatcher->matches(*status);
508 } else {
509 result = regexp->fMatcher->matches(startIndex, *status);
510 }
511 return result;
512 }
513
514
515 //------------------------------------------------------------------------------
516 //
517 // uregex_lookingAt
518 //
519 //------------------------------------------------------------------------------
520 U_CAPI UBool U_EXPORT2
521 uregex_lookingAt(URegularExpression *regexp2,
522 int32_t startIndex,
523 UErrorCode *status) {
524 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
525 }
526
527 U_CAPI UBool U_EXPORT2
528 uregex_lookingAt64(URegularExpression *regexp2,
529 int64_t startIndex,
530 UErrorCode *status) {
531 RegularExpression *regexp = (RegularExpression*)regexp2;
532 UBool result = FALSE;
533 if (validateRE(regexp, status) == FALSE) {
534 return result;
535 }
536 if (startIndex == -1) {
537 result = regexp->fMatcher->lookingAt(*status);
538 } else {
539 result = regexp->fMatcher->lookingAt(startIndex, *status);
540 }
541 return result;
542 }
543
544
545
546 //------------------------------------------------------------------------------
547 //
548 // uregex_find
549 //
550 //------------------------------------------------------------------------------
551 U_CAPI UBool U_EXPORT2
552 uregex_find(URegularExpression *regexp2,
553 int32_t startIndex,
554 UErrorCode *status) {
555 return uregex_find64( regexp2, (int64_t)startIndex, status);
556 }
557
558 U_CAPI UBool U_EXPORT2
559 uregex_find64(URegularExpression *regexp2,
560 int64_t startIndex,
561 UErrorCode *status) {
562 RegularExpression *regexp = (RegularExpression*)regexp2;
563 UBool result = FALSE;
564 if (validateRE(regexp, status) == FALSE) {
565 return result;
566 }
567 if (startIndex == -1) {
568 regexp->fMatcher->resetPreserveRegion();
569 result = regexp->fMatcher->find();
570 } else {
571 result = regexp->fMatcher->find(startIndex, *status);
572 }
573 return result;
574 }
575
576
577 //------------------------------------------------------------------------------
578 //
579 // uregex_findNext
580 //
581 //------------------------------------------------------------------------------
582 U_CAPI UBool U_EXPORT2
583 uregex_findNext(URegularExpression *regexp2,
584 UErrorCode *status) {
585 RegularExpression *regexp = (RegularExpression*)regexp2;
586 if (validateRE(regexp, status) == FALSE) {
587 return FALSE;
588 }
589 UBool result = regexp->fMatcher->find();
590 return result;
591 }
592
593 //------------------------------------------------------------------------------
594 //
595 // uregex_groupCount
596 //
597 //------------------------------------------------------------------------------
598 U_CAPI int32_t U_EXPORT2
599 uregex_groupCount(URegularExpression *regexp2,
600 UErrorCode *status) {
601 RegularExpression *regexp = (RegularExpression*)regexp2;
602 if (validateRE(regexp, status, FALSE) == FALSE) {
603 return 0;
604 }
605 int32_t result = regexp->fMatcher->groupCount();
606 return result;
607 }
608
609
610 //------------------------------------------------------------------------------
611 //
612 // uregex_group
613 //
614 //------------------------------------------------------------------------------
615 U_CAPI int32_t U_EXPORT2
616 uregex_group(URegularExpression *regexp2,
617 int32_t groupNum,
618 UChar *dest,
619 int32_t destCapacity,
620 UErrorCode *status) {
621 RegularExpression *regexp = (RegularExpression*)regexp2;
622 if (validateRE(regexp, status) == FALSE) {
623 return 0;
624 }
625 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
626 *status = U_ILLEGAL_ARGUMENT_ERROR;
627 return 0;
628 }
629
630 if (destCapacity == 0 || regexp->fText != NULL) {
631 // If preflighting or if we already have the text as UChars,
632 // this is a little cheaper than going through uregex_groupUTextDeep()
633
634 //
635 // Pick up the range of characters from the matcher
636 //
637 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
638 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
639 if (U_FAILURE(*status)) {
640 return 0;
641 }
642
643 //
644 // Trim length based on buffer capacity
645 //
646 int32_t fullLength = endIx - startIx;
647 int32_t copyLength = fullLength;
648 if (copyLength < destCapacity) {
649 dest[copyLength] = 0;
650 } else if (copyLength == destCapacity) {
651 *status = U_STRING_NOT_TERMINATED_WARNING;
652 } else {
653 copyLength = destCapacity;
654 *status = U_BUFFER_OVERFLOW_ERROR;
655 }
656
657 //
658 // Copy capture group to user's buffer
659 //
660 if (copyLength > 0) {
661 u_memcpy(dest, &regexp->fText[startIx], copyLength);
662 }
663 return fullLength;
664 } else {
665 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
666 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
667 utext_close(groupText);
668 return result;
669 }
670 }
671
672
673 //------------------------------------------------------------------------------
674 //
675 // uregex_groupUText
676 //
677 //------------------------------------------------------------------------------
678 U_CAPI UText * U_EXPORT2
679 uregex_groupUText(URegularExpression *regexp2,
680 int32_t groupNum,
681 UText *dest,
682 int64_t *groupLength,
683 UErrorCode *status) {
684 RegularExpression *regexp = (RegularExpression*)regexp2;
685 if (validateRE(regexp, status) == FALSE) {
686 UErrorCode emptyTextStatus = U_ZERO_ERROR;
687 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
688 }
689
690 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
691 }
692
693 //------------------------------------------------------------------------------
694 //
695 // uregex_groupUTextDeep
696 //
697 //------------------------------------------------------------------------------
698 U_CAPI UText * U_EXPORT2
699 uregex_groupUTextDeep(URegularExpression *regexp2,
700 int32_t groupNum,
701 UText *dest,
702 UErrorCode *status) {
703 RegularExpression *regexp = (RegularExpression*)regexp2;
704 if (validateRE(regexp, status) == FALSE) {
705 UErrorCode emptyTextStatus = U_ZERO_ERROR;
706 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
707 }
708
709 if (regexp->fText != NULL) {
710 //
711 // Pick up the range of characters from the matcher
712 // and use our already-extracted characters
713 //
714 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
715 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
716 if (U_FAILURE(*status)) {
717 UErrorCode emptyTextStatus = U_ZERO_ERROR;
718 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
719 }
720
721 if (dest) {
722 utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
723 } else {
724 UText groupText = UTEXT_INITIALIZER;
725 utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
726 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
727 utext_close(&groupText);
728 }
729
730 return dest;
731 } else {
732 return regexp->fMatcher->group(groupNum, dest, *status);
733 }
734 }
735
736 //------------------------------------------------------------------------------
737 //
738 // uregex_start
739 //
740 //------------------------------------------------------------------------------
741 U_CAPI int32_t U_EXPORT2
742 uregex_start(URegularExpression *regexp2,
743 int32_t groupNum,
744 UErrorCode *status) {
745 return (int32_t)uregex_start64( regexp2, groupNum, status);
746 }
747
748 U_CAPI int64_t U_EXPORT2
749 uregex_start64(URegularExpression *regexp2,
750 int32_t groupNum,
751 UErrorCode *status) {
752 RegularExpression *regexp = (RegularExpression*)regexp2;
753 if (validateRE(regexp, status) == FALSE) {
754 return 0;
755 }
756 int32_t result = regexp->fMatcher->start(groupNum, *status);
757 return result;
758 }
759
760 //------------------------------------------------------------------------------
761 //
762 // uregex_end
763 //
764 //------------------------------------------------------------------------------
765 U_CAPI int32_t U_EXPORT2
766 uregex_end(URegularExpression *regexp2,
767 int32_t groupNum,
768 UErrorCode *status) {
769 return (int32_t)uregex_end64( regexp2, groupNum, status);
770 }
771
772 U_CAPI int64_t U_EXPORT2
773 uregex_end64(URegularExpression *regexp2,
774 int32_t groupNum,
775 UErrorCode *status) {
776 RegularExpression *regexp = (RegularExpression*)regexp2;
777 if (validateRE(regexp, status) == FALSE) {
778 return 0;
779 }
780 int32_t result = regexp->fMatcher->end(groupNum, *status);
781 return result;
782 }
783
784 //------------------------------------------------------------------------------
785 //
786 // uregex_reset
787 //
788 //------------------------------------------------------------------------------
789 U_CAPI void U_EXPORT2
790 uregex_reset(URegularExpression *regexp2,
791 int32_t index,
792 UErrorCode *status) {
793 uregex_reset64( regexp2, (int64_t)index, status);
794 }
795
796 U_CAPI void U_EXPORT2
797 uregex_reset64(URegularExpression *regexp2,
798 int64_t index,
799 UErrorCode *status) {
800 RegularExpression *regexp = (RegularExpression*)regexp2;
801 if (validateRE(regexp, status) == FALSE) {
802 return;
803 }
804 regexp->fMatcher->reset(index, *status);
805 }
806
807
808 //------------------------------------------------------------------------------
809 //
810 // uregex_setRegion
811 //
812 //------------------------------------------------------------------------------
813 U_CAPI void U_EXPORT2
814 uregex_setRegion(URegularExpression *regexp2,
815 int32_t regionStart,
816 int32_t regionLimit,
817 UErrorCode *status) {
818 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
819 }
820
821 U_CAPI void U_EXPORT2
822 uregex_setRegion64(URegularExpression *regexp2,
823 int64_t regionStart,
824 int64_t regionLimit,
825 UErrorCode *status) {
826 RegularExpression *regexp = (RegularExpression*)regexp2;
827 if (validateRE(regexp, status) == FALSE) {
828 return;
829 }
830 regexp->fMatcher->region(regionStart, regionLimit, *status);
831 }
832
833
834 //------------------------------------------------------------------------------
835 //
836 // uregex_setRegionAndStart
837 //
838 //------------------------------------------------------------------------------
839 U_DRAFT void U_EXPORT2
840 uregex_setRegionAndStart(URegularExpression *regexp2,
841 int64_t regionStart,
842 int64_t regionLimit,
843 int64_t startIndex,
844 UErrorCode *status) {
845 RegularExpression *regexp = (RegularExpression*)regexp2;
846 if (validateRE(regexp, status) == FALSE) {
847 return;
848 }
849 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
850 }
851
852 //------------------------------------------------------------------------------
853 //
854 // uregex_regionStart
855 //
856 //------------------------------------------------------------------------------
857 U_CAPI int32_t U_EXPORT2
858 uregex_regionStart(const URegularExpression *regexp2,
859 UErrorCode *status) {
860 return (int32_t)uregex_regionStart64(regexp2, status);
861 }
862
863 U_CAPI int64_t U_EXPORT2
864 uregex_regionStart64(const URegularExpression *regexp2,
865 UErrorCode *status) {
866 RegularExpression *regexp = (RegularExpression*)regexp2;
867 if (validateRE(regexp, status) == FALSE) {
868 return 0;
869 }
870 return regexp->fMatcher->regionStart();
871 }
872
873
874 //------------------------------------------------------------------------------
875 //
876 // uregex_regionEnd
877 //
878 //------------------------------------------------------------------------------
879 U_CAPI int32_t U_EXPORT2
880 uregex_regionEnd(const URegularExpression *regexp2,
881 UErrorCode *status) {
882 return (int32_t)uregex_regionEnd64(regexp2, status);
883 }
884
885 U_CAPI int64_t U_EXPORT2
886 uregex_regionEnd64(const URegularExpression *regexp2,
887 UErrorCode *status) {
888 RegularExpression *regexp = (RegularExpression*)regexp2;
889 if (validateRE(regexp, status) == FALSE) {
890 return 0;
891 }
892 return regexp->fMatcher->regionEnd();
893 }
894
895
896 //------------------------------------------------------------------------------
897 //
898 // uregex_hasTransparentBounds
899 //
900 //------------------------------------------------------------------------------
901 U_CAPI UBool U_EXPORT2
902 uregex_hasTransparentBounds(const URegularExpression *regexp2,
903 UErrorCode *status) {
904 RegularExpression *regexp = (RegularExpression*)regexp2;
905 if (validateRE(regexp, status) == FALSE) {
906 return FALSE;
907 }
908 return regexp->fMatcher->hasTransparentBounds();
909 }
910
911
912 //------------------------------------------------------------------------------
913 //
914 // uregex_useTransparentBounds
915 //
916 //------------------------------------------------------------------------------
917 U_CAPI void U_EXPORT2
918 uregex_useTransparentBounds(URegularExpression *regexp2,
919 UBool b,
920 UErrorCode *status) {
921 RegularExpression *regexp = (RegularExpression*)regexp2;
922 if (validateRE(regexp, status) == FALSE) {
923 return;
924 }
925 regexp->fMatcher->useTransparentBounds(b);
926 }
927
928
929 //------------------------------------------------------------------------------
930 //
931 // uregex_hasAnchoringBounds
932 //
933 //------------------------------------------------------------------------------
934 U_CAPI UBool U_EXPORT2
935 uregex_hasAnchoringBounds(const URegularExpression *regexp2,
936 UErrorCode *status) {
937 RegularExpression *regexp = (RegularExpression*)regexp2;
938 if (validateRE(regexp, status) == FALSE) {
939 return FALSE;
940 }
941 return regexp->fMatcher->hasAnchoringBounds();
942 }
943
944
945 //------------------------------------------------------------------------------
946 //
947 // uregex_useAnchoringBounds
948 //
949 //------------------------------------------------------------------------------
950 U_CAPI void U_EXPORT2
951 uregex_useAnchoringBounds(URegularExpression *regexp2,
952 UBool b,
953 UErrorCode *status) {
954 RegularExpression *regexp = (RegularExpression*)regexp2;
955 if (validateRE(regexp, status) == FALSE) {
956 return;
957 }
958 regexp->fMatcher->useAnchoringBounds(b);
959 }
960
961
962 //------------------------------------------------------------------------------
963 //
964 // uregex_hitEnd
965 //
966 //------------------------------------------------------------------------------
967 U_CAPI UBool U_EXPORT2
968 uregex_hitEnd(const URegularExpression *regexp2,
969 UErrorCode *status) {
970 RegularExpression *regexp = (RegularExpression*)regexp2;
971 if (validateRE(regexp, status) == FALSE) {
972 return FALSE;
973 }
974 return regexp->fMatcher->hitEnd();
975 }
976
977
978 //------------------------------------------------------------------------------
979 //
980 // uregex_requireEnd
981 //
982 //------------------------------------------------------------------------------
983 U_CAPI UBool U_EXPORT2
984 uregex_requireEnd(const URegularExpression *regexp2,
985 UErrorCode *status) {
986 RegularExpression *regexp = (RegularExpression*)regexp2;
987 if (validateRE(regexp, status) == FALSE) {
988 return FALSE;
989 }
990 return regexp->fMatcher->requireEnd();
991 }
992
993
994 //------------------------------------------------------------------------------
995 //
996 // uregex_setTimeLimit
997 //
998 //------------------------------------------------------------------------------
999 U_CAPI void U_EXPORT2
1000 uregex_setTimeLimit(URegularExpression *regexp2,
1001 int32_t limit,
1002 UErrorCode *status) {
1003 RegularExpression *regexp = (RegularExpression*)regexp2;
1004 if (validateRE(regexp, status)) {
1005 regexp->fMatcher->setTimeLimit(limit, *status);
1006 }
1007 }
1008
1009
1010
1011 //------------------------------------------------------------------------------
1012 //
1013 // uregex_getTimeLimit
1014 //
1015 //------------------------------------------------------------------------------
1016 U_CAPI int32_t U_EXPORT2
1017 uregex_getTimeLimit(const URegularExpression *regexp2,
1018 UErrorCode *status) {
1019 int32_t retVal = 0;
1020 RegularExpression *regexp = (RegularExpression*)regexp2;
1021 if (validateRE(regexp, status)) {
1022 retVal = regexp->fMatcher->getTimeLimit();
1023 }
1024 return retVal;
1025 }
1026
1027
1028
1029 //------------------------------------------------------------------------------
1030 //
1031 // uregex_setStackLimit
1032 //
1033 //------------------------------------------------------------------------------
1034 U_CAPI void U_EXPORT2
1035 uregex_setStackLimit(URegularExpression *regexp2,
1036 int32_t limit,
1037 UErrorCode *status) {
1038 RegularExpression *regexp = (RegularExpression*)regexp2;
1039 if (validateRE(regexp, status)) {
1040 regexp->fMatcher->setStackLimit(limit, *status);
1041 }
1042 }
1043
1044
1045
1046 //------------------------------------------------------------------------------
1047 //
1048 // uregex_getStackLimit
1049 //
1050 //------------------------------------------------------------------------------
1051 U_CAPI int32_t U_EXPORT2
1052 uregex_getStackLimit(const URegularExpression *regexp2,
1053 UErrorCode *status) {
1054 int32_t retVal = 0;
1055 RegularExpression *regexp = (RegularExpression*)regexp2;
1056 if (validateRE(regexp, status)) {
1057 retVal = regexp->fMatcher->getStackLimit();
1058 }
1059 return retVal;
1060 }
1061
1062
1063 //------------------------------------------------------------------------------
1064 //
1065 // uregex_setMatchCallback
1066 //
1067 //------------------------------------------------------------------------------
1068 U_CAPI void U_EXPORT2
1069 uregex_setMatchCallback(URegularExpression *regexp2,
1070 URegexMatchCallback *callback,
1071 const void *context,
1072 UErrorCode *status) {
1073 RegularExpression *regexp = (RegularExpression*)regexp2;
1074 if (validateRE(regexp, status)) {
1075 regexp->fMatcher->setMatchCallback(callback, context, *status);
1076 }
1077 }
1078
1079
1080 //------------------------------------------------------------------------------
1081 //
1082 // uregex_getMatchCallback
1083 //
1084 //------------------------------------------------------------------------------
1085 U_CAPI void U_EXPORT2
1086 uregex_getMatchCallback(const URegularExpression *regexp2,
1087 URegexMatchCallback **callback,
1088 const void **context,
1089 UErrorCode *status) {
1090 RegularExpression *regexp = (RegularExpression*)regexp2;
1091 if (validateRE(regexp, status)) {
1092 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1093 }
1094 }
1095
1096
1097 //------------------------------------------------------------------------------
1098 //
1099 // uregex_setMatchProgressCallback
1100 //
1101 //------------------------------------------------------------------------------
1102 U_CAPI void U_EXPORT2
1103 uregex_setFindProgressCallback(URegularExpression *regexp2,
1104 URegexFindProgressCallback *callback,
1105 const void *context,
1106 UErrorCode *status) {
1107 RegularExpression *regexp = (RegularExpression*)regexp2;
1108 if (validateRE(regexp, status)) {
1109 regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1110 }
1111 }
1112
1113
1114 //------------------------------------------------------------------------------
1115 //
1116 // uregex_getMatchCallback
1117 //
1118 //------------------------------------------------------------------------------
1119 U_CAPI void U_EXPORT2
1120 uregex_getFindProgressCallback(const URegularExpression *regexp2,
1121 URegexFindProgressCallback **callback,
1122 const void **context,
1123 UErrorCode *status) {
1124 RegularExpression *regexp = (RegularExpression*)regexp2;
1125 if (validateRE(regexp, status)) {
1126 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1127 }
1128 }
1129
1130
1131 //------------------------------------------------------------------------------
1132 //
1133 // uregex_replaceAll
1134 //
1135 //------------------------------------------------------------------------------
1136 U_CAPI int32_t U_EXPORT2
1137 uregex_replaceAll(URegularExpression *regexp2,
1138 const UChar *replacementText,
1139 int32_t replacementLength,
1140 UChar *destBuf,
1141 int32_t destCapacity,
1142 UErrorCode *status) {
1143 RegularExpression *regexp = (RegularExpression*)regexp2;
1144 if (validateRE(regexp, status) == FALSE) {
1145 return 0;
1146 }
1147 if (replacementText == NULL || replacementLength < -1 ||
1148 (destBuf == NULL && destCapacity > 0) ||
1149 destCapacity < 0) {
1150 *status = U_ILLEGAL_ARGUMENT_ERROR;
1151 return 0;
1152 }
1153
1154 int32_t len = 0;
1155
1156 uregex_reset(regexp2, 0, status);
1157
1158 // Note: Seperate error code variables for findNext() and appendReplacement()
1159 // are used so that destination buffer overflow errors
1160 // in appendReplacement won't stop findNext() from working.
1161 // appendReplacement() and appendTail() special case incoming buffer
1162 // overflow errors, continuing to return the correct length.
1163 UErrorCode findStatus = *status;
1164 while (uregex_findNext(regexp2, &findStatus)) {
1165 len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1166 &destBuf, &destCapacity, status);
1167 }
1168 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1169
1170 if (U_FAILURE(findStatus)) {
1171 // If anything went wrong with the findNext(), make that error trump
1172 // whatever may have happened with the append() operations.
1173 // Errors in findNext() are not expected.
1174 *status = findStatus;
1175 }
1176
1177 return len;
1178 }
1179
1180
1181 //------------------------------------------------------------------------------
1182 //
1183 // uregex_replaceAllUText
1184 //
1185 //------------------------------------------------------------------------------
1186 U_CAPI UText * U_EXPORT2
1187 uregex_replaceAllUText(URegularExpression *regexp2,
1188 UText *replacementText,
1189 UText *dest,
1190 UErrorCode *status) {
1191 RegularExpression *regexp = (RegularExpression*)regexp2;
1192 if (validateRE(regexp, status) == FALSE) {
1193 return 0;
1194 }
1195 if (replacementText == NULL) {
1196 *status = U_ILLEGAL_ARGUMENT_ERROR;
1197 return 0;
1198 }
1199
1200 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1201 return dest;
1202 }
1203
1204
1205 //------------------------------------------------------------------------------
1206 //
1207 // uregex_replaceFirst
1208 //
1209 //------------------------------------------------------------------------------
1210 U_CAPI int32_t U_EXPORT2
1211 uregex_replaceFirst(URegularExpression *regexp2,
1212 const UChar *replacementText,
1213 int32_t replacementLength,
1214 UChar *destBuf,
1215 int32_t destCapacity,
1216 UErrorCode *status) {
1217 RegularExpression *regexp = (RegularExpression*)regexp2;
1218 if (validateRE(regexp, status) == FALSE) {
1219 return 0;
1220 }
1221 if (replacementText == NULL || replacementLength < -1 ||
1222 (destBuf == NULL && destCapacity > 0) ||
1223 destCapacity < 0) {
1224 *status = U_ILLEGAL_ARGUMENT_ERROR;
1225 return 0;
1226 }
1227
1228 int32_t len = 0;
1229 UBool findSucceeded;
1230 uregex_reset(regexp2, 0, status);
1231 findSucceeded = uregex_find(regexp2, 0, status);
1232 if (findSucceeded) {
1233 len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1234 &destBuf, &destCapacity, status);
1235 }
1236 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1237
1238 return len;
1239 }
1240
1241
1242 //------------------------------------------------------------------------------
1243 //
1244 // uregex_replaceFirstUText
1245 //
1246 //------------------------------------------------------------------------------
1247 U_CAPI UText * U_EXPORT2
1248 uregex_replaceFirstUText(URegularExpression *regexp2,
1249 UText *replacementText,
1250 UText *dest,
1251 UErrorCode *status) {
1252 RegularExpression *regexp = (RegularExpression*)regexp2;
1253 if (validateRE(regexp, status) == FALSE) {
1254 return 0;
1255 }
1256 if (replacementText == NULL) {
1257 *status = U_ILLEGAL_ARGUMENT_ERROR;
1258 return 0;
1259 }
1260
1261 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1262 return dest;
1263 }
1264
1265
1266 //------------------------------------------------------------------------------
1267 //
1268 // uregex_appendReplacement
1269 //
1270 //------------------------------------------------------------------------------
1271
1272 U_NAMESPACE_BEGIN
1273 //
1274 // Dummy class, because these functions need to be friends of class RegexMatcher,
1275 // and stand-alone C functions don't work as friends
1276 //
1277 class RegexCImpl {
1278 public:
1279 inline static int32_t appendReplacement(RegularExpression *regexp,
1280 const UChar *replacementText,
1281 int32_t replacementLength,
1282 UChar **destBuf,
1283 int32_t *destCapacity,
1284 UErrorCode *status);
1285
1286 inline static int32_t appendTail(RegularExpression *regexp,
1287 UChar **destBuf,
1288 int32_t *destCapacity,
1289 UErrorCode *status);
1290
1291 inline static int32_t split(RegularExpression *regexp,
1292 UChar *destBuf,
1293 int32_t destCapacity,
1294 int32_t *requiredCapacity,
1295 UChar *destFields[],
1296 int32_t destFieldsCapacity,
1297 UErrorCode *status);
1298 };
1299
1300 U_NAMESPACE_END
1301
1302
1303
1304 static const UChar BACKSLASH = 0x5c;
1305 static const UChar DOLLARSIGN = 0x24;
1306
1307 //
1308 // Move a character to an output buffer, with bounds checking on the index.
1309 // Index advances even if capacity is exceeded, for preflight size computations.
1310 // This little sequence is used a LOT.
1311 //
1312 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1313 if (*idx < bufCapacity) {
1314 buf[*idx] = c;
1315 }
1316 (*idx)++;
1317 }
1318
1319
1320 //
1321 // appendReplacement, the actual implementation.
1322 //
1323 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
1324 const UChar *replacementText,
1325 int32_t replacementLength,
1326 UChar **destBuf,
1327 int32_t *destCapacity,
1328 UErrorCode *status) {
1329
1330 // If we come in with a buffer overflow error, don't suppress the operation.
1331 // A series of appendReplacements, appendTail need to correctly preflight
1332 // the buffer size when an overflow happens somewhere in the middle.
1333 UBool pendingBufferOverflow = FALSE;
1334 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1335 pendingBufferOverflow = TRUE;
1336 *status = U_ZERO_ERROR;
1337 }
1338
1339 //
1340 // Validate all paramters
1341 //
1342 if (validateRE(regexp, status) == FALSE) {
1343 return 0;
1344 }
1345 if (replacementText == NULL || replacementLength < -1 ||
1346 destCapacity == NULL || destBuf == NULL ||
1347 (*destBuf == NULL && *destCapacity > 0) ||
1348 *destCapacity < 0) {
1349 *status = U_ILLEGAL_ARGUMENT_ERROR;
1350 return 0;
1351 }
1352
1353 RegexMatcher *m = regexp->fMatcher;
1354 if (m->fMatch == FALSE) {
1355 *status = U_REGEX_INVALID_STATE;
1356 return 0;
1357 }
1358
1359 UChar *dest = *destBuf;
1360 int32_t capacity = *destCapacity;
1361 int32_t destIdx = 0;
1362 int32_t i;
1363
1364 // If it wasn't supplied by the caller, get the length of the replacement text.
1365 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1366 // the fly and avoid this step.
1367 if (replacementLength == -1) {
1368 replacementLength = u_strlen(replacementText);
1369 }
1370
1371 // Copy input string from the end of previous match to start of current match
1372 if (regexp->fText != NULL) {
1373 int32_t matchStart;
1374 int32_t lastMatchEnd;
1375 if (UTEXT_USES_U16(m->fInputText)) {
1376 lastMatchEnd = (int32_t)m->fLastMatchEnd;
1377 matchStart = (int32_t)m->fMatchStart;
1378 } else {
1379 // !!!: Would like a better way to do this!
1380 UErrorCode status = U_ZERO_ERROR;
1381 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
1382 status = U_ZERO_ERROR;
1383 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
1384 }
1385 for (i=lastMatchEnd; i<matchStart; i++) {
1386 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1387 }
1388 } else {
1389 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1390 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1391 &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
1392 }
1393
1394
1395 // scan the replacement text, looking for substitutions ($n) and \escapes.
1396 int32_t replIdx = 0;
1397 while (replIdx < replacementLength) {
1398 UChar c = replacementText[replIdx];
1399 replIdx++;
1400 if (c != DOLLARSIGN && c != BACKSLASH) {
1401 // Common case, no substitution, no escaping,
1402 // just copy the char to the dest buf.
1403 appendToBuf(c, &destIdx, dest, capacity);
1404 continue;
1405 }
1406
1407 if (c == BACKSLASH) {
1408 // Backslash Escape. Copy the following char out without further checks.
1409 // Note: Surrogate pairs don't need any special handling
1410 // The second half wont be a '$' or a '\', and
1411 // will move to the dest normally on the next
1412 // loop iteration.
1413 if (replIdx >= replacementLength) {
1414 break;
1415 }
1416 c = replacementText[replIdx];
1417
1418 if (c==0x55/*U*/ || c==0x75/*u*/) {
1419 // We have a \udddd or \Udddddddd escape sequence.
1420 UChar32 escapedChar =
1421 u_unescapeAt(uregex_ucstr_unescape_charAt,
1422 &replIdx, // Index is updated by unescapeAt
1423 replacementLength, // Length of replacement text
1424 (void *)replacementText);
1425
1426 if (escapedChar != (UChar32)0xFFFFFFFF) {
1427 if (escapedChar <= 0xffff) {
1428 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1429 } else {
1430 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1431 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1432 }
1433 continue;
1434 }
1435 // Note: if the \u escape was invalid, just fall through and
1436 // treat it as a plain \<anything> escape.
1437 }
1438
1439 // Plain backslash escape. Just put out the escaped character.
1440 appendToBuf(c, &destIdx, dest, capacity);
1441
1442 replIdx++;
1443 continue;
1444 }
1445
1446
1447
1448 // We've got a $. Pick up a capture group number if one follows.
1449 // Consume at most the number of digits necessary for the largest capture
1450 // number that is valid for this pattern.
1451
1452 int32_t numDigits = 0;
1453 int32_t groupNum = 0;
1454 UChar32 digitC;
1455 for (;;) {
1456 if (replIdx >= replacementLength) {
1457 break;
1458 }
1459 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
1460 if (u_isdigit(digitC) == FALSE) {
1461 break;
1462 }
1463
1464 U16_FWD_1(replacementText, replIdx, replacementLength);
1465 groupNum=groupNum*10 + u_charDigitValue(digitC);
1466 numDigits++;
1467 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
1468 break;
1469 }
1470 }
1471
1472
1473 if (numDigits == 0) {
1474 // The $ didn't introduce a group number at all.
1475 // Treat it as just part of the substitution text.
1476 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
1477 continue;
1478 }
1479
1480 // Finally, append the capture group data to the destination.
1481 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1482 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1483 // Ignore buffer overflow when extracting the group. We need to
1484 // continue on to get full size of the untruncated result. We will
1485 // raise our own buffer overflow error at the end.
1486 *status = U_ZERO_ERROR;
1487 }
1488
1489 if (U_FAILURE(*status)) {
1490 // Can fail if group number is out of range.
1491 break;
1492 }
1493
1494 }
1495
1496 //
1497 // Nul Terminate the dest buffer if possible.
1498 // Set the appropriate buffer overflow or not terminated error, if needed.
1499 //
1500 if (destIdx < capacity) {
1501 dest[destIdx] = 0;
1502 } else if (destIdx == *destCapacity) {
1503 *status = U_STRING_NOT_TERMINATED_WARNING;
1504 } else {
1505 *status = U_BUFFER_OVERFLOW_ERROR;
1506 }
1507
1508 //
1509 // Return an updated dest buffer and capacity to the caller.
1510 //
1511 if (destIdx > 0 && *destCapacity > 0) {
1512 if (destIdx < capacity) {
1513 *destBuf += destIdx;
1514 *destCapacity -= destIdx;
1515 } else {
1516 *destBuf += capacity;
1517 *destCapacity = 0;
1518 }
1519 }
1520
1521 // If we came in with a buffer overflow, make sure we go out with one also.
1522 // (A zero length match right at the end of the previous match could
1523 // make this function succeed even though a previous call had overflowed the buf)
1524 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1525 *status = U_BUFFER_OVERFLOW_ERROR;
1526 }
1527
1528 return destIdx;
1529 }
1530
1531 //
1532 // appendReplacement the actual API function,
1533 //
1534 U_CAPI int32_t U_EXPORT2
1535 uregex_appendReplacement(URegularExpression *regexp2,
1536 const UChar *replacementText,
1537 int32_t replacementLength,
1538 UChar **destBuf,
1539 int32_t *destCapacity,
1540 UErrorCode *status) {
1541
1542 RegularExpression *regexp = (RegularExpression*)regexp2;
1543 return RegexCImpl::appendReplacement(
1544 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1545 }
1546
1547 //
1548 // uregex_appendReplacementUText...can just use the normal C++ method
1549 //
1550 U_CAPI void U_EXPORT2
1551 uregex_appendReplacementUText(URegularExpression *regexp2,
1552 UText *replText,
1553 UText *dest,
1554 UErrorCode *status) {
1555 RegularExpression *regexp = (RegularExpression*)regexp2;
1556 regexp->fMatcher->appendReplacement(dest, replText, *status);
1557 }
1558
1559
1560 //------------------------------------------------------------------------------
1561 //
1562 // uregex_appendTail
1563 //
1564 //------------------------------------------------------------------------------
1565 int32_t RegexCImpl::appendTail(RegularExpression *regexp,
1566 UChar **destBuf,
1567 int32_t *destCapacity,
1568 UErrorCode *status)
1569 {
1570
1571 // If we come in with a buffer overflow error, don't suppress the operation.
1572 // A series of appendReplacements, appendTail need to correctly preflight
1573 // the buffer size when an overflow happens somewhere in the middle.
1574 UBool pendingBufferOverflow = FALSE;
1575 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1576 pendingBufferOverflow = TRUE;
1577 *status = U_ZERO_ERROR;
1578 }
1579
1580 if (validateRE(regexp, status) == FALSE) {
1581 return 0;
1582 }
1583
1584 if (destCapacity == NULL || destBuf == NULL ||
1585 (*destBuf == NULL && *destCapacity > 0) ||
1586 *destCapacity < 0)
1587 {
1588 *status = U_ILLEGAL_ARGUMENT_ERROR;
1589 return 0;
1590 }
1591
1592 RegexMatcher *m = regexp->fMatcher;
1593
1594 int32_t destIdx = 0;
1595 int32_t destCap = *destCapacity;
1596 UChar *dest = *destBuf;
1597
1598 if (regexp->fText != NULL) {
1599 int32_t srcIdx;
1600 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1601 if (nativeIdx == -1) {
1602 srcIdx = 0;
1603 } else if (UTEXT_USES_U16(m->fInputText)) {
1604 srcIdx = (int32_t)nativeIdx;
1605 } else {
1606 UErrorCode status = U_ZERO_ERROR;
1607 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1608 }
1609
1610 for (;;) {
1611 if (srcIdx == regexp->fTextLength) {
1612 break;
1613 }
1614 UChar c = regexp->fText[srcIdx];
1615 if (c == 0 && regexp->fTextLength == -1) {
1616 regexp->fTextLength = srcIdx;
1617 break;
1618 }
1619 if (destIdx < destCap) {
1620 dest[destIdx] = c;
1621 } else {
1622 // We've overflowed the dest buffer.
1623 // If the total input string length is known, we can
1624 // compute the total buffer size needed without scanning through the string.
1625 if (regexp->fTextLength > 0) {
1626 destIdx += (regexp->fTextLength - srcIdx);
1627 break;
1628 }
1629 }
1630 srcIdx++;
1631 destIdx++;
1632 }
1633 } else {
1634 int64_t srcIdx;
1635 if (m->fMatch) {
1636 // The most recent call to find() succeeded.
1637 srcIdx = m->fMatchEnd;
1638 } else {
1639 // The last call to find() on this matcher failed().
1640 // Look back to the end of the last find() that succeeded for src index.
1641 srcIdx = m->fLastMatchEnd;
1642 if (srcIdx == -1) {
1643 // There has been no successful match with this matcher.
1644 // We want to copy the whole string.
1645 srcIdx = 0;
1646 }
1647 }
1648
1649 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1650 }
1651
1652 //
1653 // NUL terminate the output string, if possible, otherwise issue the
1654 // appropriate error or warning.
1655 //
1656 if (destIdx < destCap) {
1657 dest[destIdx] = 0;
1658 } else if (destIdx == destCap) {
1659 *status = U_STRING_NOT_TERMINATED_WARNING;
1660 } else {
1661 *status = U_BUFFER_OVERFLOW_ERROR;
1662 }
1663
1664 //
1665 // Update the user's buffer ptr and capacity vars to reflect the
1666 // amount used.
1667 //
1668 if (destIdx < destCap) {
1669 *destBuf += destIdx;
1670 *destCapacity -= destIdx;
1671 } else {
1672 *destBuf += destCap;
1673 *destCapacity = 0;
1674 }
1675
1676 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1677 *status = U_BUFFER_OVERFLOW_ERROR;
1678 }
1679
1680 return destIdx;
1681 }
1682
1683
1684 //
1685 // appendTail the actual API function
1686 //
1687 U_CAPI int32_t U_EXPORT2
1688 uregex_appendTail(URegularExpression *regexp2,
1689 UChar **destBuf,
1690 int32_t *destCapacity,
1691 UErrorCode *status) {
1692 RegularExpression *regexp = (RegularExpression*)regexp2;
1693 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1694 }
1695
1696
1697 //
1698 // uregex_appendTailUText...can just use the normal C++ method
1699 //
1700 U_CAPI UText * U_EXPORT2
1701 uregex_appendTailUText(URegularExpression *regexp2,
1702 UText *dest,
1703 UErrorCode *status) {
1704 RegularExpression *regexp = (RegularExpression*)regexp2;
1705 return regexp->fMatcher->appendTail(dest, *status);
1706 }
1707
1708
1709 //------------------------------------------------------------------------------
1710 //
1711 // copyString Internal utility to copy a string to an output buffer,
1712 // while managing buffer overflow and preflight size
1713 // computation. NUL termination is added to destination,
1714 // and the NUL is counted in the output size.
1715 //
1716 //------------------------------------------------------------------------------
1717 #if 0
1718 static void copyString(UChar *destBuffer, // Destination buffer.
1719 int32_t destCapacity, // Total capacity of dest buffer
1720 int32_t *destIndex, // Index into dest buffer. Updated on return.
1721 // Update not clipped to destCapacity.
1722 const UChar *srcPtr, // Pointer to source string
1723 int32_t srcLen) // Source string len.
1724 {
1725 int32_t si;
1726 int32_t di = *destIndex;
1727 UChar c;
1728
1729 for (si=0; si<srcLen; si++) {
1730 c = srcPtr[si];
1731 if (di < destCapacity) {
1732 destBuffer[di] = c;
1733 di++;
1734 } else {
1735 di += srcLen - si;
1736 break;
1737 }
1738 }
1739 if (di<destCapacity) {
1740 destBuffer[di] = 0;
1741 }
1742 di++;
1743 *destIndex = di;
1744 }
1745 #endif
1746
1747 //------------------------------------------------------------------------------
1748 //
1749 // uregex_split
1750 //
1751 //------------------------------------------------------------------------------
1752 int32_t RegexCImpl::split(RegularExpression *regexp,
1753 UChar *destBuf,
1754 int32_t destCapacity,
1755 int32_t *requiredCapacity,
1756 UChar *destFields[],
1757 int32_t destFieldsCapacity,
1758 UErrorCode *status) {
1759 //
1760 // Reset for the input text
1761 //
1762 regexp->fMatcher->reset();
1763 UText *inputText = regexp->fMatcher->fInputText;
1764 int64_t nextOutputStringStart = 0;
1765 int64_t inputLen = regexp->fMatcher->fInputLength;
1766 if (inputLen == 0) {
1767 return 0;
1768 }
1769
1770 //
1771 // Loop through the input text, searching for the delimiter pattern
1772 //
1773 int32_t i; // Index of the field being processed.
1774 int32_t destIdx = 0; // Next available position in destBuf;
1775 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1776 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
1777 for (i=0; ; i++) {
1778 if (i>=destFieldsCapacity-1) {
1779 // There are one or zero output strings left.
1780 // Fill the last output string with whatever is left from the input, then exit the loop.
1781 // ( i will be == destFieldsCapacity if we filled the output array while processing
1782 // capture groups of the delimiter expression, in which case we will discard the
1783 // last capture group saved in favor of the unprocessed remainder of the
1784 // input string.)
1785 if (inputLen > nextOutputStringStart) {
1786 if (i != destFieldsCapacity-1) {
1787 // No fields are left. Recycle the last one for holding the trailing part of
1788 // the input string.
1789 i = destFieldsCapacity-1;
1790 destIdx = (int32_t)(destFields[i] - destFields[0]);
1791 }
1792
1793 destFields[i] = &destBuf[destIdx];
1794 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1795 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1796 }
1797 break;
1798 }
1799
1800 if (regexp->fMatcher->find()) {
1801 // We found another delimiter. Move everything from where we started looking
1802 // up until the start of the delimiter into the next output string.
1803 destFields[i] = &destBuf[destIdx];
1804
1805 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1806 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1807 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1808 tStatus = U_ZERO_ERROR;
1809 } else {
1810 *status = tStatus;
1811 }
1812 nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1813
1814 // If the delimiter pattern has capturing parentheses, the captured
1815 // text goes out into the next n destination strings.
1816 int32_t groupNum;
1817 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1818 // If we've run out of output string slots, bail out.
1819 if (i==destFieldsCapacity-1) {
1820 break;
1821 }
1822 i++;
1823
1824 // Set up to extract the capture group contents into the dest buffer.
1825 destFields[i] = &destBuf[destIdx];
1826 tStatus = U_ZERO_ERROR;
1827 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1828 destIdx += t + 1; // Record the space used in the output string buffer.
1829 // +1 for the NUL that terminates the string.
1830 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1831 tStatus = U_ZERO_ERROR;
1832 } else {
1833 *status = tStatus;
1834 }
1835 }
1836
1837 if (nextOutputStringStart == inputLen) {
1838 // The delimiter was at the end of the string. We're done.
1839 break;
1840 }
1841
1842 }
1843 else
1844 {
1845 // We ran off the end of the input while looking for the next delimiter.
1846 // All the remaining text goes into the current output string.
1847 destFields[i] = &destBuf[destIdx];
1848 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1849 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1850 break;
1851 }
1852 }
1853
1854 // Zero out any unused portion of the destFields array
1855 int j;
1856 for (j=i+1; j<destFieldsCapacity; j++) {
1857 destFields[j] = NULL;
1858 }
1859
1860 if (requiredCapacity != NULL) {
1861 *requiredCapacity = destIdx;
1862 }
1863 if (destIdx > destCapacity) {
1864 *status = U_BUFFER_OVERFLOW_ERROR;
1865 }
1866 return i+1;
1867 }
1868
1869 //
1870 // uregex_split The actual API function
1871 //
1872 U_CAPI int32_t U_EXPORT2
1873 uregex_split(URegularExpression *regexp2,
1874 UChar *destBuf,
1875 int32_t destCapacity,
1876 int32_t *requiredCapacity,
1877 UChar *destFields[],
1878 int32_t destFieldsCapacity,
1879 UErrorCode *status) {
1880 RegularExpression *regexp = (RegularExpression*)regexp2;
1881 if (validateRE(regexp, status) == FALSE) {
1882 return 0;
1883 }
1884 if ((destBuf == NULL && destCapacity > 0) ||
1885 destCapacity < 0 ||
1886 destFields == NULL ||
1887 destFieldsCapacity < 1 ) {
1888 *status = U_ILLEGAL_ARGUMENT_ERROR;
1889 return 0;
1890 }
1891
1892 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1893 }
1894
1895
1896 //
1897 // uregex_splitUText...can just use the normal C++ method
1898 //
1899 U_CAPI int32_t U_EXPORT2
1900 uregex_splitUText(URegularExpression *regexp2,
1901 UText *destFields[],
1902 int32_t destFieldsCapacity,
1903 UErrorCode *status) {
1904 RegularExpression *regexp = (RegularExpression*)regexp2;
1905 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1906 }
1907
1908
1909 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1910