2 *******************************************************************************
3 * Copyright (C) 2004-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
25 struct URegularExpression
: public UMemory
{
28 ~URegularExpression();
31 int32_t *fPatRefCount
;
33 int32_t fPatStringLen
;
34 RegexMatcher
*fMatcher
;
35 const UChar
*fText
; // Text from setText()
36 int32_t fTextLength
; // Length provided by user with setText(), which
39 UnicodeString fTextString
; // The setText(text) is wrapped into a UnicodeString.
40 // TODO: regexp engine should not depend on UnicodeString.
43 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
45 URegularExpression::URegularExpression() {
56 URegularExpression::~URegularExpression() {
59 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
61 uprv_free(fPatString
);
62 uprv_free(fPatRefCount
);
67 //----------------------------------------------------------------------------------------
69 // validateRE Do boilerplate style checks on API function parameters.
70 // Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
72 static UBool
validateRE(const URegularExpression
*re
, UErrorCode
*status
, UBool requiresText
= TRUE
) {
73 if (U_FAILURE(*status
)) {
76 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
77 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
80 if (requiresText
&& re
->fText
== NULL
) {
81 *status
= U_REGEX_INVALID_STATE
;
87 //----------------------------------------------------------------------------------------
91 //----------------------------------------------------------------------------------------
92 U_CAPI URegularExpression
* U_EXPORT2
93 uregex_open( const UChar
*pattern
,
94 int32_t patternLength
,
99 if (U_FAILURE(*status
)) {
102 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
103 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
106 int32_t actualPatLen
= patternLength
;
107 if (actualPatLen
== -1) {
108 actualPatLen
= u_strlen(pattern
);
111 URegularExpression
*re
= new URegularExpression
;
112 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
113 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
114 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
115 *status
= U_MEMORY_ALLOCATION_ERROR
;
121 re
->fPatRefCount
= refC
;
122 *re
->fPatRefCount
= 1;
125 // Make a copy of the pattern string, so we can return it later if asked.
126 // For compiling the pattern, we will use a read-only-aliased UnicodeString
127 // of this local copy, to avoid making even more copies.
129 re
->fPatString
= patBuf
;
130 re
->fPatStringLen
= patternLength
;
131 u_memcpy(patBuf
, pattern
, actualPatLen
);
132 patBuf
[actualPatLen
] = 0;
133 UnicodeString
patString(patternLength
==-1, patBuf
, patternLength
);
136 // Compile the pattern
139 re
->fPat
= RegexPattern::compile(patString
, flags
, *pe
, *status
);
141 re
->fPat
= RegexPattern::compile(patString
, flags
, *status
);
143 if (U_FAILURE(*status
)) {
148 // Create the matcher object
150 re
->fMatcher
= re
->fPat
->matcher(*status
);
151 if (U_SUCCESS(*status
)) {
161 //----------------------------------------------------------------------------------------
165 //----------------------------------------------------------------------------------------
166 U_CAPI
void U_EXPORT2
167 uregex_close(URegularExpression
*re
) {
168 UErrorCode status
= U_ZERO_ERROR
;
169 if (validateRE(re
, &status
, FALSE
) == FALSE
) {
176 //----------------------------------------------------------------------------------------
180 //----------------------------------------------------------------------------------------
181 U_CAPI URegularExpression
* U_EXPORT2
182 uregex_clone(const URegularExpression
*source
, UErrorCode
*status
) {
183 if (validateRE(source
, status
, FALSE
) == FALSE
) {
187 URegularExpression
*clone
= new URegularExpression
;
189 *status
= U_MEMORY_ALLOCATION_ERROR
;
193 clone
->fMatcher
= source
->fPat
->matcher(*status
);
194 if (U_FAILURE(*status
)) {
199 clone
->fPat
= source
->fPat
;
200 clone
->fPatRefCount
= source
->fPatRefCount
;
201 clone
->fPatString
= source
->fPatString
;
202 clone
->fPatStringLen
= source
->fPatStringLen
;
203 umtx_atomic_inc(source
->fPatRefCount
);
204 // Note: fText is not cloned.
212 //------------------------------------------------------------------------------
216 //------------------------------------------------------------------------------
217 U_CAPI
const UChar
* U_EXPORT2
218 uregex_pattern(const URegularExpression
*regexp
,
220 UErrorCode
*status
) {
222 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
225 if (patLength
!= NULL
) {
226 *patLength
= regexp
->fPatStringLen
;
228 return regexp
->fPatString
;
232 //------------------------------------------------------------------------------
236 //------------------------------------------------------------------------------
237 U_CAPI
int32_t U_EXPORT2
238 uregex_flags(const URegularExpression
*regexp
, UErrorCode
*status
) {
239 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
242 int32_t flags
= regexp
->fPat
->flags();
247 //------------------------------------------------------------------------------
251 //------------------------------------------------------------------------------
252 U_CAPI
void U_EXPORT2
253 uregex_setText(URegularExpression
*regexp
,
256 UErrorCode
*status
) {
257 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
260 if (text
== NULL
|| textLength
< -1) {
261 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
264 regexp
->fText
= text
;
265 regexp
->fTextLength
= textLength
;
266 UBool isTerminated
= (textLength
== -1);
268 regexp
->fTextString
.setTo(isTerminated
, text
, textLength
);
269 regexp
->fMatcher
->reset(regexp
->fTextString
);
274 //------------------------------------------------------------------------------
278 //------------------------------------------------------------------------------
279 U_CAPI
const UChar
* U_EXPORT2
280 uregex_getText(URegularExpression
*regexp
,
282 UErrorCode
*status
) {
283 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
286 if (textLength
!= NULL
) {
287 *textLength
= regexp
->fTextLength
;
289 return regexp
->fText
;
293 //------------------------------------------------------------------------------
297 //------------------------------------------------------------------------------
298 U_CAPI UBool U_EXPORT2
299 uregex_matches(URegularExpression
*regexp
,
301 UErrorCode
*status
) {
302 UBool result
= FALSE
;
303 if (validateRE(regexp
, status
) == FALSE
) {
306 if (startIndex
== -1) {
307 result
= regexp
->fMatcher
->matches(*status
);
309 result
= regexp
->fMatcher
->matches(startIndex
, *status
);
316 //------------------------------------------------------------------------------
320 //------------------------------------------------------------------------------
321 U_CAPI UBool U_EXPORT2
322 uregex_lookingAt(URegularExpression
*regexp
,
324 UErrorCode
*status
) {
325 UBool result
= FALSE
;
326 if (validateRE(regexp
, status
) == FALSE
) {
329 if (startIndex
== -1) {
330 result
= regexp
->fMatcher
->lookingAt(*status
);
332 result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
339 //------------------------------------------------------------------------------
343 //------------------------------------------------------------------------------
344 U_CAPI UBool U_EXPORT2
345 uregex_find(URegularExpression
*regexp
,
347 UErrorCode
*status
) {
348 UBool result
= FALSE
;
349 if (validateRE(regexp
, status
) == FALSE
) {
352 if (startIndex
== -1) {
353 regexp
->fMatcher
->resetPreserveRegion();
354 result
= regexp
->fMatcher
->find();
356 result
= regexp
->fMatcher
->find(startIndex
, *status
);
361 //------------------------------------------------------------------------------
365 //------------------------------------------------------------------------------
366 U_CAPI UBool U_EXPORT2
367 uregex_findNext(URegularExpression
*regexp
,
368 UErrorCode
*status
) {
369 if (validateRE(regexp
, status
) == FALSE
) {
372 UBool result
= regexp
->fMatcher
->find();
376 //------------------------------------------------------------------------------
380 //------------------------------------------------------------------------------
381 U_CAPI
int32_t U_EXPORT2
382 uregex_groupCount(URegularExpression
*regexp
,
383 UErrorCode
*status
) {
384 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
387 int32_t result
= regexp
->fMatcher
->groupCount();
392 //------------------------------------------------------------------------------
396 //------------------------------------------------------------------------------
397 U_CAPI
int32_t U_EXPORT2
398 uregex_group(URegularExpression
*regexp
,
401 int32_t destCapacity
,
402 UErrorCode
*status
) {
403 if (validateRE(regexp
, status
) == FALSE
) {
406 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
407 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
412 // Pick up the range of characters from the matcher
414 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
415 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
416 if (U_FAILURE(*status
)) {
421 // Trim length based on buffer capacity
423 int32_t fullLength
= endIx
- startIx
;
424 int32_t copyLength
= fullLength
;
425 if (copyLength
< destCapacity
) {
426 dest
[copyLength
] = 0;
427 } else if (copyLength
== destCapacity
) {
428 *status
= U_STRING_NOT_TERMINATED_WARNING
;
430 copyLength
= destCapacity
;
431 *status
= U_BUFFER_OVERFLOW_ERROR
;
435 // Copy capture group to user's buffer
437 if (copyLength
> 0) {
438 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
444 //------------------------------------------------------------------------------
448 //------------------------------------------------------------------------------
449 U_CAPI
int32_t U_EXPORT2
450 uregex_start(URegularExpression
*regexp
,
452 UErrorCode
*status
) {
453 if (validateRE(regexp
, status
) == FALSE
) {
456 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
461 //------------------------------------------------------------------------------
465 //------------------------------------------------------------------------------
466 U_CAPI
int32_t U_EXPORT2
467 uregex_end(URegularExpression
*regexp
,
469 UErrorCode
*status
) {
470 if (validateRE(regexp
, status
) == FALSE
) {
473 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
477 //------------------------------------------------------------------------------
481 //------------------------------------------------------------------------------
482 U_CAPI
void U_EXPORT2
483 uregex_reset(URegularExpression
*regexp
,
485 UErrorCode
*status
) {
486 if (validateRE(regexp
, status
) == FALSE
) {
489 regexp
->fMatcher
->reset(index
, *status
);
493 //------------------------------------------------------------------------------
497 //------------------------------------------------------------------------------
498 U_CAPI
void U_EXPORT2
499 uregex_setRegion(URegularExpression
*regexp
,
502 UErrorCode
*status
) {
503 if (validateRE(regexp
, status
) == FALSE
) {
506 regexp
->fMatcher
->region(regionStart
, regionLimit
, *status
);
510 //------------------------------------------------------------------------------
512 // uregex_regionStart
514 //------------------------------------------------------------------------------
515 U_CAPI
int32_t U_EXPORT2
516 uregex_regionStart(const URegularExpression
*regexp
,
517 UErrorCode
*status
) {
518 if (validateRE(regexp
, status
) == FALSE
) {
521 return regexp
->fMatcher
->regionStart();
525 //------------------------------------------------------------------------------
529 //------------------------------------------------------------------------------
530 U_CAPI
int32_t U_EXPORT2
531 uregex_regionEnd(const URegularExpression
*regexp
,
532 UErrorCode
*status
) {
533 if (validateRE(regexp
, status
) == FALSE
) {
536 return regexp
->fMatcher
->regionEnd();
540 //------------------------------------------------------------------------------
542 // uregex_hasTransparentBounds
544 //------------------------------------------------------------------------------
545 U_CAPI UBool U_EXPORT2
546 uregex_hasTransparentBounds(const URegularExpression
*regexp
,
547 UErrorCode
*status
) {
548 if (validateRE(regexp
, status
) == FALSE
) {
551 return regexp
->fMatcher
->hasTransparentBounds();
555 //------------------------------------------------------------------------------
557 // uregex_useTransparentBounds
559 //------------------------------------------------------------------------------
560 U_CAPI
void U_EXPORT2
561 uregex_useTransparentBounds(URegularExpression
*regexp
,
563 UErrorCode
*status
) {
564 if (validateRE(regexp
, status
) == FALSE
) {
567 regexp
->fMatcher
->useTransparentBounds(b
);
571 //------------------------------------------------------------------------------
573 // uregex_hasAnchoringBounds
575 //------------------------------------------------------------------------------
576 U_CAPI UBool U_EXPORT2
577 uregex_hasAnchoringBounds(const URegularExpression
*regexp
,
578 UErrorCode
*status
) {
579 if (validateRE(regexp
, status
) == FALSE
) {
582 return regexp
->fMatcher
->hasAnchoringBounds();
586 //------------------------------------------------------------------------------
588 // uregex_useAnchoringBounds
590 //------------------------------------------------------------------------------
591 U_CAPI
void U_EXPORT2
592 uregex_useAnchoringBounds(URegularExpression
*regexp
,
594 UErrorCode
*status
) {
595 if (validateRE(regexp
, status
) == FALSE
) {
598 regexp
->fMatcher
->useAnchoringBounds(b
);
602 //------------------------------------------------------------------------------
606 //------------------------------------------------------------------------------
607 U_CAPI UBool U_EXPORT2
608 uregex_hitEnd(const URegularExpression
*regexp
,
609 UErrorCode
*status
) {
610 if (validateRE(regexp
, status
) == FALSE
) {
613 return regexp
->fMatcher
->hitEnd();
617 //------------------------------------------------------------------------------
621 //------------------------------------------------------------------------------
622 U_CAPI UBool U_EXPORT2
623 uregex_requireEnd(const URegularExpression
*regexp
,
624 UErrorCode
*status
) {
625 if (validateRE(regexp
, status
) == FALSE
) {
628 return regexp
->fMatcher
->requireEnd();
632 //------------------------------------------------------------------------------
634 // uregex_setTimeLimit
636 //------------------------------------------------------------------------------
637 U_CAPI
void U_EXPORT2
638 uregex_setTimeLimit(URegularExpression
*regexp
,
640 UErrorCode
*status
) {
641 if (validateRE(regexp
, status
)) {
642 regexp
->fMatcher
->setTimeLimit(limit
, *status
);
648 //------------------------------------------------------------------------------
650 // uregex_getTimeLimit
652 //------------------------------------------------------------------------------
653 U_CAPI
int32_t U_EXPORT2
654 uregex_getTimeLimit(const URegularExpression
*regexp
,
655 UErrorCode
*status
) {
657 if (validateRE(regexp
, status
)) {
658 retVal
= regexp
->fMatcher
->getTimeLimit();
665 //------------------------------------------------------------------------------
667 // uregex_setStackLimit
669 //------------------------------------------------------------------------------
670 U_CAPI
void U_EXPORT2
671 uregex_setStackLimit(URegularExpression
*regexp
,
673 UErrorCode
*status
) {
674 if (validateRE(regexp
, status
)) {
675 regexp
->fMatcher
->setStackLimit(limit
, *status
);
681 //------------------------------------------------------------------------------
683 // uregex_getStackLimit
685 //------------------------------------------------------------------------------
686 U_CAPI
int32_t U_EXPORT2
687 uregex_getStackLimit(const URegularExpression
*regexp
,
688 UErrorCode
*status
) {
690 if (validateRE(regexp
, status
)) {
691 retVal
= regexp
->fMatcher
->getStackLimit();
697 //------------------------------------------------------------------------------
699 // uregex_setMatchCallback
701 //------------------------------------------------------------------------------
702 U_CAPI
void U_EXPORT2
703 uregex_setMatchCallback(URegularExpression
*regexp
,
704 URegexMatchCallback
*callback
,
706 UErrorCode
*status
) {
707 if (validateRE(regexp
, status
)) {
708 regexp
->fMatcher
->setMatchCallback(callback
, context
, *status
);
713 //------------------------------------------------------------------------------
715 // uregex_getMatchCallback
717 //------------------------------------------------------------------------------
718 U_CAPI
void U_EXPORT2
719 uregex_getMatchCallback(const URegularExpression
*regexp
,
720 URegexMatchCallback
**callback
,
721 const void **context
,
722 UErrorCode
*status
) {
723 if (validateRE(regexp
, status
)) {
724 regexp
->fMatcher
->getMatchCallback(*callback
, *context
, *status
);
729 //------------------------------------------------------------------------------
733 //------------------------------------------------------------------------------
734 U_CAPI
int32_t U_EXPORT2
735 uregex_replaceAll(URegularExpression
*regexp
,
736 const UChar
*replacementText
,
737 int32_t replacementLength
,
739 int32_t destCapacity
,
740 UErrorCode
*status
) {
741 if (validateRE(regexp
, status
) == FALSE
) {
744 if (replacementText
== NULL
|| replacementLength
< -1 ||
745 destBuf
== NULL
&& destCapacity
> 0 ||
747 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
752 uregex_reset(regexp
, 0, status
);
753 while (uregex_findNext(regexp
, status
)) {
754 len
+= uregex_appendReplacement(regexp
, replacementText
, replacementLength
,
755 &destBuf
, &destCapacity
, status
);
757 len
+= uregex_appendTail(regexp
, &destBuf
, &destCapacity
, status
);
763 //------------------------------------------------------------------------------
765 // uregex_replaceFirst
767 //------------------------------------------------------------------------------
768 U_CAPI
int32_t U_EXPORT2
769 uregex_replaceFirst(URegularExpression
*regexp
,
770 const UChar
*replacementText
,
771 int32_t replacementLength
,
773 int32_t destCapacity
,
774 UErrorCode
*status
) {
775 if (validateRE(regexp
, status
) == FALSE
) {
778 if (replacementText
== NULL
|| replacementLength
< -1 ||
779 destBuf
== NULL
&& destCapacity
> 0 ||
781 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
787 uregex_reset(regexp
, 0, status
);
788 findSucceeded
= uregex_find(regexp
, 0, status
);
790 len
= uregex_appendReplacement(regexp
, replacementText
, replacementLength
,
791 &destBuf
, &destCapacity
, status
);
793 len
+= uregex_appendTail(regexp
, &destBuf
, &destCapacity
, status
);
799 //------------------------------------------------------------------------------
801 // uregex_appendReplacement
803 //------------------------------------------------------------------------------
807 // Dummy class, because these functions need to be friends of class RegexMatcher,
808 // and stand-alone C functions don't work as friends
813 inline static int32_t appendReplacement(URegularExpression
*regexp
,
814 const UChar
*replacementText
,
815 int32_t replacementLength
,
817 int32_t *destCapacity
,
820 inline static int32_t appendTail(URegularExpression
*regexp
,
822 int32_t *destCapacity
,
829 // Call-back function for u_unescapeAt(), used when we encounter
830 // \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
833 static UChar U_CALLCONV
834 unescape_charAt(int32_t offset
, void *context
) {
835 UChar c16
= ((UChar
*)context
)[offset
];
841 static const UChar BACKSLASH
= 0x5c;
842 static const UChar DOLLARSIGN
= 0x24;
845 // Move a character to an output buffer, with bounds checking on the index.
846 // Index advances even if capacity is exceeded, for preflight size computations.
847 // This little sequence is used a LOT.
849 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
850 if (*idx
< bufCapacity
) {
858 // appendReplacement, the actual implementation.
860 int32_t RegexCImpl::appendReplacement(URegularExpression
*regexp
,
861 const UChar
*replacementText
,
862 int32_t replacementLength
,
864 int32_t *destCapacity
,
865 UErrorCode
*status
) {
867 // If we come in with a buffer overflow error, don't suppress the operation.
868 // A series of appendReplacements, appendTail need to correctly preflight
869 // the buffer size when an overflow happens somewhere in the middle.
870 UBool pendingBufferOverflow
= FALSE
;
871 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
== 0) {
872 pendingBufferOverflow
= TRUE
;
873 *status
= U_ZERO_ERROR
;
877 // Validate all paramters
879 if (validateRE(regexp
, status
) == FALSE
) {
882 if (replacementText
== NULL
|| replacementLength
< -1 ||
883 destCapacity
== NULL
|| destBuf
== NULL
||
884 *destBuf
== NULL
&& *destCapacity
> 0 ||
886 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
890 RegexMatcher
*m
= regexp
->fMatcher
;
891 if (m
->fMatch
== FALSE
) {
892 *status
= U_REGEX_INVALID_STATE
;
896 UChar
*dest
= *destBuf
;
897 int32_t capacity
= *destCapacity
;
901 // If it wasn't supplied by the caller, get the length of the replacement text.
902 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
903 // the fly and avoid this step.
904 if (replacementLength
== -1) {
905 replacementLength
= u_strlen(replacementText
);
908 // Copy input string from the end of previous match to start of current match
909 for (i
=m
->fLastMatchEnd
; i
<m
->fMatchStart
; i
++) {
910 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
915 // scan the replacement text, looking for substitutions ($n) and \escapes.
917 while (replIdx
< replacementLength
) {
918 UChar c
= replacementText
[replIdx
];
920 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
921 // Common case, no substitution, no escaping,
922 // just copy the char to the dest buf.
923 appendToBuf(c
, &destIdx
, dest
, capacity
);
927 if (c
== BACKSLASH
) {
928 // Backslash Escape. Copy the following char out without further checks.
929 // Note: Surrogate pairs don't need any special handling
930 // The second half wont be a '$' or a '\', and
931 // will move to the dest normally on the next
933 if (replIdx
>= replacementLength
) {
936 c
= replacementText
[replIdx
];
938 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
939 // We have a \udddd or \Udddddddd escape sequence.
940 UChar32 escapedChar
=
941 u_unescapeAt(unescape_charAt
,
942 &replIdx
, // Index is updated by unescapeAt
943 replacementLength
, // Length of replacement text
944 (void *)replacementText
);
946 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
947 if (escapedChar
<= 0xffff) {
948 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
950 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
951 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
955 // Note: if the \u escape was invalid, just fall through and
956 // treat it as a plain \<anything> escape.
959 // Plain backslash escape. Just put out the escaped character.
960 appendToBuf(c
, &destIdx
, dest
, capacity
);
968 // We've got a $. Pick up a capture group number if one follows.
969 // Consume at most the number of digits necessary for the largest capture
970 // number that is valid for this pattern.
972 int32_t numDigits
= 0;
973 int32_t groupNum
= 0;
976 if (replIdx
>= replacementLength
) {
979 U16_GET(replacementText
, 0, replIdx
, replacementLength
, digitC
);
980 if (u_isdigit(digitC
) == FALSE
) {
984 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
985 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
987 if (numDigits
>= m
->fPattern
->fMaxCaptureDigits
) {
993 if (numDigits
== 0) {
994 // The $ didn't introduce a group number at all.
995 // Treat it as just part of the substitution text.
996 appendToBuf(DOLLARSIGN
, &destIdx
, dest
, capacity
);
1000 // Finally, append the capture group data to the destination.
1001 int32_t capacityRemaining
= capacity
- destIdx
;
1002 if (capacityRemaining
< 0) {
1003 capacityRemaining
= 0;
1005 destIdx
+= uregex_group(regexp
, groupNum
, dest
+destIdx
, capacityRemaining
, status
);
1006 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
1007 // Ignore buffer overflow when extracting the group. We need to
1008 // continue on to get full size of the untruncated result. We will
1009 // raise our own buffer overflow error at the end.
1010 *status
= U_ZERO_ERROR
;
1013 if (U_FAILURE(*status
)) {
1014 // Can fail if group number is out of range.
1021 // Nul Terminate the dest buffer if possible.
1022 // Set the appropriate buffer overflow or not terminated error, if needed.
1024 if (destIdx
< capacity
) {
1026 } else if (destIdx
== *destCapacity
) {
1027 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1029 *status
= U_BUFFER_OVERFLOW_ERROR
;
1033 // Return an updated dest buffer and capacity to the caller.
1035 if (destIdx
> 0 && *destCapacity
> 0) {
1036 if (destIdx
< capacity
) {
1037 *destBuf
+= destIdx
;
1038 *destCapacity
-= destIdx
;
1040 *destBuf
+= capacity
;
1045 // If we came in with a buffer overflow, make sure we go out with one also.
1046 // (A zero length match right at the end of the previous match could
1047 // make this function succeed even though a previous call had overflowed the buf)
1048 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1049 *status
= U_BUFFER_OVERFLOW_ERROR
;
1056 // appendReplacement the acutal API function,
1058 U_CAPI
int32_t U_EXPORT2
1059 uregex_appendReplacement(URegularExpression
*regexp
,
1060 const UChar
*replacementText
,
1061 int32_t replacementLength
,
1063 int32_t *destCapacity
,
1064 UErrorCode
*status
) {
1065 return RegexCImpl::appendReplacement(
1066 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
1070 //------------------------------------------------------------------------------
1072 // uregex_appendTail
1074 //------------------------------------------------------------------------------
1075 int32_t RegexCImpl::appendTail(URegularExpression
*regexp
,
1077 int32_t *destCapacity
,
1081 if (destCapacity
== NULL
|| destBuf
== NULL
||
1082 *destBuf
== NULL
&& *destCapacity
> 0 ||
1085 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1089 // If we come in with a buffer overflow error, don't suppress the operation.
1090 // A series of appendReplacements, appendTail need to correctly preflight
1091 // the buffer size when an overflow happens somewhere in the middle.
1092 UBool pendingBufferOverflow
= FALSE
;
1093 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& *destCapacity
== 0) {
1094 pendingBufferOverflow
= TRUE
;
1095 *status
= U_ZERO_ERROR
;
1098 if (validateRE(regexp
, status
) == FALSE
) {
1101 RegexMatcher
*m
= regexp
->fMatcher
;
1105 // The most recent call to find() succeeded.
1106 srcIdx
= m
->fMatchEnd
;
1108 // The last call to find() on this matcher failed().
1109 // Look back to the end of the last find() that succeeded for src index.
1110 srcIdx
= m
->fLastMatchEnd
;
1112 // There has been no successful match with this matcher.
1113 // We want to copy the whole string.
1118 int32_t destIdx
= 0;
1119 int32_t destCap
= *destCapacity
;
1120 UChar
*dest
= *destBuf
;
1123 if (srcIdx
== regexp
->fTextLength
) {
1126 UChar c
= regexp
->fText
[srcIdx
];
1127 if (c
== 0 && regexp
->fTextLength
== -1) {
1130 if (destIdx
< destCap
) {
1133 // We've overflowed the dest buffer.
1134 // If the total input string length is known, we can
1135 // compute the total buffer size needed without scanning through the string.
1136 if (regexp
->fTextLength
> 0) {
1137 destIdx
+= (regexp
->fTextLength
- srcIdx
);
1146 // NUL terminate the output string, if possible, otherwise issue the
1147 // appropriate error or warning.
1149 if (destIdx
< destCap
) {
1151 } else if (destIdx
== destCap
) {
1152 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1154 *status
= U_BUFFER_OVERFLOW_ERROR
;
1158 // Update the user's buffer ptr and capacity vars to reflect the
1161 if (destIdx
< destCap
) {
1162 *destBuf
+= destIdx
;
1163 *destCapacity
-= destIdx
;
1165 *destBuf
+= destCap
;
1169 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1170 *status
= U_BUFFER_OVERFLOW_ERROR
;
1177 U_CAPI
int32_t U_EXPORT2
1178 uregex_appendTail(URegularExpression
*regexp
,
1180 int32_t *destCapacity
,
1181 UErrorCode
*status
) {
1182 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
1186 //------------------------------------------------------------------------------
1188 // copyString Internal utility to copy a string to an output buffer,
1189 // while managing buffer overflow and preflight size
1190 // computation. NUL termination is added to destination,
1191 // and the NUL is counted in the output size.
1193 //------------------------------------------------------------------------------
1194 static void copyString(UChar
*destBuffer
, // Destination buffer.
1195 int32_t destCapacity
, // Total capacity of dest buffer
1196 int32_t *destIndex
, // Index into dest buffer. Updated on return.
1197 // Update not clipped to destCapacity.
1198 const UChar
*srcPtr
, // Pointer to source string
1199 int32_t srcLen
) // Source string len.
1202 int32_t di
= *destIndex
;
1205 for (si
=0; si
<srcLen
; si
++) {
1207 if (di
< destCapacity
) {
1215 if (di
<destCapacity
) {
1223 //------------------------------------------------------------------------------
1227 //------------------------------------------------------------------------------
1228 U_CAPI
int32_t U_EXPORT2
1229 uregex_split( URegularExpression
*regexp
,
1231 int32_t destCapacity
,
1232 int32_t *requiredCapacity
,
1233 UChar
*destFields
[],
1234 int32_t destFieldsCapacity
,
1235 UErrorCode
*status
) {
1236 if (validateRE(regexp
, status
) == FALSE
) {
1239 if (destBuf
== NULL
&& destCapacity
> 0 ||
1241 destFields
== NULL
||
1242 destFieldsCapacity
< 1 ) {
1243 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1248 // Reset for the input text
1250 regexp
->fMatcher
->reset();
1251 int32_t inputLen
= regexp
->fTextString
.length();
1252 int32_t nextOutputStringStart
= 0;
1253 if (inputLen
== 0) {
1259 // Loop through the input text, searching for the delimiter pattern
1261 int32_t i
; // Index of the field being processed.
1262 int32_t destIdx
= 0; // Next available position in destBuf;
1263 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1265 if (i
>=destFieldsCapacity
-1) {
1266 // There are one or zero output string left.
1267 // Fill the last output string with whatever is left from the input, then exit the loop.
1268 // ( i will be == destFieldsCapacity if we filled the output array while processing
1269 // capture groups of the delimiter expression, in which case we will discard the
1270 // last capture group saved in favor of the unprocessed remainder of the
1272 int32_t remainingLength
= inputLen
-nextOutputStringStart
;
1273 if (remainingLength
> 0) {
1275 if (i
>= destFieldsCapacity
) {
1276 // No fields are left. Recycle the last one for holding the trailing part of
1277 // the input string.
1278 i
= destFieldsCapacity
-1;
1279 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1282 destFields
[i
] = &destBuf
[destIdx
];
1283 copyString(destBuf
, destCapacity
, &destIdx
,
1284 ®exp
->fText
[nextOutputStringStart
], remainingLength
);
1288 if (regexp
->fMatcher
->find()) {
1289 // We found another delimiter. Move everything from where we started looking
1290 // up until the start of the delimiter into the next output string.
1291 int32_t fieldLen
= regexp
->fMatcher
->start(*status
) - nextOutputStringStart
;
1292 destFields
[i
] = &destBuf
[destIdx
];
1293 copyString(destBuf
, destCapacity
, &destIdx
,
1294 ®exp
->fText
[nextOutputStringStart
], fieldLen
);
1295 nextOutputStringStart
= regexp
->fMatcher
->end(*status
);
1297 // If the delimiter pattern has capturing parentheses, the captured
1298 // text goes out into the next n destination strings.
1300 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1301 // If we've run out of output string slots, bail out.
1302 if (i
==destFieldsCapacity
-1) {
1307 // Set up to extract the capture group contents into the dest buffer.
1308 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow
1309 // error while extracting this group.
1310 int32_t remainingCapacity
= destCapacity
- destIdx
;
1311 if (remainingCapacity
< 0) {
1312 remainingCapacity
= 0;
1314 destFields
[i
] = &destBuf
[destIdx
];
1315 int32_t t
= uregex_group(regexp
, groupNum
, destFields
[i
], remainingCapacity
, &tStatus
);
1316 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1317 // +1 for the NUL that terminates the string.
1320 if (nextOutputStringStart
== inputLen
) {
1321 // The delimiter was at the end of the string. We're done.
1328 // We ran off the end of the input while looking for the next delimiter.
1329 // All the remaining text goes into the current output string.
1330 destFields
[i
] = &destBuf
[destIdx
];
1331 copyString(destBuf
, destCapacity
, &destIdx
,
1332 ®exp
->fText
[nextOutputStringStart
], inputLen
-nextOutputStringStart
);
1337 // Zero out any unused portion of the destFields array
1339 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1340 destFields
[j
] = NULL
;
1343 if (requiredCapacity
!= NULL
) {
1344 *requiredCapacity
= destIdx
;
1346 if (destIdx
> destCapacity
) {
1347 *status
= U_BUFFER_OVERFLOW_ERROR
;
1353 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS