2 *******************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
23 struct URegularExpression
: public UMemory
{
26 ~URegularExpression();
29 int32_t *fPatRefCount
;
31 int32_t fPatStringLen
;
32 RegexMatcher
*fMatcher
;
33 const UChar
*fText
; // Text from setText()
34 int32_t fTextLength
; // Length provided by user with setText(), which
37 UnicodeString fTextString
; // The setText(text) is wrapped into a UnicodeString.
38 // TODO: regexp engine should not depend on UnicodeString.
41 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
45 URegularExpression::URegularExpression() {
56 URegularExpression::~URegularExpression() {
59 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
61 uprv_free(fPatString
);
62 uprv_free(fPatRefCount
);
67 //----------------------------------------------------------------------------------------
69 // validateRE Do boilerplate style checks on API function parameters.
70 // Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
72 static UBool
validateRE(const URegularExpression
*re
, UErrorCode
*status
, UBool requiresText
= TRUE
) {
73 if (U_FAILURE(*status
)) {
76 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
78 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
81 if (requiresText
&& re
->fText
== NULL
) {
82 *status
= U_REGEX_INVALID_STATE
;
88 //----------------------------------------------------------------------------------------
92 //----------------------------------------------------------------------------------------
93 U_CAPI URegularExpression
* U_EXPORT2
94 uregex_open( const UChar
*pattern
,
95 int32_t patternLength
,
100 if (U_FAILURE(*status
)) {
103 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
104 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
107 int32_t actualPatLen
= patternLength
;
108 if (actualPatLen
== -1) {
109 actualPatLen
= u_strlen(pattern
);
112 URegularExpression
*re
= new URegularExpression
;
113 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
114 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
115 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
116 *status
= U_MEMORY_ALLOCATION_ERROR
;
122 re
->fPatRefCount
= refC
;
123 *re
->fPatRefCount
= 1;
126 // Make a copy of the pattern string, so we can return it later if asked.
127 // For compiling the pattern, we will use a read-only-aliased UnicodeString
128 // of this local copy, to avoid making even more copies.
130 re
->fPatString
= patBuf
;
131 re
->fPatStringLen
= patternLength
;
132 u_memcpy(patBuf
, pattern
, actualPatLen
);
133 patBuf
[actualPatLen
] = 0;
134 UnicodeString
patString(patternLength
==-1, patBuf
, patternLength
);
137 // Compile the pattern
140 re
->fPat
= RegexPattern::compile(patString
, flags
, *pe
, *status
);
142 re
->fPat
= RegexPattern::compile(patString
, flags
, *status
);
144 if (U_FAILURE(*status
)) {
149 // Create the matcher object
151 re
->fMatcher
= re
->fPat
->matcher(*status
);
152 if (U_SUCCESS(*status
)) {
165 //----------------------------------------------------------------------------------------
169 //----------------------------------------------------------------------------------------
170 U_CAPI URegularExpression
* U_EXPORT2
171 uregex_openC( const char *pattern
,
174 UErrorCode
*status
) {
175 if (U_FAILURE(*status
)) {
178 if (pattern
== NULL
) {
179 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
183 UnicodeString
patString(pattern
);
184 URegularExpression
*re
= uregex_open(patString
.getBuffer(), patString
.length(), flags
, pe
, status
);
188 //----------------------------------------------------------------------------------------
192 //----------------------------------------------------------------------------------------
193 U_CAPI
void U_EXPORT2
194 uregex_close(URegularExpression
*re
) {
195 UErrorCode status
= U_ZERO_ERROR
;
196 if (validateRE(re
, &status
, FALSE
) == FALSE
) {
203 //----------------------------------------------------------------------------------------
207 //----------------------------------------------------------------------------------------
208 U_CAPI URegularExpression
* U_EXPORT2
209 uregex_clone(const URegularExpression
*source
, UErrorCode
*status
) {
210 if (validateRE(source
, status
, FALSE
) == FALSE
) {
214 URegularExpression
*clone
= new URegularExpression
;
216 *status
= U_MEMORY_ALLOCATION_ERROR
;
220 clone
->fMatcher
= source
->fPat
->matcher(*status
);
221 if (U_FAILURE(*status
)) {
226 *status
= U_MEMORY_ALLOCATION_ERROR
;
230 clone
->fPat
= source
->fPat
;
231 clone
->fPatRefCount
= source
->fPatRefCount
;
232 clone
->fPatString
= source
->fPatString
;
233 clone
->fPatStringLen
= source
->fPatStringLen
;
234 umtx_atomic_inc(source
->fPatRefCount
);
235 // Note: fText is not cloned.
243 //----------------------------------------------------------------------------------------
247 //----------------------------------------------------------------------------------------
248 U_CAPI
const UChar
* U_EXPORT2
249 uregex_pattern(const URegularExpression
*regexp
,
251 UErrorCode
*status
) {
253 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
256 if (patLength
!= NULL
) {
257 *patLength
= regexp
->fPatStringLen
;
259 return regexp
->fPatString
;
263 //----------------------------------------------------------------------------------------
267 //----------------------------------------------------------------------------------------
268 U_CAPI
int32_t U_EXPORT2
269 uregex_flags(const URegularExpression
*regexp
, UErrorCode
*status
) {
270 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
273 int32_t flags
= regexp
->fPat
->flags();
278 //----------------------------------------------------------------------------------------
282 //----------------------------------------------------------------------------------------
283 U_CAPI
void U_EXPORT2
284 uregex_setText(URegularExpression
*regexp
,
287 UErrorCode
*status
) {
288 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
291 if (text
== NULL
|| textLength
< -1) {
292 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
295 regexp
->fText
= text
;
296 regexp
->fTextLength
= textLength
;
297 UBool isTerminated
= (textLength
== -1);
299 regexp
->fTextString
.setTo(isTerminated
, text
, textLength
);
300 regexp
->fMatcher
->reset(regexp
->fTextString
);
305 //----------------------------------------------------------------------------------------
309 //----------------------------------------------------------------------------------------
310 U_CAPI
const UChar
* U_EXPORT2
311 uregex_getText(URegularExpression
*regexp
,
313 UErrorCode
*status
) {
314 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
317 if (textLength
!= NULL
) {
318 *textLength
= regexp
->fTextLength
;
320 return regexp
->fText
;
324 //----------------------------------------------------------------------------------------
328 //----------------------------------------------------------------------------------------
329 U_CAPI UBool U_EXPORT2
330 uregex_matches(URegularExpression
*regexp
,
332 UErrorCode
*status
) {
333 if (validateRE(regexp
, status
) == FALSE
) {
336 UBool result
= regexp
->fMatcher
->matches(startIndex
, *status
);
342 //----------------------------------------------------------------------------------------
346 //----------------------------------------------------------------------------------------
347 U_CAPI UBool U_EXPORT2
348 uregex_lookingAt(URegularExpression
*regexp
,
350 UErrorCode
*status
) {
351 if (validateRE(regexp
, status
) == FALSE
) {
354 UBool result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
360 //----------------------------------------------------------------------------------------
364 //----------------------------------------------------------------------------------------
365 U_CAPI UBool U_EXPORT2
366 uregex_find(URegularExpression
*regexp
,
368 UErrorCode
*status
) {
369 if (validateRE(regexp
, status
) == FALSE
) {
372 UBool result
= regexp
->fMatcher
->find(startIndex
, *status
);
376 //----------------------------------------------------------------------------------------
380 //----------------------------------------------------------------------------------------
381 U_CAPI UBool U_EXPORT2
382 uregex_findNext(URegularExpression
*regexp
,
383 UErrorCode
*status
) {
384 if (validateRE(regexp
, status
) == FALSE
) {
387 UBool result
= regexp
->fMatcher
->find();
391 //----------------------------------------------------------------------------------------
395 //----------------------------------------------------------------------------------------
396 U_CAPI
int32_t U_EXPORT2
397 uregex_groupCount(URegularExpression
*regexp
,
398 UErrorCode
*status
) {
399 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
402 int32_t result
= regexp
->fMatcher
->groupCount();
407 //----------------------------------------------------------------------------------------
411 //----------------------------------------------------------------------------------------
412 U_CAPI
int32_t U_EXPORT2
413 uregex_group(URegularExpression
*regexp
,
416 int32_t destCapacity
,
417 UErrorCode
*status
) {
418 if (validateRE(regexp
, status
) == FALSE
) {
421 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
422 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
427 // Pick up the range of characters from the matcher
429 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
430 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
431 if (U_FAILURE(*status
)) {
436 // Trim length based on buffer capacity
438 int32_t fullLength
= endIx
- startIx
;
439 int32_t copyLength
= fullLength
;
440 if (copyLength
< destCapacity
) {
441 dest
[copyLength
] = 0;
442 } else if (copyLength
== destCapacity
) {
443 *status
= U_STRING_NOT_TERMINATED_WARNING
;
445 copyLength
= destCapacity
;
446 *status
= U_BUFFER_OVERFLOW_ERROR
;
450 // Copy capture group to user's buffer
452 if (copyLength
> 0) {
453 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
459 //----------------------------------------------------------------------------------------
463 //----------------------------------------------------------------------------------------
464 U_CAPI
int32_t U_EXPORT2
465 uregex_start(URegularExpression
*regexp
,
467 UErrorCode
*status
) {
468 if (validateRE(regexp
, status
) == FALSE
) {
471 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
476 //----------------------------------------------------------------------------------------
480 //----------------------------------------------------------------------------------------
481 U_CAPI
int32_t U_EXPORT2
482 uregex_end(URegularExpression
*regexp
,
484 UErrorCode
*status
) {
485 if (validateRE(regexp
, status
) == FALSE
) {
488 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
492 //----------------------------------------------------------------------------------------
496 //----------------------------------------------------------------------------------------
497 U_CAPI
void U_EXPORT2
498 uregex_reset(URegularExpression
*regexp
,
500 UErrorCode
*status
) {
501 if (validateRE(regexp
, status
) == FALSE
) {
504 regexp
->fMatcher
->reset(index
, *status
);
508 //----------------------------------------------------------------------------------------
512 //----------------------------------------------------------------------------------------
513 U_CAPI
int32_t U_EXPORT2
514 uregex_replaceAll(URegularExpression
*regexp
,
515 UChar
*replacementText
,
516 int32_t replacementLength
,
518 int32_t destCapacity
,
519 UErrorCode
*status
) {
520 if (validateRE(regexp
, status
) == FALSE
) {
523 if (replacementText
== NULL
|| replacementLength
< -1 ||
524 destBuf
== NULL
&& destCapacity
> 0 ||
526 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
531 uregex_reset(regexp
, 0, status
);
532 while (uregex_findNext(regexp
, status
)) {
533 len
+= uregex_appendReplacement(regexp
, replacementText
, replacementLength
,
534 &destBuf
, &destCapacity
, status
);
536 len
+= uregex_appendTail(regexp
, &destBuf
, &destCapacity
, status
);
542 //----------------------------------------------------------------------------------------
544 // uregex_replaceFirst
546 //----------------------------------------------------------------------------------------
547 U_CAPI
int32_t U_EXPORT2
548 uregex_replaceFirst(URegularExpression
*regexp
,
549 UChar
*replacementText
,
550 int32_t replacementLength
,
552 int32_t destCapacity
,
553 UErrorCode
*status
) {
554 if (validateRE(regexp
, status
) == FALSE
) {
557 if (replacementText
== NULL
|| replacementLength
< -1 ||
558 destBuf
== NULL
&& destCapacity
> 0 ||
560 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
566 uregex_reset(regexp
, 0, status
);
567 findSucceeded
= uregex_find(regexp
, 0, status
);
569 len
= uregex_appendReplacement(regexp
, replacementText
, replacementLength
,
570 &destBuf
, &destCapacity
, status
);
572 len
+= uregex_appendTail(regexp
, &destBuf
, &destCapacity
, status
);
578 //----------------------------------------------------------------------------------------
580 // uregex_appendReplacement
582 //----------------------------------------------------------------------------------------
586 // Dummy class, because these functions need to be friends of class RegexMatcher,
587 // and stand-alone C functions don't work as friends
592 inline static int32_t appendReplacement(URegularExpression
*regexp
,
593 UChar
*replacementText
,
594 int32_t replacementLength
,
596 int32_t *destCapacity
,
599 inline static int32_t appendTail(URegularExpression
*regexp
,
601 int32_t *destCapacity
,
608 // Call-back function for u_unescapeAt(), used when we encounter
609 // \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
612 static UChar U_CALLCONV
613 unescape_charAt(int32_t offset
, void *context
) {
614 UChar c16
= ((UChar
*)context
)[offset
];
620 static const UChar BACKSLASH
= 0x5c;
621 static const UChar DOLLARSIGN
= 0x24;
624 // Move a character to an output buffer, with bounds checking on the index.
625 // Index advances even if capacity is exceeded, for preflight size computations.
626 // This little sequence is used a LOT.
628 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
629 if (*idx
< bufCapacity
) {
637 // appendReplacement, the actual implementation.
639 int32_t RegexCImpl::appendReplacement(URegularExpression
*regexp
,
640 UChar
*replacementText
,
641 int32_t replacementLength
,
643 int32_t *destCapacity
,
644 UErrorCode
*status
) {
646 // If we come in with a buffer overflow error, don't suppress the operation.
647 // A series of appendReplacements, appendTail need to correctly preflight
648 // the buffer size when an overflow happens somewhere in the middle.
649 UBool pendingBufferOverflow
= FALSE
;
650 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
== 0) {
651 pendingBufferOverflow
= TRUE
;
652 *status
= U_ZERO_ERROR
;
656 // Validate all paramters
658 if (validateRE(regexp
, status
) == FALSE
) {
661 if (replacementText
== NULL
|| replacementLength
< -1 ||
662 destCapacity
== NULL
|| destBuf
== NULL
||
663 *destBuf
== NULL
&& *destCapacity
> 0 ||
665 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
669 RegexMatcher
*m
= regexp
->fMatcher
;
670 if (m
->fMatch
== FALSE
) {
671 *status
= U_REGEX_INVALID_STATE
;
675 UChar
*dest
= *destBuf
;
676 int32_t capacity
= *destCapacity
;
680 // If it wasn't supplied by the caller, get the length of the replacement text.
681 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
682 // the fly and avoid this step.
683 if (replacementLength
== -1) {
684 replacementLength
= u_strlen(replacementText
);
687 // Copy input string from the end of previous match to start of current match
688 for (i
=m
->fLastMatchEnd
; i
<m
->fMatchStart
; i
++) {
689 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
694 // scan the replacement text, looking for substitutions ($n) and \escapes.
696 while (replIdx
< replacementLength
) {
697 UChar c
= replacementText
[replIdx
];
699 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
700 // Common case, no substitution, no escaping,
701 // just copy the char to the dest buf.
702 appendToBuf(c
, &destIdx
, dest
, capacity
);
706 if (c
== BACKSLASH
) {
707 // Backslash Escape. Copy the following char out without further checks.
708 // Note: Surrogate pairs don't need any special handling
709 // The second half wont be a '$' or a '\', and
710 // will move to the dest normally on the next
712 if (replIdx
>= replacementLength
) {
715 c
= replacementText
[replIdx
];
717 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
718 // We have a \udddd or \Udddddddd escape sequence.
719 UChar32 escapedChar
=
720 u_unescapeAt(unescape_charAt
,
721 &replIdx
, // Index is updated by unescapeAt
722 replacementLength
, // Length of replacement text
725 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
726 if (escapedChar
<= 0xffff) {
727 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
729 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
730 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
734 // Note: if the \u escape was invalid, just fall through and
735 // treat it as a plain \<anything> escape.
738 // Plain backslash escape. Just put out the escaped character.
739 appendToBuf(c
, &destIdx
, dest
, capacity
);
747 // We've got a $. Pick up a capture group number if one follows.
748 // Consume at most the number of digits necessary for the largest capture
749 // number that is valid for this pattern.
751 int32_t numDigits
= 0;
752 int32_t groupNum
= 0;
755 if (replIdx
>= replacementLength
) {
758 U16_GET(replacementText
, 0, replIdx
, replacementLength
, digitC
);
759 if (u_isdigit(digitC
) == FALSE
) {
763 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
764 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
766 if (numDigits
>= m
->fPattern
->fMaxCaptureDigits
) {
772 if (numDigits
== 0) {
773 // The $ didn't introduce a group number at all.
774 // Treat it as just part of the substitution text.
775 appendToBuf(DOLLARSIGN
, &destIdx
, dest
, capacity
);
779 // Finally, append the capture group data to the destination.
780 int32_t capacityRemaining
= capacity
- destIdx
;
781 if (capacityRemaining
< 0) {
782 capacityRemaining
= 0;
784 destIdx
+= uregex_group(regexp
, groupNum
, dest
+destIdx
, capacityRemaining
, status
);
785 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
786 // Ignore buffer overflow when extracting the group. We need to
787 // continue on to get full size of the untruncated result. We will
788 // raise our own buffer overflow error at the end.
789 *status
= U_ZERO_ERROR
;
792 if (U_FAILURE(*status
)) {
793 // Can fail if group number is out of range.
800 // Nul Terminate the dest buffer if possible.
801 // Set the appropriate buffer overflow or not terminated error, if needed.
803 if (destIdx
< capacity
) {
805 } else if (destIdx
== *destCapacity
) {
806 *status
= U_STRING_NOT_TERMINATED_WARNING
;
808 *status
= U_BUFFER_OVERFLOW_ERROR
;
812 // Return an updated dest buffer and capacity to the caller.
814 if (destIdx
> 0 && *destCapacity
> 0) {
815 if (destIdx
< capacity
) {
817 *destCapacity
-= destIdx
;
819 *destBuf
+= capacity
;
824 // If we came in with a buffer overflow, make sure we go out with one also.
825 // (A zero length match right at the end of the previous match could
826 // make this function succeed even though a previous call had overflowed the buf)
827 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
828 *status
= U_BUFFER_OVERFLOW_ERROR
;
835 // appendReplacement the acutal API function,
837 U_CAPI
int32_t U_EXPORT2
838 uregex_appendReplacement(URegularExpression
*regexp
,
839 UChar
*replacementText
,
840 int32_t replacementLength
,
842 int32_t *destCapacity
,
843 UErrorCode
*status
) {
844 return RegexCImpl::appendReplacement(
845 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
849 //----------------------------------------------------------------------------------------
853 //----------------------------------------------------------------------------------------
854 int32_t RegexCImpl::appendTail(URegularExpression
*regexp
,
856 int32_t *destCapacity
,
857 UErrorCode
*status
) {
859 // If we come in with a buffer overflow error, don't suppress the operation.
860 // A series of appendReplacements, appendTail need to correctly preflight
861 // the buffer size when an overflow happens somewhere in the middle.
862 UBool pendingBufferOverflow
= FALSE
;
863 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& *destCapacity
== 0) {
864 pendingBufferOverflow
= TRUE
;
865 *status
= U_ZERO_ERROR
;
868 if (validateRE(regexp
, status
) == FALSE
) {
871 if (destCapacity
== NULL
|| destBuf
== NULL
||
872 *destBuf
== NULL
&& *destCapacity
> 0 ||
874 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
878 RegexMatcher
*m
= regexp
->fMatcher
;
882 // The most recent call to find() succeeded.
883 srcIdx
= m
->fMatchEnd
;
885 // The last call to find() on this matcher failed().
886 // Look back to the end of the last find() that succeeded for src index.
887 srcIdx
= m
->fLastMatchEnd
;
889 // There has been no successful match with this matcher.
890 // We want to copy the whole string.
896 int32_t destCap
= *destCapacity
;
897 UChar
*dest
= *destBuf
;
900 if (srcIdx
== regexp
->fTextLength
) {
903 UChar c
= regexp
->fText
[srcIdx
];
904 if (c
== 0 && regexp
->fTextLength
== -1) {
907 if (destIdx
< destCap
) {
910 // We've overflowed the dest buffer.
911 // If the total input string length is known, we can
912 // compute the total buffer size needed without scanning through the string.
913 if (regexp
->fTextLength
> 0) {
914 destIdx
+= (regexp
->fTextLength
- srcIdx
);
923 // NUL terminate the output string, if possible, otherwise issue the
924 // appropriate error or warning.
926 if (destIdx
< destCap
) {
928 } else if (destIdx
== destCap
) {
929 *status
= U_STRING_NOT_TERMINATED_WARNING
;
931 *status
= U_BUFFER_OVERFLOW_ERROR
;
935 // Update the user's buffer ptr and capacity vars to reflect the
938 if (destIdx
< destCap
) {
940 *destCapacity
-= destIdx
;
946 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
947 *status
= U_BUFFER_OVERFLOW_ERROR
;
954 U_CAPI
int32_t U_EXPORT2
955 uregex_appendTail(URegularExpression
*regexp
,
957 int32_t *destCapacity
,
958 UErrorCode
*status
) {
959 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
963 //----------------------------------------------------------------------------------------
965 // copyString Internal utility to copy a string to an output buffer,
966 // while managing buffer overflow and preflight size
967 // computation. NUL termination is added to destination,
968 // and the NUL is counted in the output size.
970 //----------------------------------------------------------------------------------------
971 static void copyString(UChar
*destBuffer
, // Destination buffer.
972 int32_t destCapacity
, // Total capacity of dest buffer
973 int32_t *destIndex
, // Index into dest buffer. Updated on return.
974 // Update not clipped to destCapacity.
975 const UChar
*srcPtr
, // Pointer to source string
976 int32_t srcLen
) // Source string len.
979 int32_t di
= *destIndex
;
982 for (si
=0; si
<srcLen
; si
++) {
984 if (di
< destCapacity
) {
992 destBuffer
[di
++] = 0;
997 //----------------------------------------------------------------------------------------
1001 //----------------------------------------------------------------------------------------
1002 U_CAPI
int32_t U_EXPORT2
1003 uregex_split( URegularExpression
*regexp
,
1005 int32_t destCapacity
,
1006 int32_t *requiredCapacity
,
1007 UChar
*destFields
[],
1008 int32_t destFieldsCapacity
,
1009 UErrorCode
*status
) {
1010 if (validateRE(regexp
, status
) == FALSE
) {
1013 if (destBuf
== NULL
&& destCapacity
> 0 ||
1015 destFields
== NULL
||
1016 destFieldsCapacity
< 1 ) {
1017 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1022 // Reset for the input text
1024 regexp
->fMatcher
->reset();
1025 int32_t inputLen
= regexp
->fTextString
.length();
1026 int32_t nextOutputStringStart
= 0;
1027 if (inputLen
== 0) {
1033 // Loop through the input text, searching for the delimiter pattern
1035 int32_t i
; // Index of the field being processed.
1036 int32_t destIdx
= 0; // Next available position in destBuf;
1037 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1039 if (i
>=destFieldsCapacity
-1) {
1040 // There are one or zero output string left.
1041 // Fill the last output string with whatever is left from the input, then exit the loop.
1042 // ( i will be == destFieldsCapacity if we filled the output array while processing
1043 // capture groups of the delimiter expression, in which case we will discard the
1044 // last capture group saved in favor of the unprocessed remainder of the
1046 int32_t remainingLength
= inputLen
-nextOutputStringStart
;
1047 if (remainingLength
> 0) {
1049 if (i
>= destFieldsCapacity
) {
1050 // No fields are left. Recycle the last one for holding the trailing part of
1051 // the input string.
1052 i
= destFieldsCapacity
-1;
1053 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1056 destFields
[i
] = &destBuf
[destIdx
];
1057 copyString(destBuf
, destCapacity
, &destIdx
,
1058 ®exp
->fText
[nextOutputStringStart
], remainingLength
);
1062 if (regexp
->fMatcher
->find()) {
1063 // We found another delimiter. Move everything from where we started looking
1064 // up until the start of the delimiter into the next output string.
1065 int32_t fieldLen
= regexp
->fMatcher
->start(*status
) - nextOutputStringStart
;
1066 destFields
[i
] = &destBuf
[destIdx
];
1067 copyString(destBuf
, destCapacity
, &destIdx
,
1068 ®exp
->fText
[nextOutputStringStart
], fieldLen
);
1069 nextOutputStringStart
= regexp
->fMatcher
->end(*status
);
1071 // If the delimiter pattern has capturing parentheses, the captured
1072 // text goes out into the next n destination strings.
1074 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1075 // If we've run out of output string slots, bail out.
1076 if (i
==destFieldsCapacity
-1) {
1081 // Set up to extract the capture group contents into the dest buffer.
1082 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow
1083 // error while extracting this group.
1084 int32_t remainingCapacity
= destCapacity
- destIdx
;
1085 if (remainingCapacity
< 0) {
1086 remainingCapacity
= 0;
1088 destFields
[i
] = &destBuf
[destIdx
];
1089 int32_t t
= uregex_group(regexp
, groupNum
, destFields
[i
], remainingCapacity
, &tStatus
);
1090 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1091 // +1 for the NUL that terminates the string.
1094 if (nextOutputStringStart
== inputLen
) {
1095 // The delimiter was at the end of the string. We're done.
1102 // We ran off the end of the input while looking for the next delimiter.
1103 // All the remaining text goes into the current output string.
1104 destFields
[i
] = &destBuf
[destIdx
];
1105 copyString(destBuf
, destCapacity
, &destIdx
,
1106 ®exp
->fText
[nextOutputStringStart
], inputLen
-nextOutputStringStart
);
1111 // Zero out any unused portion of the destFields array
1113 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1114 destFields
[j
] = NULL
;
1117 if (requiredCapacity
!= NULL
) {
1118 *requiredCapacity
= destIdx
;
1120 if (*requiredCapacity
> destCapacity
) {
1121 *status
= U_BUFFER_OVERFLOW_ERROR
;
1131 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS