2 *******************************************************************************
3 * Copyright (C) 2004-2005, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
23 struct URegularExpression
: public UMemory
{
26 ~URegularExpression();
29 int32_t *fPatRefCount
;
31 int32_t fPatStringLen
;
32 RegexMatcher
*fMatcher
;
33 const UChar
*fText
; // Text from setText()
34 int32_t fTextLength
; // Length provided by user with setText(), which
37 UnicodeString fTextString
; // The setText(text) is wrapped into a UnicodeString.
38 // TODO: regexp engine should not depend on UnicodeString.
41 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
45 URegularExpression::URegularExpression() {
56 URegularExpression::~URegularExpression() {
59 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
61 uprv_free(fPatString
);
62 uprv_free(fPatRefCount
);
67 //----------------------------------------------------------------------------------------
69 // validateRE Do boilerplate style checks on API function parameters.
70 // Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
72 static UBool
validateRE(const URegularExpression
*re
, UErrorCode
*status
, UBool requiresText
= TRUE
) {
73 if (U_FAILURE(*status
)) {
76 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
78 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
81 if (requiresText
&& re
->fText
== NULL
) {
82 *status
= U_REGEX_INVALID_STATE
;
88 //----------------------------------------------------------------------------------------
92 //----------------------------------------------------------------------------------------
93 U_CAPI URegularExpression
* U_EXPORT2
94 uregex_open( const UChar
*pattern
,
95 int32_t patternLength
,
100 if (U_FAILURE(*status
)) {
103 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
104 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
107 int32_t actualPatLen
= patternLength
;
108 if (actualPatLen
== -1) {
109 actualPatLen
= u_strlen(pattern
);
112 URegularExpression
*re
= new URegularExpression
;
113 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
114 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
115 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
116 *status
= U_MEMORY_ALLOCATION_ERROR
;
122 re
->fPatRefCount
= refC
;
123 *re
->fPatRefCount
= 1;
126 // Make a copy of the pattern string, so we can return it later if asked.
127 // For compiling the pattern, we will use a read-only-aliased UnicodeString
128 // of this local copy, to avoid making even more copies.
130 re
->fPatString
= patBuf
;
131 re
->fPatStringLen
= patternLength
;
132 u_memcpy(patBuf
, pattern
, actualPatLen
);
133 patBuf
[actualPatLen
] = 0;
134 UnicodeString
patString(patternLength
==-1, patBuf
, patternLength
);
137 // Compile the pattern
140 re
->fPat
= RegexPattern::compile(patString
, flags
, *pe
, *status
);
142 re
->fPat
= RegexPattern::compile(patString
, flags
, *status
);
144 if (U_FAILURE(*status
)) {
149 // Create the matcher object
151 re
->fMatcher
= re
->fPat
->matcher(*status
);
152 if (U_SUCCESS(*status
)) {
162 //----------------------------------------------------------------------------------------
166 //----------------------------------------------------------------------------------------
167 U_CAPI
void U_EXPORT2
168 uregex_close(URegularExpression
*re
) {
169 UErrorCode status
= U_ZERO_ERROR
;
170 if (validateRE(re
, &status
, FALSE
) == FALSE
) {
177 //----------------------------------------------------------------------------------------
181 //----------------------------------------------------------------------------------------
182 U_CAPI URegularExpression
* U_EXPORT2
183 uregex_clone(const URegularExpression
*source
, UErrorCode
*status
) {
184 if (validateRE(source
, status
, FALSE
) == FALSE
) {
188 URegularExpression
*clone
= new URegularExpression
;
190 *status
= U_MEMORY_ALLOCATION_ERROR
;
194 clone
->fMatcher
= source
->fPat
->matcher(*status
);
195 if (U_FAILURE(*status
)) {
200 *status
= U_MEMORY_ALLOCATION_ERROR
;
204 clone
->fPat
= source
->fPat
;
205 clone
->fPatRefCount
= source
->fPatRefCount
;
206 clone
->fPatString
= source
->fPatString
;
207 clone
->fPatStringLen
= source
->fPatStringLen
;
208 umtx_atomic_inc(source
->fPatRefCount
);
209 // Note: fText is not cloned.
217 //------------------------------------------------------------------------------
221 //------------------------------------------------------------------------------
222 U_CAPI
const UChar
* U_EXPORT2
223 uregex_pattern(const URegularExpression
*regexp
,
225 UErrorCode
*status
) {
227 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
230 if (patLength
!= NULL
) {
231 *patLength
= regexp
->fPatStringLen
;
233 return regexp
->fPatString
;
237 //------------------------------------------------------------------------------
241 //------------------------------------------------------------------------------
242 U_CAPI
int32_t U_EXPORT2
243 uregex_flags(const URegularExpression
*regexp
, UErrorCode
*status
) {
244 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
247 int32_t flags
= regexp
->fPat
->flags();
252 //------------------------------------------------------------------------------
256 //------------------------------------------------------------------------------
257 U_CAPI
void U_EXPORT2
258 uregex_setText(URegularExpression
*regexp
,
261 UErrorCode
*status
) {
262 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
265 if (text
== NULL
|| textLength
< -1) {
266 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
269 regexp
->fText
= text
;
270 regexp
->fTextLength
= textLength
;
271 UBool isTerminated
= (textLength
== -1);
273 regexp
->fTextString
.setTo(isTerminated
, text
, textLength
);
274 regexp
->fMatcher
->reset(regexp
->fTextString
);
279 //------------------------------------------------------------------------------
283 //------------------------------------------------------------------------------
284 U_CAPI
const UChar
* U_EXPORT2
285 uregex_getText(URegularExpression
*regexp
,
287 UErrorCode
*status
) {
288 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
291 if (textLength
!= NULL
) {
292 *textLength
= regexp
->fTextLength
;
294 return regexp
->fText
;
298 //------------------------------------------------------------------------------
302 //------------------------------------------------------------------------------
303 U_CAPI UBool U_EXPORT2
304 uregex_matches(URegularExpression
*regexp
,
306 UErrorCode
*status
) {
307 if (validateRE(regexp
, status
) == FALSE
) {
310 UBool result
= regexp
->fMatcher
->matches(startIndex
, *status
);
316 //------------------------------------------------------------------------------
320 //------------------------------------------------------------------------------
321 U_CAPI UBool U_EXPORT2
322 uregex_lookingAt(URegularExpression
*regexp
,
324 UErrorCode
*status
) {
325 if (validateRE(regexp
, status
) == FALSE
) {
328 UBool result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
334 //------------------------------------------------------------------------------
338 //------------------------------------------------------------------------------
339 U_CAPI UBool U_EXPORT2
340 uregex_find(URegularExpression
*regexp
,
342 UErrorCode
*status
) {
343 if (validateRE(regexp
, status
) == FALSE
) {
346 UBool result
= regexp
->fMatcher
->find(startIndex
, *status
);
350 //------------------------------------------------------------------------------
354 //------------------------------------------------------------------------------
355 U_CAPI UBool U_EXPORT2
356 uregex_findNext(URegularExpression
*regexp
,
357 UErrorCode
*status
) {
358 if (validateRE(regexp
, status
) == FALSE
) {
361 UBool result
= regexp
->fMatcher
->find();
365 //------------------------------------------------------------------------------
369 //------------------------------------------------------------------------------
370 U_CAPI
int32_t U_EXPORT2
371 uregex_groupCount(URegularExpression
*regexp
,
372 UErrorCode
*status
) {
373 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
376 int32_t result
= regexp
->fMatcher
->groupCount();
381 //------------------------------------------------------------------------------
385 //------------------------------------------------------------------------------
386 U_CAPI
int32_t U_EXPORT2
387 uregex_group(URegularExpression
*regexp
,
390 int32_t destCapacity
,
391 UErrorCode
*status
) {
392 if (validateRE(regexp
, status
) == FALSE
) {
395 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
396 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
401 // Pick up the range of characters from the matcher
403 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
404 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
405 if (U_FAILURE(*status
)) {
410 // Trim length based on buffer capacity
412 int32_t fullLength
= endIx
- startIx
;
413 int32_t copyLength
= fullLength
;
414 if (copyLength
< destCapacity
) {
415 dest
[copyLength
] = 0;
416 } else if (copyLength
== destCapacity
) {
417 *status
= U_STRING_NOT_TERMINATED_WARNING
;
419 copyLength
= destCapacity
;
420 *status
= U_BUFFER_OVERFLOW_ERROR
;
424 // Copy capture group to user's buffer
426 if (copyLength
> 0) {
427 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
433 //------------------------------------------------------------------------------
437 //------------------------------------------------------------------------------
438 U_CAPI
int32_t U_EXPORT2
439 uregex_start(URegularExpression
*regexp
,
441 UErrorCode
*status
) {
442 if (validateRE(regexp
, status
) == FALSE
) {
445 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
450 //------------------------------------------------------------------------------
454 //------------------------------------------------------------------------------
455 U_CAPI
int32_t U_EXPORT2
456 uregex_end(URegularExpression
*regexp
,
458 UErrorCode
*status
) {
459 if (validateRE(regexp
, status
) == FALSE
) {
462 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
466 //------------------------------------------------------------------------------
470 //------------------------------------------------------------------------------
471 U_CAPI
void U_EXPORT2
472 uregex_reset(URegularExpression
*regexp
,
474 UErrorCode
*status
) {
475 if (validateRE(regexp
, status
) == FALSE
) {
478 regexp
->fMatcher
->reset(index
, *status
);
482 //------------------------------------------------------------------------------
486 //------------------------------------------------------------------------------
487 U_CAPI
int32_t U_EXPORT2
488 uregex_replaceAll(URegularExpression
*regexp
,
489 const UChar
*replacementText
,
490 int32_t replacementLength
,
492 int32_t destCapacity
,
493 UErrorCode
*status
) {
494 if (validateRE(regexp
, status
) == FALSE
) {
497 if (replacementText
== NULL
|| replacementLength
< -1 ||
498 destBuf
== NULL
&& destCapacity
> 0 ||
500 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
505 uregex_reset(regexp
, 0, status
);
506 while (uregex_findNext(regexp
, status
)) {
507 len
+= uregex_appendReplacement(regexp
, replacementText
, replacementLength
,
508 &destBuf
, &destCapacity
, status
);
510 len
+= uregex_appendTail(regexp
, &destBuf
, &destCapacity
, status
);
516 //------------------------------------------------------------------------------
518 // uregex_replaceFirst
520 //------------------------------------------------------------------------------
521 U_CAPI
int32_t U_EXPORT2
522 uregex_replaceFirst(URegularExpression
*regexp
,
523 const UChar
*replacementText
,
524 int32_t replacementLength
,
526 int32_t destCapacity
,
527 UErrorCode
*status
) {
528 if (validateRE(regexp
, status
) == FALSE
) {
531 if (replacementText
== NULL
|| replacementLength
< -1 ||
532 destBuf
== NULL
&& destCapacity
> 0 ||
534 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
540 uregex_reset(regexp
, 0, status
);
541 findSucceeded
= uregex_find(regexp
, 0, status
);
543 len
= uregex_appendReplacement(regexp
, replacementText
, replacementLength
,
544 &destBuf
, &destCapacity
, status
);
546 len
+= uregex_appendTail(regexp
, &destBuf
, &destCapacity
, status
);
552 //------------------------------------------------------------------------------
554 // uregex_appendReplacement
556 //------------------------------------------------------------------------------
560 // Dummy class, because these functions need to be friends of class RegexMatcher,
561 // and stand-alone C functions don't work as friends
566 inline static int32_t appendReplacement(URegularExpression
*regexp
,
567 const UChar
*replacementText
,
568 int32_t replacementLength
,
570 int32_t *destCapacity
,
573 inline static int32_t appendTail(URegularExpression
*regexp
,
575 int32_t *destCapacity
,
582 // Call-back function for u_unescapeAt(), used when we encounter
583 // \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
586 static UChar U_CALLCONV
587 unescape_charAt(int32_t offset
, void *context
) {
588 UChar c16
= ((UChar
*)context
)[offset
];
594 static const UChar BACKSLASH
= 0x5c;
595 static const UChar DOLLARSIGN
= 0x24;
598 // Move a character to an output buffer, with bounds checking on the index.
599 // Index advances even if capacity is exceeded, for preflight size computations.
600 // This little sequence is used a LOT.
602 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
603 if (*idx
< bufCapacity
) {
611 // appendReplacement, the actual implementation.
613 int32_t RegexCImpl::appendReplacement(URegularExpression
*regexp
,
614 const UChar
*replacementText
,
615 int32_t replacementLength
,
617 int32_t *destCapacity
,
618 UErrorCode
*status
) {
620 // If we come in with a buffer overflow error, don't suppress the operation.
621 // A series of appendReplacements, appendTail need to correctly preflight
622 // the buffer size when an overflow happens somewhere in the middle.
623 UBool pendingBufferOverflow
= FALSE
;
624 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
== 0) {
625 pendingBufferOverflow
= TRUE
;
626 *status
= U_ZERO_ERROR
;
630 // Validate all paramters
632 if (validateRE(regexp
, status
) == FALSE
) {
635 if (replacementText
== NULL
|| replacementLength
< -1 ||
636 destCapacity
== NULL
|| destBuf
== NULL
||
637 *destBuf
== NULL
&& *destCapacity
> 0 ||
639 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
643 RegexMatcher
*m
= regexp
->fMatcher
;
644 if (m
->fMatch
== FALSE
) {
645 *status
= U_REGEX_INVALID_STATE
;
649 UChar
*dest
= *destBuf
;
650 int32_t capacity
= *destCapacity
;
654 // If it wasn't supplied by the caller, get the length of the replacement text.
655 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
656 // the fly and avoid this step.
657 if (replacementLength
== -1) {
658 replacementLength
= u_strlen(replacementText
);
661 // Copy input string from the end of previous match to start of current match
662 for (i
=m
->fLastMatchEnd
; i
<m
->fMatchStart
; i
++) {
663 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
668 // scan the replacement text, looking for substitutions ($n) and \escapes.
670 while (replIdx
< replacementLength
) {
671 UChar c
= replacementText
[replIdx
];
673 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
674 // Common case, no substitution, no escaping,
675 // just copy the char to the dest buf.
676 appendToBuf(c
, &destIdx
, dest
, capacity
);
680 if (c
== BACKSLASH
) {
681 // Backslash Escape. Copy the following char out without further checks.
682 // Note: Surrogate pairs don't need any special handling
683 // The second half wont be a '$' or a '\', and
684 // will move to the dest normally on the next
686 if (replIdx
>= replacementLength
) {
689 c
= replacementText
[replIdx
];
691 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
692 // We have a \udddd or \Udddddddd escape sequence.
693 UChar32 escapedChar
=
694 u_unescapeAt(unescape_charAt
,
695 &replIdx
, // Index is updated by unescapeAt
696 replacementLength
, // Length of replacement text
697 (void *)replacementText
);
699 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
700 if (escapedChar
<= 0xffff) {
701 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
703 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
704 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
708 // Note: if the \u escape was invalid, just fall through and
709 // treat it as a plain \<anything> escape.
712 // Plain backslash escape. Just put out the escaped character.
713 appendToBuf(c
, &destIdx
, dest
, capacity
);
721 // We've got a $. Pick up a capture group number if one follows.
722 // Consume at most the number of digits necessary for the largest capture
723 // number that is valid for this pattern.
725 int32_t numDigits
= 0;
726 int32_t groupNum
= 0;
729 if (replIdx
>= replacementLength
) {
732 U16_GET(replacementText
, 0, replIdx
, replacementLength
, digitC
);
733 if (u_isdigit(digitC
) == FALSE
) {
737 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
738 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
740 if (numDigits
>= m
->fPattern
->fMaxCaptureDigits
) {
746 if (numDigits
== 0) {
747 // The $ didn't introduce a group number at all.
748 // Treat it as just part of the substitution text.
749 appendToBuf(DOLLARSIGN
, &destIdx
, dest
, capacity
);
753 // Finally, append the capture group data to the destination.
754 int32_t capacityRemaining
= capacity
- destIdx
;
755 if (capacityRemaining
< 0) {
756 capacityRemaining
= 0;
758 destIdx
+= uregex_group(regexp
, groupNum
, dest
+destIdx
, capacityRemaining
, status
);
759 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
760 // Ignore buffer overflow when extracting the group. We need to
761 // continue on to get full size of the untruncated result. We will
762 // raise our own buffer overflow error at the end.
763 *status
= U_ZERO_ERROR
;
766 if (U_FAILURE(*status
)) {
767 // Can fail if group number is out of range.
774 // Nul Terminate the dest buffer if possible.
775 // Set the appropriate buffer overflow or not terminated error, if needed.
777 if (destIdx
< capacity
) {
779 } else if (destIdx
== *destCapacity
) {
780 *status
= U_STRING_NOT_TERMINATED_WARNING
;
782 *status
= U_BUFFER_OVERFLOW_ERROR
;
786 // Return an updated dest buffer and capacity to the caller.
788 if (destIdx
> 0 && *destCapacity
> 0) {
789 if (destIdx
< capacity
) {
791 *destCapacity
-= destIdx
;
793 *destBuf
+= capacity
;
798 // If we came in with a buffer overflow, make sure we go out with one also.
799 // (A zero length match right at the end of the previous match could
800 // make this function succeed even though a previous call had overflowed the buf)
801 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
802 *status
= U_BUFFER_OVERFLOW_ERROR
;
809 // appendReplacement the acutal API function,
811 U_CAPI
int32_t U_EXPORT2
812 uregex_appendReplacement(URegularExpression
*regexp
,
813 const UChar
*replacementText
,
814 int32_t replacementLength
,
816 int32_t *destCapacity
,
817 UErrorCode
*status
) {
818 return RegexCImpl::appendReplacement(
819 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
823 //------------------------------------------------------------------------------
827 //------------------------------------------------------------------------------
828 int32_t RegexCImpl::appendTail(URegularExpression
*regexp
,
830 int32_t *destCapacity
,
831 UErrorCode
*status
) {
833 // If we come in with a buffer overflow error, don't suppress the operation.
834 // A series of appendReplacements, appendTail need to correctly preflight
835 // the buffer size when an overflow happens somewhere in the middle.
836 UBool pendingBufferOverflow
= FALSE
;
837 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& *destCapacity
== 0) {
838 pendingBufferOverflow
= TRUE
;
839 *status
= U_ZERO_ERROR
;
842 if (validateRE(regexp
, status
) == FALSE
) {
845 if (destCapacity
== NULL
|| destBuf
== NULL
||
846 *destBuf
== NULL
&& *destCapacity
> 0 ||
848 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
852 RegexMatcher
*m
= regexp
->fMatcher
;
856 // The most recent call to find() succeeded.
857 srcIdx
= m
->fMatchEnd
;
859 // The last call to find() on this matcher failed().
860 // Look back to the end of the last find() that succeeded for src index.
861 srcIdx
= m
->fLastMatchEnd
;
863 // There has been no successful match with this matcher.
864 // We want to copy the whole string.
870 int32_t destCap
= *destCapacity
;
871 UChar
*dest
= *destBuf
;
874 if (srcIdx
== regexp
->fTextLength
) {
877 UChar c
= regexp
->fText
[srcIdx
];
878 if (c
== 0 && regexp
->fTextLength
== -1) {
881 if (destIdx
< destCap
) {
884 // We've overflowed the dest buffer.
885 // If the total input string length is known, we can
886 // compute the total buffer size needed without scanning through the string.
887 if (regexp
->fTextLength
> 0) {
888 destIdx
+= (regexp
->fTextLength
- srcIdx
);
897 // NUL terminate the output string, if possible, otherwise issue the
898 // appropriate error or warning.
900 if (destIdx
< destCap
) {
902 } else if (destIdx
== destCap
) {
903 *status
= U_STRING_NOT_TERMINATED_WARNING
;
905 *status
= U_BUFFER_OVERFLOW_ERROR
;
909 // Update the user's buffer ptr and capacity vars to reflect the
912 if (destIdx
< destCap
) {
914 *destCapacity
-= destIdx
;
920 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
921 *status
= U_BUFFER_OVERFLOW_ERROR
;
928 U_CAPI
int32_t U_EXPORT2
929 uregex_appendTail(URegularExpression
*regexp
,
931 int32_t *destCapacity
,
932 UErrorCode
*status
) {
933 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
937 //------------------------------------------------------------------------------
939 // copyString Internal utility to copy a string to an output buffer,
940 // while managing buffer overflow and preflight size
941 // computation. NUL termination is added to destination,
942 // and the NUL is counted in the output size.
944 //------------------------------------------------------------------------------
945 static void copyString(UChar
*destBuffer
, // Destination buffer.
946 int32_t destCapacity
, // Total capacity of dest buffer
947 int32_t *destIndex
, // Index into dest buffer. Updated on return.
948 // Update not clipped to destCapacity.
949 const UChar
*srcPtr
, // Pointer to source string
950 int32_t srcLen
) // Source string len.
953 int32_t di
= *destIndex
;
956 for (si
=0; si
<srcLen
; si
++) {
958 if (di
< destCapacity
) {
966 if (di
<destCapacity
) {
974 //------------------------------------------------------------------------------
978 //------------------------------------------------------------------------------
979 U_CAPI
int32_t U_EXPORT2
980 uregex_split( URegularExpression
*regexp
,
982 int32_t destCapacity
,
983 int32_t *requiredCapacity
,
985 int32_t destFieldsCapacity
,
986 UErrorCode
*status
) {
987 if (validateRE(regexp
, status
) == FALSE
) {
990 if (destBuf
== NULL
&& destCapacity
> 0 ||
992 destFields
== NULL
||
993 destFieldsCapacity
< 1 ) {
994 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
999 // Reset for the input text
1001 regexp
->fMatcher
->reset();
1002 int32_t inputLen
= regexp
->fTextString
.length();
1003 int32_t nextOutputStringStart
= 0;
1004 if (inputLen
== 0) {
1010 // Loop through the input text, searching for the delimiter pattern
1012 int32_t i
; // Index of the field being processed.
1013 int32_t destIdx
= 0; // Next available position in destBuf;
1014 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1016 if (i
>=destFieldsCapacity
-1) {
1017 // There are one or zero output string left.
1018 // Fill the last output string with whatever is left from the input, then exit the loop.
1019 // ( i will be == destFieldsCapacity if we filled the output array while processing
1020 // capture groups of the delimiter expression, in which case we will discard the
1021 // last capture group saved in favor of the unprocessed remainder of the
1023 int32_t remainingLength
= inputLen
-nextOutputStringStart
;
1024 if (remainingLength
> 0) {
1026 if (i
>= destFieldsCapacity
) {
1027 // No fields are left. Recycle the last one for holding the trailing part of
1028 // the input string.
1029 i
= destFieldsCapacity
-1;
1030 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1033 destFields
[i
] = &destBuf
[destIdx
];
1034 copyString(destBuf
, destCapacity
, &destIdx
,
1035 ®exp
->fText
[nextOutputStringStart
], remainingLength
);
1039 if (regexp
->fMatcher
->find()) {
1040 // We found another delimiter. Move everything from where we started looking
1041 // up until the start of the delimiter into the next output string.
1042 int32_t fieldLen
= regexp
->fMatcher
->start(*status
) - nextOutputStringStart
;
1043 destFields
[i
] = &destBuf
[destIdx
];
1044 copyString(destBuf
, destCapacity
, &destIdx
,
1045 ®exp
->fText
[nextOutputStringStart
], fieldLen
);
1046 nextOutputStringStart
= regexp
->fMatcher
->end(*status
);
1048 // If the delimiter pattern has capturing parentheses, the captured
1049 // text goes out into the next n destination strings.
1051 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1052 // If we've run out of output string slots, bail out.
1053 if (i
==destFieldsCapacity
-1) {
1058 // Set up to extract the capture group contents into the dest buffer.
1059 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow
1060 // error while extracting this group.
1061 int32_t remainingCapacity
= destCapacity
- destIdx
;
1062 if (remainingCapacity
< 0) {
1063 remainingCapacity
= 0;
1065 destFields
[i
] = &destBuf
[destIdx
];
1066 int32_t t
= uregex_group(regexp
, groupNum
, destFields
[i
], remainingCapacity
, &tStatus
);
1067 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1068 // +1 for the NUL that terminates the string.
1071 if (nextOutputStringStart
== inputLen
) {
1072 // The delimiter was at the end of the string. We're done.
1079 // We ran off the end of the input while looking for the next delimiter.
1080 // All the remaining text goes into the current output string.
1081 destFields
[i
] = &destBuf
[destIdx
];
1082 copyString(destBuf
, destCapacity
, &destIdx
,
1083 ®exp
->fText
[nextOutputStringStart
], inputLen
-nextOutputStringStart
);
1088 // Zero out any unused portion of the destFields array
1090 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1091 destFields
[j
] = NULL
;
1094 if (requiredCapacity
!= NULL
) {
1095 *requiredCapacity
= destIdx
;
1097 if (destIdx
> destCapacity
) {
1098 *status
= U_BUFFER_OVERFLOW_ERROR
;
1104 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS