1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2004-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uregex.cpp
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uregex.h"
17 #include "unicode/unistr.h"
18 #include "unicode/ustring.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uobject.h"
21 #include "unicode/utf16.h"
32 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
34 struct RegularExpression
: public UMemory
{
40 u_atomic_int32_t
*fPatRefCount
;
42 int32_t fPatStringLen
;
43 RegexMatcher
*fMatcher
;
44 const UChar
*fText
; // Text from setText()
45 int32_t fTextLength
; // Length provided by user with setText(), which
50 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
52 RegularExpression::RegularExpression() {
64 RegularExpression::~RegularExpression() {
67 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
69 uprv_free(fPatString
);
70 uprv_free((void *)fPatRefCount
);
72 if (fOwnsText
&& fText
!=NULL
) {
73 uprv_free((void *)fText
);
82 //----------------------------------------------------------------------------------------
84 // validateRE Do boilerplate style checks on API function parameters.
85 // Return TRUE if they look OK.
86 //----------------------------------------------------------------------------------------
87 static UBool
validateRE(const RegularExpression
*re
, UBool requiresText
, UErrorCode
*status
) {
88 if (U_FAILURE(*status
)) {
91 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
92 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
95 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
96 if (requiresText
&& re
->fText
== NULL
&& !re
->fOwnsText
) {
97 *status
= U_REGEX_INVALID_STATE
;
103 //----------------------------------------------------------------------------------------
107 //----------------------------------------------------------------------------------------
108 U_CAPI URegularExpression
* U_EXPORT2
109 uregex_open( const UChar
*pattern
,
110 int32_t patternLength
,
113 UErrorCode
*status
) {
115 if (U_FAILURE(*status
)) {
118 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
119 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
122 int32_t actualPatLen
= patternLength
;
123 if (actualPatLen
== -1) {
124 actualPatLen
= u_strlen(pattern
);
127 RegularExpression
*re
= new RegularExpression
;
128 u_atomic_int32_t
*refC
= (u_atomic_int32_t
*)uprv_malloc(sizeof(int32_t));
129 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
130 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
131 *status
= U_MEMORY_ALLOCATION_ERROR
;
133 uprv_free((void *)refC
);
137 re
->fPatRefCount
= refC
;
138 *re
->fPatRefCount
= 1;
141 // Make a copy of the pattern string, so we can return it later if asked.
142 // For compiling the pattern, we will use a UText wrapper around
143 // this local copy, to avoid making even more copies.
145 re
->fPatString
= patBuf
;
146 re
->fPatStringLen
= patternLength
;
147 u_memcpy(patBuf
, pattern
, actualPatLen
);
148 patBuf
[actualPatLen
] = 0;
150 UText patText
= UTEXT_INITIALIZER
;
151 utext_openUChars(&patText
, patBuf
, patternLength
, status
);
154 // Compile the pattern
157 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
159 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
161 utext_close(&patText
);
163 if (U_FAILURE(*status
)) {
168 // Create the matcher object
170 re
->fMatcher
= re
->fPat
->matcher(*status
);
171 if (U_SUCCESS(*status
)) {
172 return (URegularExpression
*)re
;
181 //----------------------------------------------------------------------------------------
185 //----------------------------------------------------------------------------------------
186 U_CAPI URegularExpression
* U_EXPORT2
187 uregex_openUText(UText
*pattern
,
190 UErrorCode
*status
) {
192 if (U_FAILURE(*status
)) {
195 if (pattern
== NULL
) {
196 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
200 int64_t patternNativeLength
= utext_nativeLength(pattern
);
202 if (patternNativeLength
== 0) {
203 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
207 RegularExpression
*re
= new RegularExpression
;
209 UErrorCode lengthStatus
= U_ZERO_ERROR
;
210 int32_t pattern16Length
= utext_extract(pattern
, 0, patternNativeLength
, NULL
, 0, &lengthStatus
);
212 u_atomic_int32_t
*refC
= (u_atomic_int32_t
*)uprv_malloc(sizeof(int32_t));
213 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(pattern16Length
+1));
214 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
215 *status
= U_MEMORY_ALLOCATION_ERROR
;
217 uprv_free((void *)refC
);
221 re
->fPatRefCount
= refC
;
222 *re
->fPatRefCount
= 1;
225 // Make a copy of the pattern string, so we can return it later if asked.
226 // For compiling the pattern, we will use a read-only UText wrapper
227 // around this local copy, to avoid making even more copies.
229 re
->fPatString
= patBuf
;
230 re
->fPatStringLen
= pattern16Length
;
231 utext_extract(pattern
, 0, patternNativeLength
, patBuf
, pattern16Length
+1, status
);
233 UText patText
= UTEXT_INITIALIZER
;
234 utext_openUChars(&patText
, patBuf
, pattern16Length
, status
);
237 // Compile the pattern
240 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
242 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
244 utext_close(&patText
);
246 if (U_FAILURE(*status
)) {
251 // Create the matcher object
253 re
->fMatcher
= re
->fPat
->matcher(*status
);
254 if (U_SUCCESS(*status
)) {
255 return (URegularExpression
*)re
;
264 //----------------------------------------------------------------------------------------
268 //----------------------------------------------------------------------------------------
269 U_CAPI
void U_EXPORT2
270 uregex_close(URegularExpression
*re2
) {
271 RegularExpression
*re
= (RegularExpression
*)re2
;
272 UErrorCode status
= U_ZERO_ERROR
;
273 if (validateRE(re
, FALSE
, &status
) == FALSE
) {
280 //----------------------------------------------------------------------------------------
284 //----------------------------------------------------------------------------------------
285 U_CAPI URegularExpression
* U_EXPORT2
286 uregex_clone(const URegularExpression
*source2
, UErrorCode
*status
) {
287 RegularExpression
*source
= (RegularExpression
*)source2
;
288 if (validateRE(source
, FALSE
, status
) == FALSE
) {
292 RegularExpression
*clone
= new RegularExpression
;
294 *status
= U_MEMORY_ALLOCATION_ERROR
;
298 clone
->fMatcher
= source
->fPat
->matcher(*status
);
299 if (U_FAILURE(*status
)) {
304 clone
->fPat
= source
->fPat
;
305 clone
->fPatRefCount
= source
->fPatRefCount
;
306 clone
->fPatString
= source
->fPatString
;
307 clone
->fPatStringLen
= source
->fPatStringLen
;
308 umtx_atomic_inc(source
->fPatRefCount
);
309 // Note: fText is not cloned.
311 return (URegularExpression
*)clone
;
317 //------------------------------------------------------------------------------
321 //------------------------------------------------------------------------------
322 U_CAPI
const UChar
* U_EXPORT2
323 uregex_pattern(const URegularExpression
*regexp2
,
325 UErrorCode
*status
) {
326 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
328 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
331 if (patLength
!= NULL
) {
332 *patLength
= regexp
->fPatStringLen
;
334 return regexp
->fPatString
;
338 //------------------------------------------------------------------------------
340 // uregex_patternUText
342 //------------------------------------------------------------------------------
343 U_CAPI UText
* U_EXPORT2
344 uregex_patternUText(const URegularExpression
*regexp2
,
345 UErrorCode
*status
) {
346 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
347 return regexp
->fPat
->patternText(*status
);
351 //------------------------------------------------------------------------------
355 //------------------------------------------------------------------------------
356 U_CAPI
int32_t U_EXPORT2
357 uregex_flags(const URegularExpression
*regexp2
, UErrorCode
*status
) {
358 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
359 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
362 int32_t flags
= regexp
->fPat
->flags();
367 //------------------------------------------------------------------------------
371 //------------------------------------------------------------------------------
372 U_CAPI
void U_EXPORT2
373 uregex_setText(URegularExpression
*regexp2
,
376 UErrorCode
*status
) {
377 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
378 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
381 if (text
== NULL
|| textLength
< -1) {
382 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
386 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
387 uprv_free((void *)regexp
->fText
);
390 regexp
->fText
= text
;
391 regexp
->fTextLength
= textLength
;
392 regexp
->fOwnsText
= FALSE
;
394 UText input
= UTEXT_INITIALIZER
;
395 utext_openUChars(&input
, text
, textLength
, status
);
396 regexp
->fMatcher
->reset(&input
);
397 utext_close(&input
); // reset() made a shallow clone, so we don't need this copy
401 //------------------------------------------------------------------------------
405 //------------------------------------------------------------------------------
406 U_CAPI
void U_EXPORT2
407 uregex_setUText(URegularExpression
*regexp2
,
409 UErrorCode
*status
) {
410 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
411 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
415 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
419 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
420 uprv_free((void *)regexp
->fText
);
423 regexp
->fText
= NULL
; // only fill it in on request
424 regexp
->fTextLength
= -1;
425 regexp
->fOwnsText
= TRUE
;
426 regexp
->fMatcher
->reset(text
);
431 //------------------------------------------------------------------------------
435 //------------------------------------------------------------------------------
436 U_CAPI
const UChar
* U_EXPORT2
437 uregex_getText(URegularExpression
*regexp2
,
439 UErrorCode
*status
) {
440 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
441 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
445 if (regexp
->fText
== NULL
) {
446 // need to fill in the text
447 UText
*inputText
= regexp
->fMatcher
->inputText();
448 int64_t inputNativeLength
= utext_nativeLength(inputText
);
449 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText
, inputNativeLength
)) {
450 regexp
->fText
= inputText
->chunkContents
;
451 regexp
->fTextLength
= (int32_t)inputNativeLength
;
452 regexp
->fOwnsText
= FALSE
; // because the UText owns it
454 UErrorCode lengthStatus
= U_ZERO_ERROR
;
455 regexp
->fTextLength
= utext_extract(inputText
, 0, inputNativeLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
456 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(regexp
->fTextLength
+1));
458 utext_extract(inputText
, 0, inputNativeLength
, inputChars
, regexp
->fTextLength
+1, status
);
459 regexp
->fText
= inputChars
;
460 regexp
->fOwnsText
= TRUE
; // should already be set but just in case
464 if (textLength
!= NULL
) {
465 *textLength
= regexp
->fTextLength
;
467 return regexp
->fText
;
471 //------------------------------------------------------------------------------
475 //------------------------------------------------------------------------------
476 U_CAPI UText
* U_EXPORT2
477 uregex_getUText(URegularExpression
*regexp2
,
479 UErrorCode
*status
) {
480 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
481 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
484 return regexp
->fMatcher
->getInput(dest
, *status
);
488 //------------------------------------------------------------------------------
490 // uregex_refreshUText
492 //------------------------------------------------------------------------------
493 U_CAPI
void U_EXPORT2
494 uregex_refreshUText(URegularExpression
*regexp2
,
496 UErrorCode
*status
) {
497 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
498 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
501 regexp
->fMatcher
->refreshInputText(text
, *status
);
505 //------------------------------------------------------------------------------
509 //------------------------------------------------------------------------------
510 U_CAPI UBool U_EXPORT2
511 uregex_matches(URegularExpression
*regexp2
,
513 UErrorCode
*status
) {
514 return uregex_matches64( regexp2
, (int64_t)startIndex
, status
);
517 U_CAPI UBool U_EXPORT2
518 uregex_matches64(URegularExpression
*regexp2
,
520 UErrorCode
*status
) {
521 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
522 UBool result
= FALSE
;
523 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
526 if (startIndex
== -1) {
527 result
= regexp
->fMatcher
->matches(*status
);
529 result
= regexp
->fMatcher
->matches(startIndex
, *status
);
535 //------------------------------------------------------------------------------
539 //------------------------------------------------------------------------------
540 U_CAPI UBool U_EXPORT2
541 uregex_lookingAt(URegularExpression
*regexp2
,
543 UErrorCode
*status
) {
544 return uregex_lookingAt64( regexp2
, (int64_t)startIndex
, status
);
547 U_CAPI UBool U_EXPORT2
548 uregex_lookingAt64(URegularExpression
*regexp2
,
550 UErrorCode
*status
) {
551 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
552 UBool result
= FALSE
;
553 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
556 if (startIndex
== -1) {
557 result
= regexp
->fMatcher
->lookingAt(*status
);
559 result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
566 //------------------------------------------------------------------------------
570 //------------------------------------------------------------------------------
571 U_CAPI UBool U_EXPORT2
572 uregex_find(URegularExpression
*regexp2
,
574 UErrorCode
*status
) {
575 return uregex_find64( regexp2
, (int64_t)startIndex
, status
);
578 U_CAPI UBool U_EXPORT2
579 uregex_find64(URegularExpression
*regexp2
,
581 UErrorCode
*status
) {
582 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
583 UBool result
= FALSE
;
584 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
587 if (startIndex
== -1) {
588 regexp
->fMatcher
->resetPreserveRegion();
589 result
= regexp
->fMatcher
->find(*status
);
591 result
= regexp
->fMatcher
->find(startIndex
, *status
);
597 //------------------------------------------------------------------------------
601 //------------------------------------------------------------------------------
602 U_CAPI UBool U_EXPORT2
603 uregex_findNext(URegularExpression
*regexp2
,
604 UErrorCode
*status
) {
605 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
606 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
609 UBool result
= regexp
->fMatcher
->find(*status
);
613 //------------------------------------------------------------------------------
617 //------------------------------------------------------------------------------
618 U_CAPI
int32_t U_EXPORT2
619 uregex_groupCount(URegularExpression
*regexp2
,
620 UErrorCode
*status
) {
621 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
622 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
625 int32_t result
= regexp
->fMatcher
->groupCount();
630 //------------------------------------------------------------------------------
632 // uregex_groupNumberFromName
634 //------------------------------------------------------------------------------
636 uregex_groupNumberFromName(URegularExpression
*regexp2
,
637 const UChar
*groupName
,
639 UErrorCode
*status
) {
640 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
641 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
644 int32_t result
= regexp
->fPat
->groupNumberFromName(UnicodeString(groupName
, nameLength
), *status
);
649 uregex_groupNumberFromCName(URegularExpression
*regexp2
,
650 const char *groupName
,
652 UErrorCode
*status
) {
653 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
654 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
657 return regexp
->fPat
->groupNumberFromName(groupName
, nameLength
, *status
);
660 //------------------------------------------------------------------------------
664 //------------------------------------------------------------------------------
665 U_CAPI
int32_t U_EXPORT2
666 uregex_group(URegularExpression
*regexp2
,
669 int32_t destCapacity
,
670 UErrorCode
*status
) {
671 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
672 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
675 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
676 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
680 if (destCapacity
== 0 || regexp
->fText
!= NULL
) {
681 // If preflighting or if we already have the text as UChars,
682 // this is a little cheaper than extracting from the UText
685 // Pick up the range of characters from the matcher
687 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
688 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
689 if (U_FAILURE(*status
)) {
694 // Trim length based on buffer capacity
696 int32_t fullLength
= endIx
- startIx
;
697 int32_t copyLength
= fullLength
;
698 if (copyLength
< destCapacity
) {
699 dest
[copyLength
] = 0;
700 } else if (copyLength
== destCapacity
) {
701 *status
= U_STRING_NOT_TERMINATED_WARNING
;
703 copyLength
= destCapacity
;
704 *status
= U_BUFFER_OVERFLOW_ERROR
;
708 // Copy capture group to user's buffer
710 if (copyLength
> 0) {
711 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
715 int64_t start
= regexp
->fMatcher
->start64(groupNum
, *status
);
716 int64_t limit
= regexp
->fMatcher
->end64(groupNum
, *status
);
717 if (U_FAILURE(*status
)) {
721 // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
722 // Zero Length Match: start == end.
723 int32_t length
= utext_extract(regexp
->fMatcher
->inputText(), start
, limit
, dest
, destCapacity
, status
);
730 //------------------------------------------------------------------------------
734 //------------------------------------------------------------------------------
735 U_CAPI UText
* U_EXPORT2
736 uregex_groupUText(URegularExpression
*regexp2
,
739 int64_t *groupLength
,
740 UErrorCode
*status
) {
741 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
742 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
743 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
744 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
747 return regexp
->fMatcher
->group(groupNum
, dest
, *groupLength
, *status
);
750 //------------------------------------------------------------------------------
754 //------------------------------------------------------------------------------
755 U_CAPI
int32_t U_EXPORT2
756 uregex_start(URegularExpression
*regexp2
,
758 UErrorCode
*status
) {
759 return (int32_t)uregex_start64( regexp2
, groupNum
, status
);
762 U_CAPI
int64_t U_EXPORT2
763 uregex_start64(URegularExpression
*regexp2
,
765 UErrorCode
*status
) {
766 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
767 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
770 int64_t result
= regexp
->fMatcher
->start64(groupNum
, *status
);
774 //------------------------------------------------------------------------------
778 //------------------------------------------------------------------------------
779 U_CAPI
int32_t U_EXPORT2
780 uregex_end(URegularExpression
*regexp2
,
782 UErrorCode
*status
) {
783 return (int32_t)uregex_end64( regexp2
, groupNum
, status
);
786 U_CAPI
int64_t U_EXPORT2
787 uregex_end64(URegularExpression
*regexp2
,
789 UErrorCode
*status
) {
790 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
791 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
794 int64_t result
= regexp
->fMatcher
->end64(groupNum
, *status
);
798 //------------------------------------------------------------------------------
802 //------------------------------------------------------------------------------
803 U_CAPI
void U_EXPORT2
804 uregex_reset(URegularExpression
*regexp2
,
806 UErrorCode
*status
) {
807 uregex_reset64( regexp2
, (int64_t)index
, status
);
810 U_CAPI
void U_EXPORT2
811 uregex_reset64(URegularExpression
*regexp2
,
813 UErrorCode
*status
) {
814 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
815 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
818 regexp
->fMatcher
->reset(index
, *status
);
822 //------------------------------------------------------------------------------
826 //------------------------------------------------------------------------------
827 U_CAPI
void U_EXPORT2
828 uregex_setRegion(URegularExpression
*regexp2
,
831 UErrorCode
*status
) {
832 uregex_setRegion64( regexp2
, (int64_t)regionStart
, (int64_t)regionLimit
, status
);
835 U_CAPI
void U_EXPORT2
836 uregex_setRegion64(URegularExpression
*regexp2
,
839 UErrorCode
*status
) {
840 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
841 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
844 regexp
->fMatcher
->region(regionStart
, regionLimit
, *status
);
848 //------------------------------------------------------------------------------
850 // uregex_setRegionAndStart
852 //------------------------------------------------------------------------------
853 U_CAPI
void U_EXPORT2
854 uregex_setRegionAndStart(URegularExpression
*regexp2
,
858 UErrorCode
*status
) {
859 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
860 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
863 regexp
->fMatcher
->region(regionStart
, regionLimit
, startIndex
, *status
);
866 //------------------------------------------------------------------------------
868 // uregex_regionStart
870 //------------------------------------------------------------------------------
871 U_CAPI
int32_t U_EXPORT2
872 uregex_regionStart(const URegularExpression
*regexp2
,
873 UErrorCode
*status
) {
874 return (int32_t)uregex_regionStart64(regexp2
, status
);
877 U_CAPI
int64_t U_EXPORT2
878 uregex_regionStart64(const URegularExpression
*regexp2
,
879 UErrorCode
*status
) {
880 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
881 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
884 return regexp
->fMatcher
->regionStart();
888 //------------------------------------------------------------------------------
892 //------------------------------------------------------------------------------
893 U_CAPI
int32_t U_EXPORT2
894 uregex_regionEnd(const URegularExpression
*regexp2
,
895 UErrorCode
*status
) {
896 return (int32_t)uregex_regionEnd64(regexp2
, status
);
899 U_CAPI
int64_t U_EXPORT2
900 uregex_regionEnd64(const URegularExpression
*regexp2
,
901 UErrorCode
*status
) {
902 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
903 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
906 return regexp
->fMatcher
->regionEnd();
910 //------------------------------------------------------------------------------
912 // uregex_hasTransparentBounds
914 //------------------------------------------------------------------------------
915 U_CAPI UBool U_EXPORT2
916 uregex_hasTransparentBounds(const URegularExpression
*regexp2
,
917 UErrorCode
*status
) {
918 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
919 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
922 return regexp
->fMatcher
->hasTransparentBounds();
926 //------------------------------------------------------------------------------
928 // uregex_useTransparentBounds
930 //------------------------------------------------------------------------------
931 U_CAPI
void U_EXPORT2
932 uregex_useTransparentBounds(URegularExpression
*regexp2
,
934 UErrorCode
*status
) {
935 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
936 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
939 regexp
->fMatcher
->useTransparentBounds(b
);
943 //------------------------------------------------------------------------------
945 // uregex_hasAnchoringBounds
947 //------------------------------------------------------------------------------
948 U_CAPI UBool U_EXPORT2
949 uregex_hasAnchoringBounds(const URegularExpression
*regexp2
,
950 UErrorCode
*status
) {
951 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
952 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
955 return regexp
->fMatcher
->hasAnchoringBounds();
959 //------------------------------------------------------------------------------
961 // uregex_useAnchoringBounds
963 //------------------------------------------------------------------------------
964 U_CAPI
void U_EXPORT2
965 uregex_useAnchoringBounds(URegularExpression
*regexp2
,
967 UErrorCode
*status
) {
968 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
969 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
972 regexp
->fMatcher
->useAnchoringBounds(b
);
976 //------------------------------------------------------------------------------
980 //------------------------------------------------------------------------------
981 U_CAPI UBool U_EXPORT2
982 uregex_hitEnd(const URegularExpression
*regexp2
,
983 UErrorCode
*status
) {
984 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
985 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
988 return regexp
->fMatcher
->hitEnd();
992 //------------------------------------------------------------------------------
996 //------------------------------------------------------------------------------
997 U_CAPI UBool U_EXPORT2
998 uregex_requireEnd(const URegularExpression
*regexp2
,
999 UErrorCode
*status
) {
1000 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1001 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1004 return regexp
->fMatcher
->requireEnd();
1008 //------------------------------------------------------------------------------
1010 // uregex_setTimeLimit
1012 //------------------------------------------------------------------------------
1013 U_CAPI
void U_EXPORT2
1014 uregex_setTimeLimit(URegularExpression
*regexp2
,
1016 UErrorCode
*status
) {
1017 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1018 if (validateRE(regexp
, FALSE
, status
)) {
1019 regexp
->fMatcher
->setTimeLimit(limit
, *status
);
1025 //------------------------------------------------------------------------------
1027 // uregex_getTimeLimit
1029 //------------------------------------------------------------------------------
1030 U_CAPI
int32_t U_EXPORT2
1031 uregex_getTimeLimit(const URegularExpression
*regexp2
,
1032 UErrorCode
*status
) {
1034 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1035 if (validateRE(regexp
, FALSE
, status
)) {
1036 retVal
= regexp
->fMatcher
->getTimeLimit();
1043 //------------------------------------------------------------------------------
1045 // uregex_setStackLimit
1047 //------------------------------------------------------------------------------
1048 U_CAPI
void U_EXPORT2
1049 uregex_setStackLimit(URegularExpression
*regexp2
,
1051 UErrorCode
*status
) {
1052 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1053 if (validateRE(regexp
, FALSE
, status
)) {
1054 regexp
->fMatcher
->setStackLimit(limit
, *status
);
1060 //------------------------------------------------------------------------------
1062 // uregex_getStackLimit
1064 //------------------------------------------------------------------------------
1065 U_CAPI
int32_t U_EXPORT2
1066 uregex_getStackLimit(const URegularExpression
*regexp2
,
1067 UErrorCode
*status
) {
1069 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1070 if (validateRE(regexp
, FALSE
, status
)) {
1071 retVal
= regexp
->fMatcher
->getStackLimit();
1077 //------------------------------------------------------------------------------
1079 // uregex_setMatchCallback
1081 //------------------------------------------------------------------------------
1082 U_CAPI
void U_EXPORT2
1083 uregex_setMatchCallback(URegularExpression
*regexp2
,
1084 URegexMatchCallback
*callback
,
1085 const void *context
,
1086 UErrorCode
*status
) {
1087 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1088 if (validateRE(regexp
, FALSE
, status
)) {
1089 regexp
->fMatcher
->setMatchCallback(callback
, context
, *status
);
1094 //------------------------------------------------------------------------------
1096 // uregex_getMatchCallback
1098 //------------------------------------------------------------------------------
1099 U_CAPI
void U_EXPORT2
1100 uregex_getMatchCallback(const URegularExpression
*regexp2
,
1101 URegexMatchCallback
**callback
,
1102 const void **context
,
1103 UErrorCode
*status
) {
1104 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1105 if (validateRE(regexp
, FALSE
, status
)) {
1106 regexp
->fMatcher
->getMatchCallback(*callback
, *context
, *status
);
1111 //------------------------------------------------------------------------------
1113 // uregex_setMatchProgressCallback
1115 //------------------------------------------------------------------------------
1116 U_CAPI
void U_EXPORT2
1117 uregex_setFindProgressCallback(URegularExpression
*regexp2
,
1118 URegexFindProgressCallback
*callback
,
1119 const void *context
,
1120 UErrorCode
*status
) {
1121 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1122 if (validateRE(regexp
, FALSE
, status
)) {
1123 regexp
->fMatcher
->setFindProgressCallback(callback
, context
, *status
);
1128 //------------------------------------------------------------------------------
1130 // uregex_getMatchCallback
1132 //------------------------------------------------------------------------------
1133 U_CAPI
void U_EXPORT2
1134 uregex_getFindProgressCallback(const URegularExpression
*regexp2
,
1135 URegexFindProgressCallback
**callback
,
1136 const void **context
,
1137 UErrorCode
*status
) {
1138 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1139 if (validateRE(regexp
, FALSE
, status
)) {
1140 regexp
->fMatcher
->getFindProgressCallback(*callback
, *context
, *status
);
1145 //------------------------------------------------------------------------------
1147 // uregex_replaceAll
1149 //------------------------------------------------------------------------------
1150 U_CAPI
int32_t U_EXPORT2
1151 uregex_replaceAll(URegularExpression
*regexp2
,
1152 const UChar
*replacementText
,
1153 int32_t replacementLength
,
1155 int32_t destCapacity
,
1156 UErrorCode
*status
) {
1157 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1158 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1161 if (replacementText
== NULL
|| replacementLength
< -1 ||
1162 (destBuf
== NULL
&& destCapacity
> 0) ||
1164 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1170 uregex_reset(regexp2
, 0, status
);
1172 // Note: Seperate error code variables for findNext() and appendReplacement()
1173 // are used so that destination buffer overflow errors
1174 // in appendReplacement won't stop findNext() from working.
1175 // appendReplacement() and appendTail() special case incoming buffer
1176 // overflow errors, continuing to return the correct length.
1177 UErrorCode findStatus
= *status
;
1178 while (uregex_findNext(regexp2
, &findStatus
)) {
1179 len
+= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1180 &destBuf
, &destCapacity
, status
);
1182 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1184 if (U_FAILURE(findStatus
)) {
1185 // If anything went wrong with the findNext(), make that error trump
1186 // whatever may have happened with the append() operations.
1187 // Errors in findNext() are not expected.
1188 *status
= findStatus
;
1195 //------------------------------------------------------------------------------
1197 // uregex_replaceAllUText
1199 //------------------------------------------------------------------------------
1200 U_CAPI UText
* U_EXPORT2
1201 uregex_replaceAllUText(URegularExpression
*regexp2
,
1202 UText
*replacementText
,
1204 UErrorCode
*status
) {
1205 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1206 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1209 if (replacementText
== NULL
) {
1210 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1214 dest
= regexp
->fMatcher
->replaceAll(replacementText
, dest
, *status
);
1219 //------------------------------------------------------------------------------
1221 // uregex_replaceFirst
1223 //------------------------------------------------------------------------------
1224 U_CAPI
int32_t U_EXPORT2
1225 uregex_replaceFirst(URegularExpression
*regexp2
,
1226 const UChar
*replacementText
,
1227 int32_t replacementLength
,
1229 int32_t destCapacity
,
1230 UErrorCode
*status
) {
1231 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1232 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1235 if (replacementText
== NULL
|| replacementLength
< -1 ||
1236 (destBuf
== NULL
&& destCapacity
> 0) ||
1238 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1243 UBool findSucceeded
;
1244 uregex_reset(regexp2
, 0, status
);
1245 findSucceeded
= uregex_find(regexp2
, 0, status
);
1246 if (findSucceeded
) {
1247 len
= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1248 &destBuf
, &destCapacity
, status
);
1250 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1256 //------------------------------------------------------------------------------
1258 // uregex_replaceFirstUText
1260 //------------------------------------------------------------------------------
1261 U_CAPI UText
* U_EXPORT2
1262 uregex_replaceFirstUText(URegularExpression
*regexp2
,
1263 UText
*replacementText
,
1265 UErrorCode
*status
) {
1266 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1267 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1270 if (replacementText
== NULL
) {
1271 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1275 dest
= regexp
->fMatcher
->replaceFirst(replacementText
, dest
, *status
);
1280 //------------------------------------------------------------------------------
1282 // uregex_appendReplacement
1284 //------------------------------------------------------------------------------
1288 // Dummy class, because these functions need to be friends of class RegexMatcher,
1289 // and stand-alone C functions don't work as friends
1293 inline static int32_t appendReplacement(RegularExpression
*regexp
,
1294 const UChar
*replacementText
,
1295 int32_t replacementLength
,
1297 int32_t *destCapacity
,
1298 UErrorCode
*status
);
1300 inline static int32_t appendTail(RegularExpression
*regexp
,
1302 int32_t *destCapacity
,
1303 UErrorCode
*status
);
1305 inline static int32_t split(RegularExpression
*regexp
,
1307 int32_t destCapacity
,
1308 int32_t *requiredCapacity
,
1309 UChar
*destFields
[],
1310 int32_t destFieldsCapacity
,
1311 UErrorCode
*status
);
1318 static const UChar BACKSLASH
= 0x5c;
1319 static const UChar DOLLARSIGN
= 0x24;
1320 static const UChar LEFTBRACKET
= 0x7b;
1321 static const UChar RIGHTBRACKET
= 0x7d;
1324 // Move a character to an output buffer, with bounds checking on the index.
1325 // Index advances even if capacity is exceeded, for preflight size computations.
1326 // This little sequence is used a LOT.
1328 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
1329 if (*idx
< bufCapacity
) {
1337 // appendReplacement, the actual implementation.
1339 int32_t RegexCImpl::appendReplacement(RegularExpression
*regexp
,
1340 const UChar
*replacementText
,
1341 int32_t replacementLength
,
1343 int32_t *destCapacity
,
1344 UErrorCode
*status
) {
1346 // If we come in with a buffer overflow error, don't suppress the operation.
1347 // A series of appendReplacements, appendTail need to correctly preflight
1348 // the buffer size when an overflow happens somewhere in the middle.
1349 UBool pendingBufferOverflow
= FALSE
;
1350 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1351 pendingBufferOverflow
= TRUE
;
1352 *status
= U_ZERO_ERROR
;
1356 // Validate all paramters
1358 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1361 if (replacementText
== NULL
|| replacementLength
< -1 ||
1362 destCapacity
== NULL
|| destBuf
== NULL
||
1363 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1364 *destCapacity
< 0) {
1365 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1369 RegexMatcher
*m
= regexp
->fMatcher
;
1370 if (m
->fMatch
== FALSE
) {
1371 *status
= U_REGEX_INVALID_STATE
;
1375 UChar
*dest
= *destBuf
;
1376 int32_t capacity
= *destCapacity
;
1377 int32_t destIdx
= 0;
1380 // If it wasn't supplied by the caller, get the length of the replacement text.
1381 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1382 // the fly and avoid this step.
1383 if (replacementLength
== -1) {
1384 replacementLength
= u_strlen(replacementText
);
1387 // Copy input string from the end of previous match to start of current match
1388 if (regexp
->fText
!= NULL
) {
1390 int32_t lastMatchEnd
;
1391 if (UTEXT_USES_U16(m
->fInputText
)) {
1392 lastMatchEnd
= (int32_t)m
->fLastMatchEnd
;
1393 matchStart
= (int32_t)m
->fMatchStart
;
1395 // !!!: Would like a better way to do this!
1396 UErrorCode tempStatus
= U_ZERO_ERROR
;
1397 lastMatchEnd
= utext_extract(m
->fInputText
, 0, m
->fLastMatchEnd
, NULL
, 0, &tempStatus
);
1398 tempStatus
= U_ZERO_ERROR
;
1399 matchStart
= lastMatchEnd
+ utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
, NULL
, 0, &tempStatus
);
1401 for (i
=lastMatchEnd
; i
<matchStart
; i
++) {
1402 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
1405 UErrorCode possibleOverflowError
= U_ZERO_ERROR
; // ignore
1406 destIdx
+= utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
,
1407 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
),
1408 &possibleOverflowError
);
1410 U_ASSERT(destIdx
>= 0);
1412 // scan the replacement text, looking for substitutions ($n) and \escapes.
1413 int32_t replIdx
= 0;
1414 while (replIdx
< replacementLength
&& U_SUCCESS(*status
)) {
1415 UChar c
= replacementText
[replIdx
];
1417 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
1418 // Common case, no substitution, no escaping,
1419 // just copy the char to the dest buf.
1420 appendToBuf(c
, &destIdx
, dest
, capacity
);
1424 if (c
== BACKSLASH
) {
1425 // Backslash Escape. Copy the following char out without further checks.
1426 // Note: Surrogate pairs don't need any special handling
1427 // The second half wont be a '$' or a '\', and
1428 // will move to the dest normally on the next
1430 if (replIdx
>= replacementLength
) {
1433 c
= replacementText
[replIdx
];
1435 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
1436 // We have a \udddd or \Udddddddd escape sequence.
1437 UChar32 escapedChar
=
1438 u_unescapeAt(uregex_ucstr_unescape_charAt
,
1439 &replIdx
, // Index is updated by unescapeAt
1440 replacementLength
, // Length of replacement text
1441 (void *)replacementText
);
1443 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
1444 if (escapedChar
<= 0xffff) {
1445 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
1447 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
1448 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
1452 // Note: if the \u escape was invalid, just fall through and
1453 // treat it as a plain \<anything> escape.
1456 // Plain backslash escape. Just put out the escaped character.
1457 appendToBuf(c
, &destIdx
, dest
, capacity
);
1463 // We've got a $. Pick up the following capture group name or number.
1464 // For numbers, consume only digits that produce a valid capture group for the pattern.
1466 int32_t groupNum
= 0;
1467 U_ASSERT(c
== DOLLARSIGN
);
1469 if (replIdx
< replacementLength
) {
1470 U16_GET(replacementText
, 0, replIdx
, replacementLength
, c32
);
1472 if (u_isdigit(c32
)) {
1473 int32_t numDigits
= 0;
1474 int32_t numCaptureGroups
= m
->fPattern
->fGroupMap
->size();
1476 if (replIdx
>= replacementLength
) {
1479 U16_GET(replacementText
, 0, replIdx
, replacementLength
, c32
);
1480 if (u_isdigit(c32
) == FALSE
) {
1484 int32_t digitVal
= u_charDigitValue(c32
);
1485 if (groupNum
* 10 + digitVal
<= numCaptureGroups
) {
1486 groupNum
= groupNum
* 10 + digitVal
;
1487 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1490 if (numDigits
== 0) {
1491 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1496 } else if (c32
== LEFTBRACKET
) {
1497 // Scan for Named Capture Group, ${name}.
1498 UnicodeString groupName
;
1499 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1500 while (U_SUCCESS(*status
) && c32
!= RIGHTBRACKET
) {
1501 if (replIdx
>= replacementLength
) {
1502 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1505 U16_NEXT(replacementText
, replIdx
, replacementLength
, c32
);
1506 if ((c32
>= 0x41 && c32
<= 0x5a) || // A..Z
1507 (c32
>= 0x61 && c32
<= 0x7a) || // a..z
1508 (c32
>= 0x31 && c32
<= 0x39)) { // 0..9
1509 groupName
.append(c32
);
1510 } else if (c32
== RIGHTBRACKET
) {
1511 groupNum
= uhash_geti(regexp
->fPat
->fNamedCaptureMap
, &groupName
);
1512 if (groupNum
== 0) {
1513 // Name not defined by pattern.
1514 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1517 // Character was something other than a name char or a closing '}'
1518 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1522 // $ not followed by {name} or digits.
1523 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1527 // Finally, append the capture group data to the destination.
1528 if (U_SUCCESS(*status
)) {
1529 destIdx
+= uregex_group((URegularExpression
*)regexp
, groupNum
,
1530 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
), status
);
1531 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
1532 // Ignore buffer overflow when extracting the group. We need to
1533 // continue on to get full size of the untruncated result. We will
1534 // raise our own buffer overflow error at the end.
1535 *status
= U_ZERO_ERROR
;
1539 if (U_FAILURE(*status
)) {
1540 // bad group number or name.
1546 // Nul Terminate the dest buffer if possible.
1547 // Set the appropriate buffer overflow or not terminated error, if needed.
1549 if (destIdx
< capacity
) {
1551 } else if (U_SUCCESS(*status
)) {
1552 if (destIdx
== *destCapacity
) {
1553 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1555 *status
= U_BUFFER_OVERFLOW_ERROR
;
1560 // Return an updated dest buffer and capacity to the caller.
1562 if (destIdx
> 0 && *destCapacity
> 0) {
1563 if (destIdx
< capacity
) {
1564 *destBuf
+= destIdx
;
1565 *destCapacity
-= destIdx
;
1567 *destBuf
+= capacity
;
1572 // If we came in with a buffer overflow, make sure we go out with one also.
1573 // (A zero length match right at the end of the previous match could
1574 // make this function succeed even though a previous call had overflowed the buf)
1575 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1576 *status
= U_BUFFER_OVERFLOW_ERROR
;
1583 // appendReplacement the actual API function,
1585 U_CAPI
int32_t U_EXPORT2
1586 uregex_appendReplacement(URegularExpression
*regexp2
,
1587 const UChar
*replacementText
,
1588 int32_t replacementLength
,
1590 int32_t *destCapacity
,
1591 UErrorCode
*status
) {
1593 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1594 return RegexCImpl::appendReplacement(
1595 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
1599 // uregex_appendReplacementUText...can just use the normal C++ method
1601 U_CAPI
void U_EXPORT2
1602 uregex_appendReplacementUText(URegularExpression
*regexp2
,
1605 UErrorCode
*status
) {
1606 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1607 regexp
->fMatcher
->appendReplacement(dest
, replText
, *status
);
1611 //------------------------------------------------------------------------------
1613 // uregex_appendTail
1615 //------------------------------------------------------------------------------
1616 int32_t RegexCImpl::appendTail(RegularExpression
*regexp
,
1618 int32_t *destCapacity
,
1622 // If we come in with a buffer overflow error, don't suppress the operation.
1623 // A series of appendReplacements, appendTail need to correctly preflight
1624 // the buffer size when an overflow happens somewhere in the middle.
1625 UBool pendingBufferOverflow
= FALSE
;
1626 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1627 pendingBufferOverflow
= TRUE
;
1628 *status
= U_ZERO_ERROR
;
1631 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1635 if (destCapacity
== NULL
|| destBuf
== NULL
||
1636 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1639 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1643 RegexMatcher
*m
= regexp
->fMatcher
;
1645 int32_t destIdx
= 0;
1646 int32_t destCap
= *destCapacity
;
1647 UChar
*dest
= *destBuf
;
1649 if (regexp
->fText
!= NULL
) {
1651 int64_t nativeIdx
= (m
->fMatch
? m
->fMatchEnd
: m
->fLastMatchEnd
);
1652 if (nativeIdx
== -1) {
1654 } else if (UTEXT_USES_U16(m
->fInputText
)) {
1655 srcIdx
= (int32_t)nativeIdx
;
1657 UErrorCode newStatus
= U_ZERO_ERROR
;
1658 srcIdx
= utext_extract(m
->fInputText
, 0, nativeIdx
, NULL
, 0, &newStatus
);
1662 U_ASSERT(destIdx
>= 0);
1664 if (srcIdx
== regexp
->fTextLength
) {
1667 UChar c
= regexp
->fText
[srcIdx
];
1668 if (c
== 0 && regexp
->fTextLength
== -1) {
1669 regexp
->fTextLength
= srcIdx
;
1673 if (destIdx
< destCap
) {
1676 // We've overflowed the dest buffer.
1677 // If the total input string length is known, we can
1678 // compute the total buffer size needed without scanning through the string.
1679 if (regexp
->fTextLength
> 0) {
1680 destIdx
+= (regexp
->fTextLength
- srcIdx
);
1690 // The most recent call to find() succeeded.
1691 srcIdx
= m
->fMatchEnd
;
1693 // The last call to find() on this matcher failed().
1694 // Look back to the end of the last find() that succeeded for src index.
1695 srcIdx
= m
->fLastMatchEnd
;
1697 // There has been no successful match with this matcher.
1698 // We want to copy the whole string.
1703 destIdx
= utext_extract(m
->fInputText
, srcIdx
, m
->fInputLength
, dest
, destCap
, status
);
1707 // NUL terminate the output string, if possible, otherwise issue the
1708 // appropriate error or warning.
1710 if (destIdx
< destCap
) {
1712 } else if (destIdx
== destCap
) {
1713 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1715 *status
= U_BUFFER_OVERFLOW_ERROR
;
1719 // Update the user's buffer ptr and capacity vars to reflect the
1722 if (destIdx
< destCap
) {
1723 *destBuf
+= destIdx
;
1724 *destCapacity
-= destIdx
;
1725 } else if (*destBuf
!= NULL
) {
1726 *destBuf
+= destCap
;
1730 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1731 *status
= U_BUFFER_OVERFLOW_ERROR
;
1739 // appendTail the actual API function
1741 U_CAPI
int32_t U_EXPORT2
1742 uregex_appendTail(URegularExpression
*regexp2
,
1744 int32_t *destCapacity
,
1745 UErrorCode
*status
) {
1746 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1747 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
1752 // uregex_appendTailUText...can just use the normal C++ method
1754 U_CAPI UText
* U_EXPORT2
1755 uregex_appendTailUText(URegularExpression
*regexp2
,
1757 UErrorCode
*status
) {
1758 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1759 return regexp
->fMatcher
->appendTail(dest
, *status
);
1763 //------------------------------------------------------------------------------
1765 // copyString Internal utility to copy a string to an output buffer,
1766 // while managing buffer overflow and preflight size
1767 // computation. NUL termination is added to destination,
1768 // and the NUL is counted in the output size.
1770 //------------------------------------------------------------------------------
1772 static void copyString(UChar
*destBuffer
, // Destination buffer.
1773 int32_t destCapacity
, // Total capacity of dest buffer
1774 int32_t *destIndex
, // Index into dest buffer. Updated on return.
1775 // Update not clipped to destCapacity.
1776 const UChar
*srcPtr
, // Pointer to source string
1777 int32_t srcLen
) // Source string len.
1780 int32_t di
= *destIndex
;
1783 for (si
=0; si
<srcLen
; si
++) {
1785 if (di
< destCapacity
) {
1793 if (di
<destCapacity
) {
1801 //------------------------------------------------------------------------------
1805 //------------------------------------------------------------------------------
1806 int32_t RegexCImpl::split(RegularExpression
*regexp
,
1808 int32_t destCapacity
,
1809 int32_t *requiredCapacity
,
1810 UChar
*destFields
[],
1811 int32_t destFieldsCapacity
,
1812 UErrorCode
*status
) {
1814 // Reset for the input text
1816 regexp
->fMatcher
->reset();
1817 UText
*inputText
= regexp
->fMatcher
->fInputText
;
1818 int64_t nextOutputStringStart
= 0;
1819 int64_t inputLen
= regexp
->fMatcher
->fInputLength
;
1820 if (inputLen
== 0) {
1825 // Loop through the input text, searching for the delimiter pattern
1827 int32_t i
; // Index of the field being processed.
1828 int32_t destIdx
= 0; // Next available position in destBuf;
1829 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1830 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow errors so that the strings are still counted
1832 if (i
>=destFieldsCapacity
-1) {
1833 // There are one or zero output strings left.
1834 // Fill the last output string with whatever is left from the input, then exit the loop.
1835 // ( i will be == destFieldsCapacity if we filled the output array while processing
1836 // capture groups of the delimiter expression, in which case we will discard the
1837 // last capture group saved in favor of the unprocessed remainder of the
1839 if (inputLen
> nextOutputStringStart
) {
1840 if (i
!= destFieldsCapacity
-1) {
1841 // No fields are left. Recycle the last one for holding the trailing part of
1842 // the input string.
1843 i
= destFieldsCapacity
-1;
1844 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1847 destFields
[i
] = &destBuf
[destIdx
];
1848 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1849 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1854 if (regexp
->fMatcher
->find()) {
1855 // We found another delimiter. Move everything from where we started looking
1856 // up until the start of the delimiter into the next output string.
1857 destFields
[i
] = &destBuf
[destIdx
];
1859 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, regexp
->fMatcher
->fMatchStart
,
1860 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), &tStatus
);
1861 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1862 tStatus
= U_ZERO_ERROR
;
1866 nextOutputStringStart
= regexp
->fMatcher
->fMatchEnd
;
1868 // If the delimiter pattern has capturing parentheses, the captured
1869 // text goes out into the next n destination strings.
1871 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1872 // If we've run out of output string slots, bail out.
1873 if (i
==destFieldsCapacity
-1) {
1878 // Set up to extract the capture group contents into the dest buffer.
1879 destFields
[i
] = &destBuf
[destIdx
];
1880 tStatus
= U_ZERO_ERROR
;
1881 int32_t t
= uregex_group((URegularExpression
*)regexp
,
1884 REMAINING_CAPACITY(destIdx
, destCapacity
),
1886 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1887 // +1 for the NUL that terminates the string.
1888 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1889 tStatus
= U_ZERO_ERROR
;
1895 if (nextOutputStringStart
== inputLen
) {
1896 // The delimiter was at the end of the string.
1897 // Output an empty string, and then we are done.
1898 if (destIdx
< destCapacity
) {
1899 destBuf
[destIdx
] = 0;
1901 if (i
< destFieldsCapacity
-1) {
1904 if (destIdx
< destCapacity
) {
1905 destFields
[i
] = destBuf
+ destIdx
;
1914 // We ran off the end of the input while looking for the next delimiter.
1915 // All the remaining text goes into the current output string.
1916 destFields
[i
] = &destBuf
[destIdx
];
1917 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1918 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1923 // Zero out any unused portion of the destFields array
1925 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1926 destFields
[j
] = NULL
;
1929 if (requiredCapacity
!= NULL
) {
1930 *requiredCapacity
= destIdx
;
1932 if (destIdx
> destCapacity
) {
1933 *status
= U_BUFFER_OVERFLOW_ERROR
;
1939 // uregex_split The actual API function
1941 U_CAPI
int32_t U_EXPORT2
1942 uregex_split(URegularExpression
*regexp2
,
1944 int32_t destCapacity
,
1945 int32_t *requiredCapacity
,
1946 UChar
*destFields
[],
1947 int32_t destFieldsCapacity
,
1948 UErrorCode
*status
) {
1949 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1950 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1953 if ((destBuf
== NULL
&& destCapacity
> 0) ||
1955 destFields
== NULL
||
1956 destFieldsCapacity
< 1 ) {
1957 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1961 return RegexCImpl::split(regexp
, destBuf
, destCapacity
, requiredCapacity
, destFields
, destFieldsCapacity
, status
);
1966 // uregex_splitUText...can just use the normal C++ method
1968 U_CAPI
int32_t U_EXPORT2
1969 uregex_splitUText(URegularExpression
*regexp2
,
1970 UText
*destFields
[],
1971 int32_t destFieldsCapacity
,
1972 UErrorCode
*status
) {
1973 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1974 return regexp
->fMatcher
->split(regexp
->fMatcher
->inputText(), destFields
, destFieldsCapacity
, *status
);
1978 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS