2 *******************************************************************************
3 * Copyright (C) 2004-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
31 struct RegularExpression
: public UMemory
{
37 int32_t *fPatRefCount
;
39 int32_t fPatStringLen
;
40 RegexMatcher
*fMatcher
;
41 const UChar
*fText
; // Text from setText()
42 int32_t fTextLength
; // Length provided by user with setText(), which
47 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
49 RegularExpression::RegularExpression() {
61 RegularExpression::~RegularExpression() {
64 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
66 uprv_free(fPatString
);
67 uprv_free(fPatRefCount
);
69 if (fOwnsText
&& fText
!=NULL
) {
70 uprv_free((void *)fText
);
79 //----------------------------------------------------------------------------------------
81 // validateRE Do boilerplate style checks on API function parameters.
82 // Return TRUE if they look OK.
83 //----------------------------------------------------------------------------------------
84 static UBool
validateRE(const RegularExpression
*re
, UErrorCode
*status
, UBool requiresText
= TRUE
) {
85 if (U_FAILURE(*status
)) {
88 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
89 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
92 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
93 if (requiresText
&& re
->fText
== NULL
&& !re
->fOwnsText
) {
94 *status
= U_REGEX_INVALID_STATE
;
100 //----------------------------------------------------------------------------------------
104 //----------------------------------------------------------------------------------------
105 U_CAPI URegularExpression
* U_EXPORT2
106 uregex_open( const UChar
*pattern
,
107 int32_t patternLength
,
110 UErrorCode
*status
) {
112 if (U_FAILURE(*status
)) {
115 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
116 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
119 int32_t actualPatLen
= patternLength
;
120 if (actualPatLen
== -1) {
121 actualPatLen
= u_strlen(pattern
);
124 RegularExpression
*re
= new RegularExpression
;
125 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
126 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
127 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
128 *status
= U_MEMORY_ALLOCATION_ERROR
;
134 re
->fPatRefCount
= refC
;
135 *re
->fPatRefCount
= 1;
138 // Make a copy of the pattern string, so we can return it later if asked.
139 // For compiling the pattern, we will use a UText wrapper around
140 // this local copy, to avoid making even more copies.
142 re
->fPatString
= patBuf
;
143 re
->fPatStringLen
= patternLength
;
144 u_memcpy(patBuf
, pattern
, actualPatLen
);
145 patBuf
[actualPatLen
] = 0;
147 UText patText
= UTEXT_INITIALIZER
;
148 utext_openUChars(&patText
, patBuf
, patternLength
, status
);
151 // Compile the pattern
154 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
156 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
158 utext_close(&patText
);
160 if (U_FAILURE(*status
)) {
165 // Create the matcher object
167 re
->fMatcher
= re
->fPat
->matcher(*status
);
168 if (U_SUCCESS(*status
)) {
169 return (URegularExpression
*)re
;
178 //----------------------------------------------------------------------------------------
182 //----------------------------------------------------------------------------------------
183 U_CAPI URegularExpression
* U_EXPORT2
184 uregex_openUText(UText
*pattern
,
187 UErrorCode
*status
) {
189 if (U_FAILURE(*status
)) {
192 if (pattern
== NULL
) {
193 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
197 int64_t patternNativeLength
= utext_nativeLength(pattern
);
199 if (patternNativeLength
== 0) {
200 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
204 RegularExpression
*re
= new RegularExpression
;
206 UErrorCode lengthStatus
= U_ZERO_ERROR
;
207 int32_t pattern16Length
= utext_extract(pattern
, 0, patternNativeLength
, NULL
, 0, &lengthStatus
);
209 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
210 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(pattern16Length
+1));
211 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
212 *status
= U_MEMORY_ALLOCATION_ERROR
;
218 re
->fPatRefCount
= refC
;
219 *re
->fPatRefCount
= 1;
222 // Make a copy of the pattern string, so we can return it later if asked.
223 // For compiling the pattern, we will use a read-only UText wrapper
224 // around this local copy, to avoid making even more copies.
226 re
->fPatString
= patBuf
;
227 re
->fPatStringLen
= pattern16Length
;
228 utext_extract(pattern
, 0, patternNativeLength
, patBuf
, pattern16Length
+1, status
);
230 UText patText
= UTEXT_INITIALIZER
;
231 utext_openUChars(&patText
, patBuf
, pattern16Length
, status
);
234 // Compile the pattern
237 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
239 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
241 utext_close(&patText
);
243 if (U_FAILURE(*status
)) {
248 // Create the matcher object
250 re
->fMatcher
= re
->fPat
->matcher(*status
);
251 if (U_SUCCESS(*status
)) {
252 return (URegularExpression
*)re
;
261 //----------------------------------------------------------------------------------------
265 //----------------------------------------------------------------------------------------
266 U_CAPI
void U_EXPORT2
267 uregex_close(URegularExpression
*re2
) {
268 RegularExpression
*re
= (RegularExpression
*)re2
;
269 UErrorCode status
= U_ZERO_ERROR
;
270 if (validateRE(re
, &status
, FALSE
) == FALSE
) {
277 //----------------------------------------------------------------------------------------
281 //----------------------------------------------------------------------------------------
282 U_CAPI URegularExpression
* U_EXPORT2
283 uregex_clone(const URegularExpression
*source2
, UErrorCode
*status
) {
284 RegularExpression
*source
= (RegularExpression
*)source2
;
285 if (validateRE(source
, status
, FALSE
) == FALSE
) {
289 RegularExpression
*clone
= new RegularExpression
;
291 *status
= U_MEMORY_ALLOCATION_ERROR
;
295 clone
->fMatcher
= source
->fPat
->matcher(*status
);
296 if (U_FAILURE(*status
)) {
301 clone
->fPat
= source
->fPat
;
302 clone
->fPatRefCount
= source
->fPatRefCount
;
303 clone
->fPatString
= source
->fPatString
;
304 clone
->fPatStringLen
= source
->fPatStringLen
;
305 umtx_atomic_inc(source
->fPatRefCount
);
306 // Note: fText is not cloned.
308 return (URegularExpression
*)clone
;
314 //------------------------------------------------------------------------------
318 //------------------------------------------------------------------------------
319 U_CAPI
const UChar
* U_EXPORT2
320 uregex_pattern(const URegularExpression
*regexp2
,
322 UErrorCode
*status
) {
323 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
325 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
328 if (patLength
!= NULL
) {
329 *patLength
= regexp
->fPatStringLen
;
331 return regexp
->fPatString
;
335 //------------------------------------------------------------------------------
337 // uregex_patternUText
339 //------------------------------------------------------------------------------
340 U_CAPI UText
* U_EXPORT2
341 uregex_patternUText(const URegularExpression
*regexp2
,
342 UErrorCode
*status
) {
343 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
344 return regexp
->fPat
->patternText(*status
);
348 //------------------------------------------------------------------------------
352 //------------------------------------------------------------------------------
353 U_CAPI
int32_t U_EXPORT2
354 uregex_flags(const URegularExpression
*regexp2
, UErrorCode
*status
) {
355 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
356 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
359 int32_t flags
= regexp
->fPat
->flags();
364 //------------------------------------------------------------------------------
368 //------------------------------------------------------------------------------
369 U_CAPI
void U_EXPORT2
370 uregex_setText(URegularExpression
*regexp2
,
373 UErrorCode
*status
) {
374 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
375 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
378 if (text
== NULL
|| textLength
< -1) {
379 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
383 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
384 uprv_free((void *)regexp
->fText
);
387 regexp
->fText
= text
;
388 regexp
->fTextLength
= textLength
;
389 regexp
->fOwnsText
= FALSE
;
391 UText input
= UTEXT_INITIALIZER
;
392 utext_openUChars(&input
, text
, textLength
, status
);
393 regexp
->fMatcher
->reset(&input
);
394 utext_close(&input
); // reset() made a shallow clone, so we don't need this copy
398 //------------------------------------------------------------------------------
402 //------------------------------------------------------------------------------
403 U_CAPI
void U_EXPORT2
404 uregex_setUText(URegularExpression
*regexp2
,
406 UErrorCode
*status
) {
407 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
408 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
412 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
416 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
417 uprv_free((void *)regexp
->fText
);
420 regexp
->fText
= NULL
; // only fill it in on request
421 regexp
->fTextLength
= -1;
422 regexp
->fOwnsText
= TRUE
;
423 regexp
->fMatcher
->reset(text
);
428 //------------------------------------------------------------------------------
432 //------------------------------------------------------------------------------
433 U_CAPI
const UChar
* U_EXPORT2
434 uregex_getText(URegularExpression
*regexp2
,
436 UErrorCode
*status
) {
437 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
438 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
442 if (regexp
->fText
== NULL
) {
443 // need to fill in the text
444 UText
*inputText
= regexp
->fMatcher
->inputText();
445 int64_t inputNativeLength
= utext_nativeLength(inputText
);
446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText
, inputNativeLength
)) {
447 regexp
->fText
= inputText
->chunkContents
;
448 regexp
->fTextLength
= (int32_t)inputNativeLength
;
449 regexp
->fOwnsText
= FALSE
; // because the UText owns it
451 UErrorCode lengthStatus
= U_ZERO_ERROR
;
452 regexp
->fTextLength
= utext_extract(inputText
, 0, inputNativeLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
453 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(regexp
->fTextLength
+1));
455 utext_extract(inputText
, 0, inputNativeLength
, inputChars
, regexp
->fTextLength
+1, status
);
456 regexp
->fText
= inputChars
;
457 regexp
->fOwnsText
= TRUE
; // should already be set but just in case
461 if (textLength
!= NULL
) {
462 *textLength
= regexp
->fTextLength
;
464 return regexp
->fText
;
468 //------------------------------------------------------------------------------
472 //------------------------------------------------------------------------------
473 U_CAPI UText
* U_EXPORT2
474 uregex_getUText(URegularExpression
*regexp2
,
476 UErrorCode
*status
) {
477 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
478 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
481 return regexp
->fMatcher
->getInput(dest
, *status
);
485 //------------------------------------------------------------------------------
489 //------------------------------------------------------------------------------
490 U_CAPI UBool U_EXPORT2
491 uregex_matches(URegularExpression
*regexp2
,
493 UErrorCode
*status
) {
494 return uregex_matches64( regexp2
, (int64_t)startIndex
, status
);
497 U_CAPI UBool U_EXPORT2
498 uregex_matches64(URegularExpression
*regexp2
,
500 UErrorCode
*status
) {
501 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
502 UBool result
= FALSE
;
503 if (validateRE(regexp
, status
) == FALSE
) {
506 if (startIndex
== -1) {
507 result
= regexp
->fMatcher
->matches(*status
);
509 result
= regexp
->fMatcher
->matches(startIndex
, *status
);
515 //------------------------------------------------------------------------------
519 //------------------------------------------------------------------------------
520 U_CAPI UBool U_EXPORT2
521 uregex_lookingAt(URegularExpression
*regexp2
,
523 UErrorCode
*status
) {
524 return uregex_lookingAt64( regexp2
, (int64_t)startIndex
, status
);
527 U_CAPI UBool U_EXPORT2
528 uregex_lookingAt64(URegularExpression
*regexp2
,
530 UErrorCode
*status
) {
531 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
532 UBool result
= FALSE
;
533 if (validateRE(regexp
, status
) == FALSE
) {
536 if (startIndex
== -1) {
537 result
= regexp
->fMatcher
->lookingAt(*status
);
539 result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
546 //------------------------------------------------------------------------------
550 //------------------------------------------------------------------------------
551 U_CAPI UBool U_EXPORT2
552 uregex_find(URegularExpression
*regexp2
,
554 UErrorCode
*status
) {
555 return uregex_find64( regexp2
, (int64_t)startIndex
, status
);
558 U_CAPI UBool U_EXPORT2
559 uregex_find64(URegularExpression
*regexp2
,
561 UErrorCode
*status
) {
562 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
563 UBool result
= FALSE
;
564 if (validateRE(regexp
, status
) == FALSE
) {
567 if (startIndex
== -1) {
568 regexp
->fMatcher
->resetPreserveRegion();
569 result
= regexp
->fMatcher
->find();
571 result
= regexp
->fMatcher
->find(startIndex
, *status
);
577 //------------------------------------------------------------------------------
581 //------------------------------------------------------------------------------
582 U_CAPI UBool U_EXPORT2
583 uregex_findNext(URegularExpression
*regexp2
,
584 UErrorCode
*status
) {
585 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
586 if (validateRE(regexp
, status
) == FALSE
) {
589 UBool result
= regexp
->fMatcher
->find();
593 //------------------------------------------------------------------------------
597 //------------------------------------------------------------------------------
598 U_CAPI
int32_t U_EXPORT2
599 uregex_groupCount(URegularExpression
*regexp2
,
600 UErrorCode
*status
) {
601 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
602 if (validateRE(regexp
, status
, FALSE
) == FALSE
) {
605 int32_t result
= regexp
->fMatcher
->groupCount();
610 //------------------------------------------------------------------------------
614 //------------------------------------------------------------------------------
615 U_CAPI
int32_t U_EXPORT2
616 uregex_group(URegularExpression
*regexp2
,
619 int32_t destCapacity
,
620 UErrorCode
*status
) {
621 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
622 if (validateRE(regexp
, status
) == FALSE
) {
625 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
626 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
630 if (destCapacity
== 0 || regexp
->fText
!= NULL
) {
631 // If preflighting or if we already have the text as UChars,
632 // this is a little cheaper than going through uregex_groupUTextDeep()
635 // Pick up the range of characters from the matcher
637 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
638 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
639 if (U_FAILURE(*status
)) {
644 // Trim length based on buffer capacity
646 int32_t fullLength
= endIx
- startIx
;
647 int32_t copyLength
= fullLength
;
648 if (copyLength
< destCapacity
) {
649 dest
[copyLength
] = 0;
650 } else if (copyLength
== destCapacity
) {
651 *status
= U_STRING_NOT_TERMINATED_WARNING
;
653 copyLength
= destCapacity
;
654 *status
= U_BUFFER_OVERFLOW_ERROR
;
658 // Copy capture group to user's buffer
660 if (copyLength
> 0) {
661 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
665 UText
*groupText
= uregex_groupUTextDeep(regexp2
, groupNum
, NULL
, status
);
666 int32_t result
= utext_extract(groupText
, 0, utext_nativeLength(groupText
), dest
, destCapacity
, status
);
667 utext_close(groupText
);
673 //------------------------------------------------------------------------------
677 //------------------------------------------------------------------------------
678 U_CAPI UText
* U_EXPORT2
679 uregex_groupUText(URegularExpression
*regexp2
,
682 int64_t *groupLength
,
683 UErrorCode
*status
) {
684 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
685 if (validateRE(regexp
, status
) == FALSE
) {
686 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
687 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
690 return regexp
->fMatcher
->group(groupNum
, dest
, *groupLength
, *status
);
693 //------------------------------------------------------------------------------
695 // uregex_groupUTextDeep
697 //------------------------------------------------------------------------------
698 U_CAPI UText
* U_EXPORT2
699 uregex_groupUTextDeep(URegularExpression
*regexp2
,
702 UErrorCode
*status
) {
703 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
704 if (validateRE(regexp
, status
) == FALSE
) {
705 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
706 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
709 if (regexp
->fText
!= NULL
) {
711 // Pick up the range of characters from the matcher
712 // and use our already-extracted characters
714 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
715 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
716 if (U_FAILURE(*status
)) {
717 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
718 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
722 utext_replace(dest
, 0, utext_nativeLength(dest
), ®exp
->fText
[startIx
], endIx
- startIx
, status
);
724 UText groupText
= UTEXT_INITIALIZER
;
725 utext_openUChars(&groupText
, ®exp
->fText
[startIx
], endIx
- startIx
, status
);
726 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, status
);
727 utext_close(&groupText
);
732 return regexp
->fMatcher
->group(groupNum
, dest
, *status
);
736 //------------------------------------------------------------------------------
740 //------------------------------------------------------------------------------
741 U_CAPI
int32_t U_EXPORT2
742 uregex_start(URegularExpression
*regexp2
,
744 UErrorCode
*status
) {
745 return (int32_t)uregex_start64( regexp2
, groupNum
, status
);
748 U_CAPI
int64_t U_EXPORT2
749 uregex_start64(URegularExpression
*regexp2
,
751 UErrorCode
*status
) {
752 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
753 if (validateRE(regexp
, status
) == FALSE
) {
756 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
760 //------------------------------------------------------------------------------
764 //------------------------------------------------------------------------------
765 U_CAPI
int32_t U_EXPORT2
766 uregex_end(URegularExpression
*regexp2
,
768 UErrorCode
*status
) {
769 return (int32_t)uregex_end64( regexp2
, groupNum
, status
);
772 U_CAPI
int64_t U_EXPORT2
773 uregex_end64(URegularExpression
*regexp2
,
775 UErrorCode
*status
) {
776 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
777 if (validateRE(regexp
, status
) == FALSE
) {
780 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
784 //------------------------------------------------------------------------------
788 //------------------------------------------------------------------------------
789 U_CAPI
void U_EXPORT2
790 uregex_reset(URegularExpression
*regexp2
,
792 UErrorCode
*status
) {
793 uregex_reset64( regexp2
, (int64_t)index
, status
);
796 U_CAPI
void U_EXPORT2
797 uregex_reset64(URegularExpression
*regexp2
,
799 UErrorCode
*status
) {
800 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
801 if (validateRE(regexp
, status
) == FALSE
) {
804 regexp
->fMatcher
->reset(index
, *status
);
808 //------------------------------------------------------------------------------
812 //------------------------------------------------------------------------------
813 U_CAPI
void U_EXPORT2
814 uregex_setRegion(URegularExpression
*regexp2
,
817 UErrorCode
*status
) {
818 uregex_setRegion64( regexp2
, (int64_t)regionStart
, (int64_t)regionLimit
, status
);
821 U_CAPI
void U_EXPORT2
822 uregex_setRegion64(URegularExpression
*regexp2
,
825 UErrorCode
*status
) {
826 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
827 if (validateRE(regexp
, status
) == FALSE
) {
830 regexp
->fMatcher
->region(regionStart
, regionLimit
, *status
);
834 //------------------------------------------------------------------------------
836 // uregex_setRegionAndStart
838 //------------------------------------------------------------------------------
839 U_DRAFT
void U_EXPORT2
840 uregex_setRegionAndStart(URegularExpression
*regexp2
,
844 UErrorCode
*status
) {
845 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
846 if (validateRE(regexp
, status
) == FALSE
) {
849 regexp
->fMatcher
->region(regionStart
, regionLimit
, startIndex
, *status
);
852 //------------------------------------------------------------------------------
854 // uregex_regionStart
856 //------------------------------------------------------------------------------
857 U_CAPI
int32_t U_EXPORT2
858 uregex_regionStart(const URegularExpression
*regexp2
,
859 UErrorCode
*status
) {
860 return (int32_t)uregex_regionStart64(regexp2
, status
);
863 U_CAPI
int64_t U_EXPORT2
864 uregex_regionStart64(const URegularExpression
*regexp2
,
865 UErrorCode
*status
) {
866 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
867 if (validateRE(regexp
, status
) == FALSE
) {
870 return regexp
->fMatcher
->regionStart();
874 //------------------------------------------------------------------------------
878 //------------------------------------------------------------------------------
879 U_CAPI
int32_t U_EXPORT2
880 uregex_regionEnd(const URegularExpression
*regexp2
,
881 UErrorCode
*status
) {
882 return (int32_t)uregex_regionEnd64(regexp2
, status
);
885 U_CAPI
int64_t U_EXPORT2
886 uregex_regionEnd64(const URegularExpression
*regexp2
,
887 UErrorCode
*status
) {
888 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
889 if (validateRE(regexp
, status
) == FALSE
) {
892 return regexp
->fMatcher
->regionEnd();
896 //------------------------------------------------------------------------------
898 // uregex_hasTransparentBounds
900 //------------------------------------------------------------------------------
901 U_CAPI UBool U_EXPORT2
902 uregex_hasTransparentBounds(const URegularExpression
*regexp2
,
903 UErrorCode
*status
) {
904 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
905 if (validateRE(regexp
, status
) == FALSE
) {
908 return regexp
->fMatcher
->hasTransparentBounds();
912 //------------------------------------------------------------------------------
914 // uregex_useTransparentBounds
916 //------------------------------------------------------------------------------
917 U_CAPI
void U_EXPORT2
918 uregex_useTransparentBounds(URegularExpression
*regexp2
,
920 UErrorCode
*status
) {
921 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
922 if (validateRE(regexp
, status
) == FALSE
) {
925 regexp
->fMatcher
->useTransparentBounds(b
);
929 //------------------------------------------------------------------------------
931 // uregex_hasAnchoringBounds
933 //------------------------------------------------------------------------------
934 U_CAPI UBool U_EXPORT2
935 uregex_hasAnchoringBounds(const URegularExpression
*regexp2
,
936 UErrorCode
*status
) {
937 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
938 if (validateRE(regexp
, status
) == FALSE
) {
941 return regexp
->fMatcher
->hasAnchoringBounds();
945 //------------------------------------------------------------------------------
947 // uregex_useAnchoringBounds
949 //------------------------------------------------------------------------------
950 U_CAPI
void U_EXPORT2
951 uregex_useAnchoringBounds(URegularExpression
*regexp2
,
953 UErrorCode
*status
) {
954 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
955 if (validateRE(regexp
, status
) == FALSE
) {
958 regexp
->fMatcher
->useAnchoringBounds(b
);
962 //------------------------------------------------------------------------------
966 //------------------------------------------------------------------------------
967 U_CAPI UBool U_EXPORT2
968 uregex_hitEnd(const URegularExpression
*regexp2
,
969 UErrorCode
*status
) {
970 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
971 if (validateRE(regexp
, status
) == FALSE
) {
974 return regexp
->fMatcher
->hitEnd();
978 //------------------------------------------------------------------------------
982 //------------------------------------------------------------------------------
983 U_CAPI UBool U_EXPORT2
984 uregex_requireEnd(const URegularExpression
*regexp2
,
985 UErrorCode
*status
) {
986 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
987 if (validateRE(regexp
, status
) == FALSE
) {
990 return regexp
->fMatcher
->requireEnd();
994 //------------------------------------------------------------------------------
996 // uregex_setTimeLimit
998 //------------------------------------------------------------------------------
999 U_CAPI
void U_EXPORT2
1000 uregex_setTimeLimit(URegularExpression
*regexp2
,
1002 UErrorCode
*status
) {
1003 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1004 if (validateRE(regexp
, status
)) {
1005 regexp
->fMatcher
->setTimeLimit(limit
, *status
);
1011 //------------------------------------------------------------------------------
1013 // uregex_getTimeLimit
1015 //------------------------------------------------------------------------------
1016 U_CAPI
int32_t U_EXPORT2
1017 uregex_getTimeLimit(const URegularExpression
*regexp2
,
1018 UErrorCode
*status
) {
1020 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1021 if (validateRE(regexp
, status
)) {
1022 retVal
= regexp
->fMatcher
->getTimeLimit();
1029 //------------------------------------------------------------------------------
1031 // uregex_setStackLimit
1033 //------------------------------------------------------------------------------
1034 U_CAPI
void U_EXPORT2
1035 uregex_setStackLimit(URegularExpression
*regexp2
,
1037 UErrorCode
*status
) {
1038 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1039 if (validateRE(regexp
, status
)) {
1040 regexp
->fMatcher
->setStackLimit(limit
, *status
);
1046 //------------------------------------------------------------------------------
1048 // uregex_getStackLimit
1050 //------------------------------------------------------------------------------
1051 U_CAPI
int32_t U_EXPORT2
1052 uregex_getStackLimit(const URegularExpression
*regexp2
,
1053 UErrorCode
*status
) {
1055 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1056 if (validateRE(regexp
, status
)) {
1057 retVal
= regexp
->fMatcher
->getStackLimit();
1063 //------------------------------------------------------------------------------
1065 // uregex_setMatchCallback
1067 //------------------------------------------------------------------------------
1068 U_CAPI
void U_EXPORT2
1069 uregex_setMatchCallback(URegularExpression
*regexp2
,
1070 URegexMatchCallback
*callback
,
1071 const void *context
,
1072 UErrorCode
*status
) {
1073 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1074 if (validateRE(regexp
, status
)) {
1075 regexp
->fMatcher
->setMatchCallback(callback
, context
, *status
);
1080 //------------------------------------------------------------------------------
1082 // uregex_getMatchCallback
1084 //------------------------------------------------------------------------------
1085 U_CAPI
void U_EXPORT2
1086 uregex_getMatchCallback(const URegularExpression
*regexp2
,
1087 URegexMatchCallback
**callback
,
1088 const void **context
,
1089 UErrorCode
*status
) {
1090 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1091 if (validateRE(regexp
, status
)) {
1092 regexp
->fMatcher
->getMatchCallback(*callback
, *context
, *status
);
1097 //------------------------------------------------------------------------------
1099 // uregex_setMatchProgressCallback
1101 //------------------------------------------------------------------------------
1102 U_CAPI
void U_EXPORT2
1103 uregex_setFindProgressCallback(URegularExpression
*regexp2
,
1104 URegexFindProgressCallback
*callback
,
1105 const void *context
,
1106 UErrorCode
*status
) {
1107 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1108 if (validateRE(regexp
, status
)) {
1109 regexp
->fMatcher
->setFindProgressCallback(callback
, context
, *status
);
1114 //------------------------------------------------------------------------------
1116 // uregex_getMatchCallback
1118 //------------------------------------------------------------------------------
1119 U_CAPI
void U_EXPORT2
1120 uregex_getFindProgressCallback(const URegularExpression
*regexp2
,
1121 URegexFindProgressCallback
**callback
,
1122 const void **context
,
1123 UErrorCode
*status
) {
1124 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1125 if (validateRE(regexp
, status
)) {
1126 regexp
->fMatcher
->getFindProgressCallback(*callback
, *context
, *status
);
1131 //------------------------------------------------------------------------------
1133 // uregex_replaceAll
1135 //------------------------------------------------------------------------------
1136 U_CAPI
int32_t U_EXPORT2
1137 uregex_replaceAll(URegularExpression
*regexp2
,
1138 const UChar
*replacementText
,
1139 int32_t replacementLength
,
1141 int32_t destCapacity
,
1142 UErrorCode
*status
) {
1143 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1144 if (validateRE(regexp
, status
) == FALSE
) {
1147 if (replacementText
== NULL
|| replacementLength
< -1 ||
1148 (destBuf
== NULL
&& destCapacity
> 0) ||
1150 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1156 uregex_reset(regexp2
, 0, status
);
1158 // Note: Seperate error code variables for findNext() and appendReplacement()
1159 // are used so that destination buffer overflow errors
1160 // in appendReplacement won't stop findNext() from working.
1161 // appendReplacement() and appendTail() special case incoming buffer
1162 // overflow errors, continuing to return the correct length.
1163 UErrorCode findStatus
= *status
;
1164 while (uregex_findNext(regexp2
, &findStatus
)) {
1165 len
+= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1166 &destBuf
, &destCapacity
, status
);
1168 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1170 if (U_FAILURE(findStatus
)) {
1171 // If anything went wrong with the findNext(), make that error trump
1172 // whatever may have happened with the append() operations.
1173 // Errors in findNext() are not expected.
1174 *status
= findStatus
;
1181 //------------------------------------------------------------------------------
1183 // uregex_replaceAllUText
1185 //------------------------------------------------------------------------------
1186 U_CAPI UText
* U_EXPORT2
1187 uregex_replaceAllUText(URegularExpression
*regexp2
,
1188 UText
*replacementText
,
1190 UErrorCode
*status
) {
1191 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1192 if (validateRE(regexp
, status
) == FALSE
) {
1195 if (replacementText
== NULL
) {
1196 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1200 dest
= regexp
->fMatcher
->replaceAll(replacementText
, dest
, *status
);
1205 //------------------------------------------------------------------------------
1207 // uregex_replaceFirst
1209 //------------------------------------------------------------------------------
1210 U_CAPI
int32_t U_EXPORT2
1211 uregex_replaceFirst(URegularExpression
*regexp2
,
1212 const UChar
*replacementText
,
1213 int32_t replacementLength
,
1215 int32_t destCapacity
,
1216 UErrorCode
*status
) {
1217 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1218 if (validateRE(regexp
, status
) == FALSE
) {
1221 if (replacementText
== NULL
|| replacementLength
< -1 ||
1222 (destBuf
== NULL
&& destCapacity
> 0) ||
1224 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1229 UBool findSucceeded
;
1230 uregex_reset(regexp2
, 0, status
);
1231 findSucceeded
= uregex_find(regexp2
, 0, status
);
1232 if (findSucceeded
) {
1233 len
= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1234 &destBuf
, &destCapacity
, status
);
1236 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1242 //------------------------------------------------------------------------------
1244 // uregex_replaceFirstUText
1246 //------------------------------------------------------------------------------
1247 U_CAPI UText
* U_EXPORT2
1248 uregex_replaceFirstUText(URegularExpression
*regexp2
,
1249 UText
*replacementText
,
1251 UErrorCode
*status
) {
1252 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1253 if (validateRE(regexp
, status
) == FALSE
) {
1256 if (replacementText
== NULL
) {
1257 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1261 dest
= regexp
->fMatcher
->replaceFirst(replacementText
, dest
, *status
);
1266 //------------------------------------------------------------------------------
1268 // uregex_appendReplacement
1270 //------------------------------------------------------------------------------
1274 // Dummy class, because these functions need to be friends of class RegexMatcher,
1275 // and stand-alone C functions don't work as friends
1279 inline static int32_t appendReplacement(RegularExpression
*regexp
,
1280 const UChar
*replacementText
,
1281 int32_t replacementLength
,
1283 int32_t *destCapacity
,
1284 UErrorCode
*status
);
1286 inline static int32_t appendTail(RegularExpression
*regexp
,
1288 int32_t *destCapacity
,
1289 UErrorCode
*status
);
1291 inline static int32_t split(RegularExpression
*regexp
,
1293 int32_t destCapacity
,
1294 int32_t *requiredCapacity
,
1295 UChar
*destFields
[],
1296 int32_t destFieldsCapacity
,
1297 UErrorCode
*status
);
1304 static const UChar BACKSLASH
= 0x5c;
1305 static const UChar DOLLARSIGN
= 0x24;
1308 // Move a character to an output buffer, with bounds checking on the index.
1309 // Index advances even if capacity is exceeded, for preflight size computations.
1310 // This little sequence is used a LOT.
1312 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
1313 if (*idx
< bufCapacity
) {
1321 // appendReplacement, the actual implementation.
1323 int32_t RegexCImpl::appendReplacement(RegularExpression
*regexp
,
1324 const UChar
*replacementText
,
1325 int32_t replacementLength
,
1327 int32_t *destCapacity
,
1328 UErrorCode
*status
) {
1330 // If we come in with a buffer overflow error, don't suppress the operation.
1331 // A series of appendReplacements, appendTail need to correctly preflight
1332 // the buffer size when an overflow happens somewhere in the middle.
1333 UBool pendingBufferOverflow
= FALSE
;
1334 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1335 pendingBufferOverflow
= TRUE
;
1336 *status
= U_ZERO_ERROR
;
1340 // Validate all paramters
1342 if (validateRE(regexp
, status
) == FALSE
) {
1345 if (replacementText
== NULL
|| replacementLength
< -1 ||
1346 destCapacity
== NULL
|| destBuf
== NULL
||
1347 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1348 *destCapacity
< 0) {
1349 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1353 RegexMatcher
*m
= regexp
->fMatcher
;
1354 if (m
->fMatch
== FALSE
) {
1355 *status
= U_REGEX_INVALID_STATE
;
1359 UChar
*dest
= *destBuf
;
1360 int32_t capacity
= *destCapacity
;
1361 int32_t destIdx
= 0;
1364 // If it wasn't supplied by the caller, get the length of the replacement text.
1365 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1366 // the fly and avoid this step.
1367 if (replacementLength
== -1) {
1368 replacementLength
= u_strlen(replacementText
);
1371 // Copy input string from the end of previous match to start of current match
1372 if (regexp
->fText
!= NULL
) {
1374 int32_t lastMatchEnd
;
1375 if (UTEXT_USES_U16(m
->fInputText
)) {
1376 lastMatchEnd
= (int32_t)m
->fLastMatchEnd
;
1377 matchStart
= (int32_t)m
->fMatchStart
;
1379 // !!!: Would like a better way to do this!
1380 UErrorCode status
= U_ZERO_ERROR
;
1381 lastMatchEnd
= utext_extract(m
->fInputText
, 0, m
->fLastMatchEnd
, NULL
, 0, &status
);
1382 status
= U_ZERO_ERROR
;
1383 matchStart
= lastMatchEnd
+ utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
, NULL
, 0, &status
);
1385 for (i
=lastMatchEnd
; i
<matchStart
; i
++) {
1386 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
1389 UErrorCode possibleOverflowError
= U_ZERO_ERROR
; // ignore
1390 destIdx
+= utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
,
1391 &dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
), &possibleOverflowError
);
1395 // scan the replacement text, looking for substitutions ($n) and \escapes.
1396 int32_t replIdx
= 0;
1397 while (replIdx
< replacementLength
) {
1398 UChar c
= replacementText
[replIdx
];
1400 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
1401 // Common case, no substitution, no escaping,
1402 // just copy the char to the dest buf.
1403 appendToBuf(c
, &destIdx
, dest
, capacity
);
1407 if (c
== BACKSLASH
) {
1408 // Backslash Escape. Copy the following char out without further checks.
1409 // Note: Surrogate pairs don't need any special handling
1410 // The second half wont be a '$' or a '\', and
1411 // will move to the dest normally on the next
1413 if (replIdx
>= replacementLength
) {
1416 c
= replacementText
[replIdx
];
1418 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
1419 // We have a \udddd or \Udddddddd escape sequence.
1420 UChar32 escapedChar
=
1421 u_unescapeAt(uregex_ucstr_unescape_charAt
,
1422 &replIdx
, // Index is updated by unescapeAt
1423 replacementLength
, // Length of replacement text
1424 (void *)replacementText
);
1426 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
1427 if (escapedChar
<= 0xffff) {
1428 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
1430 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
1431 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
1435 // Note: if the \u escape was invalid, just fall through and
1436 // treat it as a plain \<anything> escape.
1439 // Plain backslash escape. Just put out the escaped character.
1440 appendToBuf(c
, &destIdx
, dest
, capacity
);
1448 // We've got a $. Pick up a capture group number if one follows.
1449 // Consume at most the number of digits necessary for the largest capture
1450 // number that is valid for this pattern.
1452 int32_t numDigits
= 0;
1453 int32_t groupNum
= 0;
1456 if (replIdx
>= replacementLength
) {
1459 U16_GET(replacementText
, 0, replIdx
, replacementLength
, digitC
);
1460 if (u_isdigit(digitC
) == FALSE
) {
1464 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1465 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
1467 if (numDigits
>= m
->fPattern
->fMaxCaptureDigits
) {
1473 if (numDigits
== 0) {
1474 // The $ didn't introduce a group number at all.
1475 // Treat it as just part of the substitution text.
1476 appendToBuf(DOLLARSIGN
, &destIdx
, dest
, capacity
);
1480 // Finally, append the capture group data to the destination.
1481 destIdx
+= uregex_group((URegularExpression
*)regexp
, groupNum
, &dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
), status
);
1482 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
1483 // Ignore buffer overflow when extracting the group. We need to
1484 // continue on to get full size of the untruncated result. We will
1485 // raise our own buffer overflow error at the end.
1486 *status
= U_ZERO_ERROR
;
1489 if (U_FAILURE(*status
)) {
1490 // Can fail if group number is out of range.
1497 // Nul Terminate the dest buffer if possible.
1498 // Set the appropriate buffer overflow or not terminated error, if needed.
1500 if (destIdx
< capacity
) {
1502 } else if (destIdx
== *destCapacity
) {
1503 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1505 *status
= U_BUFFER_OVERFLOW_ERROR
;
1509 // Return an updated dest buffer and capacity to the caller.
1511 if (destIdx
> 0 && *destCapacity
> 0) {
1512 if (destIdx
< capacity
) {
1513 *destBuf
+= destIdx
;
1514 *destCapacity
-= destIdx
;
1516 *destBuf
+= capacity
;
1521 // If we came in with a buffer overflow, make sure we go out with one also.
1522 // (A zero length match right at the end of the previous match could
1523 // make this function succeed even though a previous call had overflowed the buf)
1524 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1525 *status
= U_BUFFER_OVERFLOW_ERROR
;
1532 // appendReplacement the actual API function,
1534 U_CAPI
int32_t U_EXPORT2
1535 uregex_appendReplacement(URegularExpression
*regexp2
,
1536 const UChar
*replacementText
,
1537 int32_t replacementLength
,
1539 int32_t *destCapacity
,
1540 UErrorCode
*status
) {
1542 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1543 return RegexCImpl::appendReplacement(
1544 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
1548 // uregex_appendReplacementUText...can just use the normal C++ method
1550 U_CAPI
void U_EXPORT2
1551 uregex_appendReplacementUText(URegularExpression
*regexp2
,
1554 UErrorCode
*status
) {
1555 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1556 regexp
->fMatcher
->appendReplacement(dest
, replText
, *status
);
1560 //------------------------------------------------------------------------------
1562 // uregex_appendTail
1564 //------------------------------------------------------------------------------
1565 int32_t RegexCImpl::appendTail(RegularExpression
*regexp
,
1567 int32_t *destCapacity
,
1571 // If we come in with a buffer overflow error, don't suppress the operation.
1572 // A series of appendReplacements, appendTail need to correctly preflight
1573 // the buffer size when an overflow happens somewhere in the middle.
1574 UBool pendingBufferOverflow
= FALSE
;
1575 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1576 pendingBufferOverflow
= TRUE
;
1577 *status
= U_ZERO_ERROR
;
1580 if (validateRE(regexp
, status
) == FALSE
) {
1584 if (destCapacity
== NULL
|| destBuf
== NULL
||
1585 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1588 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1592 RegexMatcher
*m
= regexp
->fMatcher
;
1594 int32_t destIdx
= 0;
1595 int32_t destCap
= *destCapacity
;
1596 UChar
*dest
= *destBuf
;
1598 if (regexp
->fText
!= NULL
) {
1600 int64_t nativeIdx
= (m
->fMatch
? m
->fMatchEnd
: m
->fLastMatchEnd
);
1601 if (nativeIdx
== -1) {
1603 } else if (UTEXT_USES_U16(m
->fInputText
)) {
1604 srcIdx
= (int32_t)nativeIdx
;
1606 UErrorCode status
= U_ZERO_ERROR
;
1607 srcIdx
= utext_extract(m
->fInputText
, 0, nativeIdx
, NULL
, 0, &status
);
1611 if (srcIdx
== regexp
->fTextLength
) {
1614 UChar c
= regexp
->fText
[srcIdx
];
1615 if (c
== 0 && regexp
->fTextLength
== -1) {
1616 regexp
->fTextLength
= srcIdx
;
1619 if (destIdx
< destCap
) {
1622 // We've overflowed the dest buffer.
1623 // If the total input string length is known, we can
1624 // compute the total buffer size needed without scanning through the string.
1625 if (regexp
->fTextLength
> 0) {
1626 destIdx
+= (regexp
->fTextLength
- srcIdx
);
1636 // The most recent call to find() succeeded.
1637 srcIdx
= m
->fMatchEnd
;
1639 // The last call to find() on this matcher failed().
1640 // Look back to the end of the last find() that succeeded for src index.
1641 srcIdx
= m
->fLastMatchEnd
;
1643 // There has been no successful match with this matcher.
1644 // We want to copy the whole string.
1649 destIdx
= utext_extract(m
->fInputText
, srcIdx
, m
->fInputLength
, dest
, destCap
, status
);
1653 // NUL terminate the output string, if possible, otherwise issue the
1654 // appropriate error or warning.
1656 if (destIdx
< destCap
) {
1658 } else if (destIdx
== destCap
) {
1659 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1661 *status
= U_BUFFER_OVERFLOW_ERROR
;
1665 // Update the user's buffer ptr and capacity vars to reflect the
1668 if (destIdx
< destCap
) {
1669 *destBuf
+= destIdx
;
1670 *destCapacity
-= destIdx
;
1672 *destBuf
+= destCap
;
1676 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1677 *status
= U_BUFFER_OVERFLOW_ERROR
;
1685 // appendTail the actual API function
1687 U_CAPI
int32_t U_EXPORT2
1688 uregex_appendTail(URegularExpression
*regexp2
,
1690 int32_t *destCapacity
,
1691 UErrorCode
*status
) {
1692 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1693 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
1698 // uregex_appendTailUText...can just use the normal C++ method
1700 U_CAPI UText
* U_EXPORT2
1701 uregex_appendTailUText(URegularExpression
*regexp2
,
1703 UErrorCode
*status
) {
1704 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1705 return regexp
->fMatcher
->appendTail(dest
, *status
);
1709 //------------------------------------------------------------------------------
1711 // copyString Internal utility to copy a string to an output buffer,
1712 // while managing buffer overflow and preflight size
1713 // computation. NUL termination is added to destination,
1714 // and the NUL is counted in the output size.
1716 //------------------------------------------------------------------------------
1718 static void copyString(UChar
*destBuffer
, // Destination buffer.
1719 int32_t destCapacity
, // Total capacity of dest buffer
1720 int32_t *destIndex
, // Index into dest buffer. Updated on return.
1721 // Update not clipped to destCapacity.
1722 const UChar
*srcPtr
, // Pointer to source string
1723 int32_t srcLen
) // Source string len.
1726 int32_t di
= *destIndex
;
1729 for (si
=0; si
<srcLen
; si
++) {
1731 if (di
< destCapacity
) {
1739 if (di
<destCapacity
) {
1747 //------------------------------------------------------------------------------
1751 //------------------------------------------------------------------------------
1752 int32_t RegexCImpl::split(RegularExpression
*regexp
,
1754 int32_t destCapacity
,
1755 int32_t *requiredCapacity
,
1756 UChar
*destFields
[],
1757 int32_t destFieldsCapacity
,
1758 UErrorCode
*status
) {
1760 // Reset for the input text
1762 regexp
->fMatcher
->reset();
1763 UText
*inputText
= regexp
->fMatcher
->fInputText
;
1764 int64_t nextOutputStringStart
= 0;
1765 int64_t inputLen
= regexp
->fMatcher
->fInputLength
;
1766 if (inputLen
== 0) {
1771 // Loop through the input text, searching for the delimiter pattern
1773 int32_t i
; // Index of the field being processed.
1774 int32_t destIdx
= 0; // Next available position in destBuf;
1775 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1776 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow errors so that the strings are still counted
1778 if (i
>=destFieldsCapacity
-1) {
1779 // There are one or zero output strings left.
1780 // Fill the last output string with whatever is left from the input, then exit the loop.
1781 // ( i will be == destFieldsCapacity if we filled the output array while processing
1782 // capture groups of the delimiter expression, in which case we will discard the
1783 // last capture group saved in favor of the unprocessed remainder of the
1785 if (inputLen
> nextOutputStringStart
) {
1786 if (i
!= destFieldsCapacity
-1) {
1787 // No fields are left. Recycle the last one for holding the trailing part of
1788 // the input string.
1789 i
= destFieldsCapacity
-1;
1790 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1793 destFields
[i
] = &destBuf
[destIdx
];
1794 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1795 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1800 if (regexp
->fMatcher
->find()) {
1801 // We found another delimiter. Move everything from where we started looking
1802 // up until the start of the delimiter into the next output string.
1803 destFields
[i
] = &destBuf
[destIdx
];
1805 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, regexp
->fMatcher
->fMatchStart
,
1806 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), &tStatus
);
1807 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1808 tStatus
= U_ZERO_ERROR
;
1812 nextOutputStringStart
= regexp
->fMatcher
->fMatchEnd
;
1814 // If the delimiter pattern has capturing parentheses, the captured
1815 // text goes out into the next n destination strings.
1817 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1818 // If we've run out of output string slots, bail out.
1819 if (i
==destFieldsCapacity
-1) {
1824 // Set up to extract the capture group contents into the dest buffer.
1825 destFields
[i
] = &destBuf
[destIdx
];
1826 tStatus
= U_ZERO_ERROR
;
1827 int32_t t
= uregex_group((URegularExpression
*)regexp
, groupNum
, destFields
[i
], REMAINING_CAPACITY(destIdx
, destCapacity
), &tStatus
);
1828 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1829 // +1 for the NUL that terminates the string.
1830 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1831 tStatus
= U_ZERO_ERROR
;
1837 if (nextOutputStringStart
== inputLen
) {
1838 // The delimiter was at the end of the string. We're done.
1845 // We ran off the end of the input while looking for the next delimiter.
1846 // All the remaining text goes into the current output string.
1847 destFields
[i
] = &destBuf
[destIdx
];
1848 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1849 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1854 // Zero out any unused portion of the destFields array
1856 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1857 destFields
[j
] = NULL
;
1860 if (requiredCapacity
!= NULL
) {
1861 *requiredCapacity
= destIdx
;
1863 if (destIdx
> destCapacity
) {
1864 *status
= U_BUFFER_OVERFLOW_ERROR
;
1870 // uregex_split The actual API function
1872 U_CAPI
int32_t U_EXPORT2
1873 uregex_split(URegularExpression
*regexp2
,
1875 int32_t destCapacity
,
1876 int32_t *requiredCapacity
,
1877 UChar
*destFields
[],
1878 int32_t destFieldsCapacity
,
1879 UErrorCode
*status
) {
1880 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1881 if (validateRE(regexp
, status
) == FALSE
) {
1884 if ((destBuf
== NULL
&& destCapacity
> 0) ||
1886 destFields
== NULL
||
1887 destFieldsCapacity
< 1 ) {
1888 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1892 return RegexCImpl::split(regexp
, destBuf
, destCapacity
, requiredCapacity
, destFields
, destFieldsCapacity
, status
);
1897 // uregex_splitUText...can just use the normal C++ method
1899 U_CAPI
int32_t U_EXPORT2
1900 uregex_splitUText(URegularExpression
*regexp2
,
1901 UText
*destFields
[],
1902 int32_t destFieldsCapacity
,
1903 UErrorCode
*status
) {
1904 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1905 return regexp
->fMatcher
->split(regexp
->fMatcher
->inputText(), destFields
, destFieldsCapacity
, *status
);
1909 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS