2 *******************************************************************************
3 * Copyright (C) 2004-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: uregex.cpp
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "unicode/utf16.h"
30 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
32 struct RegularExpression
: public UMemory
{
38 u_atomic_int32_t
*fPatRefCount
;
40 int32_t fPatStringLen
;
41 RegexMatcher
*fMatcher
;
42 const UChar
*fText
; // Text from setText()
43 int32_t fTextLength
; // Length provided by user with setText(), which
48 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
50 RegularExpression::RegularExpression() {
62 RegularExpression::~RegularExpression() {
65 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
67 uprv_free(fPatString
);
68 uprv_free((void *)fPatRefCount
);
70 if (fOwnsText
&& fText
!=NULL
) {
71 uprv_free((void *)fText
);
80 //----------------------------------------------------------------------------------------
82 // validateRE Do boilerplate style checks on API function parameters.
83 // Return TRUE if they look OK.
84 //----------------------------------------------------------------------------------------
85 static UBool
validateRE(const RegularExpression
*re
, UBool requiresText
, UErrorCode
*status
) {
86 if (U_FAILURE(*status
)) {
89 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
90 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94 if (requiresText
&& re
->fText
== NULL
&& !re
->fOwnsText
) {
95 *status
= U_REGEX_INVALID_STATE
;
101 //----------------------------------------------------------------------------------------
105 //----------------------------------------------------------------------------------------
106 U_CAPI URegularExpression
* U_EXPORT2
107 uregex_open( const UChar
*pattern
,
108 int32_t patternLength
,
111 UErrorCode
*status
) {
113 if (U_FAILURE(*status
)) {
116 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
117 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
120 int32_t actualPatLen
= patternLength
;
121 if (actualPatLen
== -1) {
122 actualPatLen
= u_strlen(pattern
);
125 RegularExpression
*re
= new RegularExpression
;
126 u_atomic_int32_t
*refC
= (u_atomic_int32_t
*)uprv_malloc(sizeof(int32_t));
127 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
128 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
129 *status
= U_MEMORY_ALLOCATION_ERROR
;
131 uprv_free((void *)refC
);
135 re
->fPatRefCount
= refC
;
136 *re
->fPatRefCount
= 1;
139 // Make a copy of the pattern string, so we can return it later if asked.
140 // For compiling the pattern, we will use a UText wrapper around
141 // this local copy, to avoid making even more copies.
143 re
->fPatString
= patBuf
;
144 re
->fPatStringLen
= patternLength
;
145 u_memcpy(patBuf
, pattern
, actualPatLen
);
146 patBuf
[actualPatLen
] = 0;
148 UText patText
= UTEXT_INITIALIZER
;
149 utext_openUChars(&patText
, patBuf
, patternLength
, status
);
152 // Compile the pattern
155 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
157 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
159 utext_close(&patText
);
161 if (U_FAILURE(*status
)) {
166 // Create the matcher object
168 re
->fMatcher
= re
->fPat
->matcher(*status
);
169 if (U_SUCCESS(*status
)) {
170 return (URegularExpression
*)re
;
179 //----------------------------------------------------------------------------------------
183 //----------------------------------------------------------------------------------------
184 U_CAPI URegularExpression
* U_EXPORT2
185 uregex_openUText(UText
*pattern
,
188 UErrorCode
*status
) {
190 if (U_FAILURE(*status
)) {
193 if (pattern
== NULL
) {
194 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
198 int64_t patternNativeLength
= utext_nativeLength(pattern
);
200 if (patternNativeLength
== 0) {
201 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
205 RegularExpression
*re
= new RegularExpression
;
207 UErrorCode lengthStatus
= U_ZERO_ERROR
;
208 int32_t pattern16Length
= utext_extract(pattern
, 0, patternNativeLength
, NULL
, 0, &lengthStatus
);
210 u_atomic_int32_t
*refC
= (u_atomic_int32_t
*)uprv_malloc(sizeof(int32_t));
211 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(pattern16Length
+1));
212 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
213 *status
= U_MEMORY_ALLOCATION_ERROR
;
215 uprv_free((void *)refC
);
219 re
->fPatRefCount
= refC
;
220 *re
->fPatRefCount
= 1;
223 // Make a copy of the pattern string, so we can return it later if asked.
224 // For compiling the pattern, we will use a read-only UText wrapper
225 // around this local copy, to avoid making even more copies.
227 re
->fPatString
= patBuf
;
228 re
->fPatStringLen
= pattern16Length
;
229 utext_extract(pattern
, 0, patternNativeLength
, patBuf
, pattern16Length
+1, status
);
231 UText patText
= UTEXT_INITIALIZER
;
232 utext_openUChars(&patText
, patBuf
, pattern16Length
, status
);
235 // Compile the pattern
238 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
240 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
242 utext_close(&patText
);
244 if (U_FAILURE(*status
)) {
249 // Create the matcher object
251 re
->fMatcher
= re
->fPat
->matcher(*status
);
252 if (U_SUCCESS(*status
)) {
253 return (URegularExpression
*)re
;
262 //----------------------------------------------------------------------------------------
266 //----------------------------------------------------------------------------------------
267 U_CAPI
void U_EXPORT2
268 uregex_close(URegularExpression
*re2
) {
269 RegularExpression
*re
= (RegularExpression
*)re2
;
270 UErrorCode status
= U_ZERO_ERROR
;
271 if (validateRE(re
, FALSE
, &status
) == FALSE
) {
278 //----------------------------------------------------------------------------------------
282 //----------------------------------------------------------------------------------------
283 U_CAPI URegularExpression
* U_EXPORT2
284 uregex_clone(const URegularExpression
*source2
, UErrorCode
*status
) {
285 RegularExpression
*source
= (RegularExpression
*)source2
;
286 if (validateRE(source
, FALSE
, status
) == FALSE
) {
290 RegularExpression
*clone
= new RegularExpression
;
292 *status
= U_MEMORY_ALLOCATION_ERROR
;
296 clone
->fMatcher
= source
->fPat
->matcher(*status
);
297 if (U_FAILURE(*status
)) {
302 clone
->fPat
= source
->fPat
;
303 clone
->fPatRefCount
= source
->fPatRefCount
;
304 clone
->fPatString
= source
->fPatString
;
305 clone
->fPatStringLen
= source
->fPatStringLen
;
306 umtx_atomic_inc(source
->fPatRefCount
);
307 // Note: fText is not cloned.
309 return (URegularExpression
*)clone
;
315 //------------------------------------------------------------------------------
319 //------------------------------------------------------------------------------
320 U_CAPI
const UChar
* U_EXPORT2
321 uregex_pattern(const URegularExpression
*regexp2
,
323 UErrorCode
*status
) {
324 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
326 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
329 if (patLength
!= NULL
) {
330 *patLength
= regexp
->fPatStringLen
;
332 return regexp
->fPatString
;
336 //------------------------------------------------------------------------------
338 // uregex_patternUText
340 //------------------------------------------------------------------------------
341 U_CAPI UText
* U_EXPORT2
342 uregex_patternUText(const URegularExpression
*regexp2
,
343 UErrorCode
*status
) {
344 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
345 return regexp
->fPat
->patternText(*status
);
349 //------------------------------------------------------------------------------
353 //------------------------------------------------------------------------------
354 U_CAPI
int32_t U_EXPORT2
355 uregex_flags(const URegularExpression
*regexp2
, UErrorCode
*status
) {
356 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
357 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
360 int32_t flags
= regexp
->fPat
->flags();
365 //------------------------------------------------------------------------------
369 //------------------------------------------------------------------------------
370 U_CAPI
void U_EXPORT2
371 uregex_setText(URegularExpression
*regexp2
,
374 UErrorCode
*status
) {
375 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
376 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
379 if (text
== NULL
|| textLength
< -1) {
380 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
384 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
385 uprv_free((void *)regexp
->fText
);
388 regexp
->fText
= text
;
389 regexp
->fTextLength
= textLength
;
390 regexp
->fOwnsText
= FALSE
;
392 UText input
= UTEXT_INITIALIZER
;
393 utext_openUChars(&input
, text
, textLength
, status
);
394 regexp
->fMatcher
->reset(&input
);
395 utext_close(&input
); // reset() made a shallow clone, so we don't need this copy
399 //------------------------------------------------------------------------------
403 //------------------------------------------------------------------------------
404 U_CAPI
void U_EXPORT2
405 uregex_setUText(URegularExpression
*regexp2
,
407 UErrorCode
*status
) {
408 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
409 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
413 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
417 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
418 uprv_free((void *)regexp
->fText
);
421 regexp
->fText
= NULL
; // only fill it in on request
422 regexp
->fTextLength
= -1;
423 regexp
->fOwnsText
= TRUE
;
424 regexp
->fMatcher
->reset(text
);
429 //------------------------------------------------------------------------------
433 //------------------------------------------------------------------------------
434 U_CAPI
const UChar
* U_EXPORT2
435 uregex_getText(URegularExpression
*regexp2
,
437 UErrorCode
*status
) {
438 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
439 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
443 if (regexp
->fText
== NULL
) {
444 // need to fill in the text
445 UText
*inputText
= regexp
->fMatcher
->inputText();
446 int64_t inputNativeLength
= utext_nativeLength(inputText
);
447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText
, inputNativeLength
)) {
448 regexp
->fText
= inputText
->chunkContents
;
449 regexp
->fTextLength
= (int32_t)inputNativeLength
;
450 regexp
->fOwnsText
= FALSE
; // because the UText owns it
452 UErrorCode lengthStatus
= U_ZERO_ERROR
;
453 regexp
->fTextLength
= utext_extract(inputText
, 0, inputNativeLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
454 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(regexp
->fTextLength
+1));
456 utext_extract(inputText
, 0, inputNativeLength
, inputChars
, regexp
->fTextLength
+1, status
);
457 regexp
->fText
= inputChars
;
458 regexp
->fOwnsText
= TRUE
; // should already be set but just in case
462 if (textLength
!= NULL
) {
463 *textLength
= regexp
->fTextLength
;
465 return regexp
->fText
;
469 //------------------------------------------------------------------------------
473 //------------------------------------------------------------------------------
474 U_CAPI UText
* U_EXPORT2
475 uregex_getUText(URegularExpression
*regexp2
,
477 UErrorCode
*status
) {
478 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
479 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
482 return regexp
->fMatcher
->getInput(dest
, *status
);
486 //------------------------------------------------------------------------------
488 // uregex_refreshUText
490 //------------------------------------------------------------------------------
491 U_CAPI
void U_EXPORT2
492 uregex_refreshUText(URegularExpression
*regexp2
,
494 UErrorCode
*status
) {
495 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
496 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
499 regexp
->fMatcher
->refreshInputText(text
, *status
);
503 //------------------------------------------------------------------------------
507 //------------------------------------------------------------------------------
508 U_CAPI UBool U_EXPORT2
509 uregex_matches(URegularExpression
*regexp2
,
511 UErrorCode
*status
) {
512 return uregex_matches64( regexp2
, (int64_t)startIndex
, status
);
515 U_CAPI UBool U_EXPORT2
516 uregex_matches64(URegularExpression
*regexp2
,
518 UErrorCode
*status
) {
519 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
520 UBool result
= FALSE
;
521 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
524 if (startIndex
== -1) {
525 result
= regexp
->fMatcher
->matches(*status
);
527 result
= regexp
->fMatcher
->matches(startIndex
, *status
);
533 //------------------------------------------------------------------------------
537 //------------------------------------------------------------------------------
538 U_CAPI UBool U_EXPORT2
539 uregex_lookingAt(URegularExpression
*regexp2
,
541 UErrorCode
*status
) {
542 return uregex_lookingAt64( regexp2
, (int64_t)startIndex
, status
);
545 U_CAPI UBool U_EXPORT2
546 uregex_lookingAt64(URegularExpression
*regexp2
,
548 UErrorCode
*status
) {
549 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
550 UBool result
= FALSE
;
551 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
554 if (startIndex
== -1) {
555 result
= regexp
->fMatcher
->lookingAt(*status
);
557 result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
564 //------------------------------------------------------------------------------
568 //------------------------------------------------------------------------------
569 U_CAPI UBool U_EXPORT2
570 uregex_find(URegularExpression
*regexp2
,
572 UErrorCode
*status
) {
573 return uregex_find64( regexp2
, (int64_t)startIndex
, status
);
576 U_CAPI UBool U_EXPORT2
577 uregex_find64(URegularExpression
*regexp2
,
579 UErrorCode
*status
) {
580 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
581 UBool result
= FALSE
;
582 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
585 if (startIndex
== -1) {
586 regexp
->fMatcher
->resetPreserveRegion();
587 result
= regexp
->fMatcher
->find();
589 result
= regexp
->fMatcher
->find(startIndex
, *status
);
595 //------------------------------------------------------------------------------
599 //------------------------------------------------------------------------------
600 U_CAPI UBool U_EXPORT2
601 uregex_findNext(URegularExpression
*regexp2
,
602 UErrorCode
*status
) {
603 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
604 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
607 UBool result
= regexp
->fMatcher
->find();
611 //------------------------------------------------------------------------------
615 //------------------------------------------------------------------------------
616 U_CAPI
int32_t U_EXPORT2
617 uregex_groupCount(URegularExpression
*regexp2
,
618 UErrorCode
*status
) {
619 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
620 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
623 int32_t result
= regexp
->fMatcher
->groupCount();
628 //------------------------------------------------------------------------------
632 //------------------------------------------------------------------------------
633 U_CAPI
int32_t U_EXPORT2
634 uregex_group(URegularExpression
*regexp2
,
637 int32_t destCapacity
,
638 UErrorCode
*status
) {
639 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
640 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
643 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
644 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
648 if (destCapacity
== 0 || regexp
->fText
!= NULL
) {
649 // If preflighting or if we already have the text as UChars,
650 // this is a little cheaper than going through uregex_groupUTextDeep()
653 // Pick up the range of characters from the matcher
655 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
656 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
657 if (U_FAILURE(*status
)) {
662 // Trim length based on buffer capacity
664 int32_t fullLength
= endIx
- startIx
;
665 int32_t copyLength
= fullLength
;
666 if (copyLength
< destCapacity
) {
667 dest
[copyLength
] = 0;
668 } else if (copyLength
== destCapacity
) {
669 *status
= U_STRING_NOT_TERMINATED_WARNING
;
671 copyLength
= destCapacity
;
672 *status
= U_BUFFER_OVERFLOW_ERROR
;
676 // Copy capture group to user's buffer
678 if (copyLength
> 0) {
679 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
684 UText
*groupText
= uregex_groupUTextDeep(regexp2
, groupNum
, NULL
, status
);
685 if (U_SUCCESS(*status
)) {
686 result
= utext_extract(groupText
, 0, utext_nativeLength(groupText
), dest
, destCapacity
, status
);
688 utext_close(groupText
);
694 //------------------------------------------------------------------------------
698 //------------------------------------------------------------------------------
699 U_CAPI UText
* U_EXPORT2
700 uregex_groupUText(URegularExpression
*regexp2
,
703 int64_t *groupLength
,
704 UErrorCode
*status
) {
705 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
706 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
707 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
708 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
711 return regexp
->fMatcher
->group(groupNum
, dest
, *groupLength
, *status
);
714 //------------------------------------------------------------------------------
716 // uregex_groupUTextDeep
718 //------------------------------------------------------------------------------
719 U_CAPI UText
* U_EXPORT2
720 uregex_groupUTextDeep(URegularExpression
*regexp2
,
723 UErrorCode
*status
) {
724 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
725 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
726 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
727 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
730 if (regexp
->fText
!= NULL
) {
732 // Pick up the range of characters from the matcher
733 // and use our already-extracted characters
735 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
736 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
737 if (U_FAILURE(*status
)) {
738 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
739 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
743 utext_replace(dest
, 0, utext_nativeLength(dest
), ®exp
->fText
[startIx
], endIx
- startIx
, status
);
745 UText groupText
= UTEXT_INITIALIZER
;
746 utext_openUChars(&groupText
, ®exp
->fText
[startIx
], endIx
- startIx
, status
);
747 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, status
);
748 utext_close(&groupText
);
753 return regexp
->fMatcher
->group(groupNum
, dest
, *status
);
757 //------------------------------------------------------------------------------
761 //------------------------------------------------------------------------------
762 U_CAPI
int32_t U_EXPORT2
763 uregex_start(URegularExpression
*regexp2
,
765 UErrorCode
*status
) {
766 return (int32_t)uregex_start64( regexp2
, groupNum
, status
);
769 U_CAPI
int64_t U_EXPORT2
770 uregex_start64(URegularExpression
*regexp2
,
772 UErrorCode
*status
) {
773 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
774 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
777 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
781 //------------------------------------------------------------------------------
785 //------------------------------------------------------------------------------
786 U_CAPI
int32_t U_EXPORT2
787 uregex_end(URegularExpression
*regexp2
,
789 UErrorCode
*status
) {
790 return (int32_t)uregex_end64( regexp2
, groupNum
, status
);
793 U_CAPI
int64_t U_EXPORT2
794 uregex_end64(URegularExpression
*regexp2
,
796 UErrorCode
*status
) {
797 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
798 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
801 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
805 //------------------------------------------------------------------------------
809 //------------------------------------------------------------------------------
810 U_CAPI
void U_EXPORT2
811 uregex_reset(URegularExpression
*regexp2
,
813 UErrorCode
*status
) {
814 uregex_reset64( regexp2
, (int64_t)index
, status
);
817 U_CAPI
void U_EXPORT2
818 uregex_reset64(URegularExpression
*regexp2
,
820 UErrorCode
*status
) {
821 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
822 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
825 regexp
->fMatcher
->reset(index
, *status
);
829 //------------------------------------------------------------------------------
833 //------------------------------------------------------------------------------
834 U_CAPI
void U_EXPORT2
835 uregex_setRegion(URegularExpression
*regexp2
,
838 UErrorCode
*status
) {
839 uregex_setRegion64( regexp2
, (int64_t)regionStart
, (int64_t)regionLimit
, status
);
842 U_CAPI
void U_EXPORT2
843 uregex_setRegion64(URegularExpression
*regexp2
,
846 UErrorCode
*status
) {
847 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
848 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
851 regexp
->fMatcher
->region(regionStart
, regionLimit
, *status
);
855 //------------------------------------------------------------------------------
857 // uregex_setRegionAndStart
859 //------------------------------------------------------------------------------
860 U_CAPI
void U_EXPORT2
861 uregex_setRegionAndStart(URegularExpression
*regexp2
,
865 UErrorCode
*status
) {
866 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
867 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
870 regexp
->fMatcher
->region(regionStart
, regionLimit
, startIndex
, *status
);
873 //------------------------------------------------------------------------------
875 // uregex_regionStart
877 //------------------------------------------------------------------------------
878 U_CAPI
int32_t U_EXPORT2
879 uregex_regionStart(const URegularExpression
*regexp2
,
880 UErrorCode
*status
) {
881 return (int32_t)uregex_regionStart64(regexp2
, status
);
884 U_CAPI
int64_t U_EXPORT2
885 uregex_regionStart64(const URegularExpression
*regexp2
,
886 UErrorCode
*status
) {
887 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
888 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
891 return regexp
->fMatcher
->regionStart();
895 //------------------------------------------------------------------------------
899 //------------------------------------------------------------------------------
900 U_CAPI
int32_t U_EXPORT2
901 uregex_regionEnd(const URegularExpression
*regexp2
,
902 UErrorCode
*status
) {
903 return (int32_t)uregex_regionEnd64(regexp2
, status
);
906 U_CAPI
int64_t U_EXPORT2
907 uregex_regionEnd64(const URegularExpression
*regexp2
,
908 UErrorCode
*status
) {
909 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
910 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
913 return regexp
->fMatcher
->regionEnd();
917 //------------------------------------------------------------------------------
919 // uregex_hasTransparentBounds
921 //------------------------------------------------------------------------------
922 U_CAPI UBool U_EXPORT2
923 uregex_hasTransparentBounds(const URegularExpression
*regexp2
,
924 UErrorCode
*status
) {
925 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
926 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
929 return regexp
->fMatcher
->hasTransparentBounds();
933 //------------------------------------------------------------------------------
935 // uregex_useTransparentBounds
937 //------------------------------------------------------------------------------
938 U_CAPI
void U_EXPORT2
939 uregex_useTransparentBounds(URegularExpression
*regexp2
,
941 UErrorCode
*status
) {
942 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
943 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
946 regexp
->fMatcher
->useTransparentBounds(b
);
950 //------------------------------------------------------------------------------
952 // uregex_hasAnchoringBounds
954 //------------------------------------------------------------------------------
955 U_CAPI UBool U_EXPORT2
956 uregex_hasAnchoringBounds(const URegularExpression
*regexp2
,
957 UErrorCode
*status
) {
958 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
959 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
962 return regexp
->fMatcher
->hasAnchoringBounds();
966 //------------------------------------------------------------------------------
968 // uregex_useAnchoringBounds
970 //------------------------------------------------------------------------------
971 U_CAPI
void U_EXPORT2
972 uregex_useAnchoringBounds(URegularExpression
*regexp2
,
974 UErrorCode
*status
) {
975 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
976 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
979 regexp
->fMatcher
->useAnchoringBounds(b
);
983 //------------------------------------------------------------------------------
987 //------------------------------------------------------------------------------
988 U_CAPI UBool U_EXPORT2
989 uregex_hitEnd(const URegularExpression
*regexp2
,
990 UErrorCode
*status
) {
991 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
992 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
995 return regexp
->fMatcher
->hitEnd();
999 //------------------------------------------------------------------------------
1001 // uregex_requireEnd
1003 //------------------------------------------------------------------------------
1004 U_CAPI UBool U_EXPORT2
1005 uregex_requireEnd(const URegularExpression
*regexp2
,
1006 UErrorCode
*status
) {
1007 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1008 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1011 return regexp
->fMatcher
->requireEnd();
1015 //------------------------------------------------------------------------------
1017 // uregex_setTimeLimit
1019 //------------------------------------------------------------------------------
1020 U_CAPI
void U_EXPORT2
1021 uregex_setTimeLimit(URegularExpression
*regexp2
,
1023 UErrorCode
*status
) {
1024 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1025 if (validateRE(regexp
, FALSE
, status
)) {
1026 regexp
->fMatcher
->setTimeLimit(limit
, *status
);
1032 //------------------------------------------------------------------------------
1034 // uregex_getTimeLimit
1036 //------------------------------------------------------------------------------
1037 U_CAPI
int32_t U_EXPORT2
1038 uregex_getTimeLimit(const URegularExpression
*regexp2
,
1039 UErrorCode
*status
) {
1041 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1042 if (validateRE(regexp
, FALSE
, status
)) {
1043 retVal
= regexp
->fMatcher
->getTimeLimit();
1050 //------------------------------------------------------------------------------
1052 // uregex_setStackLimit
1054 //------------------------------------------------------------------------------
1055 U_CAPI
void U_EXPORT2
1056 uregex_setStackLimit(URegularExpression
*regexp2
,
1058 UErrorCode
*status
) {
1059 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1060 if (validateRE(regexp
, FALSE
, status
)) {
1061 regexp
->fMatcher
->setStackLimit(limit
, *status
);
1067 //------------------------------------------------------------------------------
1069 // uregex_getStackLimit
1071 //------------------------------------------------------------------------------
1072 U_CAPI
int32_t U_EXPORT2
1073 uregex_getStackLimit(const URegularExpression
*regexp2
,
1074 UErrorCode
*status
) {
1076 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1077 if (validateRE(regexp
, FALSE
, status
)) {
1078 retVal
= regexp
->fMatcher
->getStackLimit();
1084 //------------------------------------------------------------------------------
1086 // uregex_setMatchCallback
1088 //------------------------------------------------------------------------------
1089 U_CAPI
void U_EXPORT2
1090 uregex_setMatchCallback(URegularExpression
*regexp2
,
1091 URegexMatchCallback
*callback
,
1092 const void *context
,
1093 UErrorCode
*status
) {
1094 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1095 if (validateRE(regexp
, FALSE
, status
)) {
1096 regexp
->fMatcher
->setMatchCallback(callback
, context
, *status
);
1101 //------------------------------------------------------------------------------
1103 // uregex_getMatchCallback
1105 //------------------------------------------------------------------------------
1106 U_CAPI
void U_EXPORT2
1107 uregex_getMatchCallback(const URegularExpression
*regexp2
,
1108 URegexMatchCallback
**callback
,
1109 const void **context
,
1110 UErrorCode
*status
) {
1111 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1112 if (validateRE(regexp
, FALSE
, status
)) {
1113 regexp
->fMatcher
->getMatchCallback(*callback
, *context
, *status
);
1118 //------------------------------------------------------------------------------
1120 // uregex_setMatchProgressCallback
1122 //------------------------------------------------------------------------------
1123 U_CAPI
void U_EXPORT2
1124 uregex_setFindProgressCallback(URegularExpression
*regexp2
,
1125 URegexFindProgressCallback
*callback
,
1126 const void *context
,
1127 UErrorCode
*status
) {
1128 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1129 if (validateRE(regexp
, FALSE
, status
)) {
1130 regexp
->fMatcher
->setFindProgressCallback(callback
, context
, *status
);
1135 //------------------------------------------------------------------------------
1137 // uregex_getMatchCallback
1139 //------------------------------------------------------------------------------
1140 U_CAPI
void U_EXPORT2
1141 uregex_getFindProgressCallback(const URegularExpression
*regexp2
,
1142 URegexFindProgressCallback
**callback
,
1143 const void **context
,
1144 UErrorCode
*status
) {
1145 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1146 if (validateRE(regexp
, FALSE
, status
)) {
1147 regexp
->fMatcher
->getFindProgressCallback(*callback
, *context
, *status
);
1152 //------------------------------------------------------------------------------
1154 // uregex_replaceAll
1156 //------------------------------------------------------------------------------
1157 U_CAPI
int32_t U_EXPORT2
1158 uregex_replaceAll(URegularExpression
*regexp2
,
1159 const UChar
*replacementText
,
1160 int32_t replacementLength
,
1162 int32_t destCapacity
,
1163 UErrorCode
*status
) {
1164 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1165 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1168 if (replacementText
== NULL
|| replacementLength
< -1 ||
1169 (destBuf
== NULL
&& destCapacity
> 0) ||
1171 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1177 uregex_reset(regexp2
, 0, status
);
1179 // Note: Seperate error code variables for findNext() and appendReplacement()
1180 // are used so that destination buffer overflow errors
1181 // in appendReplacement won't stop findNext() from working.
1182 // appendReplacement() and appendTail() special case incoming buffer
1183 // overflow errors, continuing to return the correct length.
1184 UErrorCode findStatus
= *status
;
1185 while (uregex_findNext(regexp2
, &findStatus
)) {
1186 len
+= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1187 &destBuf
, &destCapacity
, status
);
1189 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1191 if (U_FAILURE(findStatus
)) {
1192 // If anything went wrong with the findNext(), make that error trump
1193 // whatever may have happened with the append() operations.
1194 // Errors in findNext() are not expected.
1195 *status
= findStatus
;
1202 //------------------------------------------------------------------------------
1204 // uregex_replaceAllUText
1206 //------------------------------------------------------------------------------
1207 U_CAPI UText
* U_EXPORT2
1208 uregex_replaceAllUText(URegularExpression
*regexp2
,
1209 UText
*replacementText
,
1211 UErrorCode
*status
) {
1212 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1213 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1216 if (replacementText
== NULL
) {
1217 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1221 dest
= regexp
->fMatcher
->replaceAll(replacementText
, dest
, *status
);
1226 //------------------------------------------------------------------------------
1228 // uregex_replaceFirst
1230 //------------------------------------------------------------------------------
1231 U_CAPI
int32_t U_EXPORT2
1232 uregex_replaceFirst(URegularExpression
*regexp2
,
1233 const UChar
*replacementText
,
1234 int32_t replacementLength
,
1236 int32_t destCapacity
,
1237 UErrorCode
*status
) {
1238 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1239 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1242 if (replacementText
== NULL
|| replacementLength
< -1 ||
1243 (destBuf
== NULL
&& destCapacity
> 0) ||
1245 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1250 UBool findSucceeded
;
1251 uregex_reset(regexp2
, 0, status
);
1252 findSucceeded
= uregex_find(regexp2
, 0, status
);
1253 if (findSucceeded
) {
1254 len
= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1255 &destBuf
, &destCapacity
, status
);
1257 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1263 //------------------------------------------------------------------------------
1265 // uregex_replaceFirstUText
1267 //------------------------------------------------------------------------------
1268 U_CAPI UText
* U_EXPORT2
1269 uregex_replaceFirstUText(URegularExpression
*regexp2
,
1270 UText
*replacementText
,
1272 UErrorCode
*status
) {
1273 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1274 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1277 if (replacementText
== NULL
) {
1278 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1282 dest
= regexp
->fMatcher
->replaceFirst(replacementText
, dest
, *status
);
1287 //------------------------------------------------------------------------------
1289 // uregex_appendReplacement
1291 //------------------------------------------------------------------------------
1295 // Dummy class, because these functions need to be friends of class RegexMatcher,
1296 // and stand-alone C functions don't work as friends
1300 inline static int32_t appendReplacement(RegularExpression
*regexp
,
1301 const UChar
*replacementText
,
1302 int32_t replacementLength
,
1304 int32_t *destCapacity
,
1305 UErrorCode
*status
);
1307 inline static int32_t appendTail(RegularExpression
*regexp
,
1309 int32_t *destCapacity
,
1310 UErrorCode
*status
);
1312 inline static int32_t split(RegularExpression
*regexp
,
1314 int32_t destCapacity
,
1315 int32_t *requiredCapacity
,
1316 UChar
*destFields
[],
1317 int32_t destFieldsCapacity
,
1318 UErrorCode
*status
);
1325 static const UChar BACKSLASH
= 0x5c;
1326 static const UChar DOLLARSIGN
= 0x24;
1329 // Move a character to an output buffer, with bounds checking on the index.
1330 // Index advances even if capacity is exceeded, for preflight size computations.
1331 // This little sequence is used a LOT.
1333 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
1334 if (*idx
< bufCapacity
) {
1342 // appendReplacement, the actual implementation.
1344 int32_t RegexCImpl::appendReplacement(RegularExpression
*regexp
,
1345 const UChar
*replacementText
,
1346 int32_t replacementLength
,
1348 int32_t *destCapacity
,
1349 UErrorCode
*status
) {
1351 // If we come in with a buffer overflow error, don't suppress the operation.
1352 // A series of appendReplacements, appendTail need to correctly preflight
1353 // the buffer size when an overflow happens somewhere in the middle.
1354 UBool pendingBufferOverflow
= FALSE
;
1355 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1356 pendingBufferOverflow
= TRUE
;
1357 *status
= U_ZERO_ERROR
;
1361 // Validate all paramters
1363 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1366 if (replacementText
== NULL
|| replacementLength
< -1 ||
1367 destCapacity
== NULL
|| destBuf
== NULL
||
1368 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1369 *destCapacity
< 0) {
1370 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1374 RegexMatcher
*m
= regexp
->fMatcher
;
1375 if (m
->fMatch
== FALSE
) {
1376 *status
= U_REGEX_INVALID_STATE
;
1380 UChar
*dest
= *destBuf
;
1381 int32_t capacity
= *destCapacity
;
1382 int32_t destIdx
= 0;
1385 // If it wasn't supplied by the caller, get the length of the replacement text.
1386 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1387 // the fly and avoid this step.
1388 if (replacementLength
== -1) {
1389 replacementLength
= u_strlen(replacementText
);
1392 // Copy input string from the end of previous match to start of current match
1393 if (regexp
->fText
!= NULL
) {
1395 int32_t lastMatchEnd
;
1396 if (UTEXT_USES_U16(m
->fInputText
)) {
1397 lastMatchEnd
= (int32_t)m
->fLastMatchEnd
;
1398 matchStart
= (int32_t)m
->fMatchStart
;
1400 // !!!: Would like a better way to do this!
1401 UErrorCode status
= U_ZERO_ERROR
;
1402 lastMatchEnd
= utext_extract(m
->fInputText
, 0, m
->fLastMatchEnd
, NULL
, 0, &status
);
1403 status
= U_ZERO_ERROR
;
1404 matchStart
= lastMatchEnd
+ utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
, NULL
, 0, &status
);
1406 for (i
=lastMatchEnd
; i
<matchStart
; i
++) {
1407 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
1410 UErrorCode possibleOverflowError
= U_ZERO_ERROR
; // ignore
1411 destIdx
+= utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
,
1412 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
),
1413 &possibleOverflowError
);
1415 U_ASSERT(destIdx
>= 0);
1417 // scan the replacement text, looking for substitutions ($n) and \escapes.
1418 int32_t replIdx
= 0;
1419 while (replIdx
< replacementLength
) {
1420 UChar c
= replacementText
[replIdx
];
1422 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
1423 // Common case, no substitution, no escaping,
1424 // just copy the char to the dest buf.
1425 appendToBuf(c
, &destIdx
, dest
, capacity
);
1429 if (c
== BACKSLASH
) {
1430 // Backslash Escape. Copy the following char out without further checks.
1431 // Note: Surrogate pairs don't need any special handling
1432 // The second half wont be a '$' or a '\', and
1433 // will move to the dest normally on the next
1435 if (replIdx
>= replacementLength
) {
1438 c
= replacementText
[replIdx
];
1440 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
1441 // We have a \udddd or \Udddddddd escape sequence.
1442 UChar32 escapedChar
=
1443 u_unescapeAt(uregex_ucstr_unescape_charAt
,
1444 &replIdx
, // Index is updated by unescapeAt
1445 replacementLength
, // Length of replacement text
1446 (void *)replacementText
);
1448 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
1449 if (escapedChar
<= 0xffff) {
1450 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
1452 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
1453 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
1457 // Note: if the \u escape was invalid, just fall through and
1458 // treat it as a plain \<anything> escape.
1461 // Plain backslash escape. Just put out the escaped character.
1462 appendToBuf(c
, &destIdx
, dest
, capacity
);
1470 // We've got a $. Pick up a capture group number if one follows.
1471 // Consume at most the number of digits necessary for the largest capture
1472 // number that is valid for this pattern.
1474 int32_t numDigits
= 0;
1475 int32_t groupNum
= 0;
1478 if (replIdx
>= replacementLength
) {
1481 U16_GET(replacementText
, 0, replIdx
, replacementLength
, digitC
);
1482 if (u_isdigit(digitC
) == FALSE
) {
1486 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1487 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
1489 if (numDigits
>= m
->fPattern
->fMaxCaptureDigits
) {
1495 if (numDigits
== 0) {
1496 // The $ didn't introduce a group number at all.
1497 // Treat it as just part of the substitution text.
1498 appendToBuf(DOLLARSIGN
, &destIdx
, dest
, capacity
);
1502 // Finally, append the capture group data to the destination.
1503 destIdx
+= uregex_group((URegularExpression
*)regexp
, groupNum
,
1504 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
), status
);
1505 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
1506 // Ignore buffer overflow when extracting the group. We need to
1507 // continue on to get full size of the untruncated result. We will
1508 // raise our own buffer overflow error at the end.
1509 *status
= U_ZERO_ERROR
;
1512 if (U_FAILURE(*status
)) {
1513 // Can fail if group number is out of range.
1520 // Nul Terminate the dest buffer if possible.
1521 // Set the appropriate buffer overflow or not terminated error, if needed.
1523 if (destIdx
< capacity
) {
1525 } else if (destIdx
== *destCapacity
) {
1526 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1528 *status
= U_BUFFER_OVERFLOW_ERROR
;
1532 // Return an updated dest buffer and capacity to the caller.
1534 if (destIdx
> 0 && *destCapacity
> 0) {
1535 if (destIdx
< capacity
) {
1536 *destBuf
+= destIdx
;
1537 *destCapacity
-= destIdx
;
1539 *destBuf
+= capacity
;
1544 // If we came in with a buffer overflow, make sure we go out with one also.
1545 // (A zero length match right at the end of the previous match could
1546 // make this function succeed even though a previous call had overflowed the buf)
1547 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1548 *status
= U_BUFFER_OVERFLOW_ERROR
;
1555 // appendReplacement the actual API function,
1557 U_CAPI
int32_t U_EXPORT2
1558 uregex_appendReplacement(URegularExpression
*regexp2
,
1559 const UChar
*replacementText
,
1560 int32_t replacementLength
,
1562 int32_t *destCapacity
,
1563 UErrorCode
*status
) {
1565 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1566 return RegexCImpl::appendReplacement(
1567 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
1571 // uregex_appendReplacementUText...can just use the normal C++ method
1573 U_CAPI
void U_EXPORT2
1574 uregex_appendReplacementUText(URegularExpression
*regexp2
,
1577 UErrorCode
*status
) {
1578 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1579 regexp
->fMatcher
->appendReplacement(dest
, replText
, *status
);
1583 //------------------------------------------------------------------------------
1585 // uregex_appendTail
1587 //------------------------------------------------------------------------------
1588 int32_t RegexCImpl::appendTail(RegularExpression
*regexp
,
1590 int32_t *destCapacity
,
1594 // If we come in with a buffer overflow error, don't suppress the operation.
1595 // A series of appendReplacements, appendTail need to correctly preflight
1596 // the buffer size when an overflow happens somewhere in the middle.
1597 UBool pendingBufferOverflow
= FALSE
;
1598 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1599 pendingBufferOverflow
= TRUE
;
1600 *status
= U_ZERO_ERROR
;
1603 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1607 if (destCapacity
== NULL
|| destBuf
== NULL
||
1608 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1611 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1615 RegexMatcher
*m
= regexp
->fMatcher
;
1617 int32_t destIdx
= 0;
1618 int32_t destCap
= *destCapacity
;
1619 UChar
*dest
= *destBuf
;
1621 if (regexp
->fText
!= NULL
) {
1623 int64_t nativeIdx
= (m
->fMatch
? m
->fMatchEnd
: m
->fLastMatchEnd
);
1624 if (nativeIdx
== -1) {
1626 } else if (UTEXT_USES_U16(m
->fInputText
)) {
1627 srcIdx
= (int32_t)nativeIdx
;
1629 UErrorCode status
= U_ZERO_ERROR
;
1630 srcIdx
= utext_extract(m
->fInputText
, 0, nativeIdx
, NULL
, 0, &status
);
1634 U_ASSERT(destIdx
>= 0);
1636 if (srcIdx
== regexp
->fTextLength
) {
1639 UChar c
= regexp
->fText
[srcIdx
];
1640 if (c
== 0 && regexp
->fTextLength
== -1) {
1641 regexp
->fTextLength
= srcIdx
;
1645 if (destIdx
< destCap
) {
1648 // We've overflowed the dest buffer.
1649 // If the total input string length is known, we can
1650 // compute the total buffer size needed without scanning through the string.
1651 if (regexp
->fTextLength
> 0) {
1652 destIdx
+= (regexp
->fTextLength
- srcIdx
);
1662 // The most recent call to find() succeeded.
1663 srcIdx
= m
->fMatchEnd
;
1665 // The last call to find() on this matcher failed().
1666 // Look back to the end of the last find() that succeeded for src index.
1667 srcIdx
= m
->fLastMatchEnd
;
1669 // There has been no successful match with this matcher.
1670 // We want to copy the whole string.
1675 destIdx
= utext_extract(m
->fInputText
, srcIdx
, m
->fInputLength
, dest
, destCap
, status
);
1679 // NUL terminate the output string, if possible, otherwise issue the
1680 // appropriate error or warning.
1682 if (destIdx
< destCap
) {
1684 } else if (destIdx
== destCap
) {
1685 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1687 *status
= U_BUFFER_OVERFLOW_ERROR
;
1691 // Update the user's buffer ptr and capacity vars to reflect the
1694 if (destIdx
< destCap
) {
1695 *destBuf
+= destIdx
;
1696 *destCapacity
-= destIdx
;
1697 } else if (*destBuf
!= NULL
) {
1698 *destBuf
+= destCap
;
1702 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1703 *status
= U_BUFFER_OVERFLOW_ERROR
;
1711 // appendTail the actual API function
1713 U_CAPI
int32_t U_EXPORT2
1714 uregex_appendTail(URegularExpression
*regexp2
,
1716 int32_t *destCapacity
,
1717 UErrorCode
*status
) {
1718 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1719 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
1724 // uregex_appendTailUText...can just use the normal C++ method
1726 U_CAPI UText
* U_EXPORT2
1727 uregex_appendTailUText(URegularExpression
*regexp2
,
1729 UErrorCode
*status
) {
1730 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1731 return regexp
->fMatcher
->appendTail(dest
, *status
);
1735 //------------------------------------------------------------------------------
1737 // copyString Internal utility to copy a string to an output buffer,
1738 // while managing buffer overflow and preflight size
1739 // computation. NUL termination is added to destination,
1740 // and the NUL is counted in the output size.
1742 //------------------------------------------------------------------------------
1744 static void copyString(UChar
*destBuffer
, // Destination buffer.
1745 int32_t destCapacity
, // Total capacity of dest buffer
1746 int32_t *destIndex
, // Index into dest buffer. Updated on return.
1747 // Update not clipped to destCapacity.
1748 const UChar
*srcPtr
, // Pointer to source string
1749 int32_t srcLen
) // Source string len.
1752 int32_t di
= *destIndex
;
1755 for (si
=0; si
<srcLen
; si
++) {
1757 if (di
< destCapacity
) {
1765 if (di
<destCapacity
) {
1773 //------------------------------------------------------------------------------
1777 //------------------------------------------------------------------------------
1778 int32_t RegexCImpl::split(RegularExpression
*regexp
,
1780 int32_t destCapacity
,
1781 int32_t *requiredCapacity
,
1782 UChar
*destFields
[],
1783 int32_t destFieldsCapacity
,
1784 UErrorCode
*status
) {
1786 // Reset for the input text
1788 regexp
->fMatcher
->reset();
1789 UText
*inputText
= regexp
->fMatcher
->fInputText
;
1790 int64_t nextOutputStringStart
= 0;
1791 int64_t inputLen
= regexp
->fMatcher
->fInputLength
;
1792 if (inputLen
== 0) {
1797 // Loop through the input text, searching for the delimiter pattern
1799 int32_t i
; // Index of the field being processed.
1800 int32_t destIdx
= 0; // Next available position in destBuf;
1801 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1802 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow errors so that the strings are still counted
1804 if (i
>=destFieldsCapacity
-1) {
1805 // There are one or zero output strings left.
1806 // Fill the last output string with whatever is left from the input, then exit the loop.
1807 // ( i will be == destFieldsCapacity if we filled the output array while processing
1808 // capture groups of the delimiter expression, in which case we will discard the
1809 // last capture group saved in favor of the unprocessed remainder of the
1811 if (inputLen
> nextOutputStringStart
) {
1812 if (i
!= destFieldsCapacity
-1) {
1813 // No fields are left. Recycle the last one for holding the trailing part of
1814 // the input string.
1815 i
= destFieldsCapacity
-1;
1816 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1819 destFields
[i
] = &destBuf
[destIdx
];
1820 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1821 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1826 if (regexp
->fMatcher
->find()) {
1827 // We found another delimiter. Move everything from where we started looking
1828 // up until the start of the delimiter into the next output string.
1829 destFields
[i
] = &destBuf
[destIdx
];
1831 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, regexp
->fMatcher
->fMatchStart
,
1832 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), &tStatus
);
1833 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1834 tStatus
= U_ZERO_ERROR
;
1838 nextOutputStringStart
= regexp
->fMatcher
->fMatchEnd
;
1840 // If the delimiter pattern has capturing parentheses, the captured
1841 // text goes out into the next n destination strings.
1843 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1844 // If we've run out of output string slots, bail out.
1845 if (i
==destFieldsCapacity
-1) {
1850 // Set up to extract the capture group contents into the dest buffer.
1851 destFields
[i
] = &destBuf
[destIdx
];
1852 tStatus
= U_ZERO_ERROR
;
1853 int32_t t
= uregex_group((URegularExpression
*)regexp
,
1856 REMAINING_CAPACITY(destIdx
, destCapacity
),
1858 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1859 // +1 for the NUL that terminates the string.
1860 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1861 tStatus
= U_ZERO_ERROR
;
1867 if (nextOutputStringStart
== inputLen
) {
1868 // The delimiter was at the end of the string.
1869 // Output an empty string, and then we are done.
1870 if (destIdx
< destCapacity
) {
1871 destBuf
[destIdx
] = 0;
1873 if (i
< destFieldsCapacity
-1) {
1876 if (destIdx
< destCapacity
) {
1877 destFields
[i
] = destBuf
+ destIdx
;
1886 // We ran off the end of the input while looking for the next delimiter.
1887 // All the remaining text goes into the current output string.
1888 destFields
[i
] = &destBuf
[destIdx
];
1889 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1890 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1895 // Zero out any unused portion of the destFields array
1897 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1898 destFields
[j
] = NULL
;
1901 if (requiredCapacity
!= NULL
) {
1902 *requiredCapacity
= destIdx
;
1904 if (destIdx
> destCapacity
) {
1905 *status
= U_BUFFER_OVERFLOW_ERROR
;
1911 // uregex_split The actual API function
1913 U_CAPI
int32_t U_EXPORT2
1914 uregex_split(URegularExpression
*regexp2
,
1916 int32_t destCapacity
,
1917 int32_t *requiredCapacity
,
1918 UChar
*destFields
[],
1919 int32_t destFieldsCapacity
,
1920 UErrorCode
*status
) {
1921 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1922 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1925 if ((destBuf
== NULL
&& destCapacity
> 0) ||
1927 destFields
== NULL
||
1928 destFieldsCapacity
< 1 ) {
1929 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1933 return RegexCImpl::split(regexp
, destBuf
, destCapacity
, requiredCapacity
, destFields
, destFieldsCapacity
, status
);
1938 // uregex_splitUText...can just use the normal C++ method
1940 U_CAPI
int32_t U_EXPORT2
1941 uregex_splitUText(URegularExpression
*regexp2
,
1942 UText
*destFields
[],
1943 int32_t destFieldsCapacity
,
1944 UErrorCode
*status
) {
1945 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1946 return regexp
->fMatcher
->split(regexp
->fMatcher
->inputText(), destFields
, destFieldsCapacity
, *status
);
1950 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS