2 *******************************************************************************
3 * Copyright (C) 2004-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "unicode/utf16.h"
30 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
32 struct RegularExpression
: public UMemory
{
38 int32_t *fPatRefCount
;
40 int32_t fPatStringLen
;
41 RegexMatcher
*fMatcher
;
42 const UChar
*fText
; // Text from setText()
43 int32_t fTextLength
; // Length provided by user with setText(), which
48 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
50 RegularExpression::RegularExpression() {
62 RegularExpression::~RegularExpression() {
65 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
67 uprv_free(fPatString
);
68 uprv_free(fPatRefCount
);
70 if (fOwnsText
&& fText
!=NULL
) {
71 uprv_free((void *)fText
);
80 //----------------------------------------------------------------------------------------
82 // validateRE Do boilerplate style checks on API function parameters.
83 // Return TRUE if they look OK.
84 //----------------------------------------------------------------------------------------
85 static UBool
validateRE(const RegularExpression
*re
, UBool requiresText
, UErrorCode
*status
) {
86 if (U_FAILURE(*status
)) {
89 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
90 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94 if (requiresText
&& re
->fText
== NULL
&& !re
->fOwnsText
) {
95 *status
= U_REGEX_INVALID_STATE
;
101 //----------------------------------------------------------------------------------------
105 //----------------------------------------------------------------------------------------
106 U_CAPI URegularExpression
* U_EXPORT2
107 uregex_open( const UChar
*pattern
,
108 int32_t patternLength
,
111 UErrorCode
*status
) {
113 if (U_FAILURE(*status
)) {
116 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
117 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
120 int32_t actualPatLen
= patternLength
;
121 if (actualPatLen
== -1) {
122 actualPatLen
= u_strlen(pattern
);
125 RegularExpression
*re
= new RegularExpression
;
126 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
127 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
128 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
129 *status
= U_MEMORY_ALLOCATION_ERROR
;
135 re
->fPatRefCount
= refC
;
136 *re
->fPatRefCount
= 1;
139 // Make a copy of the pattern string, so we can return it later if asked.
140 // For compiling the pattern, we will use a UText wrapper around
141 // this local copy, to avoid making even more copies.
143 re
->fPatString
= patBuf
;
144 re
->fPatStringLen
= patternLength
;
145 u_memcpy(patBuf
, pattern
, actualPatLen
);
146 patBuf
[actualPatLen
] = 0;
148 UText patText
= UTEXT_INITIALIZER
;
149 utext_openUChars(&patText
, patBuf
, patternLength
, status
);
152 // Compile the pattern
155 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
157 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
159 utext_close(&patText
);
161 if (U_FAILURE(*status
)) {
166 // Create the matcher object
168 re
->fMatcher
= re
->fPat
->matcher(*status
);
169 if (U_SUCCESS(*status
)) {
170 return (URegularExpression
*)re
;
179 //----------------------------------------------------------------------------------------
183 //----------------------------------------------------------------------------------------
184 U_CAPI URegularExpression
* U_EXPORT2
185 uregex_openUText(UText
*pattern
,
188 UErrorCode
*status
) {
190 if (U_FAILURE(*status
)) {
193 if (pattern
== NULL
) {
194 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
198 int64_t patternNativeLength
= utext_nativeLength(pattern
);
200 if (patternNativeLength
== 0) {
201 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
205 RegularExpression
*re
= new RegularExpression
;
207 UErrorCode lengthStatus
= U_ZERO_ERROR
;
208 int32_t pattern16Length
= utext_extract(pattern
, 0, patternNativeLength
, NULL
, 0, &lengthStatus
);
210 int32_t *refC
= (int32_t *)uprv_malloc(sizeof(int32_t));
211 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(pattern16Length
+1));
212 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
213 *status
= U_MEMORY_ALLOCATION_ERROR
;
219 re
->fPatRefCount
= refC
;
220 *re
->fPatRefCount
= 1;
223 // Make a copy of the pattern string, so we can return it later if asked.
224 // For compiling the pattern, we will use a read-only UText wrapper
225 // around this local copy, to avoid making even more copies.
227 re
->fPatString
= patBuf
;
228 re
->fPatStringLen
= pattern16Length
;
229 utext_extract(pattern
, 0, patternNativeLength
, patBuf
, pattern16Length
+1, status
);
231 UText patText
= UTEXT_INITIALIZER
;
232 utext_openUChars(&patText
, patBuf
, pattern16Length
, status
);
235 // Compile the pattern
238 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
240 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
242 utext_close(&patText
);
244 if (U_FAILURE(*status
)) {
249 // Create the matcher object
251 re
->fMatcher
= re
->fPat
->matcher(*status
);
252 if (U_SUCCESS(*status
)) {
253 return (URegularExpression
*)re
;
262 //----------------------------------------------------------------------------------------
266 //----------------------------------------------------------------------------------------
267 U_CAPI
void U_EXPORT2
268 uregex_close(URegularExpression
*re2
) {
269 RegularExpression
*re
= (RegularExpression
*)re2
;
270 UErrorCode status
= U_ZERO_ERROR
;
271 if (validateRE(re
, FALSE
, &status
) == FALSE
) {
278 //----------------------------------------------------------------------------------------
282 //----------------------------------------------------------------------------------------
283 U_CAPI URegularExpression
* U_EXPORT2
284 uregex_clone(const URegularExpression
*source2
, UErrorCode
*status
) {
285 RegularExpression
*source
= (RegularExpression
*)source2
;
286 if (validateRE(source
, FALSE
, status
) == FALSE
) {
290 RegularExpression
*clone
= new RegularExpression
;
292 *status
= U_MEMORY_ALLOCATION_ERROR
;
296 clone
->fMatcher
= source
->fPat
->matcher(*status
);
297 if (U_FAILURE(*status
)) {
302 clone
->fPat
= source
->fPat
;
303 clone
->fPatRefCount
= source
->fPatRefCount
;
304 clone
->fPatString
= source
->fPatString
;
305 clone
->fPatStringLen
= source
->fPatStringLen
;
306 umtx_atomic_inc(source
->fPatRefCount
);
307 // Note: fText is not cloned.
309 return (URegularExpression
*)clone
;
315 //------------------------------------------------------------------------------
319 //------------------------------------------------------------------------------
320 U_CAPI
const UChar
* U_EXPORT2
321 uregex_pattern(const URegularExpression
*regexp2
,
323 UErrorCode
*status
) {
324 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
326 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
329 if (patLength
!= NULL
) {
330 *patLength
= regexp
->fPatStringLen
;
332 return regexp
->fPatString
;
336 //------------------------------------------------------------------------------
338 // uregex_patternUText
340 //------------------------------------------------------------------------------
341 U_CAPI UText
* U_EXPORT2
342 uregex_patternUText(const URegularExpression
*regexp2
,
343 UErrorCode
*status
) {
344 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
345 return regexp
->fPat
->patternText(*status
);
349 //------------------------------------------------------------------------------
353 //------------------------------------------------------------------------------
354 U_CAPI
int32_t U_EXPORT2
355 uregex_flags(const URegularExpression
*regexp2
, UErrorCode
*status
) {
356 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
357 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
360 int32_t flags
= regexp
->fPat
->flags();
365 //------------------------------------------------------------------------------
369 //------------------------------------------------------------------------------
370 U_CAPI
void U_EXPORT2
371 uregex_setText(URegularExpression
*regexp2
,
374 UErrorCode
*status
) {
375 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
376 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
379 if (text
== NULL
|| textLength
< -1) {
380 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
384 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
385 uprv_free((void *)regexp
->fText
);
388 regexp
->fText
= text
;
389 regexp
->fTextLength
= textLength
;
390 regexp
->fOwnsText
= FALSE
;
392 UText input
= UTEXT_INITIALIZER
;
393 utext_openUChars(&input
, text
, textLength
, status
);
394 regexp
->fMatcher
->reset(&input
);
395 utext_close(&input
); // reset() made a shallow clone, so we don't need this copy
399 //------------------------------------------------------------------------------
403 //------------------------------------------------------------------------------
404 U_CAPI
void U_EXPORT2
405 uregex_setUText(URegularExpression
*regexp2
,
407 UErrorCode
*status
) {
408 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
409 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
413 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
417 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
418 uprv_free((void *)regexp
->fText
);
421 regexp
->fText
= NULL
; // only fill it in on request
422 regexp
->fTextLength
= -1;
423 regexp
->fOwnsText
= TRUE
;
424 regexp
->fMatcher
->reset(text
);
429 //------------------------------------------------------------------------------
433 //------------------------------------------------------------------------------
434 U_CAPI
const UChar
* U_EXPORT2
435 uregex_getText(URegularExpression
*regexp2
,
437 UErrorCode
*status
) {
438 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
439 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
443 if (regexp
->fText
== NULL
) {
444 // need to fill in the text
445 UText
*inputText
= regexp
->fMatcher
->inputText();
446 int64_t inputNativeLength
= utext_nativeLength(inputText
);
447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText
, inputNativeLength
)) {
448 regexp
->fText
= inputText
->chunkContents
;
449 regexp
->fTextLength
= (int32_t)inputNativeLength
;
450 regexp
->fOwnsText
= FALSE
; // because the UText owns it
452 UErrorCode lengthStatus
= U_ZERO_ERROR
;
453 regexp
->fTextLength
= utext_extract(inputText
, 0, inputNativeLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
454 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(regexp
->fTextLength
+1));
456 utext_extract(inputText
, 0, inputNativeLength
, inputChars
, regexp
->fTextLength
+1, status
);
457 regexp
->fText
= inputChars
;
458 regexp
->fOwnsText
= TRUE
; // should already be set but just in case
462 if (textLength
!= NULL
) {
463 *textLength
= regexp
->fTextLength
;
465 return regexp
->fText
;
469 //------------------------------------------------------------------------------
473 //------------------------------------------------------------------------------
474 U_CAPI UText
* U_EXPORT2
475 uregex_getUText(URegularExpression
*regexp2
,
477 UErrorCode
*status
) {
478 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
479 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
482 return regexp
->fMatcher
->getInput(dest
, *status
);
486 //------------------------------------------------------------------------------
488 // uregex_refreshUText
490 //------------------------------------------------------------------------------
491 U_CAPI
void U_EXPORT2
492 uregex_refreshUText(URegularExpression
*regexp2
,
494 UErrorCode
*status
) {
495 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
496 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
499 regexp
->fMatcher
->refreshInputText(text
, *status
);
503 //------------------------------------------------------------------------------
507 //------------------------------------------------------------------------------
508 U_CAPI UBool U_EXPORT2
509 uregex_matches(URegularExpression
*regexp2
,
511 UErrorCode
*status
) {
512 return uregex_matches64( regexp2
, (int64_t)startIndex
, status
);
515 U_CAPI UBool U_EXPORT2
516 uregex_matches64(URegularExpression
*regexp2
,
518 UErrorCode
*status
) {
519 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
520 UBool result
= FALSE
;
521 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
524 if (startIndex
== -1) {
525 result
= regexp
->fMatcher
->matches(*status
);
527 result
= regexp
->fMatcher
->matches(startIndex
, *status
);
533 //------------------------------------------------------------------------------
537 //------------------------------------------------------------------------------
538 U_CAPI UBool U_EXPORT2
539 uregex_lookingAt(URegularExpression
*regexp2
,
541 UErrorCode
*status
) {
542 return uregex_lookingAt64( regexp2
, (int64_t)startIndex
, status
);
545 U_CAPI UBool U_EXPORT2
546 uregex_lookingAt64(URegularExpression
*regexp2
,
548 UErrorCode
*status
) {
549 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
550 UBool result
= FALSE
;
551 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
554 if (startIndex
== -1) {
555 result
= regexp
->fMatcher
->lookingAt(*status
);
557 result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
564 //------------------------------------------------------------------------------
568 //------------------------------------------------------------------------------
569 U_CAPI UBool U_EXPORT2
570 uregex_find(URegularExpression
*regexp2
,
572 UErrorCode
*status
) {
573 return uregex_find64( regexp2
, (int64_t)startIndex
, status
);
576 U_CAPI UBool U_EXPORT2
577 uregex_find64(URegularExpression
*regexp2
,
579 UErrorCode
*status
) {
580 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
581 UBool result
= FALSE
;
582 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
585 if (startIndex
== -1) {
586 regexp
->fMatcher
->resetPreserveRegion();
587 result
= regexp
->fMatcher
->find();
589 result
= regexp
->fMatcher
->find(startIndex
, *status
);
595 //------------------------------------------------------------------------------
599 //------------------------------------------------------------------------------
600 U_CAPI UBool U_EXPORT2
601 uregex_findNext(URegularExpression
*regexp2
,
602 UErrorCode
*status
) {
603 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
604 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
607 UBool result
= regexp
->fMatcher
->find();
611 //------------------------------------------------------------------------------
615 //------------------------------------------------------------------------------
616 U_CAPI
int32_t U_EXPORT2
617 uregex_groupCount(URegularExpression
*regexp2
,
618 UErrorCode
*status
) {
619 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
620 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
623 int32_t result
= regexp
->fMatcher
->groupCount();
628 //------------------------------------------------------------------------------
632 //------------------------------------------------------------------------------
633 U_CAPI
int32_t U_EXPORT2
634 uregex_group(URegularExpression
*regexp2
,
637 int32_t destCapacity
,
638 UErrorCode
*status
) {
639 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
640 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
643 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
644 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
648 if (destCapacity
== 0 || regexp
->fText
!= NULL
) {
649 // If preflighting or if we already have the text as UChars,
650 // this is a little cheaper than going through uregex_groupUTextDeep()
653 // Pick up the range of characters from the matcher
655 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
656 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
657 if (U_FAILURE(*status
)) {
662 // Trim length based on buffer capacity
664 int32_t fullLength
= endIx
- startIx
;
665 int32_t copyLength
= fullLength
;
666 if (copyLength
< destCapacity
) {
667 dest
[copyLength
] = 0;
668 } else if (copyLength
== destCapacity
) {
669 *status
= U_STRING_NOT_TERMINATED_WARNING
;
671 copyLength
= destCapacity
;
672 *status
= U_BUFFER_OVERFLOW_ERROR
;
676 // Copy capture group to user's buffer
678 if (copyLength
> 0) {
679 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
683 UText
*groupText
= uregex_groupUTextDeep(regexp2
, groupNum
, NULL
, status
);
684 int32_t result
= utext_extract(groupText
, 0, utext_nativeLength(groupText
), dest
, destCapacity
, status
);
685 utext_close(groupText
);
691 //------------------------------------------------------------------------------
695 //------------------------------------------------------------------------------
696 U_CAPI UText
* U_EXPORT2
697 uregex_groupUText(URegularExpression
*regexp2
,
700 int64_t *groupLength
,
701 UErrorCode
*status
) {
702 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
703 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
704 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
705 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
708 return regexp
->fMatcher
->group(groupNum
, dest
, *groupLength
, *status
);
711 //------------------------------------------------------------------------------
713 // uregex_groupUTextDeep
715 //------------------------------------------------------------------------------
716 U_CAPI UText
* U_EXPORT2
717 uregex_groupUTextDeep(URegularExpression
*regexp2
,
720 UErrorCode
*status
) {
721 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
722 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
723 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
724 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
727 if (regexp
->fText
!= NULL
) {
729 // Pick up the range of characters from the matcher
730 // and use our already-extracted characters
732 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
733 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
734 if (U_FAILURE(*status
)) {
735 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
736 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
740 utext_replace(dest
, 0, utext_nativeLength(dest
), ®exp
->fText
[startIx
], endIx
- startIx
, status
);
742 UText groupText
= UTEXT_INITIALIZER
;
743 utext_openUChars(&groupText
, ®exp
->fText
[startIx
], endIx
- startIx
, status
);
744 dest
= utext_clone(NULL
, &groupText
, TRUE
, FALSE
, status
);
745 utext_close(&groupText
);
750 return regexp
->fMatcher
->group(groupNum
, dest
, *status
);
754 //------------------------------------------------------------------------------
758 //------------------------------------------------------------------------------
759 U_CAPI
int32_t U_EXPORT2
760 uregex_start(URegularExpression
*regexp2
,
762 UErrorCode
*status
) {
763 return (int32_t)uregex_start64( regexp2
, groupNum
, status
);
766 U_CAPI
int64_t U_EXPORT2
767 uregex_start64(URegularExpression
*regexp2
,
769 UErrorCode
*status
) {
770 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
771 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
774 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
778 //------------------------------------------------------------------------------
782 //------------------------------------------------------------------------------
783 U_CAPI
int32_t U_EXPORT2
784 uregex_end(URegularExpression
*regexp2
,
786 UErrorCode
*status
) {
787 return (int32_t)uregex_end64( regexp2
, groupNum
, status
);
790 U_CAPI
int64_t U_EXPORT2
791 uregex_end64(URegularExpression
*regexp2
,
793 UErrorCode
*status
) {
794 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
795 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
798 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
802 //------------------------------------------------------------------------------
806 //------------------------------------------------------------------------------
807 U_CAPI
void U_EXPORT2
808 uregex_reset(URegularExpression
*regexp2
,
810 UErrorCode
*status
) {
811 uregex_reset64( regexp2
, (int64_t)index
, status
);
814 U_CAPI
void U_EXPORT2
815 uregex_reset64(URegularExpression
*regexp2
,
817 UErrorCode
*status
) {
818 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
819 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
822 regexp
->fMatcher
->reset(index
, *status
);
826 //------------------------------------------------------------------------------
830 //------------------------------------------------------------------------------
831 U_CAPI
void U_EXPORT2
832 uregex_setRegion(URegularExpression
*regexp2
,
835 UErrorCode
*status
) {
836 uregex_setRegion64( regexp2
, (int64_t)regionStart
, (int64_t)regionLimit
, status
);
839 U_CAPI
void U_EXPORT2
840 uregex_setRegion64(URegularExpression
*regexp2
,
843 UErrorCode
*status
) {
844 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
845 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
848 regexp
->fMatcher
->region(regionStart
, regionLimit
, *status
);
852 //------------------------------------------------------------------------------
854 // uregex_setRegionAndStart
856 //------------------------------------------------------------------------------
857 U_CAPI
void U_EXPORT2
858 uregex_setRegionAndStart(URegularExpression
*regexp2
,
862 UErrorCode
*status
) {
863 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
864 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
867 regexp
->fMatcher
->region(regionStart
, regionLimit
, startIndex
, *status
);
870 //------------------------------------------------------------------------------
872 // uregex_regionStart
874 //------------------------------------------------------------------------------
875 U_CAPI
int32_t U_EXPORT2
876 uregex_regionStart(const URegularExpression
*regexp2
,
877 UErrorCode
*status
) {
878 return (int32_t)uregex_regionStart64(regexp2
, status
);
881 U_CAPI
int64_t U_EXPORT2
882 uregex_regionStart64(const URegularExpression
*regexp2
,
883 UErrorCode
*status
) {
884 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
885 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
888 return regexp
->fMatcher
->regionStart();
892 //------------------------------------------------------------------------------
896 //------------------------------------------------------------------------------
897 U_CAPI
int32_t U_EXPORT2
898 uregex_regionEnd(const URegularExpression
*regexp2
,
899 UErrorCode
*status
) {
900 return (int32_t)uregex_regionEnd64(regexp2
, status
);
903 U_CAPI
int64_t U_EXPORT2
904 uregex_regionEnd64(const URegularExpression
*regexp2
,
905 UErrorCode
*status
) {
906 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
907 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
910 return regexp
->fMatcher
->regionEnd();
914 //------------------------------------------------------------------------------
916 // uregex_hasTransparentBounds
918 //------------------------------------------------------------------------------
919 U_CAPI UBool U_EXPORT2
920 uregex_hasTransparentBounds(const URegularExpression
*regexp2
,
921 UErrorCode
*status
) {
922 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
923 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
926 return regexp
->fMatcher
->hasTransparentBounds();
930 //------------------------------------------------------------------------------
932 // uregex_useTransparentBounds
934 //------------------------------------------------------------------------------
935 U_CAPI
void U_EXPORT2
936 uregex_useTransparentBounds(URegularExpression
*regexp2
,
938 UErrorCode
*status
) {
939 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
940 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
943 regexp
->fMatcher
->useTransparentBounds(b
);
947 //------------------------------------------------------------------------------
949 // uregex_hasAnchoringBounds
951 //------------------------------------------------------------------------------
952 U_CAPI UBool U_EXPORT2
953 uregex_hasAnchoringBounds(const URegularExpression
*regexp2
,
954 UErrorCode
*status
) {
955 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
956 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
959 return regexp
->fMatcher
->hasAnchoringBounds();
963 //------------------------------------------------------------------------------
965 // uregex_useAnchoringBounds
967 //------------------------------------------------------------------------------
968 U_CAPI
void U_EXPORT2
969 uregex_useAnchoringBounds(URegularExpression
*regexp2
,
971 UErrorCode
*status
) {
972 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
973 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
976 regexp
->fMatcher
->useAnchoringBounds(b
);
980 //------------------------------------------------------------------------------
984 //------------------------------------------------------------------------------
985 U_CAPI UBool U_EXPORT2
986 uregex_hitEnd(const URegularExpression
*regexp2
,
987 UErrorCode
*status
) {
988 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
989 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
992 return regexp
->fMatcher
->hitEnd();
996 //------------------------------------------------------------------------------
1000 //------------------------------------------------------------------------------
1001 U_CAPI UBool U_EXPORT2
1002 uregex_requireEnd(const URegularExpression
*regexp2
,
1003 UErrorCode
*status
) {
1004 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1005 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1008 return regexp
->fMatcher
->requireEnd();
1012 //------------------------------------------------------------------------------
1014 // uregex_setTimeLimit
1016 //------------------------------------------------------------------------------
1017 U_CAPI
void U_EXPORT2
1018 uregex_setTimeLimit(URegularExpression
*regexp2
,
1020 UErrorCode
*status
) {
1021 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1022 if (validateRE(regexp
, FALSE
, status
)) {
1023 regexp
->fMatcher
->setTimeLimit(limit
, *status
);
1029 //------------------------------------------------------------------------------
1031 // uregex_getTimeLimit
1033 //------------------------------------------------------------------------------
1034 U_CAPI
int32_t U_EXPORT2
1035 uregex_getTimeLimit(const URegularExpression
*regexp2
,
1036 UErrorCode
*status
) {
1038 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1039 if (validateRE(regexp
, FALSE
, status
)) {
1040 retVal
= regexp
->fMatcher
->getTimeLimit();
1047 //------------------------------------------------------------------------------
1049 // uregex_setStackLimit
1051 //------------------------------------------------------------------------------
1052 U_CAPI
void U_EXPORT2
1053 uregex_setStackLimit(URegularExpression
*regexp2
,
1055 UErrorCode
*status
) {
1056 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1057 if (validateRE(regexp
, FALSE
, status
)) {
1058 regexp
->fMatcher
->setStackLimit(limit
, *status
);
1064 //------------------------------------------------------------------------------
1066 // uregex_getStackLimit
1068 //------------------------------------------------------------------------------
1069 U_CAPI
int32_t U_EXPORT2
1070 uregex_getStackLimit(const URegularExpression
*regexp2
,
1071 UErrorCode
*status
) {
1073 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1074 if (validateRE(regexp
, FALSE
, status
)) {
1075 retVal
= regexp
->fMatcher
->getStackLimit();
1081 //------------------------------------------------------------------------------
1083 // uregex_setMatchCallback
1085 //------------------------------------------------------------------------------
1086 U_CAPI
void U_EXPORT2
1087 uregex_setMatchCallback(URegularExpression
*regexp2
,
1088 URegexMatchCallback
*callback
,
1089 const void *context
,
1090 UErrorCode
*status
) {
1091 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1092 if (validateRE(regexp
, FALSE
, status
)) {
1093 regexp
->fMatcher
->setMatchCallback(callback
, context
, *status
);
1098 //------------------------------------------------------------------------------
1100 // uregex_getMatchCallback
1102 //------------------------------------------------------------------------------
1103 U_CAPI
void U_EXPORT2
1104 uregex_getMatchCallback(const URegularExpression
*regexp2
,
1105 URegexMatchCallback
**callback
,
1106 const void **context
,
1107 UErrorCode
*status
) {
1108 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1109 if (validateRE(regexp
, FALSE
, status
)) {
1110 regexp
->fMatcher
->getMatchCallback(*callback
, *context
, *status
);
1115 //------------------------------------------------------------------------------
1117 // uregex_setMatchProgressCallback
1119 //------------------------------------------------------------------------------
1120 U_CAPI
void U_EXPORT2
1121 uregex_setFindProgressCallback(URegularExpression
*regexp2
,
1122 URegexFindProgressCallback
*callback
,
1123 const void *context
,
1124 UErrorCode
*status
) {
1125 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1126 if (validateRE(regexp
, FALSE
, status
)) {
1127 regexp
->fMatcher
->setFindProgressCallback(callback
, context
, *status
);
1132 //------------------------------------------------------------------------------
1134 // uregex_getMatchCallback
1136 //------------------------------------------------------------------------------
1137 U_CAPI
void U_EXPORT2
1138 uregex_getFindProgressCallback(const URegularExpression
*regexp2
,
1139 URegexFindProgressCallback
**callback
,
1140 const void **context
,
1141 UErrorCode
*status
) {
1142 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1143 if (validateRE(regexp
, FALSE
, status
)) {
1144 regexp
->fMatcher
->getFindProgressCallback(*callback
, *context
, *status
);
1149 //------------------------------------------------------------------------------
1151 // uregex_replaceAll
1153 //------------------------------------------------------------------------------
1154 U_CAPI
int32_t U_EXPORT2
1155 uregex_replaceAll(URegularExpression
*regexp2
,
1156 const UChar
*replacementText
,
1157 int32_t replacementLength
,
1159 int32_t destCapacity
,
1160 UErrorCode
*status
) {
1161 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1162 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1165 if (replacementText
== NULL
|| replacementLength
< -1 ||
1166 (destBuf
== NULL
&& destCapacity
> 0) ||
1168 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1174 uregex_reset(regexp2
, 0, status
);
1176 // Note: Seperate error code variables for findNext() and appendReplacement()
1177 // are used so that destination buffer overflow errors
1178 // in appendReplacement won't stop findNext() from working.
1179 // appendReplacement() and appendTail() special case incoming buffer
1180 // overflow errors, continuing to return the correct length.
1181 UErrorCode findStatus
= *status
;
1182 while (uregex_findNext(regexp2
, &findStatus
)) {
1183 len
+= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1184 &destBuf
, &destCapacity
, status
);
1186 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1188 if (U_FAILURE(findStatus
)) {
1189 // If anything went wrong with the findNext(), make that error trump
1190 // whatever may have happened with the append() operations.
1191 // Errors in findNext() are not expected.
1192 *status
= findStatus
;
1199 //------------------------------------------------------------------------------
1201 // uregex_replaceAllUText
1203 //------------------------------------------------------------------------------
1204 U_CAPI UText
* U_EXPORT2
1205 uregex_replaceAllUText(URegularExpression
*regexp2
,
1206 UText
*replacementText
,
1208 UErrorCode
*status
) {
1209 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1210 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1213 if (replacementText
== NULL
) {
1214 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1218 dest
= regexp
->fMatcher
->replaceAll(replacementText
, dest
, *status
);
1223 //------------------------------------------------------------------------------
1225 // uregex_replaceFirst
1227 //------------------------------------------------------------------------------
1228 U_CAPI
int32_t U_EXPORT2
1229 uregex_replaceFirst(URegularExpression
*regexp2
,
1230 const UChar
*replacementText
,
1231 int32_t replacementLength
,
1233 int32_t destCapacity
,
1234 UErrorCode
*status
) {
1235 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1236 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1239 if (replacementText
== NULL
|| replacementLength
< -1 ||
1240 (destBuf
== NULL
&& destCapacity
> 0) ||
1242 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1247 UBool findSucceeded
;
1248 uregex_reset(regexp2
, 0, status
);
1249 findSucceeded
= uregex_find(regexp2
, 0, status
);
1250 if (findSucceeded
) {
1251 len
= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1252 &destBuf
, &destCapacity
, status
);
1254 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1260 //------------------------------------------------------------------------------
1262 // uregex_replaceFirstUText
1264 //------------------------------------------------------------------------------
1265 U_CAPI UText
* U_EXPORT2
1266 uregex_replaceFirstUText(URegularExpression
*regexp2
,
1267 UText
*replacementText
,
1269 UErrorCode
*status
) {
1270 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1271 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1274 if (replacementText
== NULL
) {
1275 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1279 dest
= regexp
->fMatcher
->replaceFirst(replacementText
, dest
, *status
);
1284 //------------------------------------------------------------------------------
1286 // uregex_appendReplacement
1288 //------------------------------------------------------------------------------
1292 // Dummy class, because these functions need to be friends of class RegexMatcher,
1293 // and stand-alone C functions don't work as friends
1297 inline static int32_t appendReplacement(RegularExpression
*regexp
,
1298 const UChar
*replacementText
,
1299 int32_t replacementLength
,
1301 int32_t *destCapacity
,
1302 UErrorCode
*status
);
1304 inline static int32_t appendTail(RegularExpression
*regexp
,
1306 int32_t *destCapacity
,
1307 UErrorCode
*status
);
1309 inline static int32_t split(RegularExpression
*regexp
,
1311 int32_t destCapacity
,
1312 int32_t *requiredCapacity
,
1313 UChar
*destFields
[],
1314 int32_t destFieldsCapacity
,
1315 UErrorCode
*status
);
1322 static const UChar BACKSLASH
= 0x5c;
1323 static const UChar DOLLARSIGN
= 0x24;
1326 // Move a character to an output buffer, with bounds checking on the index.
1327 // Index advances even if capacity is exceeded, for preflight size computations.
1328 // This little sequence is used a LOT.
1330 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
1331 if (*idx
< bufCapacity
) {
1339 // appendReplacement, the actual implementation.
1341 int32_t RegexCImpl::appendReplacement(RegularExpression
*regexp
,
1342 const UChar
*replacementText
,
1343 int32_t replacementLength
,
1345 int32_t *destCapacity
,
1346 UErrorCode
*status
) {
1348 // If we come in with a buffer overflow error, don't suppress the operation.
1349 // A series of appendReplacements, appendTail need to correctly preflight
1350 // the buffer size when an overflow happens somewhere in the middle.
1351 UBool pendingBufferOverflow
= FALSE
;
1352 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1353 pendingBufferOverflow
= TRUE
;
1354 *status
= U_ZERO_ERROR
;
1358 // Validate all paramters
1360 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1363 if (replacementText
== NULL
|| replacementLength
< -1 ||
1364 destCapacity
== NULL
|| destBuf
== NULL
||
1365 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1366 *destCapacity
< 0) {
1367 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1371 RegexMatcher
*m
= regexp
->fMatcher
;
1372 if (m
->fMatch
== FALSE
) {
1373 *status
= U_REGEX_INVALID_STATE
;
1377 UChar
*dest
= *destBuf
;
1378 int32_t capacity
= *destCapacity
;
1379 int32_t destIdx
= 0;
1382 // If it wasn't supplied by the caller, get the length of the replacement text.
1383 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1384 // the fly and avoid this step.
1385 if (replacementLength
== -1) {
1386 replacementLength
= u_strlen(replacementText
);
1389 // Copy input string from the end of previous match to start of current match
1390 if (regexp
->fText
!= NULL
) {
1392 int32_t lastMatchEnd
;
1393 if (UTEXT_USES_U16(m
->fInputText
)) {
1394 lastMatchEnd
= (int32_t)m
->fLastMatchEnd
;
1395 matchStart
= (int32_t)m
->fMatchStart
;
1397 // !!!: Would like a better way to do this!
1398 UErrorCode status
= U_ZERO_ERROR
;
1399 lastMatchEnd
= utext_extract(m
->fInputText
, 0, m
->fLastMatchEnd
, NULL
, 0, &status
);
1400 status
= U_ZERO_ERROR
;
1401 matchStart
= lastMatchEnd
+ utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
, NULL
, 0, &status
);
1403 for (i
=lastMatchEnd
; i
<matchStart
; i
++) {
1404 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
1407 UErrorCode possibleOverflowError
= U_ZERO_ERROR
; // ignore
1408 destIdx
+= utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
,
1409 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
),
1410 &possibleOverflowError
);
1412 U_ASSERT(destIdx
>= 0);
1414 // scan the replacement text, looking for substitutions ($n) and \escapes.
1415 int32_t replIdx
= 0;
1416 while (replIdx
< replacementLength
) {
1417 UChar c
= replacementText
[replIdx
];
1419 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
1420 // Common case, no substitution, no escaping,
1421 // just copy the char to the dest buf.
1422 appendToBuf(c
, &destIdx
, dest
, capacity
);
1426 if (c
== BACKSLASH
) {
1427 // Backslash Escape. Copy the following char out without further checks.
1428 // Note: Surrogate pairs don't need any special handling
1429 // The second half wont be a '$' or a '\', and
1430 // will move to the dest normally on the next
1432 if (replIdx
>= replacementLength
) {
1435 c
= replacementText
[replIdx
];
1437 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
1438 // We have a \udddd or \Udddddddd escape sequence.
1439 UChar32 escapedChar
=
1440 u_unescapeAt(uregex_ucstr_unescape_charAt
,
1441 &replIdx
, // Index is updated by unescapeAt
1442 replacementLength
, // Length of replacement text
1443 (void *)replacementText
);
1445 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
1446 if (escapedChar
<= 0xffff) {
1447 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
1449 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
1450 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
1454 // Note: if the \u escape was invalid, just fall through and
1455 // treat it as a plain \<anything> escape.
1458 // Plain backslash escape. Just put out the escaped character.
1459 appendToBuf(c
, &destIdx
, dest
, capacity
);
1467 // We've got a $. Pick up a capture group number if one follows.
1468 // Consume at most the number of digits necessary for the largest capture
1469 // number that is valid for this pattern.
1471 int32_t numDigits
= 0;
1472 int32_t groupNum
= 0;
1475 if (replIdx
>= replacementLength
) {
1478 U16_GET(replacementText
, 0, replIdx
, replacementLength
, digitC
);
1479 if (u_isdigit(digitC
) == FALSE
) {
1483 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1484 groupNum
=groupNum
*10 + u_charDigitValue(digitC
);
1486 if (numDigits
>= m
->fPattern
->fMaxCaptureDigits
) {
1492 if (numDigits
== 0) {
1493 // The $ didn't introduce a group number at all.
1494 // Treat it as just part of the substitution text.
1495 appendToBuf(DOLLARSIGN
, &destIdx
, dest
, capacity
);
1499 // Finally, append the capture group data to the destination.
1500 destIdx
+= uregex_group((URegularExpression
*)regexp
, groupNum
,
1501 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
), status
);
1502 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
1503 // Ignore buffer overflow when extracting the group. We need to
1504 // continue on to get full size of the untruncated result. We will
1505 // raise our own buffer overflow error at the end.
1506 *status
= U_ZERO_ERROR
;
1509 if (U_FAILURE(*status
)) {
1510 // Can fail if group number is out of range.
1517 // Nul Terminate the dest buffer if possible.
1518 // Set the appropriate buffer overflow or not terminated error, if needed.
1520 if (destIdx
< capacity
) {
1522 } else if (destIdx
== *destCapacity
) {
1523 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1525 *status
= U_BUFFER_OVERFLOW_ERROR
;
1529 // Return an updated dest buffer and capacity to the caller.
1531 if (destIdx
> 0 && *destCapacity
> 0) {
1532 if (destIdx
< capacity
) {
1533 *destBuf
+= destIdx
;
1534 *destCapacity
-= destIdx
;
1536 *destBuf
+= capacity
;
1541 // If we came in with a buffer overflow, make sure we go out with one also.
1542 // (A zero length match right at the end of the previous match could
1543 // make this function succeed even though a previous call had overflowed the buf)
1544 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1545 *status
= U_BUFFER_OVERFLOW_ERROR
;
1552 // appendReplacement the actual API function,
1554 U_CAPI
int32_t U_EXPORT2
1555 uregex_appendReplacement(URegularExpression
*regexp2
,
1556 const UChar
*replacementText
,
1557 int32_t replacementLength
,
1559 int32_t *destCapacity
,
1560 UErrorCode
*status
) {
1562 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1563 return RegexCImpl::appendReplacement(
1564 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
1568 // uregex_appendReplacementUText...can just use the normal C++ method
1570 U_CAPI
void U_EXPORT2
1571 uregex_appendReplacementUText(URegularExpression
*regexp2
,
1574 UErrorCode
*status
) {
1575 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1576 regexp
->fMatcher
->appendReplacement(dest
, replText
, *status
);
1580 //------------------------------------------------------------------------------
1582 // uregex_appendTail
1584 //------------------------------------------------------------------------------
1585 int32_t RegexCImpl::appendTail(RegularExpression
*regexp
,
1587 int32_t *destCapacity
,
1591 // If we come in with a buffer overflow error, don't suppress the operation.
1592 // A series of appendReplacements, appendTail need to correctly preflight
1593 // the buffer size when an overflow happens somewhere in the middle.
1594 UBool pendingBufferOverflow
= FALSE
;
1595 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1596 pendingBufferOverflow
= TRUE
;
1597 *status
= U_ZERO_ERROR
;
1600 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1604 if (destCapacity
== NULL
|| destBuf
== NULL
||
1605 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1608 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1612 RegexMatcher
*m
= regexp
->fMatcher
;
1614 int32_t destIdx
= 0;
1615 int32_t destCap
= *destCapacity
;
1616 UChar
*dest
= *destBuf
;
1618 if (regexp
->fText
!= NULL
) {
1620 int64_t nativeIdx
= (m
->fMatch
? m
->fMatchEnd
: m
->fLastMatchEnd
);
1621 if (nativeIdx
== -1) {
1623 } else if (UTEXT_USES_U16(m
->fInputText
)) {
1624 srcIdx
= (int32_t)nativeIdx
;
1626 UErrorCode status
= U_ZERO_ERROR
;
1627 srcIdx
= utext_extract(m
->fInputText
, 0, nativeIdx
, NULL
, 0, &status
);
1631 U_ASSERT(destIdx
>= 0);
1633 if (srcIdx
== regexp
->fTextLength
) {
1636 UChar c
= regexp
->fText
[srcIdx
];
1637 if (c
== 0 && regexp
->fTextLength
== -1) {
1638 regexp
->fTextLength
= srcIdx
;
1642 if (destIdx
< destCap
) {
1645 // We've overflowed the dest buffer.
1646 // If the total input string length is known, we can
1647 // compute the total buffer size needed without scanning through the string.
1648 if (regexp
->fTextLength
> 0) {
1649 destIdx
+= (regexp
->fTextLength
- srcIdx
);
1659 // The most recent call to find() succeeded.
1660 srcIdx
= m
->fMatchEnd
;
1662 // The last call to find() on this matcher failed().
1663 // Look back to the end of the last find() that succeeded for src index.
1664 srcIdx
= m
->fLastMatchEnd
;
1666 // There has been no successful match with this matcher.
1667 // We want to copy the whole string.
1672 destIdx
= utext_extract(m
->fInputText
, srcIdx
, m
->fInputLength
, dest
, destCap
, status
);
1676 // NUL terminate the output string, if possible, otherwise issue the
1677 // appropriate error or warning.
1679 if (destIdx
< destCap
) {
1681 } else if (destIdx
== destCap
) {
1682 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1684 *status
= U_BUFFER_OVERFLOW_ERROR
;
1688 // Update the user's buffer ptr and capacity vars to reflect the
1691 if (destIdx
< destCap
) {
1692 *destBuf
+= destIdx
;
1693 *destCapacity
-= destIdx
;
1694 } else if (*destBuf
!= NULL
) {
1695 *destBuf
+= destCap
;
1699 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1700 *status
= U_BUFFER_OVERFLOW_ERROR
;
1708 // appendTail the actual API function
1710 U_CAPI
int32_t U_EXPORT2
1711 uregex_appendTail(URegularExpression
*regexp2
,
1713 int32_t *destCapacity
,
1714 UErrorCode
*status
) {
1715 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1716 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
1721 // uregex_appendTailUText...can just use the normal C++ method
1723 U_CAPI UText
* U_EXPORT2
1724 uregex_appendTailUText(URegularExpression
*regexp2
,
1726 UErrorCode
*status
) {
1727 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1728 return regexp
->fMatcher
->appendTail(dest
, *status
);
1732 //------------------------------------------------------------------------------
1734 // copyString Internal utility to copy a string to an output buffer,
1735 // while managing buffer overflow and preflight size
1736 // computation. NUL termination is added to destination,
1737 // and the NUL is counted in the output size.
1739 //------------------------------------------------------------------------------
1741 static void copyString(UChar
*destBuffer
, // Destination buffer.
1742 int32_t destCapacity
, // Total capacity of dest buffer
1743 int32_t *destIndex
, // Index into dest buffer. Updated on return.
1744 // Update not clipped to destCapacity.
1745 const UChar
*srcPtr
, // Pointer to source string
1746 int32_t srcLen
) // Source string len.
1749 int32_t di
= *destIndex
;
1752 for (si
=0; si
<srcLen
; si
++) {
1754 if (di
< destCapacity
) {
1762 if (di
<destCapacity
) {
1770 //------------------------------------------------------------------------------
1774 //------------------------------------------------------------------------------
1775 int32_t RegexCImpl::split(RegularExpression
*regexp
,
1777 int32_t destCapacity
,
1778 int32_t *requiredCapacity
,
1779 UChar
*destFields
[],
1780 int32_t destFieldsCapacity
,
1781 UErrorCode
*status
) {
1783 // Reset for the input text
1785 regexp
->fMatcher
->reset();
1786 UText
*inputText
= regexp
->fMatcher
->fInputText
;
1787 int64_t nextOutputStringStart
= 0;
1788 int64_t inputLen
= regexp
->fMatcher
->fInputLength
;
1789 if (inputLen
== 0) {
1794 // Loop through the input text, searching for the delimiter pattern
1796 int32_t i
; // Index of the field being processed.
1797 int32_t destIdx
= 0; // Next available position in destBuf;
1798 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1799 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow errors so that the strings are still counted
1801 if (i
>=destFieldsCapacity
-1) {
1802 // There are one or zero output strings left.
1803 // Fill the last output string with whatever is left from the input, then exit the loop.
1804 // ( i will be == destFieldsCapacity if we filled the output array while processing
1805 // capture groups of the delimiter expression, in which case we will discard the
1806 // last capture group saved in favor of the unprocessed remainder of the
1808 if (inputLen
> nextOutputStringStart
) {
1809 if (i
!= destFieldsCapacity
-1) {
1810 // No fields are left. Recycle the last one for holding the trailing part of
1811 // the input string.
1812 i
= destFieldsCapacity
-1;
1813 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1816 destFields
[i
] = &destBuf
[destIdx
];
1817 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1818 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1823 if (regexp
->fMatcher
->find()) {
1824 // We found another delimiter. Move everything from where we started looking
1825 // up until the start of the delimiter into the next output string.
1826 destFields
[i
] = &destBuf
[destIdx
];
1828 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, regexp
->fMatcher
->fMatchStart
,
1829 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), &tStatus
);
1830 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1831 tStatus
= U_ZERO_ERROR
;
1835 nextOutputStringStart
= regexp
->fMatcher
->fMatchEnd
;
1837 // If the delimiter pattern has capturing parentheses, the captured
1838 // text goes out into the next n destination strings.
1840 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1841 // If we've run out of output string slots, bail out.
1842 if (i
==destFieldsCapacity
-1) {
1847 // Set up to extract the capture group contents into the dest buffer.
1848 destFields
[i
] = &destBuf
[destIdx
];
1849 tStatus
= U_ZERO_ERROR
;
1850 int32_t t
= uregex_group((URegularExpression
*)regexp
,
1853 REMAINING_CAPACITY(destIdx
, destCapacity
),
1855 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1856 // +1 for the NUL that terminates the string.
1857 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1858 tStatus
= U_ZERO_ERROR
;
1864 if (nextOutputStringStart
== inputLen
) {
1865 // The delimiter was at the end of the string.
1866 // Output an empty string, and then we are done.
1867 if (destIdx
< destCapacity
) {
1868 destBuf
[destIdx
] = 0;
1870 if (i
< destFieldsCapacity
-1) {
1873 if (destIdx
< destCapacity
) {
1874 destFields
[i
] = destBuf
+ destIdx
;
1883 // We ran off the end of the input while looking for the next delimiter.
1884 // All the remaining text goes into the current output string.
1885 destFields
[i
] = &destBuf
[destIdx
];
1886 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1887 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1892 // Zero out any unused portion of the destFields array
1894 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1895 destFields
[j
] = NULL
;
1898 if (requiredCapacity
!= NULL
) {
1899 *requiredCapacity
= destIdx
;
1901 if (destIdx
> destCapacity
) {
1902 *status
= U_BUFFER_OVERFLOW_ERROR
;
1908 // uregex_split The actual API function
1910 U_CAPI
int32_t U_EXPORT2
1911 uregex_split(URegularExpression
*regexp2
,
1913 int32_t destCapacity
,
1914 int32_t *requiredCapacity
,
1915 UChar
*destFields
[],
1916 int32_t destFieldsCapacity
,
1917 UErrorCode
*status
) {
1918 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1919 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1922 if ((destBuf
== NULL
&& destCapacity
> 0) ||
1924 destFields
== NULL
||
1925 destFieldsCapacity
< 1 ) {
1926 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1930 return RegexCImpl::split(regexp
, destBuf
, destCapacity
, requiredCapacity
, destFields
, destFieldsCapacity
, status
);
1935 // uregex_splitUText...can just use the normal C++ method
1937 U_CAPI
int32_t U_EXPORT2
1938 uregex_splitUText(URegularExpression
*regexp2
,
1939 UText
*destFields
[],
1940 int32_t destFieldsCapacity
,
1941 UErrorCode
*status
) {
1942 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1943 return regexp
->fMatcher
->split(regexp
->fMatcher
->inputText(), destFields
, destFieldsCapacity
, *status
);
1947 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS