2 *******************************************************************************
3 * Copyright (C) 2004-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: uregex.cpp
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "unicode/utf16.h"
30 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
32 struct RegularExpression
: public UMemory
{
38 u_atomic_int32_t
*fPatRefCount
;
40 int32_t fPatStringLen
;
41 RegexMatcher
*fMatcher
;
42 const UChar
*fText
; // Text from setText()
43 int32_t fTextLength
; // Length provided by user with setText(), which
48 static const int32_t REXP_MAGIC
= 0x72657870; // "rexp" in ASCII
50 RegularExpression::RegularExpression() {
62 RegularExpression::~RegularExpression() {
65 if (fPatRefCount
!=NULL
&& umtx_atomic_dec(fPatRefCount
)==0) {
67 uprv_free(fPatString
);
68 uprv_free((void *)fPatRefCount
);
70 if (fOwnsText
&& fText
!=NULL
) {
71 uprv_free((void *)fText
);
80 //----------------------------------------------------------------------------------------
82 // validateRE Do boilerplate style checks on API function parameters.
83 // Return TRUE if they look OK.
84 //----------------------------------------------------------------------------------------
85 static UBool
validateRE(const RegularExpression
*re
, UBool requiresText
, UErrorCode
*status
) {
86 if (U_FAILURE(*status
)) {
89 if (re
== NULL
|| re
->fMagic
!= REXP_MAGIC
) {
90 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94 if (requiresText
&& re
->fText
== NULL
&& !re
->fOwnsText
) {
95 *status
= U_REGEX_INVALID_STATE
;
101 //----------------------------------------------------------------------------------------
105 //----------------------------------------------------------------------------------------
106 U_CAPI URegularExpression
* U_EXPORT2
107 uregex_open( const UChar
*pattern
,
108 int32_t patternLength
,
111 UErrorCode
*status
) {
113 if (U_FAILURE(*status
)) {
116 if (pattern
== NULL
|| patternLength
< -1 || patternLength
== 0) {
117 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
120 int32_t actualPatLen
= patternLength
;
121 if (actualPatLen
== -1) {
122 actualPatLen
= u_strlen(pattern
);
125 RegularExpression
*re
= new RegularExpression
;
126 u_atomic_int32_t
*refC
= (u_atomic_int32_t
*)uprv_malloc(sizeof(int32_t));
127 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(actualPatLen
+1));
128 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
129 *status
= U_MEMORY_ALLOCATION_ERROR
;
131 uprv_free((void *)refC
);
135 re
->fPatRefCount
= refC
;
136 *re
->fPatRefCount
= 1;
139 // Make a copy of the pattern string, so we can return it later if asked.
140 // For compiling the pattern, we will use a UText wrapper around
141 // this local copy, to avoid making even more copies.
143 re
->fPatString
= patBuf
;
144 re
->fPatStringLen
= patternLength
;
145 u_memcpy(patBuf
, pattern
, actualPatLen
);
146 patBuf
[actualPatLen
] = 0;
148 UText patText
= UTEXT_INITIALIZER
;
149 utext_openUChars(&patText
, patBuf
, patternLength
, status
);
152 // Compile the pattern
155 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
157 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
159 utext_close(&patText
);
161 if (U_FAILURE(*status
)) {
166 // Create the matcher object
168 re
->fMatcher
= re
->fPat
->matcher(*status
);
169 if (U_SUCCESS(*status
)) {
170 return (URegularExpression
*)re
;
179 //----------------------------------------------------------------------------------------
183 //----------------------------------------------------------------------------------------
184 U_CAPI URegularExpression
* U_EXPORT2
185 uregex_openUText(UText
*pattern
,
188 UErrorCode
*status
) {
190 if (U_FAILURE(*status
)) {
193 if (pattern
== NULL
) {
194 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
198 int64_t patternNativeLength
= utext_nativeLength(pattern
);
200 if (patternNativeLength
== 0) {
201 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
205 RegularExpression
*re
= new RegularExpression
;
207 UErrorCode lengthStatus
= U_ZERO_ERROR
;
208 int32_t pattern16Length
= utext_extract(pattern
, 0, patternNativeLength
, NULL
, 0, &lengthStatus
);
210 u_atomic_int32_t
*refC
= (u_atomic_int32_t
*)uprv_malloc(sizeof(int32_t));
211 UChar
*patBuf
= (UChar
*)uprv_malloc(sizeof(UChar
)*(pattern16Length
+1));
212 if (re
== NULL
|| refC
== NULL
|| patBuf
== NULL
) {
213 *status
= U_MEMORY_ALLOCATION_ERROR
;
215 uprv_free((void *)refC
);
219 re
->fPatRefCount
= refC
;
220 *re
->fPatRefCount
= 1;
223 // Make a copy of the pattern string, so we can return it later if asked.
224 // For compiling the pattern, we will use a read-only UText wrapper
225 // around this local copy, to avoid making even more copies.
227 re
->fPatString
= patBuf
;
228 re
->fPatStringLen
= pattern16Length
;
229 utext_extract(pattern
, 0, patternNativeLength
, patBuf
, pattern16Length
+1, status
);
231 UText patText
= UTEXT_INITIALIZER
;
232 utext_openUChars(&patText
, patBuf
, pattern16Length
, status
);
235 // Compile the pattern
238 re
->fPat
= RegexPattern::compile(&patText
, flags
, *pe
, *status
);
240 re
->fPat
= RegexPattern::compile(&patText
, flags
, *status
);
242 utext_close(&patText
);
244 if (U_FAILURE(*status
)) {
249 // Create the matcher object
251 re
->fMatcher
= re
->fPat
->matcher(*status
);
252 if (U_SUCCESS(*status
)) {
253 return (URegularExpression
*)re
;
262 //----------------------------------------------------------------------------------------
266 //----------------------------------------------------------------------------------------
267 U_CAPI
void U_EXPORT2
268 uregex_close(URegularExpression
*re2
) {
269 RegularExpression
*re
= (RegularExpression
*)re2
;
270 UErrorCode status
= U_ZERO_ERROR
;
271 if (validateRE(re
, FALSE
, &status
) == FALSE
) {
278 //----------------------------------------------------------------------------------------
282 //----------------------------------------------------------------------------------------
283 U_CAPI URegularExpression
* U_EXPORT2
284 uregex_clone(const URegularExpression
*source2
, UErrorCode
*status
) {
285 RegularExpression
*source
= (RegularExpression
*)source2
;
286 if (validateRE(source
, FALSE
, status
) == FALSE
) {
290 RegularExpression
*clone
= new RegularExpression
;
292 *status
= U_MEMORY_ALLOCATION_ERROR
;
296 clone
->fMatcher
= source
->fPat
->matcher(*status
);
297 if (U_FAILURE(*status
)) {
302 clone
->fPat
= source
->fPat
;
303 clone
->fPatRefCount
= source
->fPatRefCount
;
304 clone
->fPatString
= source
->fPatString
;
305 clone
->fPatStringLen
= source
->fPatStringLen
;
306 umtx_atomic_inc(source
->fPatRefCount
);
307 // Note: fText is not cloned.
309 return (URegularExpression
*)clone
;
315 //------------------------------------------------------------------------------
319 //------------------------------------------------------------------------------
320 U_CAPI
const UChar
* U_EXPORT2
321 uregex_pattern(const URegularExpression
*regexp2
,
323 UErrorCode
*status
) {
324 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
326 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
329 if (patLength
!= NULL
) {
330 *patLength
= regexp
->fPatStringLen
;
332 return regexp
->fPatString
;
336 //------------------------------------------------------------------------------
338 // uregex_patternUText
340 //------------------------------------------------------------------------------
341 U_CAPI UText
* U_EXPORT2
342 uregex_patternUText(const URegularExpression
*regexp2
,
343 UErrorCode
*status
) {
344 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
345 return regexp
->fPat
->patternText(*status
);
349 //------------------------------------------------------------------------------
353 //------------------------------------------------------------------------------
354 U_CAPI
int32_t U_EXPORT2
355 uregex_flags(const URegularExpression
*regexp2
, UErrorCode
*status
) {
356 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
357 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
360 int32_t flags
= regexp
->fPat
->flags();
365 //------------------------------------------------------------------------------
369 //------------------------------------------------------------------------------
370 U_CAPI
void U_EXPORT2
371 uregex_setText(URegularExpression
*regexp2
,
374 UErrorCode
*status
) {
375 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
376 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
379 if (text
== NULL
|| textLength
< -1) {
380 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
384 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
385 uprv_free((void *)regexp
->fText
);
388 regexp
->fText
= text
;
389 regexp
->fTextLength
= textLength
;
390 regexp
->fOwnsText
= FALSE
;
392 UText input
= UTEXT_INITIALIZER
;
393 utext_openUChars(&input
, text
, textLength
, status
);
394 regexp
->fMatcher
->reset(&input
);
395 utext_close(&input
); // reset() made a shallow clone, so we don't need this copy
399 //------------------------------------------------------------------------------
403 //------------------------------------------------------------------------------
404 U_CAPI
void U_EXPORT2
405 uregex_setUText(URegularExpression
*regexp2
,
407 UErrorCode
*status
) {
408 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
409 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
413 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
417 if (regexp
->fOwnsText
&& regexp
->fText
!= NULL
) {
418 uprv_free((void *)regexp
->fText
);
421 regexp
->fText
= NULL
; // only fill it in on request
422 regexp
->fTextLength
= -1;
423 regexp
->fOwnsText
= TRUE
;
424 regexp
->fMatcher
->reset(text
);
429 //------------------------------------------------------------------------------
433 //------------------------------------------------------------------------------
434 U_CAPI
const UChar
* U_EXPORT2
435 uregex_getText(URegularExpression
*regexp2
,
437 UErrorCode
*status
) {
438 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
439 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
443 if (regexp
->fText
== NULL
) {
444 // need to fill in the text
445 UText
*inputText
= regexp
->fMatcher
->inputText();
446 int64_t inputNativeLength
= utext_nativeLength(inputText
);
447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText
, inputNativeLength
)) {
448 regexp
->fText
= inputText
->chunkContents
;
449 regexp
->fTextLength
= (int32_t)inputNativeLength
;
450 regexp
->fOwnsText
= FALSE
; // because the UText owns it
452 UErrorCode lengthStatus
= U_ZERO_ERROR
;
453 regexp
->fTextLength
= utext_extract(inputText
, 0, inputNativeLength
, NULL
, 0, &lengthStatus
); // buffer overflow error
454 UChar
*inputChars
= (UChar
*)uprv_malloc(sizeof(UChar
)*(regexp
->fTextLength
+1));
456 utext_extract(inputText
, 0, inputNativeLength
, inputChars
, regexp
->fTextLength
+1, status
);
457 regexp
->fText
= inputChars
;
458 regexp
->fOwnsText
= TRUE
; // should already be set but just in case
462 if (textLength
!= NULL
) {
463 *textLength
= regexp
->fTextLength
;
465 return regexp
->fText
;
469 //------------------------------------------------------------------------------
473 //------------------------------------------------------------------------------
474 U_CAPI UText
* U_EXPORT2
475 uregex_getUText(URegularExpression
*regexp2
,
477 UErrorCode
*status
) {
478 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
479 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
482 return regexp
->fMatcher
->getInput(dest
, *status
);
486 //------------------------------------------------------------------------------
488 // uregex_refreshUText
490 //------------------------------------------------------------------------------
491 U_CAPI
void U_EXPORT2
492 uregex_refreshUText(URegularExpression
*regexp2
,
494 UErrorCode
*status
) {
495 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
496 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
499 regexp
->fMatcher
->refreshInputText(text
, *status
);
503 //------------------------------------------------------------------------------
507 //------------------------------------------------------------------------------
508 U_CAPI UBool U_EXPORT2
509 uregex_matches(URegularExpression
*regexp2
,
511 UErrorCode
*status
) {
512 return uregex_matches64( regexp2
, (int64_t)startIndex
, status
);
515 U_CAPI UBool U_EXPORT2
516 uregex_matches64(URegularExpression
*regexp2
,
518 UErrorCode
*status
) {
519 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
520 UBool result
= FALSE
;
521 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
524 if (startIndex
== -1) {
525 result
= regexp
->fMatcher
->matches(*status
);
527 result
= regexp
->fMatcher
->matches(startIndex
, *status
);
533 //------------------------------------------------------------------------------
537 //------------------------------------------------------------------------------
538 U_CAPI UBool U_EXPORT2
539 uregex_lookingAt(URegularExpression
*regexp2
,
541 UErrorCode
*status
) {
542 return uregex_lookingAt64( regexp2
, (int64_t)startIndex
, status
);
545 U_CAPI UBool U_EXPORT2
546 uregex_lookingAt64(URegularExpression
*regexp2
,
548 UErrorCode
*status
) {
549 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
550 UBool result
= FALSE
;
551 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
554 if (startIndex
== -1) {
555 result
= regexp
->fMatcher
->lookingAt(*status
);
557 result
= regexp
->fMatcher
->lookingAt(startIndex
, *status
);
564 //------------------------------------------------------------------------------
568 //------------------------------------------------------------------------------
569 U_CAPI UBool U_EXPORT2
570 uregex_find(URegularExpression
*regexp2
,
572 UErrorCode
*status
) {
573 return uregex_find64( regexp2
, (int64_t)startIndex
, status
);
576 U_CAPI UBool U_EXPORT2
577 uregex_find64(URegularExpression
*regexp2
,
579 UErrorCode
*status
) {
580 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
581 UBool result
= FALSE
;
582 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
585 if (startIndex
== -1) {
586 regexp
->fMatcher
->resetPreserveRegion();
587 result
= regexp
->fMatcher
->find(*status
);
589 result
= regexp
->fMatcher
->find(startIndex
, *status
);
595 //------------------------------------------------------------------------------
599 //------------------------------------------------------------------------------
600 U_CAPI UBool U_EXPORT2
601 uregex_findNext(URegularExpression
*regexp2
,
602 UErrorCode
*status
) {
603 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
604 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
607 UBool result
= regexp
->fMatcher
->find(*status
);
611 //------------------------------------------------------------------------------
615 //------------------------------------------------------------------------------
616 U_CAPI
int32_t U_EXPORT2
617 uregex_groupCount(URegularExpression
*regexp2
,
618 UErrorCode
*status
) {
619 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
620 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
623 int32_t result
= regexp
->fMatcher
->groupCount();
628 //------------------------------------------------------------------------------
630 // uregex_groupNumberFromName
632 //------------------------------------------------------------------------------
634 uregex_groupNumberFromName(URegularExpression
*regexp2
,
635 const UChar
*groupName
,
637 UErrorCode
*status
) {
638 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
639 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
642 int32_t result
= regexp
->fPat
->groupNumberFromName(UnicodeString(groupName
, nameLength
), *status
);
647 uregex_groupNumberFromCName(URegularExpression
*regexp2
,
648 const char *groupName
,
650 UErrorCode
*status
) {
651 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
652 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
655 return regexp
->fPat
->groupNumberFromName(groupName
, nameLength
, *status
);
658 //------------------------------------------------------------------------------
662 //------------------------------------------------------------------------------
663 U_CAPI
int32_t U_EXPORT2
664 uregex_group(URegularExpression
*regexp2
,
667 int32_t destCapacity
,
668 UErrorCode
*status
) {
669 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
670 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
673 if (destCapacity
< 0 || (destCapacity
> 0 && dest
== NULL
)) {
674 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
678 if (destCapacity
== 0 || regexp
->fText
!= NULL
) {
679 // If preflighting or if we already have the text as UChars,
680 // this is a little cheaper than extracting from the UText
683 // Pick up the range of characters from the matcher
685 int32_t startIx
= regexp
->fMatcher
->start(groupNum
, *status
);
686 int32_t endIx
= regexp
->fMatcher
->end (groupNum
, *status
);
687 if (U_FAILURE(*status
)) {
692 // Trim length based on buffer capacity
694 int32_t fullLength
= endIx
- startIx
;
695 int32_t copyLength
= fullLength
;
696 if (copyLength
< destCapacity
) {
697 dest
[copyLength
] = 0;
698 } else if (copyLength
== destCapacity
) {
699 *status
= U_STRING_NOT_TERMINATED_WARNING
;
701 copyLength
= destCapacity
;
702 *status
= U_BUFFER_OVERFLOW_ERROR
;
706 // Copy capture group to user's buffer
708 if (copyLength
> 0) {
709 u_memcpy(dest
, ®exp
->fText
[startIx
], copyLength
);
713 int64_t start
= regexp
->fMatcher
->start64(groupNum
, *status
);
714 int64_t limit
= regexp
->fMatcher
->end64(groupNum
, *status
);
715 if (U_FAILURE(*status
)) {
719 // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
720 // Zero Length Match: start == end.
721 int32_t length
= utext_extract(regexp
->fMatcher
->inputText(), start
, limit
, dest
, destCapacity
, status
);
728 //------------------------------------------------------------------------------
732 //------------------------------------------------------------------------------
733 U_CAPI UText
* U_EXPORT2
734 uregex_groupUText(URegularExpression
*regexp2
,
737 int64_t *groupLength
,
738 UErrorCode
*status
) {
739 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
740 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
741 UErrorCode emptyTextStatus
= U_ZERO_ERROR
;
742 return (dest
? dest
: utext_openUChars(NULL
, NULL
, 0, &emptyTextStatus
));
745 return regexp
->fMatcher
->group(groupNum
, dest
, *groupLength
, *status
);
748 //------------------------------------------------------------------------------
752 //------------------------------------------------------------------------------
753 U_CAPI
int32_t U_EXPORT2
754 uregex_start(URegularExpression
*regexp2
,
756 UErrorCode
*status
) {
757 return (int32_t)uregex_start64( regexp2
, groupNum
, status
);
760 U_CAPI
int64_t U_EXPORT2
761 uregex_start64(URegularExpression
*regexp2
,
763 UErrorCode
*status
) {
764 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
765 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
768 int32_t result
= regexp
->fMatcher
->start(groupNum
, *status
);
772 //------------------------------------------------------------------------------
776 //------------------------------------------------------------------------------
777 U_CAPI
int32_t U_EXPORT2
778 uregex_end(URegularExpression
*regexp2
,
780 UErrorCode
*status
) {
781 return (int32_t)uregex_end64( regexp2
, groupNum
, status
);
784 U_CAPI
int64_t U_EXPORT2
785 uregex_end64(URegularExpression
*regexp2
,
787 UErrorCode
*status
) {
788 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
789 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
792 int32_t result
= regexp
->fMatcher
->end(groupNum
, *status
);
796 //------------------------------------------------------------------------------
800 //------------------------------------------------------------------------------
801 U_CAPI
void U_EXPORT2
802 uregex_reset(URegularExpression
*regexp2
,
804 UErrorCode
*status
) {
805 uregex_reset64( regexp2
, (int64_t)index
, status
);
808 U_CAPI
void U_EXPORT2
809 uregex_reset64(URegularExpression
*regexp2
,
811 UErrorCode
*status
) {
812 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
813 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
816 regexp
->fMatcher
->reset(index
, *status
);
820 //------------------------------------------------------------------------------
824 //------------------------------------------------------------------------------
825 U_CAPI
void U_EXPORT2
826 uregex_setRegion(URegularExpression
*regexp2
,
829 UErrorCode
*status
) {
830 uregex_setRegion64( regexp2
, (int64_t)regionStart
, (int64_t)regionLimit
, status
);
833 U_CAPI
void U_EXPORT2
834 uregex_setRegion64(URegularExpression
*regexp2
,
837 UErrorCode
*status
) {
838 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
839 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
842 regexp
->fMatcher
->region(regionStart
, regionLimit
, *status
);
846 //------------------------------------------------------------------------------
848 // uregex_setRegionAndStart
850 //------------------------------------------------------------------------------
851 U_CAPI
void U_EXPORT2
852 uregex_setRegionAndStart(URegularExpression
*regexp2
,
856 UErrorCode
*status
) {
857 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
858 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
861 regexp
->fMatcher
->region(regionStart
, regionLimit
, startIndex
, *status
);
864 //------------------------------------------------------------------------------
866 // uregex_regionStart
868 //------------------------------------------------------------------------------
869 U_CAPI
int32_t U_EXPORT2
870 uregex_regionStart(const URegularExpression
*regexp2
,
871 UErrorCode
*status
) {
872 return (int32_t)uregex_regionStart64(regexp2
, status
);
875 U_CAPI
int64_t U_EXPORT2
876 uregex_regionStart64(const URegularExpression
*regexp2
,
877 UErrorCode
*status
) {
878 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
879 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
882 return regexp
->fMatcher
->regionStart();
886 //------------------------------------------------------------------------------
890 //------------------------------------------------------------------------------
891 U_CAPI
int32_t U_EXPORT2
892 uregex_regionEnd(const URegularExpression
*regexp2
,
893 UErrorCode
*status
) {
894 return (int32_t)uregex_regionEnd64(regexp2
, status
);
897 U_CAPI
int64_t U_EXPORT2
898 uregex_regionEnd64(const URegularExpression
*regexp2
,
899 UErrorCode
*status
) {
900 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
901 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
904 return regexp
->fMatcher
->regionEnd();
908 //------------------------------------------------------------------------------
910 // uregex_hasTransparentBounds
912 //------------------------------------------------------------------------------
913 U_CAPI UBool U_EXPORT2
914 uregex_hasTransparentBounds(const URegularExpression
*regexp2
,
915 UErrorCode
*status
) {
916 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
917 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
920 return regexp
->fMatcher
->hasTransparentBounds();
924 //------------------------------------------------------------------------------
926 // uregex_useTransparentBounds
928 //------------------------------------------------------------------------------
929 U_CAPI
void U_EXPORT2
930 uregex_useTransparentBounds(URegularExpression
*regexp2
,
932 UErrorCode
*status
) {
933 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
934 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
937 regexp
->fMatcher
->useTransparentBounds(b
);
941 //------------------------------------------------------------------------------
943 // uregex_hasAnchoringBounds
945 //------------------------------------------------------------------------------
946 U_CAPI UBool U_EXPORT2
947 uregex_hasAnchoringBounds(const URegularExpression
*regexp2
,
948 UErrorCode
*status
) {
949 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
950 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
953 return regexp
->fMatcher
->hasAnchoringBounds();
957 //------------------------------------------------------------------------------
959 // uregex_useAnchoringBounds
961 //------------------------------------------------------------------------------
962 U_CAPI
void U_EXPORT2
963 uregex_useAnchoringBounds(URegularExpression
*regexp2
,
965 UErrorCode
*status
) {
966 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
967 if (validateRE(regexp
, FALSE
, status
) == FALSE
) {
970 regexp
->fMatcher
->useAnchoringBounds(b
);
974 //------------------------------------------------------------------------------
978 //------------------------------------------------------------------------------
979 U_CAPI UBool U_EXPORT2
980 uregex_hitEnd(const URegularExpression
*regexp2
,
981 UErrorCode
*status
) {
982 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
983 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
986 return regexp
->fMatcher
->hitEnd();
990 //------------------------------------------------------------------------------
994 //------------------------------------------------------------------------------
995 U_CAPI UBool U_EXPORT2
996 uregex_requireEnd(const URegularExpression
*regexp2
,
997 UErrorCode
*status
) {
998 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
999 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1002 return regexp
->fMatcher
->requireEnd();
1006 //------------------------------------------------------------------------------
1008 // uregex_setTimeLimit
1010 //------------------------------------------------------------------------------
1011 U_CAPI
void U_EXPORT2
1012 uregex_setTimeLimit(URegularExpression
*regexp2
,
1014 UErrorCode
*status
) {
1015 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1016 if (validateRE(regexp
, FALSE
, status
)) {
1017 regexp
->fMatcher
->setTimeLimit(limit
, *status
);
1023 //------------------------------------------------------------------------------
1025 // uregex_getTimeLimit
1027 //------------------------------------------------------------------------------
1028 U_CAPI
int32_t U_EXPORT2
1029 uregex_getTimeLimit(const URegularExpression
*regexp2
,
1030 UErrorCode
*status
) {
1032 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1033 if (validateRE(regexp
, FALSE
, status
)) {
1034 retVal
= regexp
->fMatcher
->getTimeLimit();
1041 //------------------------------------------------------------------------------
1043 // uregex_setStackLimit
1045 //------------------------------------------------------------------------------
1046 U_CAPI
void U_EXPORT2
1047 uregex_setStackLimit(URegularExpression
*regexp2
,
1049 UErrorCode
*status
) {
1050 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1051 if (validateRE(regexp
, FALSE
, status
)) {
1052 regexp
->fMatcher
->setStackLimit(limit
, *status
);
1058 //------------------------------------------------------------------------------
1060 // uregex_getStackLimit
1062 //------------------------------------------------------------------------------
1063 U_CAPI
int32_t U_EXPORT2
1064 uregex_getStackLimit(const URegularExpression
*regexp2
,
1065 UErrorCode
*status
) {
1067 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1068 if (validateRE(regexp
, FALSE
, status
)) {
1069 retVal
= regexp
->fMatcher
->getStackLimit();
1075 //------------------------------------------------------------------------------
1077 // uregex_setMatchCallback
1079 //------------------------------------------------------------------------------
1080 U_CAPI
void U_EXPORT2
1081 uregex_setMatchCallback(URegularExpression
*regexp2
,
1082 URegexMatchCallback
*callback
,
1083 const void *context
,
1084 UErrorCode
*status
) {
1085 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1086 if (validateRE(regexp
, FALSE
, status
)) {
1087 regexp
->fMatcher
->setMatchCallback(callback
, context
, *status
);
1092 //------------------------------------------------------------------------------
1094 // uregex_getMatchCallback
1096 //------------------------------------------------------------------------------
1097 U_CAPI
void U_EXPORT2
1098 uregex_getMatchCallback(const URegularExpression
*regexp2
,
1099 URegexMatchCallback
**callback
,
1100 const void **context
,
1101 UErrorCode
*status
) {
1102 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1103 if (validateRE(regexp
, FALSE
, status
)) {
1104 regexp
->fMatcher
->getMatchCallback(*callback
, *context
, *status
);
1109 //------------------------------------------------------------------------------
1111 // uregex_setMatchProgressCallback
1113 //------------------------------------------------------------------------------
1114 U_CAPI
void U_EXPORT2
1115 uregex_setFindProgressCallback(URegularExpression
*regexp2
,
1116 URegexFindProgressCallback
*callback
,
1117 const void *context
,
1118 UErrorCode
*status
) {
1119 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1120 if (validateRE(regexp
, FALSE
, status
)) {
1121 regexp
->fMatcher
->setFindProgressCallback(callback
, context
, *status
);
1126 //------------------------------------------------------------------------------
1128 // uregex_getMatchCallback
1130 //------------------------------------------------------------------------------
1131 U_CAPI
void U_EXPORT2
1132 uregex_getFindProgressCallback(const URegularExpression
*regexp2
,
1133 URegexFindProgressCallback
**callback
,
1134 const void **context
,
1135 UErrorCode
*status
) {
1136 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1137 if (validateRE(regexp
, FALSE
, status
)) {
1138 regexp
->fMatcher
->getFindProgressCallback(*callback
, *context
, *status
);
1143 //------------------------------------------------------------------------------
1145 // uregex_replaceAll
1147 //------------------------------------------------------------------------------
1148 U_CAPI
int32_t U_EXPORT2
1149 uregex_replaceAll(URegularExpression
*regexp2
,
1150 const UChar
*replacementText
,
1151 int32_t replacementLength
,
1153 int32_t destCapacity
,
1154 UErrorCode
*status
) {
1155 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1156 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1159 if (replacementText
== NULL
|| replacementLength
< -1 ||
1160 (destBuf
== NULL
&& destCapacity
> 0) ||
1162 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1168 uregex_reset(regexp2
, 0, status
);
1170 // Note: Seperate error code variables for findNext() and appendReplacement()
1171 // are used so that destination buffer overflow errors
1172 // in appendReplacement won't stop findNext() from working.
1173 // appendReplacement() and appendTail() special case incoming buffer
1174 // overflow errors, continuing to return the correct length.
1175 UErrorCode findStatus
= *status
;
1176 while (uregex_findNext(regexp2
, &findStatus
)) {
1177 len
+= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1178 &destBuf
, &destCapacity
, status
);
1180 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1182 if (U_FAILURE(findStatus
)) {
1183 // If anything went wrong with the findNext(), make that error trump
1184 // whatever may have happened with the append() operations.
1185 // Errors in findNext() are not expected.
1186 *status
= findStatus
;
1193 //------------------------------------------------------------------------------
1195 // uregex_replaceAllUText
1197 //------------------------------------------------------------------------------
1198 U_CAPI UText
* U_EXPORT2
1199 uregex_replaceAllUText(URegularExpression
*regexp2
,
1200 UText
*replacementText
,
1202 UErrorCode
*status
) {
1203 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1204 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1207 if (replacementText
== NULL
) {
1208 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1212 dest
= regexp
->fMatcher
->replaceAll(replacementText
, dest
, *status
);
1217 //------------------------------------------------------------------------------
1219 // uregex_replaceFirst
1221 //------------------------------------------------------------------------------
1222 U_CAPI
int32_t U_EXPORT2
1223 uregex_replaceFirst(URegularExpression
*regexp2
,
1224 const UChar
*replacementText
,
1225 int32_t replacementLength
,
1227 int32_t destCapacity
,
1228 UErrorCode
*status
) {
1229 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1230 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1233 if (replacementText
== NULL
|| replacementLength
< -1 ||
1234 (destBuf
== NULL
&& destCapacity
> 0) ||
1236 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1241 UBool findSucceeded
;
1242 uregex_reset(regexp2
, 0, status
);
1243 findSucceeded
= uregex_find(regexp2
, 0, status
);
1244 if (findSucceeded
) {
1245 len
= uregex_appendReplacement(regexp2
, replacementText
, replacementLength
,
1246 &destBuf
, &destCapacity
, status
);
1248 len
+= uregex_appendTail(regexp2
, &destBuf
, &destCapacity
, status
);
1254 //------------------------------------------------------------------------------
1256 // uregex_replaceFirstUText
1258 //------------------------------------------------------------------------------
1259 U_CAPI UText
* U_EXPORT2
1260 uregex_replaceFirstUText(URegularExpression
*regexp2
,
1261 UText
*replacementText
,
1263 UErrorCode
*status
) {
1264 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1265 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1268 if (replacementText
== NULL
) {
1269 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1273 dest
= regexp
->fMatcher
->replaceFirst(replacementText
, dest
, *status
);
1278 //------------------------------------------------------------------------------
1280 // uregex_appendReplacement
1282 //------------------------------------------------------------------------------
1286 // Dummy class, because these functions need to be friends of class RegexMatcher,
1287 // and stand-alone C functions don't work as friends
1291 inline static int32_t appendReplacement(RegularExpression
*regexp
,
1292 const UChar
*replacementText
,
1293 int32_t replacementLength
,
1295 int32_t *destCapacity
,
1296 UErrorCode
*status
);
1298 inline static int32_t appendTail(RegularExpression
*regexp
,
1300 int32_t *destCapacity
,
1301 UErrorCode
*status
);
1303 inline static int32_t split(RegularExpression
*regexp
,
1305 int32_t destCapacity
,
1306 int32_t *requiredCapacity
,
1307 UChar
*destFields
[],
1308 int32_t destFieldsCapacity
,
1309 UErrorCode
*status
);
1316 static const UChar BACKSLASH
= 0x5c;
1317 static const UChar DOLLARSIGN
= 0x24;
1318 static const UChar LEFTBRACKET
= 0x7b;
1319 static const UChar RIGHTBRACKET
= 0x7d;
1322 // Move a character to an output buffer, with bounds checking on the index.
1323 // Index advances even if capacity is exceeded, for preflight size computations.
1324 // This little sequence is used a LOT.
1326 static inline void appendToBuf(UChar c
, int32_t *idx
, UChar
*buf
, int32_t bufCapacity
) {
1327 if (*idx
< bufCapacity
) {
1335 // appendReplacement, the actual implementation.
1337 int32_t RegexCImpl::appendReplacement(RegularExpression
*regexp
,
1338 const UChar
*replacementText
,
1339 int32_t replacementLength
,
1341 int32_t *destCapacity
,
1342 UErrorCode
*status
) {
1344 // If we come in with a buffer overflow error, don't suppress the operation.
1345 // A series of appendReplacements, appendTail need to correctly preflight
1346 // the buffer size when an overflow happens somewhere in the middle.
1347 UBool pendingBufferOverflow
= FALSE
;
1348 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1349 pendingBufferOverflow
= TRUE
;
1350 *status
= U_ZERO_ERROR
;
1354 // Validate all paramters
1356 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1359 if (replacementText
== NULL
|| replacementLength
< -1 ||
1360 destCapacity
== NULL
|| destBuf
== NULL
||
1361 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1362 *destCapacity
< 0) {
1363 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1367 RegexMatcher
*m
= regexp
->fMatcher
;
1368 if (m
->fMatch
== FALSE
) {
1369 *status
= U_REGEX_INVALID_STATE
;
1373 UChar
*dest
= *destBuf
;
1374 int32_t capacity
= *destCapacity
;
1375 int32_t destIdx
= 0;
1378 // If it wasn't supplied by the caller, get the length of the replacement text.
1379 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1380 // the fly and avoid this step.
1381 if (replacementLength
== -1) {
1382 replacementLength
= u_strlen(replacementText
);
1385 // Copy input string from the end of previous match to start of current match
1386 if (regexp
->fText
!= NULL
) {
1388 int32_t lastMatchEnd
;
1389 if (UTEXT_USES_U16(m
->fInputText
)) {
1390 lastMatchEnd
= (int32_t)m
->fLastMatchEnd
;
1391 matchStart
= (int32_t)m
->fMatchStart
;
1393 // !!!: Would like a better way to do this!
1394 UErrorCode tempStatus
= U_ZERO_ERROR
;
1395 lastMatchEnd
= utext_extract(m
->fInputText
, 0, m
->fLastMatchEnd
, NULL
, 0, &tempStatus
);
1396 tempStatus
= U_ZERO_ERROR
;
1397 matchStart
= lastMatchEnd
+ utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
, NULL
, 0, &tempStatus
);
1399 for (i
=lastMatchEnd
; i
<matchStart
; i
++) {
1400 appendToBuf(regexp
->fText
[i
], &destIdx
, dest
, capacity
);
1403 UErrorCode possibleOverflowError
= U_ZERO_ERROR
; // ignore
1404 destIdx
+= utext_extract(m
->fInputText
, m
->fLastMatchEnd
, m
->fMatchStart
,
1405 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
),
1406 &possibleOverflowError
);
1408 U_ASSERT(destIdx
>= 0);
1410 // scan the replacement text, looking for substitutions ($n) and \escapes.
1411 int32_t replIdx
= 0;
1412 while (replIdx
< replacementLength
&& U_SUCCESS(*status
)) {
1413 UChar c
= replacementText
[replIdx
];
1415 if (c
!= DOLLARSIGN
&& c
!= BACKSLASH
) {
1416 // Common case, no substitution, no escaping,
1417 // just copy the char to the dest buf.
1418 appendToBuf(c
, &destIdx
, dest
, capacity
);
1422 if (c
== BACKSLASH
) {
1423 // Backslash Escape. Copy the following char out without further checks.
1424 // Note: Surrogate pairs don't need any special handling
1425 // The second half wont be a '$' or a '\', and
1426 // will move to the dest normally on the next
1428 if (replIdx
>= replacementLength
) {
1431 c
= replacementText
[replIdx
];
1433 if (c
==0x55/*U*/ || c
==0x75/*u*/) {
1434 // We have a \udddd or \Udddddddd escape sequence.
1435 UChar32 escapedChar
=
1436 u_unescapeAt(uregex_ucstr_unescape_charAt
,
1437 &replIdx
, // Index is updated by unescapeAt
1438 replacementLength
, // Length of replacement text
1439 (void *)replacementText
);
1441 if (escapedChar
!= (UChar32
)0xFFFFFFFF) {
1442 if (escapedChar
<= 0xffff) {
1443 appendToBuf((UChar
)escapedChar
, &destIdx
, dest
, capacity
);
1445 appendToBuf(U16_LEAD(escapedChar
), &destIdx
, dest
, capacity
);
1446 appendToBuf(U16_TRAIL(escapedChar
), &destIdx
, dest
, capacity
);
1450 // Note: if the \u escape was invalid, just fall through and
1451 // treat it as a plain \<anything> escape.
1454 // Plain backslash escape. Just put out the escaped character.
1455 appendToBuf(c
, &destIdx
, dest
, capacity
);
1461 // We've got a $. Pick up the following capture group name or number.
1462 // For numbers, consume only digits that produce a valid capture group for the pattern.
1464 int32_t groupNum
= 0;
1465 U_ASSERT(c
== DOLLARSIGN
);
1467 U16_GET(replacementText
, 0, replIdx
, replacementLength
, c32
);
1468 if (u_isdigit(c32
)) {
1469 int32_t numDigits
= 0;
1470 int32_t numCaptureGroups
= m
->fPattern
->fGroupMap
->size();
1472 if (replIdx
>= replacementLength
) {
1475 U16_GET(replacementText
, 0, replIdx
, replacementLength
, c32
);
1476 if (u_isdigit(c32
) == FALSE
) {
1480 int32_t digitVal
= u_charDigitValue(c32
);
1481 if (groupNum
* 10 + digitVal
<= numCaptureGroups
) {
1482 groupNum
= groupNum
* 10 + digitVal
;
1483 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1486 if (numDigits
== 0) {
1487 *status
= U_INDEX_OUTOFBOUNDS_ERROR
;
1492 } else if (c32
== LEFTBRACKET
) {
1493 // Scan for Named Capture Group, ${name}.
1494 UnicodeString groupName
;
1495 U16_FWD_1(replacementText
, replIdx
, replacementLength
);
1496 while (U_SUCCESS(*status
) && c32
!= RIGHTBRACKET
) {
1497 if (replIdx
>= replacementLength
) {
1498 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1501 U16_NEXT(replacementText
, replIdx
, replacementLength
, c32
);
1502 if ((c32
>= 0x41 && c32
<= 0x5a) || // A..Z
1503 (c32
>= 0x61 && c32
<= 0x7a) || // a..z
1504 (c32
>= 0x31 && c32
<= 0x39)) { // 0..9
1505 groupName
.append(c32
);
1506 } else if (c32
== RIGHTBRACKET
) {
1507 groupNum
= uhash_geti(regexp
->fPat
->fNamedCaptureMap
, &groupName
);
1508 if (groupNum
== 0) {
1509 // Name not defined by pattern.
1510 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1513 // Character was something other than a name char or a closing '}'
1514 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1518 // $ not followed by {name} or digits.
1519 *status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
1523 // Finally, append the capture group data to the destination.
1524 if (U_SUCCESS(*status
)) {
1525 destIdx
+= uregex_group((URegularExpression
*)regexp
, groupNum
,
1526 dest
==NULL
?NULL
:&dest
[destIdx
], REMAINING_CAPACITY(destIdx
, capacity
), status
);
1527 if (*status
== U_BUFFER_OVERFLOW_ERROR
) {
1528 // Ignore buffer overflow when extracting the group. We need to
1529 // continue on to get full size of the untruncated result. We will
1530 // raise our own buffer overflow error at the end.
1531 *status
= U_ZERO_ERROR
;
1535 if (U_FAILURE(*status
)) {
1536 // bad group number or name.
1542 // Nul Terminate the dest buffer if possible.
1543 // Set the appropriate buffer overflow or not terminated error, if needed.
1545 if (destIdx
< capacity
) {
1547 } else if (U_SUCCESS(*status
)) {
1548 if (destIdx
== *destCapacity
) {
1549 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1551 *status
= U_BUFFER_OVERFLOW_ERROR
;
1556 // Return an updated dest buffer and capacity to the caller.
1558 if (destIdx
> 0 && *destCapacity
> 0) {
1559 if (destIdx
< capacity
) {
1560 *destBuf
+= destIdx
;
1561 *destCapacity
-= destIdx
;
1563 *destBuf
+= capacity
;
1568 // If we came in with a buffer overflow, make sure we go out with one also.
1569 // (A zero length match right at the end of the previous match could
1570 // make this function succeed even though a previous call had overflowed the buf)
1571 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1572 *status
= U_BUFFER_OVERFLOW_ERROR
;
1579 // appendReplacement the actual API function,
1581 U_CAPI
int32_t U_EXPORT2
1582 uregex_appendReplacement(URegularExpression
*regexp2
,
1583 const UChar
*replacementText
,
1584 int32_t replacementLength
,
1586 int32_t *destCapacity
,
1587 UErrorCode
*status
) {
1589 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1590 return RegexCImpl::appendReplacement(
1591 regexp
, replacementText
, replacementLength
,destBuf
, destCapacity
, status
);
1595 // uregex_appendReplacementUText...can just use the normal C++ method
1597 U_CAPI
void U_EXPORT2
1598 uregex_appendReplacementUText(URegularExpression
*regexp2
,
1601 UErrorCode
*status
) {
1602 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1603 regexp
->fMatcher
->appendReplacement(dest
, replText
, *status
);
1607 //------------------------------------------------------------------------------
1609 // uregex_appendTail
1611 //------------------------------------------------------------------------------
1612 int32_t RegexCImpl::appendTail(RegularExpression
*regexp
,
1614 int32_t *destCapacity
,
1618 // If we come in with a buffer overflow error, don't suppress the operation.
1619 // A series of appendReplacements, appendTail need to correctly preflight
1620 // the buffer size when an overflow happens somewhere in the middle.
1621 UBool pendingBufferOverflow
= FALSE
;
1622 if (*status
== U_BUFFER_OVERFLOW_ERROR
&& destCapacity
!= NULL
&& *destCapacity
== 0) {
1623 pendingBufferOverflow
= TRUE
;
1624 *status
= U_ZERO_ERROR
;
1627 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1631 if (destCapacity
== NULL
|| destBuf
== NULL
||
1632 (*destBuf
== NULL
&& *destCapacity
> 0) ||
1635 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1639 RegexMatcher
*m
= regexp
->fMatcher
;
1641 int32_t destIdx
= 0;
1642 int32_t destCap
= *destCapacity
;
1643 UChar
*dest
= *destBuf
;
1645 if (regexp
->fText
!= NULL
) {
1647 int64_t nativeIdx
= (m
->fMatch
? m
->fMatchEnd
: m
->fLastMatchEnd
);
1648 if (nativeIdx
== -1) {
1650 } else if (UTEXT_USES_U16(m
->fInputText
)) {
1651 srcIdx
= (int32_t)nativeIdx
;
1653 UErrorCode status
= U_ZERO_ERROR
;
1654 srcIdx
= utext_extract(m
->fInputText
, 0, nativeIdx
, NULL
, 0, &status
);
1658 U_ASSERT(destIdx
>= 0);
1660 if (srcIdx
== regexp
->fTextLength
) {
1663 UChar c
= regexp
->fText
[srcIdx
];
1664 if (c
== 0 && regexp
->fTextLength
== -1) {
1665 regexp
->fTextLength
= srcIdx
;
1669 if (destIdx
< destCap
) {
1672 // We've overflowed the dest buffer.
1673 // If the total input string length is known, we can
1674 // compute the total buffer size needed without scanning through the string.
1675 if (regexp
->fTextLength
> 0) {
1676 destIdx
+= (regexp
->fTextLength
- srcIdx
);
1686 // The most recent call to find() succeeded.
1687 srcIdx
= m
->fMatchEnd
;
1689 // The last call to find() on this matcher failed().
1690 // Look back to the end of the last find() that succeeded for src index.
1691 srcIdx
= m
->fLastMatchEnd
;
1693 // There has been no successful match with this matcher.
1694 // We want to copy the whole string.
1699 destIdx
= utext_extract(m
->fInputText
, srcIdx
, m
->fInputLength
, dest
, destCap
, status
);
1703 // NUL terminate the output string, if possible, otherwise issue the
1704 // appropriate error or warning.
1706 if (destIdx
< destCap
) {
1708 } else if (destIdx
== destCap
) {
1709 *status
= U_STRING_NOT_TERMINATED_WARNING
;
1711 *status
= U_BUFFER_OVERFLOW_ERROR
;
1715 // Update the user's buffer ptr and capacity vars to reflect the
1718 if (destIdx
< destCap
) {
1719 *destBuf
+= destIdx
;
1720 *destCapacity
-= destIdx
;
1721 } else if (*destBuf
!= NULL
) {
1722 *destBuf
+= destCap
;
1726 if (pendingBufferOverflow
&& U_SUCCESS(*status
)) {
1727 *status
= U_BUFFER_OVERFLOW_ERROR
;
1735 // appendTail the actual API function
1737 U_CAPI
int32_t U_EXPORT2
1738 uregex_appendTail(URegularExpression
*regexp2
,
1740 int32_t *destCapacity
,
1741 UErrorCode
*status
) {
1742 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1743 return RegexCImpl::appendTail(regexp
, destBuf
, destCapacity
, status
);
1748 // uregex_appendTailUText...can just use the normal C++ method
1750 U_CAPI UText
* U_EXPORT2
1751 uregex_appendTailUText(URegularExpression
*regexp2
,
1753 UErrorCode
*status
) {
1754 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1755 return regexp
->fMatcher
->appendTail(dest
, *status
);
1759 //------------------------------------------------------------------------------
1761 // copyString Internal utility to copy a string to an output buffer,
1762 // while managing buffer overflow and preflight size
1763 // computation. NUL termination is added to destination,
1764 // and the NUL is counted in the output size.
1766 //------------------------------------------------------------------------------
1768 static void copyString(UChar
*destBuffer
, // Destination buffer.
1769 int32_t destCapacity
, // Total capacity of dest buffer
1770 int32_t *destIndex
, // Index into dest buffer. Updated on return.
1771 // Update not clipped to destCapacity.
1772 const UChar
*srcPtr
, // Pointer to source string
1773 int32_t srcLen
) // Source string len.
1776 int32_t di
= *destIndex
;
1779 for (si
=0; si
<srcLen
; si
++) {
1781 if (di
< destCapacity
) {
1789 if (di
<destCapacity
) {
1797 //------------------------------------------------------------------------------
1801 //------------------------------------------------------------------------------
1802 int32_t RegexCImpl::split(RegularExpression
*regexp
,
1804 int32_t destCapacity
,
1805 int32_t *requiredCapacity
,
1806 UChar
*destFields
[],
1807 int32_t destFieldsCapacity
,
1808 UErrorCode
*status
) {
1810 // Reset for the input text
1812 regexp
->fMatcher
->reset();
1813 UText
*inputText
= regexp
->fMatcher
->fInputText
;
1814 int64_t nextOutputStringStart
= 0;
1815 int64_t inputLen
= regexp
->fMatcher
->fInputLength
;
1816 if (inputLen
== 0) {
1821 // Loop through the input text, searching for the delimiter pattern
1823 int32_t i
; // Index of the field being processed.
1824 int32_t destIdx
= 0; // Next available position in destBuf;
1825 int32_t numCaptureGroups
= regexp
->fMatcher
->groupCount();
1826 UErrorCode tStatus
= U_ZERO_ERROR
; // Want to ignore any buffer overflow errors so that the strings are still counted
1828 if (i
>=destFieldsCapacity
-1) {
1829 // There are one or zero output strings left.
1830 // Fill the last output string with whatever is left from the input, then exit the loop.
1831 // ( i will be == destFieldsCapacity if we filled the output array while processing
1832 // capture groups of the delimiter expression, in which case we will discard the
1833 // last capture group saved in favor of the unprocessed remainder of the
1835 if (inputLen
> nextOutputStringStart
) {
1836 if (i
!= destFieldsCapacity
-1) {
1837 // No fields are left. Recycle the last one for holding the trailing part of
1838 // the input string.
1839 i
= destFieldsCapacity
-1;
1840 destIdx
= (int32_t)(destFields
[i
] - destFields
[0]);
1843 destFields
[i
] = &destBuf
[destIdx
];
1844 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1845 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1850 if (regexp
->fMatcher
->find()) {
1851 // We found another delimiter. Move everything from where we started looking
1852 // up until the start of the delimiter into the next output string.
1853 destFields
[i
] = &destBuf
[destIdx
];
1855 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, regexp
->fMatcher
->fMatchStart
,
1856 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), &tStatus
);
1857 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1858 tStatus
= U_ZERO_ERROR
;
1862 nextOutputStringStart
= regexp
->fMatcher
->fMatchEnd
;
1864 // If the delimiter pattern has capturing parentheses, the captured
1865 // text goes out into the next n destination strings.
1867 for (groupNum
=1; groupNum
<=numCaptureGroups
; groupNum
++) {
1868 // If we've run out of output string slots, bail out.
1869 if (i
==destFieldsCapacity
-1) {
1874 // Set up to extract the capture group contents into the dest buffer.
1875 destFields
[i
] = &destBuf
[destIdx
];
1876 tStatus
= U_ZERO_ERROR
;
1877 int32_t t
= uregex_group((URegularExpression
*)regexp
,
1880 REMAINING_CAPACITY(destIdx
, destCapacity
),
1882 destIdx
+= t
+ 1; // Record the space used in the output string buffer.
1883 // +1 for the NUL that terminates the string.
1884 if (tStatus
== U_BUFFER_OVERFLOW_ERROR
) {
1885 tStatus
= U_ZERO_ERROR
;
1891 if (nextOutputStringStart
== inputLen
) {
1892 // The delimiter was at the end of the string.
1893 // Output an empty string, and then we are done.
1894 if (destIdx
< destCapacity
) {
1895 destBuf
[destIdx
] = 0;
1897 if (i
< destFieldsCapacity
-1) {
1900 if (destIdx
< destCapacity
) {
1901 destFields
[i
] = destBuf
+ destIdx
;
1910 // We ran off the end of the input while looking for the next delimiter.
1911 // All the remaining text goes into the current output string.
1912 destFields
[i
] = &destBuf
[destIdx
];
1913 destIdx
+= 1 + utext_extract(inputText
, nextOutputStringStart
, inputLen
,
1914 &destBuf
[destIdx
], REMAINING_CAPACITY(destIdx
, destCapacity
), status
);
1919 // Zero out any unused portion of the destFields array
1921 for (j
=i
+1; j
<destFieldsCapacity
; j
++) {
1922 destFields
[j
] = NULL
;
1925 if (requiredCapacity
!= NULL
) {
1926 *requiredCapacity
= destIdx
;
1928 if (destIdx
> destCapacity
) {
1929 *status
= U_BUFFER_OVERFLOW_ERROR
;
1935 // uregex_split The actual API function
1937 U_CAPI
int32_t U_EXPORT2
1938 uregex_split(URegularExpression
*regexp2
,
1940 int32_t destCapacity
,
1941 int32_t *requiredCapacity
,
1942 UChar
*destFields
[],
1943 int32_t destFieldsCapacity
,
1944 UErrorCode
*status
) {
1945 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1946 if (validateRE(regexp
, TRUE
, status
) == FALSE
) {
1949 if ((destBuf
== NULL
&& destCapacity
> 0) ||
1951 destFields
== NULL
||
1952 destFieldsCapacity
< 1 ) {
1953 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
1957 return RegexCImpl::split(regexp
, destBuf
, destCapacity
, requiredCapacity
, destFields
, destFieldsCapacity
, status
);
1962 // uregex_splitUText...can just use the normal C++ method
1964 U_CAPI
int32_t U_EXPORT2
1965 uregex_splitUText(URegularExpression
*regexp2
,
1966 UText
*destFields
[],
1967 int32_t destFieldsCapacity
,
1968 UErrorCode
*status
) {
1969 RegularExpression
*regexp
= (RegularExpression
*)regexp2
;
1970 return regexp
->fMatcher
->split(regexp
->fMatcher
->inputText(), destFields
, destFieldsCapacity
, *status
);
1974 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS