X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..2ca993e82fb37b597a3c73ecd1586a139a6579c5:/icuSources/i18n/uregex.cpp diff --git a/icuSources/i18n/uregex.cpp b/icuSources/i18n/uregex.cpp index c2249eb1..99e94283 100644 --- a/icuSources/i18n/uregex.cpp +++ b/icuSources/i18n/uregex.cpp @@ -1,9 +1,9 @@ /* ******************************************************************************* -* Copyright (C) 2004-2010, International Business Machines +* Copyright (C) 2004-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* -* file name: regex.cpp +* file name: uregex.cpp */ #include "unicode/utypes.h" @@ -16,14 +16,15 @@ #include "unicode/ustring.h" #include "unicode/uchar.h" #include "unicode/uobject.h" -#include "umutex.h" -#include "uassert.h" +#include "unicode/utf16.h" #include "cmemory.h" +#include "uassert.h" +#include "uhash.h" +#include "umutex.h" +#include "uvectr32.h" #include "regextxt.h" -#include - U_NAMESPACE_BEGIN #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) @@ -34,7 +35,7 @@ public: ~RegularExpression(); int32_t fMagic; RegexPattern *fPat; - int32_t *fPatRefCount; + u_atomic_int32_t *fPatRefCount; UChar *fPatString; int32_t fPatStringLen; RegexMatcher *fMatcher; @@ -64,7 +65,7 @@ RegularExpression::~RegularExpression() { if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { delete fPat; uprv_free(fPatString); - uprv_free(fPatRefCount); + uprv_free((void *)fPatRefCount); } if (fOwnsText && fText!=NULL) { uprv_free((void *)fText); @@ -81,7 +82,7 @@ U_NAMESPACE_USE // validateRE Do boilerplate style checks on API function parameters. // Return TRUE if they look OK. //---------------------------------------------------------------------------------------- -static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { +static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { if (U_FAILURE(*status)) { return FALSE; } @@ -121,13 +122,13 @@ uregex_open( const UChar *pattern, actualPatLen = u_strlen(pattern); } - RegularExpression *re = new RegularExpression; - int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); + RegularExpression *re = new RegularExpression; + u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); if (re == NULL || refC == NULL || patBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; - uprv_free(refC); + uprv_free((void *)refC); uprv_free(patBuf); return NULL; } @@ -143,7 +144,7 @@ uregex_open( const UChar *pattern, re->fPatStringLen = patternLength; u_memcpy(patBuf, pattern, actualPatLen); patBuf[actualPatLen] = 0; - + UText patText = UTEXT_INITIALIZER; utext_openUChars(&patText, patBuf, patternLength, status); @@ -156,7 +157,7 @@ uregex_open( const UChar *pattern, re->fPat = RegexPattern::compile(&patText, flags, *status); } utext_close(&patText); - + if (U_FAILURE(*status)) { goto ErrorExit; } @@ -185,7 +186,7 @@ uregex_openUText(UText *pattern, uint32_t flags, UParseError *pe, UErrorCode *status) { - + if (U_FAILURE(*status)) { return NULL; } @@ -193,31 +194,31 @@ uregex_openUText(UText *pattern, *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } - + int64_t patternNativeLength = utext_nativeLength(pattern); - + if (patternNativeLength == 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } - + RegularExpression *re = new RegularExpression; - + UErrorCode lengthStatus = U_ZERO_ERROR; int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); - - int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); + + u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); if (re == NULL || refC == NULL || patBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; - uprv_free(refC); + uprv_free((void *)refC); uprv_free(patBuf); return NULL; } re->fPatRefCount = refC; *re->fPatRefCount = 1; - + // // Make a copy of the pattern string, so we can return it later if asked. // For compiling the pattern, we will use a read-only UText wrapper @@ -226,10 +227,10 @@ uregex_openUText(UText *pattern, re->fPatString = patBuf; re->fPatStringLen = pattern16Length; utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); - + UText patText = UTEXT_INITIALIZER; utext_openUChars(&patText, patBuf, pattern16Length, status); - + // // Compile the pattern // @@ -239,11 +240,11 @@ uregex_openUText(UText *pattern, re->fPat = RegexPattern::compile(&patText, flags, *status); } utext_close(&patText); - + if (U_FAILURE(*status)) { goto ErrorExit; } - + // // Create the matcher object // @@ -251,11 +252,11 @@ uregex_openUText(UText *pattern, if (U_SUCCESS(*status)) { return (URegularExpression*)re; } - + ErrorExit: delete re; return NULL; - + } //---------------------------------------------------------------------------------------- @@ -267,7 +268,7 @@ U_CAPI void U_EXPORT2 uregex_close(URegularExpression *re2) { RegularExpression *re = (RegularExpression*)re2; UErrorCode status = U_ZERO_ERROR; - if (validateRE(re, &status, FALSE) == FALSE) { + if (validateRE(re, FALSE, &status) == FALSE) { return; } delete re; @@ -279,10 +280,10 @@ uregex_close(URegularExpression *re2) { // uregex_clone // //---------------------------------------------------------------------------------------- -U_CAPI URegularExpression * U_EXPORT2 +U_CAPI URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *source2, UErrorCode *status) { RegularExpression *source = (RegularExpression*)source2; - if (validateRE(source, status, FALSE) == FALSE) { + if (validateRE(source, FALSE, status) == FALSE) { return NULL; } @@ -299,7 +300,7 @@ uregex_clone(const URegularExpression *source2, UErrorCode *status) { } clone->fPat = source->fPat; - clone->fPatRefCount = source->fPatRefCount; + clone->fPatRefCount = source->fPatRefCount; clone->fPatString = source->fPatString; clone->fPatStringLen = source->fPatStringLen; umtx_atomic_inc(source->fPatRefCount); @@ -316,13 +317,13 @@ uregex_clone(const URegularExpression *source2, UErrorCode *status) { // uregex_pattern // //------------------------------------------------------------------------------ -U_CAPI const UChar * U_EXPORT2 +U_CAPI const UChar * U_EXPORT2 uregex_pattern(const URegularExpression *regexp2, int32_t *patLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - - if (validateRE(regexp, status, FALSE) == FALSE) { + + if (validateRE(regexp, FALSE, status) == FALSE) { return NULL; } if (patLength != NULL) { @@ -350,10 +351,10 @@ uregex_patternUText(const URegularExpression *regexp2, // uregex_flags // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status, FALSE) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return 0; } int32_t flags = regexp->fPat->flags(); @@ -366,28 +367,28 @@ uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { // uregex_setText // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setText(URegularExpression *regexp2, const UChar *text, int32_t textLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status, FALSE) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return; } if (text == NULL || textLength < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } - + if (regexp->fOwnsText && regexp->fText != NULL) { uprv_free((void *)regexp->fText); } - + regexp->fText = text; regexp->fTextLength = textLength; regexp->fOwnsText = FALSE; - + UText input = UTEXT_INITIALIZER; utext_openUChars(&input, text, textLength, status); regexp->fMatcher->reset(&input); @@ -400,23 +401,23 @@ uregex_setText(URegularExpression *regexp2, // uregex_setUText // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setUText(URegularExpression *regexp2, UText *text, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status, FALSE) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return; } if (text == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } - + if (regexp->fOwnsText && regexp->fText != NULL) { uprv_free((void *)regexp->fText); } - + regexp->fText = NULL; // only fill it in on request regexp->fTextLength = -1; regexp->fOwnsText = TRUE; @@ -430,15 +431,15 @@ uregex_setUText(URegularExpression *regexp2, // uregex_getText // //------------------------------------------------------------------------------ -U_CAPI const UChar * U_EXPORT2 +U_CAPI const UChar * U_EXPORT2 uregex_getText(URegularExpression *regexp2, int32_t *textLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status, FALSE) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return NULL; } - + if (regexp->fText == NULL) { // need to fill in the text UText *inputText = regexp->fMatcher->inputText(); @@ -451,13 +452,13 @@ uregex_getText(URegularExpression *regexp2, UErrorCode lengthStatus = U_ZERO_ERROR; regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); - + utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); regexp->fText = inputChars; regexp->fOwnsText = TRUE; // should already be set but just in case } } - + if (textLength != NULL) { *textLength = regexp->fTextLength; } @@ -470,37 +471,54 @@ uregex_getText(URegularExpression *regexp2, // uregex_getUText // //------------------------------------------------------------------------------ -U_CAPI UText * U_EXPORT2 +U_CAPI UText * U_EXPORT2 uregex_getUText(URegularExpression *regexp2, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status, FALSE) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return dest; } return regexp->fMatcher->getInput(dest, *status); } +//------------------------------------------------------------------------------ +// +// uregex_refreshUText +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_refreshUText(URegularExpression *regexp2, + UText *text, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return; + } + regexp->fMatcher->refreshInputText(text, *status); +} + + //------------------------------------------------------------------------------ // // uregex_matches // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_matches(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { return uregex_matches64( regexp2, (int64_t)startIndex, status); } -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_matches64(URegularExpression *regexp2, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = FALSE; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return result; } if (startIndex == -1) { @@ -517,20 +535,20 @@ uregex_matches64(URegularExpression *regexp2, // uregex_lookingAt // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_lookingAt(URegularExpression *regexp2, int32_t startIndex, UErrorCode *status) { return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); } -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_lookingAt64(URegularExpression *regexp2, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = FALSE; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return result; } if (startIndex == -1) { @@ -548,25 +566,25 @@ uregex_lookingAt64(URegularExpression *regexp2, // uregex_find // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_find(URegularExpression *regexp2, - int32_t startIndex, + int32_t startIndex, UErrorCode *status) { return uregex_find64( regexp2, (int64_t)startIndex, status); } -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_find64(URegularExpression *regexp2, - int64_t startIndex, + int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; UBool result = FALSE; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return result; } if (startIndex == -1) { regexp->fMatcher->resetPreserveRegion(); - result = regexp->fMatcher->find(); + result = regexp->fMatcher->find(*status); } else { result = regexp->fMatcher->find(startIndex, *status); } @@ -579,14 +597,14 @@ uregex_find64(URegularExpression *regexp2, // uregex_findNext // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_findNext(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return FALSE; } - UBool result = regexp->fMatcher->find(); + UBool result = regexp->fMatcher->find(*status); return result; } @@ -595,11 +613,11 @@ uregex_findNext(URegularExpression *regexp2, // uregex_groupCount // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_groupCount(URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status, FALSE) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return 0; } int32_t result = regexp->fMatcher->groupCount(); @@ -607,30 +625,60 @@ uregex_groupCount(URegularExpression *regexp2, } +//------------------------------------------------------------------------------ +// +// uregex_groupNumberFromName +// +//------------------------------------------------------------------------------ +int32_t +uregex_groupNumberFromName(URegularExpression *regexp2, + const UChar *groupName, + int32_t nameLength, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return 0; + } + int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status); + return result; +} + +int32_t +uregex_groupNumberFromCName(URegularExpression *regexp2, + const char *groupName, + int32_t nameLength, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return 0; + } + return regexp->fPat->groupNumberFromName(groupName, nameLength, *status); +} + //------------------------------------------------------------------------------ // // uregex_group // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_group(URegularExpression *regexp2, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - + if (destCapacity == 0 || regexp->fText != NULL) { // If preflighting or if we already have the text as UChars, - // this is a little cheaper than going through uregex_groupUTextDeep() - + // this is a little cheaper than extracting from the UText + // // Pick up the range of characters from the matcher // @@ -642,7 +690,7 @@ uregex_group(URegularExpression *regexp2, // // Trim length based on buffer capacity - // + // int32_t fullLength = endIx - startIx; int32_t copyLength = fullLength; if (copyLength < destCapacity) { @@ -653,7 +701,7 @@ uregex_group(URegularExpression *regexp2, copyLength = destCapacity; *status = U_BUFFER_OVERFLOW_ERROR; } - + // // Copy capture group to user's buffer // @@ -662,11 +710,18 @@ uregex_group(URegularExpression *regexp2, } return fullLength; } else { - UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status); - int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status); - utext_close(groupText); - return result; + int64_t start = regexp->fMatcher->start64(groupNum, *status); + int64_t limit = regexp->fMatcher->end64(groupNum, *status); + if (U_FAILURE(*status)) { + return 0; + } + // Note edge cases: + // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. + // Zero Length Match: start == end. + int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); + return length; } + } @@ -675,14 +730,14 @@ uregex_group(URegularExpression *regexp2, // uregex_groupUText // //------------------------------------------------------------------------------ -U_CAPI UText * U_EXPORT2 +U_CAPI UText * U_EXPORT2 uregex_groupUText(URegularExpression *regexp2, int32_t groupNum, UText *dest, int64_t *groupLength, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { UErrorCode emptyTextStatus = U_ZERO_ERROR; return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); } @@ -690,67 +745,24 @@ uregex_groupUText(URegularExpression *regexp2, return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); } -//------------------------------------------------------------------------------ -// -// uregex_groupUTextDeep -// -//------------------------------------------------------------------------------ -U_CAPI UText * U_EXPORT2 -uregex_groupUTextDeep(URegularExpression *regexp2, - int32_t groupNum, - UText *dest, - UErrorCode *status) { - RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { - UErrorCode emptyTextStatus = U_ZERO_ERROR; - return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); - } - - if (regexp->fText != NULL) { - // - // Pick up the range of characters from the matcher - // and use our already-extracted characters - // - int32_t startIx = regexp->fMatcher->start(groupNum, *status); - int32_t endIx = regexp->fMatcher->end (groupNum, *status); - if (U_FAILURE(*status)) { - UErrorCode emptyTextStatus = U_ZERO_ERROR; - return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); - } - - if (dest) { - utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status); - } else { - UText groupText = UTEXT_INITIALIZER; - utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status); - dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); - utext_close(&groupText); - } - - return dest; - } else { - return regexp->fMatcher->group(groupNum, dest, *status); - } -} - //------------------------------------------------------------------------------ // // uregex_start // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_start(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { return (int32_t)uregex_start64( regexp2, groupNum, status); } -U_CAPI int64_t U_EXPORT2 +U_CAPI int64_t U_EXPORT2 uregex_start64(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } int32_t result = regexp->fMatcher->start(groupNum, *status); @@ -762,19 +774,19 @@ uregex_start64(URegularExpression *regexp2, // uregex_end // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_end(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { return (int32_t)uregex_end64( regexp2, groupNum, status); } -U_CAPI int64_t U_EXPORT2 +U_CAPI int64_t U_EXPORT2 uregex_end64(URegularExpression *regexp2, int32_t groupNum, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } int32_t result = regexp->fMatcher->end(groupNum, *status); @@ -786,19 +798,19 @@ uregex_end64(URegularExpression *regexp2, // uregex_reset // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_reset(URegularExpression *regexp2, int32_t index, UErrorCode *status) { uregex_reset64( regexp2, (int64_t)index, status); } -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_reset64(URegularExpression *regexp2, int64_t index, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return; } regexp->fMatcher->reset(index, *status); @@ -810,7 +822,7 @@ uregex_reset64(URegularExpression *regexp2, // uregex_setRegion // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setRegion(URegularExpression *regexp2, int32_t regionStart, int32_t regionLimit, @@ -818,13 +830,13 @@ uregex_setRegion(URegularExpression *regexp2, uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); } -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setRegion64(URegularExpression *regexp2, int64_t regionStart, int64_t regionLimit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return; } regexp->fMatcher->region(regionStart, regionLimit, *status); @@ -836,14 +848,14 @@ uregex_setRegion64(URegularExpression *regexp2, // uregex_setRegionAndStart // //------------------------------------------------------------------------------ -U_DRAFT void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setRegionAndStart(URegularExpression *regexp2, int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return; } regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); @@ -854,17 +866,17 @@ uregex_setRegionAndStart(URegularExpression *regexp2, // uregex_regionStart // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_regionStart(const URegularExpression *regexp2, UErrorCode *status) { return (int32_t)uregex_regionStart64(regexp2, status); } -U_CAPI int64_t U_EXPORT2 +U_CAPI int64_t U_EXPORT2 uregex_regionStart64(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } return regexp->fMatcher->regionStart(); @@ -876,17 +888,17 @@ uregex_regionStart64(const URegularExpression *regexp2, // uregex_regionEnd // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_regionEnd(const URegularExpression *regexp2, UErrorCode *status) { return (int32_t)uregex_regionEnd64(regexp2, status); } -U_CAPI int64_t U_EXPORT2 +U_CAPI int64_t U_EXPORT2 uregex_regionEnd64(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } return regexp->fMatcher->regionEnd(); @@ -898,11 +910,11 @@ uregex_regionEnd64(const URegularExpression *regexp2, // uregex_hasTransparentBounds // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_hasTransparentBounds(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return FALSE; } return regexp->fMatcher->hasTransparentBounds(); @@ -914,12 +926,12 @@ uregex_hasTransparentBounds(const URegularExpression *regexp2, // uregex_useTransparentBounds // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_useTransparentBounds(URegularExpression *regexp2, UBool b, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return; } regexp->fMatcher->useTransparentBounds(b); @@ -931,11 +943,11 @@ uregex_useTransparentBounds(URegularExpression *regexp2, // uregex_hasAnchoringBounds // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_hasAnchoringBounds(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return FALSE; } return regexp->fMatcher->hasAnchoringBounds(); @@ -947,12 +959,12 @@ uregex_hasAnchoringBounds(const URegularExpression *regexp2, // uregex_useAnchoringBounds // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_useAnchoringBounds(URegularExpression *regexp2, UBool b, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, FALSE, status) == FALSE) { return; } regexp->fMatcher->useAnchoringBounds(b); @@ -964,11 +976,11 @@ uregex_useAnchoringBounds(URegularExpression *regexp2, // uregex_hitEnd // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_hitEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return FALSE; } return regexp->fMatcher->hitEnd(); @@ -980,11 +992,11 @@ uregex_hitEnd(const URegularExpression *regexp2, // uregex_requireEnd // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 uregex_requireEnd(const URegularExpression *regexp2, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return FALSE; } return regexp->fMatcher->requireEnd(); @@ -996,12 +1008,12 @@ uregex_requireEnd(const URegularExpression *regexp2, // uregex_setTimeLimit // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setTimeLimit(URegularExpression *regexp2, int32_t limit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { regexp->fMatcher->setTimeLimit(limit, *status); } } @@ -1013,12 +1025,12 @@ uregex_setTimeLimit(URegularExpression *regexp2, // uregex_getTimeLimit // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_getTimeLimit(const URegularExpression *regexp2, UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { retVal = regexp->fMatcher->getTimeLimit(); } return retVal; @@ -1031,12 +1043,12 @@ uregex_getTimeLimit(const URegularExpression *regexp2, // uregex_setStackLimit // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_setStackLimit(URegularExpression *regexp2, int32_t limit, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { regexp->fMatcher->setStackLimit(limit, *status); } } @@ -1048,12 +1060,12 @@ uregex_setStackLimit(URegularExpression *regexp2, // uregex_getStackLimit // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_getStackLimit(const URegularExpression *regexp2, UErrorCode *status) { int32_t retVal = 0; RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { retVal = regexp->fMatcher->getStackLimit(); } return retVal; @@ -1071,7 +1083,7 @@ uregex_setMatchCallback(URegularExpression *regexp2, const void *context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { regexp->fMatcher->setMatchCallback(callback, context, *status); } } @@ -1082,13 +1094,13 @@ uregex_setMatchCallback(URegularExpression *regexp2, // uregex_getMatchCallback // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_getMatchCallback(const URegularExpression *regexp2, URegexMatchCallback **callback, const void **context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { regexp->fMatcher->getMatchCallback(*callback, *context, *status); } } @@ -1105,7 +1117,7 @@ uregex_setFindProgressCallback(URegularExpression *regexp2, const void *context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { regexp->fMatcher->setFindProgressCallback(callback, context, *status); } } @@ -1116,13 +1128,13 @@ uregex_setFindProgressCallback(URegularExpression *regexp2, // uregex_getMatchCallback // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_getFindProgressCallback(const URegularExpression *regexp2, URegexFindProgressCallback **callback, const void **context, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status)) { + if (validateRE(regexp, FALSE, status)) { regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); } } @@ -1133,7 +1145,7 @@ uregex_getFindProgressCallback(const URegularExpression *regexp2, // uregex_replaceAll // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_replaceAll(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, @@ -1141,7 +1153,7 @@ uregex_replaceAll(URegularExpression *regexp2, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || @@ -1166,7 +1178,7 @@ uregex_replaceAll(URegularExpression *regexp2, &destBuf, &destCapacity, status); } len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); - + if (U_FAILURE(findStatus)) { // If anything went wrong with the findNext(), make that error trump // whatever may have happened with the append() operations. @@ -1183,31 +1195,31 @@ uregex_replaceAll(URegularExpression *regexp2, // uregex_replaceAllUText // //------------------------------------------------------------------------------ -U_CAPI UText * U_EXPORT2 +U_CAPI UText * U_EXPORT2 uregex_replaceAllUText(URegularExpression *regexp2, UText *replacementText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - + dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); return dest; } - + //------------------------------------------------------------------------------ // // uregex_replaceFirst // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_replaceFirst(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, @@ -1215,7 +1227,7 @@ uregex_replaceFirst(URegularExpression *regexp2, int32_t destCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || @@ -1230,7 +1242,7 @@ uregex_replaceFirst(URegularExpression *regexp2, uregex_reset(regexp2, 0, status); findSucceeded = uregex_find(regexp2, 0, status); if (findSucceeded) { - len = uregex_appendReplacement(regexp2, replacementText, replacementLength, + len = uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); @@ -1244,20 +1256,20 @@ uregex_replaceFirst(URegularExpression *regexp2, // uregex_replaceFirstUText // //------------------------------------------------------------------------------ -U_CAPI UText * U_EXPORT2 +U_CAPI UText * U_EXPORT2 uregex_replaceFirstUText(URegularExpression *regexp2, UText *replacementText, UText *dest, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - + dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); return dest; } @@ -1287,7 +1299,7 @@ class RegexCImpl { UChar **destBuf, int32_t *destCapacity, UErrorCode *status); - + inline static int32_t split(RegularExpression *regexp, UChar *destBuf, int32_t destCapacity, @@ -1303,6 +1315,8 @@ U_NAMESPACE_END static const UChar BACKSLASH = 0x5c; static const UChar DOLLARSIGN = 0x24; +static const UChar LEFTBRACKET = 0x7b; +static const UChar RIGHTBRACKET = 0x7d; // // Move a character to an output buffer, with bounds checking on the index. @@ -1339,11 +1353,11 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, // // Validate all paramters // - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || - destCapacity == NULL || destBuf == NULL || + destCapacity == NULL || destBuf == NULL || (*destBuf == NULL && *destCapacity > 0) || *destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; @@ -1360,7 +1374,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, int32_t capacity = *destCapacity; int32_t destIdx = 0; int32_t i; - + // If it wasn't supplied by the caller, get the length of the replacement text. // TODO: slightly smarter logic in the copy loop could watch for the NUL on // the fly and avoid this step. @@ -1377,28 +1391,29 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, matchStart = (int32_t)m->fMatchStart; } else { // !!!: Would like a better way to do this! - UErrorCode status = U_ZERO_ERROR; - lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status); - status = U_ZERO_ERROR; - matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status); + UErrorCode tempStatus = U_ZERO_ERROR; + lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus); + tempStatus = U_ZERO_ERROR; + matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus); } for (i=lastMatchEnd; ifText[i], &destIdx, dest, capacity); - } + } } else { UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, - &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError); + dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), + &possibleOverflowError); } - + U_ASSERT(destIdx >= 0); // scan the replacement text, looking for substitutions ($n) and \escapes. int32_t replIdx = 0; - while (replIdx < replacementLength) { + while (replIdx < replacementLength && U_SUCCESS(*status)) { UChar c = replacementText[replIdx]; replIdx++; if (c != DOLLARSIGN && c != BACKSLASH) { - // Common case, no substitution, no escaping, + // Common case, no substitution, no escaping, // just copy the char to the dest buf. appendToBuf(c, &destIdx, dest, capacity); continue; @@ -1417,9 +1432,9 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. - UChar32 escapedChar = + UChar32 escapedChar = u_unescapeAt(uregex_ucstr_unescape_charAt, - &replIdx, // Index is updated by unescapeAt + &replIdx, // Index is updated by unescapeAt replacementLength, // Length of replacement text (void *)replacementText); @@ -1443,54 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, continue; } + // We've got a $. Pick up the following capture group name or number. + // For numbers, consume only digits that produce a valid capture group for the pattern. - - // We've got a $. Pick up a capture group number if one follows. - // Consume at most the number of digits necessary for the largest capture - // number that is valid for this pattern. - - int32_t numDigits = 0; int32_t groupNum = 0; - UChar32 digitC; - for (;;) { - if (replIdx >= replacementLength) { - break; - } - U16_GET(replacementText, 0, replIdx, replacementLength, digitC); - if (u_isdigit(digitC) == FALSE) { - break; - } + U_ASSERT(c == DOLLARSIGN); + UChar32 c32; + U16_GET(replacementText, 0, replIdx, replacementLength, c32); + if (u_isdigit(c32)) { + int32_t numDigits = 0; + int32_t numCaptureGroups = m->fPattern->fGroupMap->size(); + for (;;) { + if (replIdx >= replacementLength) { + break; + } + U16_GET(replacementText, 0, replIdx, replacementLength, c32); + if (u_isdigit(c32) == FALSE) { + break; + } + int32_t digitVal = u_charDigitValue(c32); + if (groupNum * 10 + digitVal <= numCaptureGroups) { + groupNum = groupNum * 10 + digitVal; + U16_FWD_1(replacementText, replIdx, replacementLength); + numDigits++; + } else { + if (numDigits == 0) { + *status = U_INDEX_OUTOFBOUNDS_ERROR; + } + break; + } + } + } else if (c32 == LEFTBRACKET) { + // Scan for Named Capture Group, ${name}. + UnicodeString groupName; U16_FWD_1(replacementText, replIdx, replacementLength); - groupNum=groupNum*10 + u_charDigitValue(digitC); - numDigits++; - if (numDigits >= m->fPattern->fMaxCaptureDigits) { - break; + while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { + if (replIdx >= replacementLength) { + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + break; + } + U16_NEXT(replacementText, replIdx, replacementLength, c32); + if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z + (c32 >= 0x61 && c32 <= 0x7a) || // a..z + (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 + groupName.append(c32); + } else if (c32 == RIGHTBRACKET) { + groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName); + if (groupNum == 0) { + // Name not defined by pattern. + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + } + } else { + // Character was something other than a name char or a closing '}' + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + } } + } else { + // $ not followed by {name} or digits. + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } - if (numDigits == 0) { - // The $ didn't introduce a group number at all. - // Treat it as just part of the substitution text. - appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); - continue; - } - // Finally, append the capture group data to the destination. - destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - // Ignore buffer overflow when extracting the group. We need to - // continue on to get full size of the untruncated result. We will - // raise our own buffer overflow error at the end. - *status = U_ZERO_ERROR; + if (U_SUCCESS(*status)) { + destIdx += uregex_group((URegularExpression*)regexp, groupNum, + dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); + if (*status == U_BUFFER_OVERFLOW_ERROR) { + // Ignore buffer overflow when extracting the group. We need to + // continue on to get full size of the untruncated result. We will + // raise our own buffer overflow error at the end. + *status = U_ZERO_ERROR; + } } if (U_FAILURE(*status)) { - // Can fail if group number is out of range. + // bad group number or name. break; } - } // @@ -1499,12 +1544,14 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, // if (destIdx < capacity) { dest[destIdx] = 0; - } else if (destIdx == *destCapacity) { - *status = U_STRING_NOT_TERMINATED_WARNING; - } else { - *status = U_BUFFER_OVERFLOW_ERROR; + } else if (U_SUCCESS(*status)) { + if (destIdx == *destCapacity) { + *status = U_STRING_NOT_TERMINATED_WARNING; + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } } - + // // Return an updated dest buffer and capacity to the caller. // @@ -1531,14 +1578,14 @@ int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, // // appendReplacement the actual API function, // -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_appendReplacement(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, UChar **destBuf, int32_t *destCapacity, UErrorCode *status) { - + RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendReplacement( regexp, replacementText, replacementLength,destBuf, destCapacity, status); @@ -1547,7 +1594,7 @@ uregex_appendReplacement(URegularExpression *regexp2, // // uregex_appendReplacementUText...can just use the normal C++ method // -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 uregex_appendReplacementUText(URegularExpression *regexp2, UText *replText, UText *dest, @@ -1577,11 +1624,11 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, *status = U_ZERO_ERROR; } - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } - - if (destCapacity == NULL || destBuf == NULL || + + if (destCapacity == NULL || destBuf == NULL || (*destBuf == NULL && *destCapacity > 0) || *destCapacity < 0) { @@ -1594,7 +1641,7 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, int32_t destIdx = 0; int32_t destCap = *destCapacity; UChar *dest = *destBuf; - + if (regexp->fText != NULL) { int32_t srcIdx; int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); @@ -1606,8 +1653,10 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, UErrorCode status = U_ZERO_ERROR; srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status); } - + for (;;) { + U_ASSERT(destIdx >= 0); + if (srcIdx == regexp->fTextLength) { break; } @@ -1616,6 +1665,7 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, regexp->fTextLength = srcIdx; break; } + if (destIdx < destCap) { dest[destIdx] = c; } else { @@ -1629,11 +1679,11 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, } srcIdx++; destIdx++; - } + } } else { int64_t srcIdx; if (m->fMatch) { - // The most recent call to find() succeeded. + // The most recent call to find() succeeded. srcIdx = m->fMatchEnd; } else { // The last call to find() on this matcher failed(). @@ -1668,7 +1718,7 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, if (destIdx < destCap) { *destBuf += destIdx; *destCapacity -= destIdx; - } else { + } else if (*destBuf != NULL) { *destBuf += destCap; *destCapacity = 0; } @@ -1684,7 +1734,7 @@ int32_t RegexCImpl::appendTail(RegularExpression *regexp, // // appendTail the actual API function // -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_appendTail(URegularExpression *regexp2, UChar **destBuf, int32_t *destCapacity, @@ -1697,7 +1747,7 @@ uregex_appendTail(URegularExpression *regexp2, // // uregex_appendTailUText...can just use the normal C++ method // -U_CAPI UText * U_EXPORT2 +U_CAPI UText * U_EXPORT2 uregex_appendTailUText(URegularExpression *regexp2, UText *dest, UErrorCode *status) { @@ -1789,19 +1839,19 @@ int32_t RegexCImpl::split(RegularExpression *regexp, i = destFieldsCapacity-1; destIdx = (int32_t)(destFields[i] - destFields[0]); } - + destFields[i] = &destBuf[destIdx]; destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); } break; } - + if (regexp->fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. destFields[i] = &destBuf[destIdx]; - + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); if (tStatus == U_BUFFER_OVERFLOW_ERROR) { @@ -1810,7 +1860,7 @@ int32_t RegexCImpl::split(RegularExpression *regexp, *status = tStatus; } nextOutputStringStart = regexp->fMatcher->fMatchEnd; - + // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; @@ -1820,11 +1870,15 @@ int32_t RegexCImpl::split(RegularExpression *regexp, break; } i++; - + // Set up to extract the capture group contents into the dest buffer. destFields[i] = &destBuf[destIdx]; tStatus = U_ZERO_ERROR; - int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); + int32_t t = uregex_group((URegularExpression*)regexp, + groupNum, + destFields[i], + REMAINING_CAPACITY(destIdx, destCapacity), + &tStatus); destIdx += t + 1; // Record the space used in the output string buffer. // +1 for the NUL that terminates the string. if (tStatus == U_BUFFER_OVERFLOW_ERROR) { @@ -1835,7 +1889,18 @@ int32_t RegexCImpl::split(RegularExpression *regexp, } if (nextOutputStringStart == inputLen) { - // The delimiter was at the end of the string. We're done. + // The delimiter was at the end of the string. + // Output an empty string, and then we are done. + if (destIdx < destCapacity) { + destBuf[destIdx] = 0; + } + if (i < destFieldsCapacity-1) { + ++i; + } + if (destIdx < destCapacity) { + destFields[i] = destBuf + destIdx; + } + ++destIdx; break; } @@ -1869,7 +1934,7 @@ int32_t RegexCImpl::split(RegularExpression *regexp, // // uregex_split The actual API function // -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_split(URegularExpression *regexp2, UChar *destBuf, int32_t destCapacity, @@ -1878,7 +1943,7 @@ uregex_split(URegularExpression *regexp2, int32_t destFieldsCapacity, UErrorCode *status) { RegularExpression *regexp = (RegularExpression*)regexp2; - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if ((destBuf == NULL && destCapacity > 0) || @@ -1888,15 +1953,15 @@ uregex_split(URegularExpression *regexp2, *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - + return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); } - + // // uregex_splitUText...can just use the normal C++ method // -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 uregex_splitUText(URegularExpression *regexp2, UText *destFields[], int32_t destFieldsCapacity,