X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/i18n/uregex.cpp diff --git a/icuSources/i18n/uregex.cpp b/icuSources/i18n/uregex.cpp index 9aab85e5..57c2febe 100644 --- a/icuSources/i18n/uregex.cpp +++ b/icuSources/i18n/uregex.cpp @@ -1,9 +1,11 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* -* Copyright (C) 2004-2005, International Business Machines +* Copyright (C) 2004-2015, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* -* file name: regex.cpp +* file name: uregex.cpp */ #include "unicode/utypes.h" @@ -16,33 +18,38 @@ #include "unicode/ustring.h" #include "unicode/uchar.h" #include "unicode/uobject.h" -#include "umutex.h" -#include "uassert.h" +#include "unicode/utf16.h" #include "cmemory.h" +#include "uassert.h" +#include "uhash.h" +#include "umutex.h" +#include "uvectr32.h" + +#include "regextxt.h" + +U_NAMESPACE_BEGIN + +#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) -struct URegularExpression: public UMemory { +struct RegularExpression: public UMemory { public: - URegularExpression(); - ~URegularExpression(); + RegularExpression(); + ~RegularExpression(); int32_t fMagic; RegexPattern *fPat; - int32_t *fPatRefCount; + u_atomic_int32_t *fPatRefCount; UChar *fPatString; int32_t fPatStringLen; RegexMatcher *fMatcher; const UChar *fText; // Text from setText() int32_t fTextLength; // Length provided by user with setText(), which // may be -1. - - UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString. - // TODO: regexp engine should not depend on UnicodeString. + UBool fOwnsText; }; static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII -U_NAMESPACE_USE - -URegularExpression::URegularExpression() { +RegularExpression::RegularExpression() { fMagic = REXP_MAGIC; fPat = NULL; fPatRefCount = NULL; @@ -51,34 +58,42 @@ URegularExpression::URegularExpression() { fMatcher = NULL; fText = NULL; fTextLength = 0; + fOwnsText = FALSE; } -URegularExpression::~URegularExpression() { +RegularExpression::~RegularExpression() { delete fMatcher; fMatcher = NULL; if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { delete fPat; uprv_free(fPatString); - uprv_free(fPatRefCount); + uprv_free((void *)fPatRefCount); + } + if (fOwnsText && fText!=NULL) { + uprv_free((void *)fText); } fMagic = 0; } +U_NAMESPACE_END + +U_NAMESPACE_USE + //---------------------------------------------------------------------------------------- // // validateRE Do boilerplate style checks on API function parameters. // Return TRUE if they look OK. //---------------------------------------------------------------------------------------- -static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { +static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) { if (U_FAILURE(*status)) { return FALSE; } if (re == NULL || re->fMagic != REXP_MAGIC) { - // U_ASSERT(FALSE); *status = U_ILLEGAL_ARGUMENT_ERROR; return FALSE; } - if (requiresText && re->fText == NULL) { + // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway + if (requiresText && re->fText == NULL && !re->fOwnsText) { *status = U_REGEX_INVALID_STATE; return FALSE; } @@ -109,13 +124,13 @@ uregex_open( const UChar *pattern, actualPatLen = u_strlen(pattern); } - URegularExpression *re = new URegularExpression; - int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); + RegularExpression *re = new RegularExpression; + u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); if (re == NULL || refC == NULL || patBuf == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; delete re; - uprv_free(refC); + uprv_free((void *)refC); uprv_free(patBuf); return NULL; } @@ -124,23 +139,110 @@ uregex_open( const UChar *pattern, // // Make a copy of the pattern string, so we can return it later if asked. - // For compiling the pattern, we will use a read-only-aliased UnicodeString - // of this local copy, to avoid making even more copies. + // For compiling the pattern, we will use a UText wrapper around + // this local copy, to avoid making even more copies. // re->fPatString = patBuf; re->fPatStringLen = patternLength; u_memcpy(patBuf, pattern, actualPatLen); patBuf[actualPatLen] = 0; - UnicodeString patString(patternLength==-1, patBuf, patternLength); + + UText patText = UTEXT_INITIALIZER; + utext_openUChars(&patText, patBuf, patternLength, status); + + // + // Compile the pattern + // + if (pe != NULL) { + re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); + } else { + re->fPat = RegexPattern::compile(&patText, flags, *status); + } + utext_close(&patText); + + if (U_FAILURE(*status)) { + goto ErrorExit; + } + + // + // Create the matcher object + // + re->fMatcher = re->fPat->matcher(*status); + if (U_SUCCESS(*status)) { + return (URegularExpression*)re; + } + +ErrorExit: + delete re; + return NULL; + +} + +//---------------------------------------------------------------------------------------- +// +// uregex_openUText +// +//---------------------------------------------------------------------------------------- +U_CAPI URegularExpression * U_EXPORT2 +uregex_openUText(UText *pattern, + uint32_t flags, + UParseError *pe, + UErrorCode *status) { + + if (U_FAILURE(*status)) { + return NULL; + } + if (pattern == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + int64_t patternNativeLength = utext_nativeLength(pattern); + + if (patternNativeLength == 0) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + RegularExpression *re = new RegularExpression; + + UErrorCode lengthStatus = U_ZERO_ERROR; + int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus); + + u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t)); + UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1)); + if (re == NULL || refC == NULL || patBuf == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + delete re; + uprv_free((void *)refC); + uprv_free(patBuf); + return NULL; + } + re->fPatRefCount = refC; + *re->fPatRefCount = 1; + + // + // Make a copy of the pattern string, so we can return it later if asked. + // For compiling the pattern, we will use a read-only UText wrapper + // around this local copy, to avoid making even more copies. + // + re->fPatString = patBuf; + re->fPatStringLen = pattern16Length; + utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status); + + UText patText = UTEXT_INITIALIZER; + utext_openUChars(&patText, patBuf, pattern16Length, status); // // Compile the pattern // if (pe != NULL) { - re->fPat = RegexPattern::compile(patString, flags, *pe, *status); + re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); } else { - re->fPat = RegexPattern::compile(patString, flags, *status); + re->fPat = RegexPattern::compile(&patText, flags, *status); } + utext_close(&patText); + if (U_FAILURE(*status)) { goto ErrorExit; } @@ -150,7 +252,7 @@ uregex_open( const UChar *pattern, // re->fMatcher = re->fPat->matcher(*status); if (U_SUCCESS(*status)) { - return re; + return (URegularExpression*)re; } ErrorExit: @@ -165,9 +267,10 @@ ErrorExit: // //---------------------------------------------------------------------------------------- U_CAPI void U_EXPORT2 -uregex_close(URegularExpression *re) { +uregex_close(URegularExpression *re2) { + RegularExpression *re = (RegularExpression*)re2; UErrorCode status = U_ZERO_ERROR; - if (validateRE(re, &status, FALSE) == FALSE) { + if (validateRE(re, FALSE, &status) == FALSE) { return; } delete re; @@ -179,13 +282,14 @@ uregex_close(URegularExpression *re) { // uregex_clone // //---------------------------------------------------------------------------------------- -U_CAPI URegularExpression * U_EXPORT2 -uregex_clone(const URegularExpression *source, UErrorCode *status) { - if (validateRE(source, status, FALSE) == FALSE) { +U_CAPI URegularExpression * U_EXPORT2 +uregex_clone(const URegularExpression *source2, UErrorCode *status) { + RegularExpression *source = (RegularExpression*)source2; + if (validateRE(source, FALSE, status) == FALSE) { return NULL; } - URegularExpression *clone = new URegularExpression; + RegularExpression *clone = new RegularExpression; if (clone == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; return NULL; @@ -196,19 +300,15 @@ uregex_clone(const URegularExpression *source, UErrorCode *status) { delete clone; return NULL; } - if (clone == NULL) { - *status = U_MEMORY_ALLOCATION_ERROR; - return NULL; - } clone->fPat = source->fPat; - clone->fPatRefCount = source->fPatRefCount; + clone->fPatRefCount = source->fPatRefCount; clone->fPatString = source->fPatString; clone->fPatStringLen = source->fPatStringLen; umtx_atomic_inc(source->fPatRefCount); // Note: fText is not cloned. - return clone; + return (URegularExpression*)clone; } @@ -219,12 +319,13 @@ uregex_clone(const URegularExpression *source, UErrorCode *status) { // uregex_pattern // //------------------------------------------------------------------------------ -U_CAPI const UChar * U_EXPORT2 -uregex_pattern(const URegularExpression *regexp, - int32_t *patLength, - UErrorCode *status) { - - if (validateRE(regexp, status, FALSE) == FALSE) { +U_CAPI const UChar * U_EXPORT2 +uregex_pattern(const URegularExpression *regexp2, + int32_t *patLength, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + + if (validateRE(regexp, FALSE, status) == FALSE) { return NULL; } if (patLength != NULL) { @@ -234,14 +335,28 @@ uregex_pattern(const URegularExpression *regexp, } +//------------------------------------------------------------------------------ +// +// uregex_patternUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_patternUText(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + return regexp->fPat->patternText(*status); +} + + //------------------------------------------------------------------------------ // // uregex_flags // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_flags(const URegularExpression *regexp, UErrorCode *status) { - if (validateRE(regexp, status, FALSE) == FALSE) { +U_CAPI int32_t U_EXPORT2 +uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { return 0; } int32_t flags = regexp->fPat->flags(); @@ -254,24 +369,61 @@ uregex_flags(const URegularExpression *regexp, UErrorCode *status) { // uregex_setText // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 -uregex_setText(URegularExpression *regexp, +U_CAPI void U_EXPORT2 +uregex_setText(URegularExpression *regexp2, const UChar *text, int32_t textLength, UErrorCode *status) { - if (validateRE(regexp, status, FALSE) == FALSE) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { return; } if (text == NULL || textLength < -1) { *status = U_ILLEGAL_ARGUMENT_ERROR; return; } + + if (regexp->fOwnsText && regexp->fText != NULL) { + uprv_free((void *)regexp->fText); + } + regexp->fText = text; regexp->fTextLength = textLength; - UBool isTerminated = (textLength == -1); + regexp->fOwnsText = FALSE; - regexp->fTextString.setTo(isTerminated, text, textLength); - regexp->fMatcher->reset(regexp->fTextString); + UText input = UTEXT_INITIALIZER; + utext_openUChars(&input, text, textLength, status); + regexp->fMatcher->reset(&input); + utext_close(&input); // reset() made a shallow clone, so we don't need this copy +} + + +//------------------------------------------------------------------------------ +// +// uregex_setUText +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setUText(URegularExpression *regexp2, + UText *text, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return; + } + if (text == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if (regexp->fOwnsText && regexp->fText != NULL) { + uprv_free((void *)regexp->fText); + } + + regexp->fText = NULL; // only fill it in on request + regexp->fTextLength = -1; + regexp->fOwnsText = TRUE; + regexp->fMatcher->reset(text); } @@ -281,201 +433,712 @@ uregex_setText(URegularExpression *regexp, // uregex_getText // //------------------------------------------------------------------------------ -U_CAPI const UChar * U_EXPORT2 -uregex_getText(URegularExpression *regexp, +U_CAPI const UChar * U_EXPORT2 +uregex_getText(URegularExpression *regexp2, int32_t *textLength, UErrorCode *status) { - if (validateRE(regexp, status, FALSE) == FALSE) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { return NULL; } + + if (regexp->fText == NULL) { + // need to fill in the text + UText *inputText = regexp->fMatcher->inputText(); + int64_t inputNativeLength = utext_nativeLength(inputText); + if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { + regexp->fText = inputText->chunkContents; + regexp->fTextLength = (int32_t)inputNativeLength; + regexp->fOwnsText = FALSE; // because the UText owns it + } else { + UErrorCode lengthStatus = U_ZERO_ERROR; + regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error + UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1)); + + utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status); + regexp->fText = inputChars; + regexp->fOwnsText = TRUE; // should already be set but just in case + } + } + if (textLength != NULL) { *textLength = regexp->fTextLength; } - return regexp->fText; + return regexp->fText; +} + + +//------------------------------------------------------------------------------ +// +// uregex_getUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_getUText(URegularExpression *regexp2, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return dest; + } + return regexp->fMatcher->getInput(dest, *status); +} + + +//------------------------------------------------------------------------------ +// +// uregex_refreshUText +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_refreshUText(URegularExpression *regexp2, + UText *text, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return; + } + regexp->fMatcher->refreshInputText(text, *status); +} + + +//------------------------------------------------------------------------------ +// +// uregex_matches +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_matches(URegularExpression *regexp2, + int32_t startIndex, + UErrorCode *status) { + return uregex_matches64( regexp2, (int64_t)startIndex, status); +} + +U_CAPI UBool U_EXPORT2 +uregex_matches64(URegularExpression *regexp2, + int64_t startIndex, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + UBool result = FALSE; + if (validateRE(regexp, TRUE, status) == FALSE) { + return result; + } + if (startIndex == -1) { + result = regexp->fMatcher->matches(*status); + } else { + result = regexp->fMatcher->matches(startIndex, *status); + } + return result; +} + + +//------------------------------------------------------------------------------ +// +// uregex_lookingAt +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_lookingAt(URegularExpression *regexp2, + int32_t startIndex, + UErrorCode *status) { + return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); +} + +U_CAPI UBool U_EXPORT2 +uregex_lookingAt64(URegularExpression *regexp2, + int64_t startIndex, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + UBool result = FALSE; + if (validateRE(regexp, TRUE, status) == FALSE) { + return result; + } + if (startIndex == -1) { + result = regexp->fMatcher->lookingAt(*status); + } else { + result = regexp->fMatcher->lookingAt(startIndex, *status); + } + return result; +} + + + +//------------------------------------------------------------------------------ +// +// uregex_find +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_find(URegularExpression *regexp2, + int32_t startIndex, + UErrorCode *status) { + return uregex_find64( regexp2, (int64_t)startIndex, status); +} + +U_CAPI UBool U_EXPORT2 +uregex_find64(URegularExpression *regexp2, + int64_t startIndex, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + UBool result = FALSE; + if (validateRE(regexp, TRUE, status) == FALSE) { + return result; + } + if (startIndex == -1) { + regexp->fMatcher->resetPreserveRegion(); + result = regexp->fMatcher->find(*status); + } else { + result = regexp->fMatcher->find(startIndex, *status); + } + return result; +} + + +//------------------------------------------------------------------------------ +// +// uregex_findNext +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_findNext(URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return FALSE; + } + UBool result = regexp->fMatcher->find(*status); + return result; +} + +//------------------------------------------------------------------------------ +// +// uregex_groupCount +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_groupCount(URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return 0; + } + int32_t result = regexp->fMatcher->groupCount(); + return result; +} + + +//------------------------------------------------------------------------------ +// +// uregex_groupNumberFromName +// +//------------------------------------------------------------------------------ +int32_t +uregex_groupNumberFromName(URegularExpression *regexp2, + const UChar *groupName, + int32_t nameLength, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return 0; + } + int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status); + return result; +} + +int32_t +uregex_groupNumberFromCName(URegularExpression *regexp2, + const char *groupName, + int32_t nameLength, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return 0; + } + return regexp->fPat->groupNumberFromName(groupName, nameLength, *status); +} + +//------------------------------------------------------------------------------ +// +// uregex_group +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_group(URegularExpression *regexp2, + int32_t groupNum, + UChar *dest, + int32_t destCapacity, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if (destCapacity == 0 || regexp->fText != NULL) { + // If preflighting or if we already have the text as UChars, + // this is a little cheaper than extracting from the UText + + // + // Pick up the range of characters from the matcher + // + int32_t startIx = regexp->fMatcher->start(groupNum, *status); + int32_t endIx = regexp->fMatcher->end (groupNum, *status); + if (U_FAILURE(*status)) { + return 0; + } + + // + // Trim length based on buffer capacity + // + int32_t fullLength = endIx - startIx; + int32_t copyLength = fullLength; + if (copyLength < destCapacity) { + dest[copyLength] = 0; + } else if (copyLength == destCapacity) { + *status = U_STRING_NOT_TERMINATED_WARNING; + } else { + copyLength = destCapacity; + *status = U_BUFFER_OVERFLOW_ERROR; + } + + // + // Copy capture group to user's buffer + // + if (copyLength > 0) { + u_memcpy(dest, ®exp->fText[startIx], copyLength); + } + return fullLength; + } else { + int64_t start = regexp->fMatcher->start64(groupNum, *status); + int64_t limit = regexp->fMatcher->end64(groupNum, *status); + if (U_FAILURE(*status)) { + return 0; + } + // Note edge cases: + // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result. + // Zero Length Match: start == end. + int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status); + return length; + } + +} + + +//------------------------------------------------------------------------------ +// +// uregex_groupUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_groupUText(URegularExpression *regexp2, + int32_t groupNum, + UText *dest, + int64_t *groupLength, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + UErrorCode emptyTextStatus = U_ZERO_ERROR; + return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus)); + } + + return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); +} + +//------------------------------------------------------------------------------ +// +// uregex_start +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_start(URegularExpression *regexp2, + int32_t groupNum, + UErrorCode *status) { + return (int32_t)uregex_start64( regexp2, groupNum, status); +} + +U_CAPI int64_t U_EXPORT2 +uregex_start64(URegularExpression *regexp2, + int32_t groupNum, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + int64_t result = regexp->fMatcher->start64(groupNum, *status); + return result; +} + +//------------------------------------------------------------------------------ +// +// uregex_end +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_end(URegularExpression *regexp2, + int32_t groupNum, + UErrorCode *status) { + return (int32_t)uregex_end64( regexp2, groupNum, status); +} + +U_CAPI int64_t U_EXPORT2 +uregex_end64(URegularExpression *regexp2, + int32_t groupNum, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + int64_t result = regexp->fMatcher->end64(groupNum, *status); + return result; +} + +//------------------------------------------------------------------------------ +// +// uregex_reset +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_reset(URegularExpression *regexp2, + int32_t index, + UErrorCode *status) { + uregex_reset64( regexp2, (int64_t)index, status); +} + +U_CAPI void U_EXPORT2 +uregex_reset64(URegularExpression *regexp2, + int64_t index, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return; + } + regexp->fMatcher->reset(index, *status); +} + + +//------------------------------------------------------------------------------ +// +// uregex_setRegion +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setRegion(URegularExpression *regexp2, + int32_t regionStart, + int32_t regionLimit, + UErrorCode *status) { + uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status); +} + +U_CAPI void U_EXPORT2 +uregex_setRegion64(URegularExpression *regexp2, + int64_t regionStart, + int64_t regionLimit, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return; + } + regexp->fMatcher->region(regionStart, regionLimit, *status); +} + + +//------------------------------------------------------------------------------ +// +// uregex_setRegionAndStart +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_setRegionAndStart(URegularExpression *regexp2, + int64_t regionStart, + int64_t regionLimit, + int64_t startIndex, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return; + } + regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); +} + +//------------------------------------------------------------------------------ +// +// uregex_regionStart +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_regionStart(const URegularExpression *regexp2, + UErrorCode *status) { + return (int32_t)uregex_regionStart64(regexp2, status); +} + +U_CAPI int64_t U_EXPORT2 +uregex_regionStart64(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + return regexp->fMatcher->regionStart(); +} + + +//------------------------------------------------------------------------------ +// +// uregex_regionEnd +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_regionEnd(const URegularExpression *regexp2, + UErrorCode *status) { + return (int32_t)uregex_regionEnd64(regexp2, status); +} + +U_CAPI int64_t U_EXPORT2 +uregex_regionEnd64(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + return regexp->fMatcher->regionEnd(); +} + + +//------------------------------------------------------------------------------ +// +// uregex_hasTransparentBounds +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_hasTransparentBounds(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return FALSE; + } + return regexp->fMatcher->hasTransparentBounds(); +} + + +//------------------------------------------------------------------------------ +// +// uregex_useTransparentBounds +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_useTransparentBounds(URegularExpression *regexp2, + UBool b, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return; + } + regexp->fMatcher->useTransparentBounds(b); +} + + +//------------------------------------------------------------------------------ +// +// uregex_hasAnchoringBounds +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_hasAnchoringBounds(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return FALSE; + } + return regexp->fMatcher->hasAnchoringBounds(); +} + + +//------------------------------------------------------------------------------ +// +// uregex_useAnchoringBounds +// +//------------------------------------------------------------------------------ +U_CAPI void U_EXPORT2 +uregex_useAnchoringBounds(URegularExpression *regexp2, + UBool b, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status) == FALSE) { + return; + } + regexp->fMatcher->useAnchoringBounds(b); +} + + +//------------------------------------------------------------------------------ +// +// uregex_hitEnd +// +//------------------------------------------------------------------------------ +U_CAPI UBool U_EXPORT2 +uregex_hitEnd(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return FALSE; + } + return regexp->fMatcher->hitEnd(); } //------------------------------------------------------------------------------ // -// uregex_matches +// uregex_requireEnd // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 -uregex_matches(URegularExpression *regexp, - int32_t startIndex, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { +U_CAPI UBool U_EXPORT2 +uregex_requireEnd(const URegularExpression *regexp2, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { return FALSE; } - UBool result = regexp->fMatcher->matches(startIndex, *status); - return result; + return regexp->fMatcher->requireEnd(); } - //------------------------------------------------------------------------------ // -// uregex_lookingAt +// uregex_setTimeLimit // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 -uregex_lookingAt(URegularExpression *regexp, - int32_t startIndex, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return FALSE; +U_CAPI void U_EXPORT2 +uregex_setTimeLimit(URegularExpression *regexp2, + int32_t limit, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + regexp->fMatcher->setTimeLimit(limit, *status); } - UBool result = regexp->fMatcher->lookingAt(startIndex, *status); - return result; } //------------------------------------------------------------------------------ // -// uregex_find +// uregex_getTimeLimit // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 -uregex_find(URegularExpression *regexp, - int32_t startIndex, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return FALSE; - } - UBool result = regexp->fMatcher->find(startIndex, *status); - return result; +U_CAPI int32_t U_EXPORT2 +uregex_getTimeLimit(const URegularExpression *regexp2, + UErrorCode *status) { + int32_t retVal = 0; + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + retVal = regexp->fMatcher->getTimeLimit(); + } + return retVal; } + + //------------------------------------------------------------------------------ // -// uregex_findNext +// uregex_setStackLimit // //------------------------------------------------------------------------------ -U_CAPI UBool U_EXPORT2 -uregex_findNext(URegularExpression *regexp, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return FALSE; +U_CAPI void U_EXPORT2 +uregex_setStackLimit(URegularExpression *regexp2, + int32_t limit, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + regexp->fMatcher->setStackLimit(limit, *status); } - UBool result = regexp->fMatcher->find(); - return result; } + + //------------------------------------------------------------------------------ // -// uregex_groupCount +// uregex_getStackLimit // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_groupCount(URegularExpression *regexp, - UErrorCode *status) { - if (validateRE(regexp, status, FALSE) == FALSE) { - return 0; - } - int32_t result = regexp->fMatcher->groupCount(); - return result; +U_CAPI int32_t U_EXPORT2 +uregex_getStackLimit(const URegularExpression *regexp2, + UErrorCode *status) { + int32_t retVal = 0; + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + retVal = regexp->fMatcher->getStackLimit(); + } + return retVal; } //------------------------------------------------------------------------------ // -// uregex_group +// uregex_setMatchCallback // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_group(URegularExpression *regexp, - int32_t groupNum, - UChar *dest, - int32_t destCapacity, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return 0; - } - if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - // - // Pick up the range of characters from the matcher - // - int32_t startIx = regexp->fMatcher->start(groupNum, *status); - int32_t endIx = regexp->fMatcher->end (groupNum, *status); - if (U_FAILURE(*status)) { - return 0; - } - - // - // Trim length based on buffer capacity - // - int32_t fullLength = endIx - startIx; - int32_t copyLength = fullLength; - if (copyLength < destCapacity) { - dest[copyLength] = 0; - } else if (copyLength == destCapacity) { - *status = U_STRING_NOT_TERMINATED_WARNING; - } else { - copyLength = destCapacity; - *status = U_BUFFER_OVERFLOW_ERROR; - } - - // - // Copy capture group to user's buffer - // - if (copyLength > 0) { - u_memcpy(dest, ®exp->fText[startIx], copyLength); +U_CAPI void U_EXPORT2 +uregex_setMatchCallback(URegularExpression *regexp2, + URegexMatchCallback *callback, + const void *context, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + regexp->fMatcher->setMatchCallback(callback, context, *status); } - return fullLength; } //------------------------------------------------------------------------------ // -// uregex_start +// uregex_getMatchCallback // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_start(URegularExpression *regexp, - int32_t groupNum, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return 0; - } - int32_t result = regexp->fMatcher->start(groupNum, *status); - return result; +U_CAPI void U_EXPORT2 +uregex_getMatchCallback(const URegularExpression *regexp2, + URegexMatchCallback **callback, + const void **context, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + regexp->fMatcher->getMatchCallback(*callback, *context, *status); + } } //------------------------------------------------------------------------------ // -// uregex_end +// uregex_setMatchProgressCallback // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_end(URegularExpression *regexp, - int32_t groupNum, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return 0; +U_CAPI void U_EXPORT2 +uregex_setFindProgressCallback(URegularExpression *regexp2, + URegexFindProgressCallback *callback, + const void *context, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + regexp->fMatcher->setFindProgressCallback(callback, context, *status); } - int32_t result = regexp->fMatcher->end(groupNum, *status); - return result; } + //------------------------------------------------------------------------------ // -// uregex_reset +// uregex_getMatchCallback // //------------------------------------------------------------------------------ -U_CAPI void U_EXPORT2 -uregex_reset(URegularExpression *regexp, - int32_t index, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return; - } - regexp->fMatcher->reset(index, *status); +U_CAPI void U_EXPORT2 +uregex_getFindProgressCallback(const URegularExpression *regexp2, + URegexFindProgressCallback **callback, + const void **context, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, FALSE, status)) { + regexp->fMatcher->getFindProgressCallback(*callback, *context, *status); + } } @@ -484,52 +1147,93 @@ uregex_reset(URegularExpression *regexp, // uregex_replaceAll // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_replaceAll(URegularExpression *regexp, +U_CAPI int32_t U_EXPORT2 +uregex_replaceAll(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, UChar *destBuf, int32_t destCapacity, UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || - destBuf == NULL && destCapacity > 0 || + (destBuf == NULL && destCapacity > 0) || destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } int32_t len = 0; - uregex_reset(regexp, 0, status); - while (uregex_findNext(regexp, status)) { - len += uregex_appendReplacement(regexp, replacementText, replacementLength, + + uregex_reset(regexp2, 0, status); + + // Note: Seperate error code variables for findNext() and appendReplacement() + // are used so that destination buffer overflow errors + // in appendReplacement won't stop findNext() from working. + // appendReplacement() and appendTail() special case incoming buffer + // overflow errors, continuing to return the correct length. + UErrorCode findStatus = *status; + while (uregex_findNext(regexp2, &findStatus)) { + len += uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } - len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); + len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); + + if (U_FAILURE(findStatus)) { + // If anything went wrong with the findNext(), make that error trump + // whatever may have happened with the append() operations. + // Errors in findNext() are not expected. + *status = findStatus; + } return len; } +//------------------------------------------------------------------------------ +// +// uregex_replaceAllUText +// +//------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_replaceAllUText(URegularExpression *regexp2, + UText *replacementText, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + if (replacementText == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); + return dest; +} + + //------------------------------------------------------------------------------ // // uregex_replaceFirst // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_replaceFirst(URegularExpression *regexp, +U_CAPI int32_t U_EXPORT2 +uregex_replaceFirst(URegularExpression *regexp2, const UChar *replacementText, int32_t replacementLength, UChar *destBuf, int32_t destCapacity, UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || - destBuf == NULL && destCapacity > 0 || + (destBuf == NULL && destCapacity > 0) || destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; @@ -537,13 +1241,13 @@ uregex_replaceFirst(URegularExpression *regexp, int32_t len = 0; UBool findSucceeded; - uregex_reset(regexp, 0, status); - findSucceeded = uregex_find(regexp, 0, status); + uregex_reset(regexp2, 0, status); + findSucceeded = uregex_find(regexp2, 0, status); if (findSucceeded) { - len = uregex_appendReplacement(regexp, replacementText, replacementLength, + len = uregex_appendReplacement(regexp2, replacementText, replacementLength, &destBuf, &destCapacity, status); } - len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); + len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); return len; } @@ -551,48 +1255,70 @@ uregex_replaceFirst(URegularExpression *regexp, //------------------------------------------------------------------------------ // -// uregex_appendReplacement +// uregex_replaceFirstUText // //------------------------------------------------------------------------------ +U_CAPI UText * U_EXPORT2 +uregex_replaceFirstUText(URegularExpression *regexp2, + UText *replacementText, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + if (replacementText == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); + return dest; +} + + +//------------------------------------------------------------------------------ +// +// uregex_appendReplacement +// +//------------------------------------------------------------------------------ +U_NAMESPACE_BEGIN // // Dummy class, because these functions need to be friends of class RegexMatcher, // and stand-alone C functions don't work as friends // -U_NAMESPACE_BEGIN class RegexCImpl { public: - inline static int32_t appendReplacement(URegularExpression *regexp, + inline static int32_t appendReplacement(RegularExpression *regexp, const UChar *replacementText, int32_t replacementLength, UChar **destBuf, int32_t *destCapacity, UErrorCode *status); - inline static int32_t appendTail(URegularExpression *regexp, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status); + inline static int32_t appendTail(RegularExpression *regexp, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status); + + inline static int32_t split(RegularExpression *regexp, + UChar *destBuf, + int32_t destCapacity, + int32_t *requiredCapacity, + UChar *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status); }; -U_NAMESPACE_END +U_NAMESPACE_END -// -// Call-back function for u_unescapeAt(), used when we encounter -// \uxxxx or \Uxxxxxxxxx escapes in the replacement text. -// -U_CDECL_BEGIN -static UChar U_CALLCONV -unescape_charAt(int32_t offset, void *context) { - UChar c16 = ((UChar *)context)[offset]; - return c16; -} -U_CDECL_END static const UChar BACKSLASH = 0x5c; static const UChar DOLLARSIGN = 0x24; +static const UChar LEFTBRACKET = 0x7b; +static const UChar RIGHTBRACKET = 0x7d; // // Move a character to an output buffer, with bounds checking on the index. @@ -610,18 +1336,18 @@ static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCap // // appendReplacement, the actual implementation. // -int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, - const UChar *replacementText, - int32_t replacementLength, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status) { +int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, + const UChar *replacementText, + int32_t replacementLength, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status) { // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle. UBool pendingBufferOverflow = FALSE; - if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) { + if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { pendingBufferOverflow = TRUE; *status = U_ZERO_ERROR; } @@ -629,12 +1355,12 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, // // Validate all paramters // - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } if (replacementText == NULL || replacementLength < -1 || - destCapacity == NULL || destBuf == NULL || - *destBuf == NULL && *destCapacity > 0 || + destCapacity == NULL || destBuf == NULL || + (*destBuf == NULL && *destCapacity > 0) || *destCapacity < 0) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; @@ -650,7 +1376,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, int32_t capacity = *destCapacity; int32_t destIdx = 0; int32_t i; - + // If it wasn't supplied by the caller, get the length of the replacement text. // TODO: slightly smarter logic in the copy loop could watch for the NUL on // the fly and avoid this step. @@ -659,19 +1385,37 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, } // Copy input string from the end of previous match to start of current match - for (i=m->fLastMatchEnd; ifMatchStart; i++) { - appendToBuf(regexp->fText[i], &destIdx, dest, capacity); + if (regexp->fText != NULL) { + int32_t matchStart; + int32_t lastMatchEnd; + if (UTEXT_USES_U16(m->fInputText)) { + lastMatchEnd = (int32_t)m->fLastMatchEnd; + matchStart = (int32_t)m->fMatchStart; + } else { + // !!!: Would like a better way to do this! + UErrorCode tempStatus = U_ZERO_ERROR; + lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus); + tempStatus = U_ZERO_ERROR; + matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus); + } + for (i=lastMatchEnd; ifText[i], &destIdx, dest, capacity); + } + } else { + UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore + destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, + dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), + &possibleOverflowError); } - - + U_ASSERT(destIdx >= 0); // scan the replacement text, looking for substitutions ($n) and \escapes. int32_t replIdx = 0; - while (replIdx < replacementLength) { + while (replIdx < replacementLength && U_SUCCESS(*status)) { UChar c = replacementText[replIdx]; replIdx++; if (c != DOLLARSIGN && c != BACKSLASH) { - // Common case, no substitution, no escaping, + // Common case, no substitution, no escaping, // just copy the char to the dest buf. appendToBuf(c, &destIdx, dest, capacity); continue; @@ -690,9 +1434,9 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, if (c==0x55/*U*/ || c==0x75/*u*/) { // We have a \udddd or \Udddddddd escape sequence. - UChar32 escapedChar = - u_unescapeAt(unescape_charAt, - &replIdx, // Index is updated by unescapeAt + UChar32 escapedChar = + u_unescapeAt(uregex_ucstr_unescape_charAt, + &replIdx, // Index is updated by unescapeAt replacementLength, // Length of replacement text (void *)replacementText); @@ -716,58 +1460,86 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, continue; } + // We've got a $. Pick up the following capture group name or number. + // For numbers, consume only digits that produce a valid capture group for the pattern. - - // We've got a $. Pick up a capture group number if one follows. - // Consume at most the number of digits necessary for the largest capture - // number that is valid for this pattern. - - int32_t numDigits = 0; int32_t groupNum = 0; - UChar32 digitC; - for (;;) { - if (replIdx >= replacementLength) { - break; - } - U16_GET(replacementText, 0, replIdx, replacementLength, digitC); - if (u_isdigit(digitC) == FALSE) { - break; - } + U_ASSERT(c == DOLLARSIGN); + UChar32 c32 = -1; + if (replIdx < replacementLength) { + U16_GET(replacementText, 0, replIdx, replacementLength, c32); + } + if (u_isdigit(c32)) { + int32_t numDigits = 0; + int32_t numCaptureGroups = m->fPattern->fGroupMap->size(); + for (;;) { + if (replIdx >= replacementLength) { + break; + } + U16_GET(replacementText, 0, replIdx, replacementLength, c32); + if (u_isdigit(c32) == FALSE) { + break; + } + int32_t digitVal = u_charDigitValue(c32); + if (groupNum * 10 + digitVal <= numCaptureGroups) { + groupNum = groupNum * 10 + digitVal; + U16_FWD_1(replacementText, replIdx, replacementLength); + numDigits++; + } else { + if (numDigits == 0) { + *status = U_INDEX_OUTOFBOUNDS_ERROR; + } + break; + } + } + } else if (c32 == LEFTBRACKET) { + // Scan for Named Capture Group, ${name}. + UnicodeString groupName; U16_FWD_1(replacementText, replIdx, replacementLength); - groupNum=groupNum*10 + u_charDigitValue(digitC); - numDigits++; - if (numDigits >= m->fPattern->fMaxCaptureDigits) { - break; + while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { + if (replIdx >= replacementLength) { + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + break; + } + U16_NEXT(replacementText, replIdx, replacementLength, c32); + if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z + (c32 >= 0x61 && c32 <= 0x7a) || // a..z + (c32 >= 0x31 && c32 <= 0x39)) { // 0..9 + groupName.append(c32); + } else if (c32 == RIGHTBRACKET) { + groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName); + if (groupNum == 0) { + // Name not defined by pattern. + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + } + } else { + // Character was something other than a name char or a closing '}' + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; + } } + } else { + // $ not followed by {name} or digits. + *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; } - if (numDigits == 0) { - // The $ didn't introduce a group number at all. - // Treat it as just part of the substitution text. - appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); - continue; - } - // Finally, append the capture group data to the destination. - int32_t capacityRemaining = capacity - destIdx; - if (capacityRemaining < 0) { - capacityRemaining = 0; - } - destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status); - if (*status == U_BUFFER_OVERFLOW_ERROR) { - // Ignore buffer overflow when extracting the group. We need to - // continue on to get full size of the untruncated result. We will - // raise our own buffer overflow error at the end. - *status = U_ZERO_ERROR; + if (U_SUCCESS(*status)) { + destIdx += uregex_group((URegularExpression*)regexp, groupNum, + dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status); + if (*status == U_BUFFER_OVERFLOW_ERROR) { + // Ignore buffer overflow when extracting the group. We need to + // continue on to get full size of the untruncated result. We will + // raise our own buffer overflow error at the end. + *status = U_ZERO_ERROR; + } } if (U_FAILURE(*status)) { - // Can fail if group number is out of range. + // bad group number or name. break; } - } // @@ -776,12 +1548,14 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, // if (destIdx < capacity) { dest[destIdx] = 0; - } else if (destIdx == *destCapacity) { - *status = U_STRING_NOT_TERMINATED_WARNING; - } else { - *status = U_BUFFER_OVERFLOW_ERROR; + } else if (U_SUCCESS(*status)) { + if (destIdx == *destCapacity) { + *status = U_STRING_NOT_TERMINATED_WARNING; + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + } } - + // // Return an updated dest buffer and capacity to the caller. // @@ -806,91 +1580,127 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, } // -// appendReplacement the acutal API function, +// appendReplacement the actual API function, // -U_CAPI int32_t U_EXPORT2 -uregex_appendReplacement(URegularExpression *regexp, - const UChar *replacementText, - int32_t replacementLength, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status) { +U_CAPI int32_t U_EXPORT2 +uregex_appendReplacement(URegularExpression *regexp2, + const UChar *replacementText, + int32_t replacementLength, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status) { + + RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendReplacement( regexp, replacementText, replacementLength,destBuf, destCapacity, status); } +// +// uregex_appendReplacementUText...can just use the normal C++ method +// +U_CAPI void U_EXPORT2 +uregex_appendReplacementUText(URegularExpression *regexp2, + UText *replText, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + regexp->fMatcher->appendReplacement(dest, replText, *status); +} + //------------------------------------------------------------------------------ // // uregex_appendTail // //------------------------------------------------------------------------------ -int32_t RegexCImpl::appendTail(URegularExpression *regexp, - UChar **destBuf, - int32_t *destCapacity, - UErrorCode *status) { +int32_t RegexCImpl::appendTail(RegularExpression *regexp, + UChar **destBuf, + int32_t *destCapacity, + UErrorCode *status) +{ // If we come in with a buffer overflow error, don't suppress the operation. // A series of appendReplacements, appendTail need to correctly preflight // the buffer size when an overflow happens somewhere in the middle. UBool pendingBufferOverflow = FALSE; - if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) { + if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) { pendingBufferOverflow = TRUE; *status = U_ZERO_ERROR; } - if (validateRE(regexp, status) == FALSE) { + if (validateRE(regexp, TRUE, status) == FALSE) { return 0; } - if (destCapacity == NULL || destBuf == NULL || - *destBuf == NULL && *destCapacity > 0 || - *destCapacity < 0) { + + if (destCapacity == NULL || destBuf == NULL || + (*destBuf == NULL && *destCapacity > 0) || + *destCapacity < 0) + { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } - - RegexMatcher *m = regexp->fMatcher; - int32_t srcIdx; - if (m->fMatch) { - // The most recent call to find() succeeded. - srcIdx = m->fMatchEnd; - } else { - // The last call to find() on this matcher failed(). - // Look back to the end of the last find() that succeeded for src index. - srcIdx = m->fLastMatchEnd; - if (srcIdx == -1) { - // There has been no successful match with this matcher. - // We want to copy the whole string. - srcIdx = 0; - } - } + RegexMatcher *m = regexp->fMatcher; int32_t destIdx = 0; int32_t destCap = *destCapacity; UChar *dest = *destBuf; - for (;;) { - if (srcIdx == regexp->fTextLength) { - break; + if (regexp->fText != NULL) { + int32_t srcIdx; + int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); + if (nativeIdx == -1) { + srcIdx = 0; + } else if (UTEXT_USES_U16(m->fInputText)) { + srcIdx = (int32_t)nativeIdx; + } else { + UErrorCode newStatus = U_ZERO_ERROR; + srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &newStatus); } - UChar c = regexp->fText[srcIdx]; - if (c == 0 && regexp->fTextLength == -1) { - break; + + for (;;) { + U_ASSERT(destIdx >= 0); + + if (srcIdx == regexp->fTextLength) { + break; + } + UChar c = regexp->fText[srcIdx]; + if (c == 0 && regexp->fTextLength == -1) { + regexp->fTextLength = srcIdx; + break; + } + + if (destIdx < destCap) { + dest[destIdx] = c; + } else { + // We've overflowed the dest buffer. + // If the total input string length is known, we can + // compute the total buffer size needed without scanning through the string. + if (regexp->fTextLength > 0) { + destIdx += (regexp->fTextLength - srcIdx); + break; + } + } + srcIdx++; + destIdx++; } - if (destIdx < destCap) { - dest[destIdx] = c; + } else { + int64_t srcIdx; + if (m->fMatch) { + // The most recent call to find() succeeded. + srcIdx = m->fMatchEnd; } else { - // We've overflowed the dest buffer. - // If the total input string length is known, we can - // compute the total buffer size needed without scanning through the string. - if (regexp->fTextLength > 0) { - destIdx += (regexp->fTextLength - srcIdx); - break; + // The last call to find() on this matcher failed(). + // Look back to the end of the last find() that succeeded for src index. + srcIdx = m->fLastMatchEnd; + if (srcIdx == -1) { + // There has been no successful match with this matcher. + // We want to copy the whole string. + srcIdx = 0; } } - srcIdx++; - destIdx++; + + destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status); } // @@ -912,7 +1722,7 @@ int32_t RegexCImpl::appendTail(URegularExpression *regexp, if (destIdx < destCap) { *destBuf += destIdx; *destCapacity -= destIdx; - } else { + } else if (*destBuf != NULL) { *destBuf += destCap; *destCapacity = 0; } @@ -925,15 +1735,31 @@ int32_t RegexCImpl::appendTail(URegularExpression *regexp, } -U_CAPI int32_t U_EXPORT2 -uregex_appendTail(URegularExpression *regexp, +// +// appendTail the actual API function +// +U_CAPI int32_t U_EXPORT2 +uregex_appendTail(URegularExpression *regexp2, UChar **destBuf, int32_t *destCapacity, UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); } +// +// uregex_appendTailUText...can just use the normal C++ method +// +U_CAPI UText * U_EXPORT2 +uregex_appendTailUText(URegularExpression *regexp2, + UText *dest, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + return regexp->fMatcher->appendTail(dest, *status); +} + + //------------------------------------------------------------------------------ // // copyString Internal utility to copy a string to an output buffer, @@ -942,6 +1768,7 @@ uregex_appendTail(URegularExpression *regexp, // and the NUL is counted in the output size. // //------------------------------------------------------------------------------ +#if 0 static void copyString(UChar *destBuffer, // Destination buffer. int32_t destCapacity, // Total capacity of dest buffer int32_t *destIndex, // Index into dest buffer. Updated on return. @@ -969,82 +1796,75 @@ static void copyString(UChar *destBuffer, // Destination buffer. di++; *destIndex = di; } - +#endif //------------------------------------------------------------------------------ // // uregex_split // //------------------------------------------------------------------------------ -U_CAPI int32_t U_EXPORT2 -uregex_split( URegularExpression *regexp, - UChar *destBuf, - int32_t destCapacity, - int32_t *requiredCapacity, - UChar *destFields[], - int32_t destFieldsCapacity, - UErrorCode *status) { - if (validateRE(regexp, status) == FALSE) { - return 0; - } - if (destBuf == NULL && destCapacity > 0 || - destCapacity < 0 || - destFields == NULL || - destFieldsCapacity < 1 ) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - +int32_t RegexCImpl::split(RegularExpression *regexp, + UChar *destBuf, + int32_t destCapacity, + int32_t *requiredCapacity, + UChar *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status) { // // Reset for the input text // regexp->fMatcher->reset(); - int32_t inputLen = regexp->fTextString.length(); - int32_t nextOutputStringStart = 0; + UText *inputText = regexp->fMatcher->fInputText; + int64_t nextOutputStringStart = 0; + int64_t inputLen = regexp->fMatcher->fInputLength; if (inputLen == 0) { return 0; } - // // Loop through the input text, searching for the delimiter pattern // int32_t i; // Index of the field being processed. int32_t destIdx = 0; // Next available position in destBuf; int32_t numCaptureGroups = regexp->fMatcher->groupCount(); + UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted for (i=0; ; i++) { if (i>=destFieldsCapacity-1) { - // There are one or zero output string left. + // There are one or zero output strings left. // Fill the last output string with whatever is left from the input, then exit the loop. // ( i will be == destFieldsCapacity if we filled the output array while processing // capture groups of the delimiter expression, in which case we will discard the // last capture group saved in favor of the unprocessed remainder of the // input string.) - int32_t remainingLength = inputLen-nextOutputStringStart; - if (remainingLength > 0) { - } - if (i >= destFieldsCapacity) { - // No fields are left. Recycle the last one for holding the trailing part of - // the input string. - i = destFieldsCapacity-1; - destIdx = (int32_t)(destFields[i] - destFields[0]); + if (inputLen > nextOutputStringStart) { + if (i != destFieldsCapacity-1) { + // No fields are left. Recycle the last one for holding the trailing part of + // the input string. + i = destFieldsCapacity-1; + destIdx = (int32_t)(destFields[i] - destFields[0]); + } + + destFields[i] = &destBuf[destIdx]; + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, + &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); } - - destFields[i] = &destBuf[destIdx]; - copyString(destBuf, destCapacity, &destIdx, - ®exp->fText[nextOutputStringStart], remainingLength); break; } - + if (regexp->fMatcher->find()) { // We found another delimiter. Move everything from where we started looking // up until the start of the delimiter into the next output string. - int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart; destFields[i] = &destBuf[destIdx]; - copyString(destBuf, destCapacity, &destIdx, - ®exp->fText[nextOutputStringStart], fieldLen); - nextOutputStringStart = regexp->fMatcher->end(*status); - + + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart, + &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); + if (tStatus == U_BUFFER_OVERFLOW_ERROR) { + tStatus = U_ZERO_ERROR; + } else { + *status = tStatus; + } + nextOutputStringStart = regexp->fMatcher->fMatchEnd; + // If the delimiter pattern has capturing parentheses, the captured // text goes out into the next n destination strings. int32_t groupNum; @@ -1054,22 +1874,37 @@ uregex_split( URegularExpression *regexp, break; } i++; - + // Set up to extract the capture group contents into the dest buffer. - UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow - // error while extracting this group. - int32_t remainingCapacity = destCapacity - destIdx; - if (remainingCapacity < 0) { - remainingCapacity = 0; - } destFields[i] = &destBuf[destIdx]; - int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus); + tStatus = U_ZERO_ERROR; + int32_t t = uregex_group((URegularExpression*)regexp, + groupNum, + destFields[i], + REMAINING_CAPACITY(destIdx, destCapacity), + &tStatus); destIdx += t + 1; // Record the space used in the output string buffer. // +1 for the NUL that terminates the string. + if (tStatus == U_BUFFER_OVERFLOW_ERROR) { + tStatus = U_ZERO_ERROR; + } else { + *status = tStatus; + } } if (nextOutputStringStart == inputLen) { - // The delimiter was at the end of the string. We're done. + // The delimiter was at the end of the string. + // Output an empty string, and then we are done. + if (destIdx < destCapacity) { + destBuf[destIdx] = 0; + } + if (i < destFieldsCapacity-1) { + ++i; + } + if (destIdx < destCapacity) { + destFields[i] = destBuf + destIdx; + } + ++destIdx; break; } @@ -1079,8 +1914,8 @@ uregex_split( URegularExpression *regexp, // We ran off the end of the input while looking for the next delimiter. // All the remaining text goes into the current output string. destFields[i] = &destBuf[destIdx]; - copyString(destBuf, destCapacity, &destIdx, - ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart); + destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen, + &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status); break; } } @@ -1100,6 +1935,45 @@ uregex_split( URegularExpression *regexp, return i+1; } +// +// uregex_split The actual API function +// +U_CAPI int32_t U_EXPORT2 +uregex_split(URegularExpression *regexp2, + UChar *destBuf, + int32_t destCapacity, + int32_t *requiredCapacity, + UChar *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + if (validateRE(regexp, TRUE, status) == FALSE) { + return 0; + } + if ((destBuf == NULL && destCapacity > 0) || + destCapacity < 0 || + destFields == NULL || + destFieldsCapacity < 1 ) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status); +} + + +// +// uregex_splitUText...can just use the normal C++ method +// +U_CAPI int32_t U_EXPORT2 +uregex_splitUText(URegularExpression *regexp2, + UText *destFields[], + int32_t destFieldsCapacity, + UErrorCode *status) { + RegularExpression *regexp = (RegularExpression*)regexp2; + return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status); +} + #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS