+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
-* Copyright (C) 2004-2013, International Business Machines
+* Copyright (C) 2004-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uregex.cpp
#include "unicode/uchar.h"
#include "unicode/uobject.h"
#include "unicode/utf16.h"
-#include "umutex.h"
-#include "uassert.h"
#include "cmemory.h"
+#include "uassert.h"
+#include "uhash.h"
+#include "umutex.h"
+#include "uvectr32.h"
#include "regextxt.h"
-#include <stdio.h>
-
U_NAMESPACE_BEGIN
#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
re->fPatStringLen = patternLength;
u_memcpy(patBuf, pattern, actualPatLen);
patBuf[actualPatLen] = 0;
-
+
UText patText = UTEXT_INITIALIZER;
utext_openUChars(&patText, patBuf, patternLength, status);
re->fPat = RegexPattern::compile(&patText, flags, *status);
}
utext_close(&patText);
-
+
if (U_FAILURE(*status)) {
goto ErrorExit;
}
uint32_t flags,
UParseError *pe,
UErrorCode *status) {
-
+
if (U_FAILURE(*status)) {
return NULL;
}
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
-
+
int64_t patternNativeLength = utext_nativeLength(pattern);
-
+
if (patternNativeLength == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
-
+
RegularExpression *re = new RegularExpression;
-
+
UErrorCode lengthStatus = U_ZERO_ERROR;
int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
-
+
u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
if (re == NULL || refC == NULL || patBuf == NULL) {
}
re->fPatRefCount = refC;
*re->fPatRefCount = 1;
-
+
//
// Make a copy of the pattern string, so we can return it later if asked.
// For compiling the pattern, we will use a read-only UText wrapper
re->fPatString = patBuf;
re->fPatStringLen = pattern16Length;
utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
-
+
UText patText = UTEXT_INITIALIZER;
utext_openUChars(&patText, patBuf, pattern16Length, status);
-
+
//
// Compile the pattern
//
re->fPat = RegexPattern::compile(&patText, flags, *status);
}
utext_close(&patText);
-
+
if (U_FAILURE(*status)) {
goto ErrorExit;
}
-
+
//
// Create the matcher object
//
if (U_SUCCESS(*status)) {
return (URegularExpression*)re;
}
-
+
ErrorExit:
delete re;
return NULL;
-
+
}
//----------------------------------------------------------------------------------------
// uregex_clone
//
//----------------------------------------------------------------------------------------
-U_CAPI URegularExpression * U_EXPORT2
+U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression *source2, UErrorCode *status) {
RegularExpression *source = (RegularExpression*)source2;
if (validateRE(source, FALSE, status) == FALSE) {
}
clone->fPat = source->fPat;
- clone->fPatRefCount = source->fPatRefCount;
+ clone->fPatRefCount = source->fPatRefCount;
clone->fPatString = source->fPatString;
clone->fPatStringLen = source->fPatStringLen;
umtx_atomic_inc(source->fPatRefCount);
// uregex_pattern
//
//------------------------------------------------------------------------------
-U_CAPI const UChar * U_EXPORT2
+U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression *regexp2,
int32_t *patLength,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
-
+
if (validateRE(regexp, FALSE, status) == FALSE) {
return NULL;
}
// uregex_flags
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, FALSE, status) == FALSE) {
// uregex_setText
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setText(URegularExpression *regexp2,
const UChar *text,
int32_t textLength,
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
-
+
if (regexp->fOwnsText && regexp->fText != NULL) {
uprv_free((void *)regexp->fText);
}
-
+
regexp->fText = text;
regexp->fTextLength = textLength;
regexp->fOwnsText = FALSE;
-
+
UText input = UTEXT_INITIALIZER;
utext_openUChars(&input, text, textLength, status);
regexp->fMatcher->reset(&input);
// uregex_setUText
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setUText(URegularExpression *regexp2,
UText *text,
UErrorCode *status) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
-
+
if (regexp->fOwnsText && regexp->fText != NULL) {
uprv_free((void *)regexp->fText);
}
-
+
regexp->fText = NULL; // only fill it in on request
regexp->fTextLength = -1;
regexp->fOwnsText = TRUE;
// uregex_getText
//
//------------------------------------------------------------------------------
-U_CAPI const UChar * U_EXPORT2
+U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression *regexp2,
int32_t *textLength,
UErrorCode *status) {
if (validateRE(regexp, FALSE, status) == FALSE) {
return NULL;
}
-
+
if (regexp->fText == NULL) {
// need to fill in the text
UText *inputText = regexp->fMatcher->inputText();
UErrorCode lengthStatus = U_ZERO_ERROR;
regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
-
+
utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
regexp->fText = inputChars;
regexp->fOwnsText = TRUE; // should already be set but just in case
}
}
-
+
if (textLength != NULL) {
*textLength = regexp->fTextLength;
}
// uregex_getUText
//
//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
uregex_getUText(URegularExpression *regexp2,
UText *dest,
UErrorCode *status) {
// uregex_refreshUText
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_refreshUText(URegularExpression *regexp2,
UText *text,
UErrorCode *status) {
// uregex_matches
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_matches64( regexp2, (int64_t)startIndex, status);
}
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_matches64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
// uregex_lookingAt
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression *regexp2,
int32_t startIndex,
UErrorCode *status) {
return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
}
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_lookingAt64(URegularExpression *regexp2,
int64_t startIndex,
UErrorCode *status) {
// uregex_find
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression *regexp2,
- int32_t startIndex,
+ int32_t startIndex,
UErrorCode *status) {
return uregex_find64( regexp2, (int64_t)startIndex, status);
}
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_find64(URegularExpression *regexp2,
- int64_t startIndex,
+ int64_t startIndex,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
UBool result = FALSE;
}
if (startIndex == -1) {
regexp->fMatcher->resetPreserveRegion();
- result = regexp->fMatcher->find();
+ result = regexp->fMatcher->find(*status);
} else {
result = regexp->fMatcher->find(startIndex, *status);
}
// uregex_findNext
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
if (validateRE(regexp, TRUE, status) == FALSE) {
return FALSE;
}
- UBool result = regexp->fMatcher->find();
+ UBool result = regexp->fMatcher->find(*status);
return result;
}
// uregex_groupCount
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
}
+//------------------------------------------------------------------------------
+//
+// uregex_groupNumberFromName
+//
+//------------------------------------------------------------------------------
+int32_t
+uregex_groupNumberFromName(URegularExpression *regexp2,
+ const UChar *groupName,
+ int32_t nameLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
+ return result;
+}
+
+int32_t
+uregex_groupNumberFromCName(URegularExpression *regexp2,
+ const char *groupName,
+ int32_t nameLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
+}
+
//------------------------------------------------------------------------------
//
// uregex_group
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression *regexp2,
int32_t groupNum,
UChar *dest,
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
-
+
if (destCapacity == 0 || regexp->fText != NULL) {
// If preflighting or if we already have the text as UChars,
- // this is a little cheaper than going through uregex_groupUTextDeep()
-
+ // this is a little cheaper than extracting from the UText
+
//
// Pick up the range of characters from the matcher
//
//
// Trim length based on buffer capacity
- //
+ //
int32_t fullLength = endIx - startIx;
int32_t copyLength = fullLength;
if (copyLength < destCapacity) {
copyLength = destCapacity;
*status = U_BUFFER_OVERFLOW_ERROR;
}
-
+
//
// Copy capture group to user's buffer
//
}
return fullLength;
} else {
- int32_t result = 0;
- UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
- if (U_SUCCESS(*status)) {
- result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
+ int64_t start = regexp->fMatcher->start64(groupNum, *status);
+ int64_t limit = regexp->fMatcher->end64(groupNum, *status);
+ if (U_FAILURE(*status)) {
+ return 0;
}
- utext_close(groupText);
- return result;
+ // Note edge cases:
+ // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
+ // Zero Length Match: start == end.
+ int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
+ return length;
}
+
}
// uregex_groupUText
//
//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
uregex_groupUText(URegularExpression *regexp2,
int32_t groupNum,
UText *dest,
return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
}
-//------------------------------------------------------------------------------
-//
-// uregex_groupUTextDeep
-//
-//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
-uregex_groupUTextDeep(URegularExpression *regexp2,
- int32_t groupNum,
- UText *dest,
- UErrorCode *status) {
- RegularExpression *regexp = (RegularExpression*)regexp2;
- if (validateRE(regexp, TRUE, status) == FALSE) {
- UErrorCode emptyTextStatus = U_ZERO_ERROR;
- return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
- }
-
- if (regexp->fText != NULL) {
- //
- // Pick up the range of characters from the matcher
- // and use our already-extracted characters
- //
- int32_t startIx = regexp->fMatcher->start(groupNum, *status);
- int32_t endIx = regexp->fMatcher->end (groupNum, *status);
- if (U_FAILURE(*status)) {
- UErrorCode emptyTextStatus = U_ZERO_ERROR;
- return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
- }
-
- if (dest) {
- utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[startIx], endIx - startIx, status);
- } else {
- UText groupText = UTEXT_INITIALIZER;
- utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startIx, status);
- dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
- utext_close(&groupText);
- }
-
- return dest;
- } else {
- return regexp->fMatcher->group(groupNum, dest, *status);
- }
-}
-
//------------------------------------------------------------------------------
//
// uregex_start
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
return (int32_t)uregex_start64( regexp2, groupNum, status);
}
-U_CAPI int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
uregex_start64(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
- int32_t result = regexp->fMatcher->start(groupNum, *status);
+ int64_t result = regexp->fMatcher->start64(groupNum, *status);
return result;
}
// uregex_end
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
return (int32_t)uregex_end64( regexp2, groupNum, status);
}
-U_CAPI int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
uregex_end64(URegularExpression *regexp2,
int32_t groupNum,
UErrorCode *status) {
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
- int32_t result = regexp->fMatcher->end(groupNum, *status);
+ int64_t result = regexp->fMatcher->end64(groupNum, *status);
return result;
}
// uregex_reset
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_reset(URegularExpression *regexp2,
int32_t index,
UErrorCode *status) {
uregex_reset64( regexp2, (int64_t)index, status);
}
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_reset64(URegularExpression *regexp2,
int64_t index,
UErrorCode *status) {
// uregex_setRegion
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression *regexp2,
int32_t regionStart,
int32_t regionLimit,
uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
}
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setRegion64(URegularExpression *regexp2,
int64_t regionStart,
int64_t regionLimit,
// uregex_setRegionAndStart
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setRegionAndStart(URegularExpression *regexp2,
int64_t regionStart,
int64_t regionLimit,
// uregex_regionStart
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression *regexp2,
UErrorCode *status) {
return (int32_t)uregex_regionStart64(regexp2, status);
}
-U_CAPI int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
// uregex_regionEnd
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression *regexp2,
UErrorCode *status) {
return (int32_t)uregex_regionEnd64(regexp2, status);
}
-U_CAPI int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
// uregex_hasTransparentBounds
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
// uregex_useTransparentBounds
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression *regexp2,
UBool b,
UErrorCode *status) {
// uregex_hasAnchoringBounds
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
// uregex_useAnchoringBounds
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression *regexp2,
UBool b,
UErrorCode *status) {
// uregex_hitEnd
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
// uregex_requireEnd
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression *regexp2,
UErrorCode *status) {
RegularExpression *regexp = (RegularExpression*)regexp2;
// uregex_setTimeLimit
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setTimeLimit(URegularExpression *regexp2,
int32_t limit,
UErrorCode *status) {
// uregex_getTimeLimit
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression *regexp2,
UErrorCode *status) {
int32_t retVal = 0;
// uregex_setStackLimit
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_setStackLimit(URegularExpression *regexp2,
int32_t limit,
UErrorCode *status) {
// uregex_getStackLimit
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression *regexp2,
UErrorCode *status) {
int32_t retVal = 0;
// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_getMatchCallback(const URegularExpression *regexp2,
URegexMatchCallback **callback,
const void **context,
// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression *regexp2,
URegexFindProgressCallback **callback,
const void **context,
// uregex_replaceAll
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
-
+
if (U_FAILURE(findStatus)) {
// If anything went wrong with the findNext(), make that error trump
// whatever may have happened with the append() operations.
// uregex_replaceAllUText
//
//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression *regexp2,
UText *replacementText,
UText *dest,
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
-
+
dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
return dest;
}
-
+
//------------------------------------------------------------------------------
//
// uregex_replaceFirst
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
uregex_reset(regexp2, 0, status);
findSucceeded = uregex_find(regexp2, 0, status);
if (findSucceeded) {
- len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
+ len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
// uregex_replaceFirstUText
//
//------------------------------------------------------------------------------
-U_CAPI UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression *regexp2,
UText *replacementText,
UText *dest,
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
-
+
dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
return dest;
}
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
-
+
inline static int32_t split(RegularExpression *regexp,
UChar *destBuf,
int32_t destCapacity,
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
+static const UChar LEFTBRACKET = 0x7b;
+static const UChar RIGHTBRACKET = 0x7d;
//
// Move a character to an output buffer, with bounds checking on the index.
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
- destCapacity == NULL || destBuf == NULL ||
+ destCapacity == NULL || destBuf == NULL ||
(*destBuf == NULL && *destCapacity > 0) ||
*destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
int32_t capacity = *destCapacity;
int32_t destIdx = 0;
int32_t i;
-
+
// If it wasn't supplied by the caller, get the length of the replacement text.
// TODO: slightly smarter logic in the copy loop could watch for the NUL on
// the fly and avoid this step.
matchStart = (int32_t)m->fMatchStart;
} else {
// !!!: Would like a better way to do this!
- UErrorCode status = U_ZERO_ERROR;
- lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
- status = U_ZERO_ERROR;
- matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
+ UErrorCode tempStatus = U_ZERO_ERROR;
+ lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
+ tempStatus = U_ZERO_ERROR;
+ matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
}
for (i=lastMatchEnd; i<matchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
- }
+ }
} else {
UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replIdx = 0;
- while (replIdx < replacementLength) {
+ while (replIdx < replacementLength && U_SUCCESS(*status)) {
UChar c = replacementText[replIdx];
replIdx++;
if (c != DOLLARSIGN && c != BACKSLASH) {
- // Common case, no substitution, no escaping,
+ // Common case, no substitution, no escaping,
// just copy the char to the dest buf.
appendToBuf(c, &destIdx, dest, capacity);
continue;
if (c==0x55/*U*/ || c==0x75/*u*/) {
// We have a \udddd or \Udddddddd escape sequence.
- UChar32 escapedChar =
+ UChar32 escapedChar =
u_unescapeAt(uregex_ucstr_unescape_charAt,
- &replIdx, // Index is updated by unescapeAt
+ &replIdx, // Index is updated by unescapeAt
replacementLength, // Length of replacement text
(void *)replacementText);
continue;
}
+ // We've got a $. Pick up the following capture group name or number.
+ // For numbers, consume only digits that produce a valid capture group for the pattern.
-
- // We've got a $. Pick up a capture group number if one follows.
- // Consume at most the number of digits necessary for the largest capture
- // number that is valid for this pattern.
-
- int32_t numDigits = 0;
int32_t groupNum = 0;
- UChar32 digitC;
- for (;;) {
- if (replIdx >= replacementLength) {
- break;
- }
- U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
- if (u_isdigit(digitC) == FALSE) {
- break;
- }
+ U_ASSERT(c == DOLLARSIGN);
+ UChar32 c32 = -1;
+ if (replIdx < replacementLength) {
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+ }
+ if (u_isdigit(c32)) {
+ int32_t numDigits = 0;
+ int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
+ for (;;) {
+ if (replIdx >= replacementLength) {
+ break;
+ }
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+ if (u_isdigit(c32) == FALSE) {
+ break;
+ }
+ int32_t digitVal = u_charDigitValue(c32);
+ if (groupNum * 10 + digitVal <= numCaptureGroups) {
+ groupNum = groupNum * 10 + digitVal;
+ U16_FWD_1(replacementText, replIdx, replacementLength);
+ numDigits++;
+ } else {
+ if (numDigits == 0) {
+ *status = U_INDEX_OUTOFBOUNDS_ERROR;
+ }
+ break;
+ }
+ }
+ } else if (c32 == LEFTBRACKET) {
+ // Scan for Named Capture Group, ${name}.
+ UnicodeString groupName;
U16_FWD_1(replacementText, replIdx, replacementLength);
- groupNum=groupNum*10 + u_charDigitValue(digitC);
- numDigits++;
- if (numDigits >= m->fPattern->fMaxCaptureDigits) {
- break;
+ while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
+ if (replIdx >= replacementLength) {
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ break;
+ }
+ U16_NEXT(replacementText, replIdx, replacementLength, c32);
+ if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
+ (c32 >= 0x61 && c32 <= 0x7a) || // a..z
+ (c32 >= 0x31 && c32 <= 0x39)) { // 0..9
+ groupName.append(c32);
+ } else if (c32 == RIGHTBRACKET) {
+ groupNum = regexp->fPat->fNamedCaptureMap ?
+ uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName) : 0;
+ if (groupNum == 0) {
+ // Name not defined by pattern.
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ }
+ } else {
+ // Character was something other than a name char or a closing '}'
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ }
}
+ } else {
+ // $ not followed by {name} or digits.
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
- if (numDigits == 0) {
- // The $ didn't introduce a group number at all.
- // Treat it as just part of the substitution text.
- appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
- continue;
- }
-
// Finally, append the capture group data to the destination.
- destIdx += uregex_group((URegularExpression*)regexp, groupNum,
- dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- // Ignore buffer overflow when extracting the group. We need to
- // continue on to get full size of the untruncated result. We will
- // raise our own buffer overflow error at the end.
- *status = U_ZERO_ERROR;
+ if (U_SUCCESS(*status)) {
+ destIdx += uregex_group((URegularExpression*)regexp, groupNum,
+ dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
+ if (*status == U_BUFFER_OVERFLOW_ERROR) {
+ // Ignore buffer overflow when extracting the group. We need to
+ // continue on to get full size of the untruncated result. We will
+ // raise our own buffer overflow error at the end.
+ *status = U_ZERO_ERROR;
+ }
}
if (U_FAILURE(*status)) {
- // Can fail if group number is out of range.
+ // bad group number or name.
break;
}
-
}
//
//
if (destIdx < capacity) {
dest[destIdx] = 0;
- } else if (destIdx == *destCapacity) {
- *status = U_STRING_NOT_TERMINATED_WARNING;
- } else {
- *status = U_BUFFER_OVERFLOW_ERROR;
+ } else if (U_SUCCESS(*status)) {
+ if (destIdx == *destCapacity) {
+ *status = U_STRING_NOT_TERMINATED_WARNING;
+ } else {
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ }
}
-
+
//
// Return an updated dest buffer and capacity to the caller.
//
//
// appendReplacement the actual API function,
//
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
-
+
RegularExpression *regexp = (RegularExpression*)regexp2;
return RegexCImpl::appendReplacement(
regexp, replacementText, replacementLength,destBuf, destCapacity, status);
//
// uregex_appendReplacementUText...can just use the normal C++ method
//
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
uregex_appendReplacementUText(URegularExpression *regexp2,
UText *replText,
UText *dest,
if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
-
- if (destCapacity == NULL || destBuf == NULL ||
+
+ if (destCapacity == NULL || destBuf == NULL ||
(*destBuf == NULL && *destCapacity > 0) ||
*destCapacity < 0)
{
int32_t destIdx = 0;
int32_t destCap = *destCapacity;
UChar *dest = *destBuf;
-
+
if (regexp->fText != NULL) {
int32_t srcIdx;
int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
} else if (UTEXT_USES_U16(m->fInputText)) {
srcIdx = (int32_t)nativeIdx;
} else {
- UErrorCode status = U_ZERO_ERROR;
- srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
+ UErrorCode newStatus = U_ZERO_ERROR;
+ srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &newStatus);
}
-
+
for (;;) {
U_ASSERT(destIdx >= 0);
}
srcIdx++;
destIdx++;
- }
+ }
} else {
int64_t srcIdx;
if (m->fMatch) {
- // The most recent call to find() succeeded.
+ // The most recent call to find() succeeded.
srcIdx = m->fMatchEnd;
} else {
// The last call to find() on this matcher failed().
//
// appendTail the actual API function
//
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression *regexp2,
UChar **destBuf,
int32_t *destCapacity,
//
// uregex_appendTailUText...can just use the normal C++ method
//
-U_CAPI UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression *regexp2,
UText *dest,
UErrorCode *status) {
i = destFieldsCapacity-1;
destIdx = (int32_t)(destFields[i] - destFields[0]);
}
-
+
destFields[i] = &destBuf[destIdx];
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
}
break;
}
-
+
if (regexp->fMatcher->find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
destFields[i] = &destBuf[destIdx];
-
+
destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
&destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
*status = tStatus;
}
nextOutputStringStart = regexp->fMatcher->fMatchEnd;
-
+
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
break;
}
i++;
-
+
// Set up to extract the capture group contents into the dest buffer.
destFields[i] = &destBuf[destIdx];
tStatus = U_ZERO_ERROR;
- int32_t t = uregex_group((URegularExpression*)regexp,
- groupNum,
- destFields[i],
- REMAINING_CAPACITY(destIdx, destCapacity),
+ int32_t t = uregex_group((URegularExpression*)regexp,
+ groupNum,
+ destFields[i],
+ REMAINING_CAPACITY(destIdx, destCapacity),
&tStatus);
destIdx += t + 1; // Record the space used in the output string buffer.
// +1 for the NUL that terminates the string.
}
if (nextOutputStringStart == inputLen) {
- // The delimiter was at the end of the string.
+ // The delimiter was at the end of the string.
// Output an empty string, and then we are done.
if (destIdx < destCapacity) {
destBuf[destIdx] = 0;
//
// uregex_split The actual API function
//
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression *regexp2,
UChar *destBuf,
int32_t destCapacity,
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
-
+
return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
}
-
+
//
// uregex_splitUText...can just use the normal C++ method
//
-U_CAPI int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
uregex_splitUText(URegularExpression *regexp2,
UText *destFields[],
int32_t destFieldsCapacity,