+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
-* Copyright (C) 2004-2005, International Business Machines
+* Copyright (C) 2004-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
-* file name: regex.cpp
+* file name: uregex.cpp
*/
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uobject.h"
-#include "umutex.h"
-#include "uassert.h"
+#include "unicode/utf16.h"
#include "cmemory.h"
+#include "uassert.h"
+#include "uhash.h"
+#include "umutex.h"
+#include "uvectr32.h"
+
+#include "regextxt.h"
+
+U_NAMESPACE_BEGIN
+
+#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
-struct URegularExpression: public UMemory {
+struct RegularExpression: public UMemory {
public:
- URegularExpression();
- ~URegularExpression();
+ RegularExpression();
+ ~RegularExpression();
int32_t fMagic;
RegexPattern *fPat;
- int32_t *fPatRefCount;
+ u_atomic_int32_t *fPatRefCount;
UChar *fPatString;
int32_t fPatStringLen;
RegexMatcher *fMatcher;
const UChar *fText; // Text from setText()
int32_t fTextLength; // Length provided by user with setText(), which
// may be -1.
-
- UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
- // TODO: regexp engine should not depend on UnicodeString.
+ UBool fOwnsText;
};
static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
-U_NAMESPACE_USE
-
-URegularExpression::URegularExpression() {
+RegularExpression::RegularExpression() {
fMagic = REXP_MAGIC;
fPat = NULL;
fPatRefCount = NULL;
fMatcher = NULL;
fText = NULL;
fTextLength = 0;
+ fOwnsText = FALSE;
}
-URegularExpression::~URegularExpression() {
+RegularExpression::~RegularExpression() {
delete fMatcher;
fMatcher = NULL;
if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
delete fPat;
uprv_free(fPatString);
- uprv_free(fPatRefCount);
+ uprv_free((void *)fPatRefCount);
+ }
+ if (fOwnsText && fText!=NULL) {
+ uprv_free((void *)fText);
}
fMagic = 0;
}
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
//----------------------------------------------------------------------------------------
//
// validateRE Do boilerplate style checks on API function parameters.
// Return TRUE if they look OK.
//----------------------------------------------------------------------------------------
-static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
+static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
if (U_FAILURE(*status)) {
return FALSE;
}
if (re == NULL || re->fMagic != REXP_MAGIC) {
- // U_ASSERT(FALSE);
*status = U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
- if (requiresText && re->fText == NULL) {
+ // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
+ if (requiresText && re->fText == NULL && !re->fOwnsText) {
*status = U_REGEX_INVALID_STATE;
return FALSE;
}
actualPatLen = u_strlen(pattern);
}
- URegularExpression *re = new URegularExpression;
- int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
+ RegularExpression *re = new RegularExpression;
+ u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
if (re == NULL || refC == NULL || patBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete re;
- uprv_free(refC);
+ uprv_free((void *)refC);
uprv_free(patBuf);
return NULL;
}
//
// Make a copy of the pattern string, so we can return it later if asked.
- // For compiling the pattern, we will use a read-only-aliased UnicodeString
- // of this local copy, to avoid making even more copies.
+ // For compiling the pattern, we will use a UText wrapper around
+ // this local copy, to avoid making even more copies.
//
re->fPatString = patBuf;
re->fPatStringLen = patternLength;
u_memcpy(patBuf, pattern, actualPatLen);
patBuf[actualPatLen] = 0;
- UnicodeString patString(patternLength==-1, patBuf, patternLength);
+
+ UText patText = UTEXT_INITIALIZER;
+ utext_openUChars(&patText, patBuf, patternLength, status);
+
+ //
+ // Compile the pattern
+ //
+ if (pe != NULL) {
+ re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
+ } else {
+ re->fPat = RegexPattern::compile(&patText, flags, *status);
+ }
+ utext_close(&patText);
+
+ if (U_FAILURE(*status)) {
+ goto ErrorExit;
+ }
+
+ //
+ // Create the matcher object
+ //
+ re->fMatcher = re->fPat->matcher(*status);
+ if (U_SUCCESS(*status)) {
+ return (URegularExpression*)re;
+ }
+
+ErrorExit:
+ delete re;
+ return NULL;
+
+}
+
+//----------------------------------------------------------------------------------------
+//
+// uregex_openUText
+//
+//----------------------------------------------------------------------------------------
+U_CAPI URegularExpression * U_EXPORT2
+uregex_openUText(UText *pattern,
+ uint32_t flags,
+ UParseError *pe,
+ UErrorCode *status) {
+
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+ if (pattern == NULL) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+
+ int64_t patternNativeLength = utext_nativeLength(pattern);
+
+ if (patternNativeLength == 0) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+
+ RegularExpression *re = new RegularExpression;
+
+ UErrorCode lengthStatus = U_ZERO_ERROR;
+ int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
+
+ u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
+ UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
+ if (re == NULL || refC == NULL || patBuf == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ delete re;
+ uprv_free((void *)refC);
+ uprv_free(patBuf);
+ return NULL;
+ }
+ re->fPatRefCount = refC;
+ *re->fPatRefCount = 1;
+
+ //
+ // Make a copy of the pattern string, so we can return it later if asked.
+ // For compiling the pattern, we will use a read-only UText wrapper
+ // around this local copy, to avoid making even more copies.
+ //
+ re->fPatString = patBuf;
+ re->fPatStringLen = pattern16Length;
+ utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
+
+ UText patText = UTEXT_INITIALIZER;
+ utext_openUChars(&patText, patBuf, pattern16Length, status);
//
// Compile the pattern
//
if (pe != NULL) {
- re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
+ re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
} else {
- re->fPat = RegexPattern::compile(patString, flags, *status);
+ re->fPat = RegexPattern::compile(&patText, flags, *status);
}
+ utext_close(&patText);
+
if (U_FAILURE(*status)) {
goto ErrorExit;
}
//
re->fMatcher = re->fPat->matcher(*status);
if (U_SUCCESS(*status)) {
- return re;
+ return (URegularExpression*)re;
}
ErrorExit:
//
//----------------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
-uregex_close(URegularExpression *re) {
+uregex_close(URegularExpression *re2) {
+ RegularExpression *re = (RegularExpression*)re2;
UErrorCode status = U_ZERO_ERROR;
- if (validateRE(re, &status, FALSE) == FALSE) {
+ if (validateRE(re, FALSE, &status) == FALSE) {
return;
}
delete re;
// uregex_clone
//
//----------------------------------------------------------------------------------------
-U_CAPI URegularExpression * U_EXPORT2
-uregex_clone(const URegularExpression *source, UErrorCode *status) {
- if (validateRE(source, status, FALSE) == FALSE) {
+U_CAPI URegularExpression * U_EXPORT2
+uregex_clone(const URegularExpression *source2, UErrorCode *status) {
+ RegularExpression *source = (RegularExpression*)source2;
+ if (validateRE(source, FALSE, status) == FALSE) {
return NULL;
}
- URegularExpression *clone = new URegularExpression;
+ RegularExpression *clone = new RegularExpression;
if (clone == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
delete clone;
return NULL;
}
- if (clone == NULL) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return NULL;
- }
clone->fPat = source->fPat;
- clone->fPatRefCount = source->fPatRefCount;
+ clone->fPatRefCount = source->fPatRefCount;
clone->fPatString = source->fPatString;
clone->fPatStringLen = source->fPatStringLen;
umtx_atomic_inc(source->fPatRefCount);
// Note: fText is not cloned.
- return clone;
+ return (URegularExpression*)clone;
}
// uregex_pattern
//
//------------------------------------------------------------------------------
-U_CAPI const UChar * U_EXPORT2
-uregex_pattern(const URegularExpression *regexp,
- int32_t *patLength,
- UErrorCode *status) {
-
- if (validateRE(regexp, status, FALSE) == FALSE) {
+U_CAPI const UChar * U_EXPORT2
+uregex_pattern(const URegularExpression *regexp2,
+ int32_t *patLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+
+ if (validateRE(regexp, FALSE, status) == FALSE) {
return NULL;
}
if (patLength != NULL) {
}
+//------------------------------------------------------------------------------
+//
+// uregex_patternUText
+//
+//------------------------------------------------------------------------------
+U_CAPI UText * U_EXPORT2
+uregex_patternUText(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ return regexp->fPat->patternText(*status);
+}
+
+
//------------------------------------------------------------------------------
//
// uregex_flags
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
- if (validateRE(regexp, status, FALSE) == FALSE) {
+U_CAPI int32_t U_EXPORT2
+uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
return 0;
}
int32_t flags = regexp->fPat->flags();
// uregex_setText
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
-uregex_setText(URegularExpression *regexp,
+U_CAPI void U_EXPORT2
+uregex_setText(URegularExpression *regexp2,
const UChar *text,
int32_t textLength,
UErrorCode *status) {
- if (validateRE(regexp, status, FALSE) == FALSE) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
return;
}
if (text == NULL || textLength < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
+
+ if (regexp->fOwnsText && regexp->fText != NULL) {
+ uprv_free((void *)regexp->fText);
+ }
+
regexp->fText = text;
regexp->fTextLength = textLength;
- UBool isTerminated = (textLength == -1);
+ regexp->fOwnsText = FALSE;
- regexp->fTextString.setTo(isTerminated, text, textLength);
- regexp->fMatcher->reset(regexp->fTextString);
+ UText input = UTEXT_INITIALIZER;
+ utext_openUChars(&input, text, textLength, status);
+ regexp->fMatcher->reset(&input);
+ utext_close(&input); // reset() made a shallow clone, so we don't need this copy
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_setUText
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_setUText(URegularExpression *regexp2,
+ UText *text,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return;
+ }
+ if (text == NULL) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ if (regexp->fOwnsText && regexp->fText != NULL) {
+ uprv_free((void *)regexp->fText);
+ }
+
+ regexp->fText = NULL; // only fill it in on request
+ regexp->fTextLength = -1;
+ regexp->fOwnsText = TRUE;
+ regexp->fMatcher->reset(text);
}
// uregex_getText
//
//------------------------------------------------------------------------------
-U_CAPI const UChar * U_EXPORT2
-uregex_getText(URegularExpression *regexp,
+U_CAPI const UChar * U_EXPORT2
+uregex_getText(URegularExpression *regexp2,
int32_t *textLength,
UErrorCode *status) {
- if (validateRE(regexp, status, FALSE) == FALSE) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
return NULL;
}
+
+ if (regexp->fText == NULL) {
+ // need to fill in the text
+ UText *inputText = regexp->fMatcher->inputText();
+ int64_t inputNativeLength = utext_nativeLength(inputText);
+ if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
+ regexp->fText = inputText->chunkContents;
+ regexp->fTextLength = (int32_t)inputNativeLength;
+ regexp->fOwnsText = FALSE; // because the UText owns it
+ } else {
+ UErrorCode lengthStatus = U_ZERO_ERROR;
+ regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
+ UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
+
+ utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
+ regexp->fText = inputChars;
+ regexp->fOwnsText = TRUE; // should already be set but just in case
+ }
+ }
+
if (textLength != NULL) {
*textLength = regexp->fTextLength;
}
- return regexp->fText;
+ return regexp->fText;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_getUText
+//
+//------------------------------------------------------------------------------
+U_CAPI UText * U_EXPORT2
+uregex_getUText(URegularExpression *regexp2,
+ UText *dest,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return dest;
+ }
+ return regexp->fMatcher->getInput(dest, *status);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_refreshUText
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_refreshUText(URegularExpression *regexp2,
+ UText *text,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return;
+ }
+ regexp->fMatcher->refreshInputText(text, *status);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_matches
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_matches(URegularExpression *regexp2,
+ int32_t startIndex,
+ UErrorCode *status) {
+ return uregex_matches64( regexp2, (int64_t)startIndex, status);
+}
+
+U_CAPI UBool U_EXPORT2
+uregex_matches64(URegularExpression *regexp2,
+ int64_t startIndex,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ UBool result = FALSE;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return result;
+ }
+ if (startIndex == -1) {
+ result = regexp->fMatcher->matches(*status);
+ } else {
+ result = regexp->fMatcher->matches(startIndex, *status);
+ }
+ return result;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_lookingAt
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_lookingAt(URegularExpression *regexp2,
+ int32_t startIndex,
+ UErrorCode *status) {
+ return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
+}
+
+U_CAPI UBool U_EXPORT2
+uregex_lookingAt64(URegularExpression *regexp2,
+ int64_t startIndex,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ UBool result = FALSE;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return result;
+ }
+ if (startIndex == -1) {
+ result = regexp->fMatcher->lookingAt(*status);
+ } else {
+ result = regexp->fMatcher->lookingAt(startIndex, *status);
+ }
+ return result;
+}
+
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_find
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_find(URegularExpression *regexp2,
+ int32_t startIndex,
+ UErrorCode *status) {
+ return uregex_find64( regexp2, (int64_t)startIndex, status);
+}
+
+U_CAPI UBool U_EXPORT2
+uregex_find64(URegularExpression *regexp2,
+ int64_t startIndex,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ UBool result = FALSE;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return result;
+ }
+ if (startIndex == -1) {
+ regexp->fMatcher->resetPreserveRegion();
+ result = regexp->fMatcher->find(*status);
+ } else {
+ result = regexp->fMatcher->find(startIndex, *status);
+ }
+ return result;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_findNext
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_findNext(URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return FALSE;
+ }
+ UBool result = regexp->fMatcher->find(*status);
+ return result;
+}
+
+//------------------------------------------------------------------------------
+//
+// uregex_groupCount
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2
+uregex_groupCount(URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ int32_t result = regexp->fMatcher->groupCount();
+ return result;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_groupNumberFromName
+//
+//------------------------------------------------------------------------------
+int32_t
+uregex_groupNumberFromName(URegularExpression *regexp2,
+ const UChar *groupName,
+ int32_t nameLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
+ return result;
+}
+
+int32_t
+uregex_groupNumberFromCName(URegularExpression *regexp2,
+ const char *groupName,
+ int32_t nameLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return 0;
+ }
+ return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
+}
+
+//------------------------------------------------------------------------------
+//
+// uregex_group
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2
+uregex_group(URegularExpression *regexp2,
+ int32_t groupNum,
+ UChar *dest,
+ int32_t destCapacity,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ if (destCapacity == 0 || regexp->fText != NULL) {
+ // If preflighting or if we already have the text as UChars,
+ // this is a little cheaper than extracting from the UText
+
+ //
+ // Pick up the range of characters from the matcher
+ //
+ int32_t startIx = regexp->fMatcher->start(groupNum, *status);
+ int32_t endIx = regexp->fMatcher->end (groupNum, *status);
+ if (U_FAILURE(*status)) {
+ return 0;
+ }
+
+ //
+ // Trim length based on buffer capacity
+ //
+ int32_t fullLength = endIx - startIx;
+ int32_t copyLength = fullLength;
+ if (copyLength < destCapacity) {
+ dest[copyLength] = 0;
+ } else if (copyLength == destCapacity) {
+ *status = U_STRING_NOT_TERMINATED_WARNING;
+ } else {
+ copyLength = destCapacity;
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ }
+
+ //
+ // Copy capture group to user's buffer
+ //
+ if (copyLength > 0) {
+ u_memcpy(dest, ®exp->fText[startIx], copyLength);
+ }
+ return fullLength;
+ } else {
+ int64_t start = regexp->fMatcher->start64(groupNum, *status);
+ int64_t limit = regexp->fMatcher->end64(groupNum, *status);
+ if (U_FAILURE(*status)) {
+ return 0;
+ }
+ // Note edge cases:
+ // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
+ // Zero Length Match: start == end.
+ int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
+ return length;
+ }
+
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_groupUText
+//
+//------------------------------------------------------------------------------
+U_CAPI UText * U_EXPORT2
+uregex_groupUText(URegularExpression *regexp2,
+ int32_t groupNum,
+ UText *dest,
+ int64_t *groupLength,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ UErrorCode emptyTextStatus = U_ZERO_ERROR;
+ return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
+ }
+
+ return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
+}
+
+//------------------------------------------------------------------------------
+//
+// uregex_start
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2
+uregex_start(URegularExpression *regexp2,
+ int32_t groupNum,
+ UErrorCode *status) {
+ return (int32_t)uregex_start64( regexp2, groupNum, status);
+}
+
+U_CAPI int64_t U_EXPORT2
+uregex_start64(URegularExpression *regexp2,
+ int32_t groupNum,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ int64_t result = regexp->fMatcher->start64(groupNum, *status);
+ return result;
+}
+
+//------------------------------------------------------------------------------
+//
+// uregex_end
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2
+uregex_end(URegularExpression *regexp2,
+ int32_t groupNum,
+ UErrorCode *status) {
+ return (int32_t)uregex_end64( regexp2, groupNum, status);
+}
+
+U_CAPI int64_t U_EXPORT2
+uregex_end64(URegularExpression *regexp2,
+ int32_t groupNum,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ int64_t result = regexp->fMatcher->end64(groupNum, *status);
+ return result;
+}
+
+//------------------------------------------------------------------------------
+//
+// uregex_reset
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_reset(URegularExpression *regexp2,
+ int32_t index,
+ UErrorCode *status) {
+ uregex_reset64( regexp2, (int64_t)index, status);
+}
+
+U_CAPI void U_EXPORT2
+uregex_reset64(URegularExpression *regexp2,
+ int64_t index,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return;
+ }
+ regexp->fMatcher->reset(index, *status);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_setRegion
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_setRegion(URegularExpression *regexp2,
+ int32_t regionStart,
+ int32_t regionLimit,
+ UErrorCode *status) {
+ uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
+}
+
+U_CAPI void U_EXPORT2
+uregex_setRegion64(URegularExpression *regexp2,
+ int64_t regionStart,
+ int64_t regionLimit,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return;
+ }
+ regexp->fMatcher->region(regionStart, regionLimit, *status);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_setRegionAndStart
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_setRegionAndStart(URegularExpression *regexp2,
+ int64_t regionStart,
+ int64_t regionLimit,
+ int64_t startIndex,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return;
+ }
+ regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
+}
+
+//------------------------------------------------------------------------------
+//
+// uregex_regionStart
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2
+uregex_regionStart(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ return (int32_t)uregex_regionStart64(regexp2, status);
+}
+
+U_CAPI int64_t U_EXPORT2
+uregex_regionStart64(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ return regexp->fMatcher->regionStart();
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_regionEnd
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2
+uregex_regionEnd(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ return (int32_t)uregex_regionEnd64(regexp2, status);
+}
+
+U_CAPI int64_t U_EXPORT2
+uregex_regionEnd64(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ return regexp->fMatcher->regionEnd();
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_hasTransparentBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_hasTransparentBounds(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return FALSE;
+ }
+ return regexp->fMatcher->hasTransparentBounds();
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_useTransparentBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_useTransparentBounds(URegularExpression *regexp2,
+ UBool b,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return;
+ }
+ regexp->fMatcher->useTransparentBounds(b);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_hasAnchoringBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_hasAnchoringBounds(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return FALSE;
+ }
+ return regexp->fMatcher->hasAnchoringBounds();
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_useAnchoringBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2
+uregex_useAnchoringBounds(URegularExpression *regexp2,
+ UBool b,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status) == FALSE) {
+ return;
+ }
+ regexp->fMatcher->useAnchoringBounds(b);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_hitEnd
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2
+uregex_hitEnd(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return FALSE;
+ }
+ return regexp->fMatcher->hitEnd();
}
//------------------------------------------------------------------------------
//
-// uregex_matches
+// uregex_requireEnd
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
-uregex_matches(URegularExpression *regexp,
- int32_t startIndex,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
+U_CAPI UBool U_EXPORT2
+uregex_requireEnd(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
return FALSE;
}
- UBool result = regexp->fMatcher->matches(startIndex, *status);
- return result;
+ return regexp->fMatcher->requireEnd();
}
-
//------------------------------------------------------------------------------
//
-// uregex_lookingAt
+// uregex_setTimeLimit
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
-uregex_lookingAt(URegularExpression *regexp,
- int32_t startIndex,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return FALSE;
+U_CAPI void U_EXPORT2
+uregex_setTimeLimit(URegularExpression *regexp2,
+ int32_t limit,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ regexp->fMatcher->setTimeLimit(limit, *status);
}
- UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
- return result;
}
//------------------------------------------------------------------------------
//
-// uregex_find
+// uregex_getTimeLimit
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
-uregex_find(URegularExpression *regexp,
- int32_t startIndex,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return FALSE;
- }
- UBool result = regexp->fMatcher->find(startIndex, *status);
- return result;
+U_CAPI int32_t U_EXPORT2
+uregex_getTimeLimit(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ int32_t retVal = 0;
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ retVal = regexp->fMatcher->getTimeLimit();
+ }
+ return retVal;
}
+
+
//------------------------------------------------------------------------------
//
-// uregex_findNext
+// uregex_setStackLimit
//
//------------------------------------------------------------------------------
-U_CAPI UBool U_EXPORT2
-uregex_findNext(URegularExpression *regexp,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return FALSE;
+U_CAPI void U_EXPORT2
+uregex_setStackLimit(URegularExpression *regexp2,
+ int32_t limit,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ regexp->fMatcher->setStackLimit(limit, *status);
}
- UBool result = regexp->fMatcher->find();
- return result;
}
+
+
//------------------------------------------------------------------------------
//
-// uregex_groupCount
+// uregex_getStackLimit
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_groupCount(URegularExpression *regexp,
- UErrorCode *status) {
- if (validateRE(regexp, status, FALSE) == FALSE) {
- return 0;
- }
- int32_t result = regexp->fMatcher->groupCount();
- return result;
+U_CAPI int32_t U_EXPORT2
+uregex_getStackLimit(const URegularExpression *regexp2,
+ UErrorCode *status) {
+ int32_t retVal = 0;
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ retVal = regexp->fMatcher->getStackLimit();
+ }
+ return retVal;
}
//------------------------------------------------------------------------------
//
-// uregex_group
+// uregex_setMatchCallback
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_group(URegularExpression *regexp,
- int32_t groupNum,
- UChar *dest,
- int32_t destCapacity,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return 0;
- }
- if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
- *status = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
-
- //
- // Pick up the range of characters from the matcher
- //
- int32_t startIx = regexp->fMatcher->start(groupNum, *status);
- int32_t endIx = regexp->fMatcher->end (groupNum, *status);
- if (U_FAILURE(*status)) {
- return 0;
- }
-
- //
- // Trim length based on buffer capacity
- //
- int32_t fullLength = endIx - startIx;
- int32_t copyLength = fullLength;
- if (copyLength < destCapacity) {
- dest[copyLength] = 0;
- } else if (copyLength == destCapacity) {
- *status = U_STRING_NOT_TERMINATED_WARNING;
- } else {
- copyLength = destCapacity;
- *status = U_BUFFER_OVERFLOW_ERROR;
- }
-
- //
- // Copy capture group to user's buffer
- //
- if (copyLength > 0) {
- u_memcpy(dest, ®exp->fText[startIx], copyLength);
+U_CAPI void U_EXPORT2
+uregex_setMatchCallback(URegularExpression *regexp2,
+ URegexMatchCallback *callback,
+ const void *context,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ regexp->fMatcher->setMatchCallback(callback, context, *status);
}
- return fullLength;
}
//------------------------------------------------------------------------------
//
-// uregex_start
+// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_start(URegularExpression *regexp,
- int32_t groupNum,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return 0;
- }
- int32_t result = regexp->fMatcher->start(groupNum, *status);
- return result;
+U_CAPI void U_EXPORT2
+uregex_getMatchCallback(const URegularExpression *regexp2,
+ URegexMatchCallback **callback,
+ const void **context,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ regexp->fMatcher->getMatchCallback(*callback, *context, *status);
+ }
}
//------------------------------------------------------------------------------
//
-// uregex_end
+// uregex_setMatchProgressCallback
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_end(URegularExpression *regexp,
- int32_t groupNum,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return 0;
+U_CAPI void U_EXPORT2
+uregex_setFindProgressCallback(URegularExpression *regexp2,
+ URegexFindProgressCallback *callback,
+ const void *context,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ regexp->fMatcher->setFindProgressCallback(callback, context, *status);
}
- int32_t result = regexp->fMatcher->end(groupNum, *status);
- return result;
}
+
//------------------------------------------------------------------------------
//
-// uregex_reset
+// uregex_getMatchCallback
//
//------------------------------------------------------------------------------
-U_CAPI void U_EXPORT2
-uregex_reset(URegularExpression *regexp,
- int32_t index,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return;
- }
- regexp->fMatcher->reset(index, *status);
+U_CAPI void U_EXPORT2
+uregex_getFindProgressCallback(const URegularExpression *regexp2,
+ URegexFindProgressCallback **callback,
+ const void **context,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, FALSE, status)) {
+ regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
+ }
}
// uregex_replaceAll
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_replaceAll(URegularExpression *regexp,
+U_CAPI int32_t U_EXPORT2
+uregex_replaceAll(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
UChar *destBuf,
int32_t destCapacity,
UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
- destBuf == NULL && destCapacity > 0 ||
+ (destBuf == NULL && destCapacity > 0) ||
destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t len = 0;
- uregex_reset(regexp, 0, status);
- while (uregex_findNext(regexp, status)) {
- len += uregex_appendReplacement(regexp, replacementText, replacementLength,
+
+ uregex_reset(regexp2, 0, status);
+
+ // Note: Seperate error code variables for findNext() and appendReplacement()
+ // are used so that destination buffer overflow errors
+ // in appendReplacement won't stop findNext() from working.
+ // appendReplacement() and appendTail() special case incoming buffer
+ // overflow errors, continuing to return the correct length.
+ UErrorCode findStatus = *status;
+ while (uregex_findNext(regexp2, &findStatus)) {
+ len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
- len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
+ len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
+
+ if (U_FAILURE(findStatus)) {
+ // If anything went wrong with the findNext(), make that error trump
+ // whatever may have happened with the append() operations.
+ // Errors in findNext() are not expected.
+ *status = findStatus;
+ }
return len;
}
+//------------------------------------------------------------------------------
+//
+// uregex_replaceAllUText
+//
+//------------------------------------------------------------------------------
+U_CAPI UText * U_EXPORT2
+uregex_replaceAllUText(URegularExpression *regexp2,
+ UText *replacementText,
+ UText *dest,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ if (replacementText == NULL) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
+ return dest;
+}
+
+
//------------------------------------------------------------------------------
//
// uregex_replaceFirst
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_replaceFirst(URegularExpression *regexp,
+U_CAPI int32_t U_EXPORT2
+uregex_replaceFirst(URegularExpression *regexp2,
const UChar *replacementText,
int32_t replacementLength,
UChar *destBuf,
int32_t destCapacity,
UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
- destBuf == NULL && destCapacity > 0 ||
+ (destBuf == NULL && destCapacity > 0) ||
destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
int32_t len = 0;
UBool findSucceeded;
- uregex_reset(regexp, 0, status);
- findSucceeded = uregex_find(regexp, 0, status);
+ uregex_reset(regexp2, 0, status);
+ findSucceeded = uregex_find(regexp2, 0, status);
if (findSucceeded) {
- len = uregex_appendReplacement(regexp, replacementText, replacementLength,
+ len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
&destBuf, &destCapacity, status);
}
- len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
+ len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
return len;
}
//------------------------------------------------------------------------------
//
-// uregex_appendReplacement
+// uregex_replaceFirstUText
//
//------------------------------------------------------------------------------
+U_CAPI UText * U_EXPORT2
+uregex_replaceFirstUText(URegularExpression *regexp2,
+ UText *replacementText,
+ UText *dest,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ if (replacementText == NULL) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+ dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
+ return dest;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// uregex_appendReplacement
+//
+//------------------------------------------------------------------------------
+U_NAMESPACE_BEGIN
//
// Dummy class, because these functions need to be friends of class RegexMatcher,
// and stand-alone C functions don't work as friends
//
-U_NAMESPACE_BEGIN
class RegexCImpl {
public:
- inline static int32_t appendReplacement(URegularExpression *regexp,
+ inline static int32_t appendReplacement(RegularExpression *regexp,
const UChar *replacementText,
int32_t replacementLength,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status);
- inline static int32_t appendTail(URegularExpression *regexp,
- UChar **destBuf,
- int32_t *destCapacity,
- UErrorCode *status);
+ inline static int32_t appendTail(RegularExpression *regexp,
+ UChar **destBuf,
+ int32_t *destCapacity,
+ UErrorCode *status);
+
+ inline static int32_t split(RegularExpression *regexp,
+ UChar *destBuf,
+ int32_t destCapacity,
+ int32_t *requiredCapacity,
+ UChar *destFields[],
+ int32_t destFieldsCapacity,
+ UErrorCode *status);
};
-U_NAMESPACE_END
+U_NAMESPACE_END
-//
-// Call-back function for u_unescapeAt(), used when we encounter
-// \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
-//
-U_CDECL_BEGIN
-static UChar U_CALLCONV
-unescape_charAt(int32_t offset, void *context) {
- UChar c16 = ((UChar *)context)[offset];
- return c16;
-}
-U_CDECL_END
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
+static const UChar LEFTBRACKET = 0x7b;
+static const UChar RIGHTBRACKET = 0x7d;
//
// Move a character to an output buffer, with bounds checking on the index.
//
// appendReplacement, the actual implementation.
//
-int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
- const UChar *replacementText,
- int32_t replacementLength,
- UChar **destBuf,
- int32_t *destCapacity,
- UErrorCode *status) {
+int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
+ const UChar *replacementText,
+ int32_t replacementLength,
+ UChar **destBuf,
+ int32_t *destCapacity,
+ UErrorCode *status) {
// If we come in with a buffer overflow error, don't suppress the operation.
// A series of appendReplacements, appendTail need to correctly preflight
// the buffer size when an overflow happens somewhere in the middle.
UBool pendingBufferOverflow = FALSE;
- if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
+ if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
pendingBufferOverflow = TRUE;
*status = U_ZERO_ERROR;
}
//
// Validate all paramters
//
- if (validateRE(regexp, status) == FALSE) {
+ if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
if (replacementText == NULL || replacementLength < -1 ||
- destCapacity == NULL || destBuf == NULL ||
- *destBuf == NULL && *destCapacity > 0 ||
+ destCapacity == NULL || destBuf == NULL ||
+ (*destBuf == NULL && *destCapacity > 0) ||
*destCapacity < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
int32_t capacity = *destCapacity;
int32_t destIdx = 0;
int32_t i;
-
+
// If it wasn't supplied by the caller, get the length of the replacement text.
// TODO: slightly smarter logic in the copy loop could watch for the NUL on
// the fly and avoid this step.
}
// Copy input string from the end of previous match to start of current match
- for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
- appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
+ if (regexp->fText != NULL) {
+ int32_t matchStart;
+ int32_t lastMatchEnd;
+ if (UTEXT_USES_U16(m->fInputText)) {
+ lastMatchEnd = (int32_t)m->fLastMatchEnd;
+ matchStart = (int32_t)m->fMatchStart;
+ } else {
+ // !!!: Would like a better way to do this!
+ UErrorCode tempStatus = U_ZERO_ERROR;
+ lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
+ tempStatus = U_ZERO_ERROR;
+ matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
+ }
+ for (i=lastMatchEnd; i<matchStart; i++) {
+ appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
+ }
+ } else {
+ UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
+ destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
+ dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
+ &possibleOverflowError);
}
-
-
+ U_ASSERT(destIdx >= 0);
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replIdx = 0;
- while (replIdx < replacementLength) {
+ while (replIdx < replacementLength && U_SUCCESS(*status)) {
UChar c = replacementText[replIdx];
replIdx++;
if (c != DOLLARSIGN && c != BACKSLASH) {
- // Common case, no substitution, no escaping,
+ // Common case, no substitution, no escaping,
// just copy the char to the dest buf.
appendToBuf(c, &destIdx, dest, capacity);
continue;
if (c==0x55/*U*/ || c==0x75/*u*/) {
// We have a \udddd or \Udddddddd escape sequence.
- UChar32 escapedChar =
- u_unescapeAt(unescape_charAt,
- &replIdx, // Index is updated by unescapeAt
+ UChar32 escapedChar =
+ u_unescapeAt(uregex_ucstr_unescape_charAt,
+ &replIdx, // Index is updated by unescapeAt
replacementLength, // Length of replacement text
(void *)replacementText);
continue;
}
+ // We've got a $. Pick up the following capture group name or number.
+ // For numbers, consume only digits that produce a valid capture group for the pattern.
-
- // We've got a $. Pick up a capture group number if one follows.
- // Consume at most the number of digits necessary for the largest capture
- // number that is valid for this pattern.
-
- int32_t numDigits = 0;
int32_t groupNum = 0;
- UChar32 digitC;
- for (;;) {
- if (replIdx >= replacementLength) {
- break;
- }
- U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
- if (u_isdigit(digitC) == FALSE) {
- break;
- }
+ U_ASSERT(c == DOLLARSIGN);
+ UChar32 c32 = -1;
+ if (replIdx < replacementLength) {
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+ }
+ if (u_isdigit(c32)) {
+ int32_t numDigits = 0;
+ int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
+ for (;;) {
+ if (replIdx >= replacementLength) {
+ break;
+ }
+ U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+ if (u_isdigit(c32) == FALSE) {
+ break;
+ }
+ int32_t digitVal = u_charDigitValue(c32);
+ if (groupNum * 10 + digitVal <= numCaptureGroups) {
+ groupNum = groupNum * 10 + digitVal;
+ U16_FWD_1(replacementText, replIdx, replacementLength);
+ numDigits++;
+ } else {
+ if (numDigits == 0) {
+ *status = U_INDEX_OUTOFBOUNDS_ERROR;
+ }
+ break;
+ }
+ }
+ } else if (c32 == LEFTBRACKET) {
+ // Scan for Named Capture Group, ${name}.
+ UnicodeString groupName;
U16_FWD_1(replacementText, replIdx, replacementLength);
- groupNum=groupNum*10 + u_charDigitValue(digitC);
- numDigits++;
- if (numDigits >= m->fPattern->fMaxCaptureDigits) {
- break;
+ while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
+ if (replIdx >= replacementLength) {
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ break;
+ }
+ U16_NEXT(replacementText, replIdx, replacementLength, c32);
+ if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
+ (c32 >= 0x61 && c32 <= 0x7a) || // a..z
+ (c32 >= 0x31 && c32 <= 0x39)) { // 0..9
+ groupName.append(c32);
+ } else if (c32 == RIGHTBRACKET) {
+ groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
+ if (groupNum == 0) {
+ // Name not defined by pattern.
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ }
+ } else {
+ // Character was something other than a name char or a closing '}'
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+ }
}
+ } else {
+ // $ not followed by {name} or digits.
+ *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
}
- if (numDigits == 0) {
- // The $ didn't introduce a group number at all.
- // Treat it as just part of the substitution text.
- appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
- continue;
- }
-
// Finally, append the capture group data to the destination.
- int32_t capacityRemaining = capacity - destIdx;
- if (capacityRemaining < 0) {
- capacityRemaining = 0;
- }
- destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
- if (*status == U_BUFFER_OVERFLOW_ERROR) {
- // Ignore buffer overflow when extracting the group. We need to
- // continue on to get full size of the untruncated result. We will
- // raise our own buffer overflow error at the end.
- *status = U_ZERO_ERROR;
+ if (U_SUCCESS(*status)) {
+ destIdx += uregex_group((URegularExpression*)regexp, groupNum,
+ dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
+ if (*status == U_BUFFER_OVERFLOW_ERROR) {
+ // Ignore buffer overflow when extracting the group. We need to
+ // continue on to get full size of the untruncated result. We will
+ // raise our own buffer overflow error at the end.
+ *status = U_ZERO_ERROR;
+ }
}
if (U_FAILURE(*status)) {
- // Can fail if group number is out of range.
+ // bad group number or name.
break;
}
-
}
//
//
if (destIdx < capacity) {
dest[destIdx] = 0;
- } else if (destIdx == *destCapacity) {
- *status = U_STRING_NOT_TERMINATED_WARNING;
- } else {
- *status = U_BUFFER_OVERFLOW_ERROR;
+ } else if (U_SUCCESS(*status)) {
+ if (destIdx == *destCapacity) {
+ *status = U_STRING_NOT_TERMINATED_WARNING;
+ } else {
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ }
}
-
+
//
// Return an updated dest buffer and capacity to the caller.
//
}
//
-// appendReplacement the acutal API function,
+// appendReplacement the actual API function,
//
-U_CAPI int32_t U_EXPORT2
-uregex_appendReplacement(URegularExpression *regexp,
- const UChar *replacementText,
- int32_t replacementLength,
- UChar **destBuf,
- int32_t *destCapacity,
- UErrorCode *status) {
+U_CAPI int32_t U_EXPORT2
+uregex_appendReplacement(URegularExpression *regexp2,
+ const UChar *replacementText,
+ int32_t replacementLength,
+ UChar **destBuf,
+ int32_t *destCapacity,
+ UErrorCode *status) {
+
+ RegularExpression *regexp = (RegularExpression*)regexp2;
return RegexCImpl::appendReplacement(
regexp, replacementText, replacementLength,destBuf, destCapacity, status);
}
+//
+// uregex_appendReplacementUText...can just use the normal C++ method
+//
+U_CAPI void U_EXPORT2
+uregex_appendReplacementUText(URegularExpression *regexp2,
+ UText *replText,
+ UText *dest,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ regexp->fMatcher->appendReplacement(dest, replText, *status);
+}
+
//------------------------------------------------------------------------------
//
// uregex_appendTail
//
//------------------------------------------------------------------------------
-int32_t RegexCImpl::appendTail(URegularExpression *regexp,
- UChar **destBuf,
- int32_t *destCapacity,
- UErrorCode *status) {
+int32_t RegexCImpl::appendTail(RegularExpression *regexp,
+ UChar **destBuf,
+ int32_t *destCapacity,
+ UErrorCode *status)
+{
// If we come in with a buffer overflow error, don't suppress the operation.
// A series of appendReplacements, appendTail need to correctly preflight
// the buffer size when an overflow happens somewhere in the middle.
UBool pendingBufferOverflow = FALSE;
- if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
+ if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
pendingBufferOverflow = TRUE;
*status = U_ZERO_ERROR;
}
- if (validateRE(regexp, status) == FALSE) {
+ if (validateRE(regexp, TRUE, status) == FALSE) {
return 0;
}
- if (destCapacity == NULL || destBuf == NULL ||
- *destBuf == NULL && *destCapacity > 0 ||
- *destCapacity < 0) {
+
+ if (destCapacity == NULL || destBuf == NULL ||
+ (*destBuf == NULL && *destCapacity > 0) ||
+ *destCapacity < 0)
+ {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
-
- RegexMatcher *m = regexp->fMatcher;
- int32_t srcIdx;
- if (m->fMatch) {
- // The most recent call to find() succeeded.
- srcIdx = m->fMatchEnd;
- } else {
- // The last call to find() on this matcher failed().
- // Look back to the end of the last find() that succeeded for src index.
- srcIdx = m->fLastMatchEnd;
- if (srcIdx == -1) {
- // There has been no successful match with this matcher.
- // We want to copy the whole string.
- srcIdx = 0;
- }
- }
+ RegexMatcher *m = regexp->fMatcher;
int32_t destIdx = 0;
int32_t destCap = *destCapacity;
UChar *dest = *destBuf;
- for (;;) {
- if (srcIdx == regexp->fTextLength) {
- break;
+ if (regexp->fText != NULL) {
+ int32_t srcIdx;
+ int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
+ if (nativeIdx == -1) {
+ srcIdx = 0;
+ } else if (UTEXT_USES_U16(m->fInputText)) {
+ srcIdx = (int32_t)nativeIdx;
+ } else {
+ UErrorCode newStatus = U_ZERO_ERROR;
+ srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &newStatus);
}
- UChar c = regexp->fText[srcIdx];
- if (c == 0 && regexp->fTextLength == -1) {
- break;
+
+ for (;;) {
+ U_ASSERT(destIdx >= 0);
+
+ if (srcIdx == regexp->fTextLength) {
+ break;
+ }
+ UChar c = regexp->fText[srcIdx];
+ if (c == 0 && regexp->fTextLength == -1) {
+ regexp->fTextLength = srcIdx;
+ break;
+ }
+
+ if (destIdx < destCap) {
+ dest[destIdx] = c;
+ } else {
+ // We've overflowed the dest buffer.
+ // If the total input string length is known, we can
+ // compute the total buffer size needed without scanning through the string.
+ if (regexp->fTextLength > 0) {
+ destIdx += (regexp->fTextLength - srcIdx);
+ break;
+ }
+ }
+ srcIdx++;
+ destIdx++;
}
- if (destIdx < destCap) {
- dest[destIdx] = c;
+ } else {
+ int64_t srcIdx;
+ if (m->fMatch) {
+ // The most recent call to find() succeeded.
+ srcIdx = m->fMatchEnd;
} else {
- // We've overflowed the dest buffer.
- // If the total input string length is known, we can
- // compute the total buffer size needed without scanning through the string.
- if (regexp->fTextLength > 0) {
- destIdx += (regexp->fTextLength - srcIdx);
- break;
+ // The last call to find() on this matcher failed().
+ // Look back to the end of the last find() that succeeded for src index.
+ srcIdx = m->fLastMatchEnd;
+ if (srcIdx == -1) {
+ // There has been no successful match with this matcher.
+ // We want to copy the whole string.
+ srcIdx = 0;
}
}
- srcIdx++;
- destIdx++;
+
+ destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
}
//
if (destIdx < destCap) {
*destBuf += destIdx;
*destCapacity -= destIdx;
- } else {
+ } else if (*destBuf != NULL) {
*destBuf += destCap;
*destCapacity = 0;
}
}
-U_CAPI int32_t U_EXPORT2
-uregex_appendTail(URegularExpression *regexp,
+//
+// appendTail the actual API function
+//
+U_CAPI int32_t U_EXPORT2
+uregex_appendTail(URegularExpression *regexp2,
UChar **destBuf,
int32_t *destCapacity,
UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
}
+//
+// uregex_appendTailUText...can just use the normal C++ method
+//
+U_CAPI UText * U_EXPORT2
+uregex_appendTailUText(URegularExpression *regexp2,
+ UText *dest,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ return regexp->fMatcher->appendTail(dest, *status);
+}
+
+
//------------------------------------------------------------------------------
//
// copyString Internal utility to copy a string to an output buffer,
// and the NUL is counted in the output size.
//
//------------------------------------------------------------------------------
+#if 0
static void copyString(UChar *destBuffer, // Destination buffer.
int32_t destCapacity, // Total capacity of dest buffer
int32_t *destIndex, // Index into dest buffer. Updated on return.
di++;
*destIndex = di;
}
-
+#endif
//------------------------------------------------------------------------------
//
// uregex_split
//
//------------------------------------------------------------------------------
-U_CAPI int32_t U_EXPORT2
-uregex_split( URegularExpression *regexp,
- UChar *destBuf,
- int32_t destCapacity,
- int32_t *requiredCapacity,
- UChar *destFields[],
- int32_t destFieldsCapacity,
- UErrorCode *status) {
- if (validateRE(regexp, status) == FALSE) {
- return 0;
- }
- if (destBuf == NULL && destCapacity > 0 ||
- destCapacity < 0 ||
- destFields == NULL ||
- destFieldsCapacity < 1 ) {
- *status = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
-
+int32_t RegexCImpl::split(RegularExpression *regexp,
+ UChar *destBuf,
+ int32_t destCapacity,
+ int32_t *requiredCapacity,
+ UChar *destFields[],
+ int32_t destFieldsCapacity,
+ UErrorCode *status) {
//
// Reset for the input text
//
regexp->fMatcher->reset();
- int32_t inputLen = regexp->fTextString.length();
- int32_t nextOutputStringStart = 0;
+ UText *inputText = regexp->fMatcher->fInputText;
+ int64_t nextOutputStringStart = 0;
+ int64_t inputLen = regexp->fMatcher->fInputLength;
if (inputLen == 0) {
return 0;
}
-
//
// Loop through the input text, searching for the delimiter pattern
//
int32_t i; // Index of the field being processed.
int32_t destIdx = 0; // Next available position in destBuf;
int32_t numCaptureGroups = regexp->fMatcher->groupCount();
+ UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
for (i=0; ; i++) {
if (i>=destFieldsCapacity-1) {
- // There are one or zero output string left.
+ // There are one or zero output strings left.
// Fill the last output string with whatever is left from the input, then exit the loop.
// ( i will be == destFieldsCapacity if we filled the output array while processing
// capture groups of the delimiter expression, in which case we will discard the
// last capture group saved in favor of the unprocessed remainder of the
// input string.)
- int32_t remainingLength = inputLen-nextOutputStringStart;
- if (remainingLength > 0) {
- }
- if (i >= destFieldsCapacity) {
- // No fields are left. Recycle the last one for holding the trailing part of
- // the input string.
- i = destFieldsCapacity-1;
- destIdx = (int32_t)(destFields[i] - destFields[0]);
+ if (inputLen > nextOutputStringStart) {
+ if (i != destFieldsCapacity-1) {
+ // No fields are left. Recycle the last one for holding the trailing part of
+ // the input string.
+ i = destFieldsCapacity-1;
+ destIdx = (int32_t)(destFields[i] - destFields[0]);
+ }
+
+ destFields[i] = &destBuf[destIdx];
+ destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
+ &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
}
-
- destFields[i] = &destBuf[destIdx];
- copyString(destBuf, destCapacity, &destIdx,
- ®exp->fText[nextOutputStringStart], remainingLength);
break;
}
-
+
if (regexp->fMatcher->find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
- int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
destFields[i] = &destBuf[destIdx];
- copyString(destBuf, destCapacity, &destIdx,
- ®exp->fText[nextOutputStringStart], fieldLen);
- nextOutputStringStart = regexp->fMatcher->end(*status);
-
+
+ destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
+ &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
+ if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
+ tStatus = U_ZERO_ERROR;
+ } else {
+ *status = tStatus;
+ }
+ nextOutputStringStart = regexp->fMatcher->fMatchEnd;
+
// If the delimiter pattern has capturing parentheses, the captured
// text goes out into the next n destination strings.
int32_t groupNum;
break;
}
i++;
-
+
// Set up to extract the capture group contents into the dest buffer.
- UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
- // error while extracting this group.
- int32_t remainingCapacity = destCapacity - destIdx;
- if (remainingCapacity < 0) {
- remainingCapacity = 0;
- }
destFields[i] = &destBuf[destIdx];
- int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
+ tStatus = U_ZERO_ERROR;
+ int32_t t = uregex_group((URegularExpression*)regexp,
+ groupNum,
+ destFields[i],
+ REMAINING_CAPACITY(destIdx, destCapacity),
+ &tStatus);
destIdx += t + 1; // Record the space used in the output string buffer.
// +1 for the NUL that terminates the string.
+ if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
+ tStatus = U_ZERO_ERROR;
+ } else {
+ *status = tStatus;
+ }
}
if (nextOutputStringStart == inputLen) {
- // The delimiter was at the end of the string. We're done.
+ // The delimiter was at the end of the string.
+ // Output an empty string, and then we are done.
+ if (destIdx < destCapacity) {
+ destBuf[destIdx] = 0;
+ }
+ if (i < destFieldsCapacity-1) {
+ ++i;
+ }
+ if (destIdx < destCapacity) {
+ destFields[i] = destBuf + destIdx;
+ }
+ ++destIdx;
break;
}
// We ran off the end of the input while looking for the next delimiter.
// All the remaining text goes into the current output string.
destFields[i] = &destBuf[destIdx];
- copyString(destBuf, destCapacity, &destIdx,
- ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
+ destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
+ &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
break;
}
}
return i+1;
}
+//
+// uregex_split The actual API function
+//
+U_CAPI int32_t U_EXPORT2
+uregex_split(URegularExpression *regexp2,
+ UChar *destBuf,
+ int32_t destCapacity,
+ int32_t *requiredCapacity,
+ UChar *destFields[],
+ int32_t destFieldsCapacity,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ if (validateRE(regexp, TRUE, status) == FALSE) {
+ return 0;
+ }
+ if ((destBuf == NULL && destCapacity > 0) ||
+ destCapacity < 0 ||
+ destFields == NULL ||
+ destFieldsCapacity < 1 ) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
+}
+
+
+//
+// uregex_splitUText...can just use the normal C++ method
+//
+U_CAPI int32_t U_EXPORT2
+uregex_splitUText(URegularExpression *regexp2,
+ UText *destFields[],
+ int32_t destFieldsCapacity,
+ UErrorCode *status) {
+ RegularExpression *regexp = (RegularExpression*)regexp2;
+ return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
+}
+
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS