+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
-* Copyright (C) 2005-2012, International Business Machines
+* Copyright (C) 2005-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utext.cpp
-* encoding: US-ASCII
+* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
U_CAPI void U_EXPORT2
utext_setNativeIndex(UText *ut, int64_t index) {
+ // Apple note, at entry ut->chunkContents may be 0, not necessarily a problem
+ // (CF funcs will have set chunkNativeStart/Limit to 0 forcing call to access)
if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
// The desired position is outside of the current chunk.
// Access the new position. Assume a forward iteration from here,
// utf-16 indexing.
ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
} else {
- ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
+ ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
}
+ // Apple note, it can still be valid to have ut->chunkContents==0 at this
+ // point (just not inside the next block), see <rdar://problem/53610517>
+
// The convention is that the index must always be on a code point boundary.
// Adjust the index position if it is in the middle of a surrogate pair.
if (ut->chunkOffset<ut->chunkLength) {
U_CAPI UText * U_EXPORT2
utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
- UText *result;
- result = src->pFuncs->clone(dest, src, deep, status);
+ if (U_FAILURE(*status)) {
+ return dest;
+ }
+ UText *result = src->pFuncs->clone(dest, src, deep, status);
+ if (U_FAILURE(*status)) {
+ return result;
+ }
+ if (result == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ return result;
+ }
if (readOnly) {
utext_freeze(result);
}
adjustPointer(dest, &dest->r, src);
adjustPointer(dest, (const void **)&dest->chunkContents, src);
+ // The newly shallow-cloned UText does _not_ own the underlying storage for the text.
+ // (The source for the clone may or may not have owned the text.)
+
+ dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
+
return dest;
}
//------------------------------------------------------------------------------
// Chunk size.
-// Must be less than 85, because of byte mapping from UChar indexes to native indexes.
+// Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
// to two UChars.)
+// The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
+// is a three-byte sequence (truncated four-byte sequence).
//
enum { UTF8_TEXT_CHUNK_SIZE=32 };
// Requested index is in this buffer.
u8b = (UTF8Buf *)ut->p; // the current buffer
mapIndex = ix - u8b->toUCharsMapStart;
+ U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
return TRUE;
// Swap the UText buffers.
// We want to fill what was previously the alternate buffer,
// and make what was the current buffer be the new alternate.
- UTF8Buf *u8b = (UTF8Buf *)ut->q;
+ UTF8Buf *u8b_swap = (UTF8Buf *)ut->q;
ut->q = ut->p;
- ut->p = u8b;
+ ut->p = u8b_swap;
int32_t strLen = ut->b;
UBool nulTerminated = FALSE;
nulTerminated = TRUE;
}
- UChar *buf = u8b->buf;
- uint8_t *mapToNative = u8b->mapToNative;
- uint8_t *mapToUChars = u8b->mapToUChars;
+ UChar *buf = u8b_swap->buf;
+ uint8_t *mapToNative = u8b_swap->mapToNative;
+ uint8_t *mapToUChars = u8b_swap->mapToUChars;
int32_t destIx = 0;
int32_t srcIx = ix;
UBool seenNonAscii = FALSE;
// General case, handle everything.
if (seenNonAscii == FALSE) {
seenNonAscii = TRUE;
- u8b->bufNILimit = destIx;
+ u8b_swap->bufNILimit = destIx;
}
int32_t cIx = srcIx;
mapToUChars[srcIx - ix] = (uint8_t)destIx;
// fill in Buffer descriptor
- u8b->bufNativeStart = ix;
- u8b->bufNativeLimit = srcIx;
- u8b->bufStartIdx = 0;
- u8b->bufLimitIdx = destIx;
+ u8b_swap->bufNativeStart = ix;
+ u8b_swap->bufNativeLimit = srcIx;
+ u8b_swap->bufStartIdx = 0;
+ u8b_swap->bufLimitIdx = destIx;
if (seenNonAscii == FALSE) {
- u8b->bufNILimit = destIx;
+ u8b_swap->bufNILimit = destIx;
}
- u8b->toUCharsMapStart = u8b->bufNativeStart;
+ u8b_swap->toUCharsMapStart = u8b_swap->bufNativeStart;
// Set UText chunk to refer to this buffer.
ut->chunkContents = buf;
ut->chunkOffset = 0;
- ut->chunkLength = u8b->bufLimitIdx;
- ut->chunkNativeStart = u8b->bufNativeStart;
- ut->chunkNativeLimit = u8b->bufNativeLimit;
- ut->nativeIndexingLimit = u8b->bufNILimit;
+ ut->chunkLength = u8b_swap->bufLimitIdx;
+ ut->chunkNativeStart = u8b_swap->bufNativeStart;
+ ut->chunkNativeLimit = u8b_swap->bufNativeLimit;
+ ut->nativeIndexingLimit = u8b_swap->bufNILimit;
// For zero terminated strings, keep track of the maximum point
// scanned so far.
// Can only do this if the incoming index is somewhere in the interior of the string.
// If index is at the end, there is no character there to look at.
if (ix != ut->b) {
+ // Note: this function will only move the index back if it is on a trail byte
+ // and there is a preceding lead byte and the sequence from the lead
+ // through this trail could be part of a valid UTF-8 sequence
+ // Otherwise the index remains unchanged.
U8_SET_CP_START(s8, 0, ix);
}
// Swap the UText buffers.
// We want to fill what was previously the alternate buffer,
// and make what was the current buffer be the new alternate.
- UTF8Buf *u8b = (UTF8Buf *)ut->q;
+ UTF8Buf *u8b_swap = (UTF8Buf *)ut->q;
ut->q = ut->p;
- ut->p = u8b;
-
- UChar *buf = u8b->buf;
- uint8_t *mapToNative = u8b->mapToNative;
- uint8_t *mapToUChars = u8b->mapToUChars;
- int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
+ ut->p = u8b_swap;
+
+ UChar *buf = u8b_swap->buf;
+ uint8_t *mapToNative = u8b_swap->mapToNative;
+ uint8_t *mapToUChars = u8b_swap->mapToUChars;
+ int32_t toUCharsMapStart = ix - sizeof(UTF8Buf::mapToUChars) + 1;
+ // Note that toUCharsMapStart can be negative. Happens when the remaining
+ // text from current position to the beginning is less than the buffer size.
+ // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
// at end of buffer to leave room
// for a surrogate pair at the
if (c<0x80) {
// Special case ASCII range for speed.
buf[destIx] = (UChar)c;
+ U_ASSERT(toUCharsMapStart <= srcIx);
mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
} else {
do {
mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
} while (sIx >= srcIx);
+ U_ASSERT(toUCharsMapStart <= (srcIx+1));
// Set native indexing limit to be the current position.
// We are processing a non-ascii, non-native-indexing char now;
bufNILimit = destIx;
}
}
- u8b->bufNativeStart = srcIx;
- u8b->bufNativeLimit = ix;
- u8b->bufStartIdx = destIx;
- u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
- u8b->bufNILimit = bufNILimit - u8b->bufStartIdx;
- u8b->toUCharsMapStart = toUCharsMapStart;
-
- ut->chunkContents = &buf[u8b->bufStartIdx];
- ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
+ u8b_swap->bufNativeStart = srcIx;
+ u8b_swap->bufNativeLimit = ix;
+ u8b_swap->bufStartIdx = destIx;
+ u8b_swap->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
+ u8b_swap->bufNILimit = bufNILimit - u8b_swap->bufStartIdx;
+ u8b_swap->toUCharsMapStart = toUCharsMapStart;
+
+ ut->chunkContents = &buf[u8b_swap->bufStartIdx];
+ ut->chunkLength = u8b_swap->bufLimitIdx - u8b_swap->bufStartIdx;
ut->chunkOffset = ut->chunkLength;
- ut->chunkNativeStart = u8b->bufNativeStart;
- ut->chunkNativeLimit = u8b->bufNativeLimit;
- ut->nativeIndexingLimit = u8b->bufNILimit;
+ ut->chunkNativeStart = u8b_swap->bufNativeStart;
+ ut->chunkNativeLimit = u8b_swap->bufNativeLimit;
+ ut->nativeIndexingLimit = u8b_swap->bufNILimit;
return TRUE;
}
U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
U_ASSERT(index<=ut->chunkNativeLimit);
int32_t mapIndex = index - u8b->toUCharsMapStart;
+ U_ASSERT(mapIndex < (int32_t)sizeof(UTF8Buf::mapToUChars));
int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
U_ASSERT(offset>=0 && offset<=ut->chunkLength);
return offset;
U_CDECL_END
-static const struct UTextFuncs utf8Funcs =
+static const struct UTextFuncs utf8Funcs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
UnicodeString buffer(dest, 0, destCapacity); // writable alias
rep->extractBetween(start32, limit32, buffer);
repTextAccess(ut, limit32, TRUE);
-
+
return u_terminateUChars(dest, destCapacity, length, status);
}
repTextAccess(ut, nativeIterIndex, TRUE);
}
-static const struct UTextFuncs repFuncs =
+static const struct UTextFuncs repFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
repTextLength,
repTextAccess,
repTextExtract,
- repTextReplace,
- repTextCopy,
+ repTextReplace,
+ repTextCopy,
NULL, // MapOffsetToNative,
NULL, // MapIndexToUTF16,
repTextClose,
return NULL;
}
ut = utext_setup(ut, sizeof(ReplExtra), status);
+ if(U_FAILURE(*status)) {
+ return ut;
+ }
ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
if(rep->hasMetaData()) {
}
if(move) {
- // move: copy to destIndex, then replace original with nothing
+ // move: copy to destIndex, then remove original
int32_t segLength=limit32-start32;
us->copy(start32, limit32, destIndex32);
if(destIndex32<start32) {
start32+=segLength;
}
- us->replace(start32, segLength, NULL, 0);
+ us->remove(start32, segLength);
} else {
// copy
us->copy(start32, limit32, destIndex32);
}
-static const struct UTextFuncs unistrFuncs =
+static const struct UTextFuncs unistrFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
unistrTextLength,
unistrTextAccess,
unistrTextExtract,
- unistrTextReplace,
- unistrTextCopy,
+ unistrTextReplace,
+ unistrTextCopy,
NULL, // MapOffsetToNative,
NULL, // MapIndexToUTF16,
unistrTextClose,
ut->chunkLength = si;
ut->nativeIndexingLimit = si;
strLength = si;
+ limit32 = si;
break;
}
U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
// If the limit index points to a lead surrogate of a pair,
// add the corresponding trail surrogate to the destination.
if (si>0 && U16_IS_LEAD(s[si-1]) &&
- ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
+ ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
{
if (di<destCapacity) {
// store only if there is space in the output buffer.
- dest[di++] = s[si++];
+ dest[di++] = s[si];
}
+ si++;
}
// Put iteration position at the point just following the extracted text
- ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
+ if (si <= ut->chunkNativeLimit) {
+ ut->chunkOffset = si;
+ } else {
+ ucstrTextAccess(ut, si, TRUE);
+ }
// Add a terminating NUL if space in the buffer permits,
// and set the error status as required.
return di;
}
-static const struct UTextFuncs ucstrFuncs =
+static const struct UTextFuncs ucstrFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
CharacterIterator *srcCI =(CharacterIterator *)src->context;
srcCI = srcCI->clone();
dest = utext_openCharacterIterator(dest, srcCI, status);
+ if (U_FAILURE(*status)) {
+ return dest;
+ }
// cast off const on getNativeIndex.
// For CharacterIterator based UTexts, this is safe, the operation is const.
int64_t ix = utext_getNativeIndex((UText *)src);
}
srci += len;
}
-
+
charIterTextAccess(ut, copyLimit, TRUE);
u_terminateUChars(dest, destCapacity, desti, status);
return desti;
}
-static const struct UTextFuncs charIterFuncs =
+static const struct UTextFuncs charIterFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
}
return ut;
}
-
-
-