/*
*******************************************************************************
*
-* Copyright (C) 2005-2006, International Business Machines
+* Copyright (C) 2005-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/utext.h"
+#include "unicode/utf.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
#include "ustr_imp.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
+#include "putilimp.h"
+U_NAMESPACE_USE
#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
-U_DRAFT UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
utext_moveIndex32(UText *ut, int32_t delta) {
UChar32 c;
if (delta > 0) {
}
-U_DRAFT int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
utext_nativeLength(UText *ut) {
return ut->pFuncs->nativeLength(ut);
}
-U_DRAFT UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
utext_isLengthExpensive(const UText *ut) {
UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
return r;
}
-U_DRAFT int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
utext_getNativeIndex(const UText *ut) {
if(ut->chunkOffset <= ut->nativeIndexingLimit) {
return ut->chunkNativeStart+ut->chunkOffset;
}
-U_DRAFT void U_EXPORT2
+U_CAPI void U_EXPORT2
utext_setNativeIndex(UText *ut, int64_t index) {
if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
// The desired position is outside of the current chunk.
// Adjust the index position if it is in the middle of a surrogate pair.
if (ut->chunkOffset<ut->chunkLength) {
UChar c= ut->chunkContents[ut->chunkOffset];
- if (UTF16_IS_TRAIL(c)) {
+ if (U16_IS_TRAIL(c)) {
if (ut->chunkOffset==0) {
ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
}
if (ut->chunkOffset>0) {
UChar lead = ut->chunkContents[ut->chunkOffset-1];
- if (UTF16_IS_LEAD(lead)) {
+ if (U16_IS_LEAD(lead)) {
ut->chunkOffset--;
}
}
-U_DRAFT int64_t U_EXPORT2
+U_CAPI int64_t U_EXPORT2
utext_getPreviousNativeIndex(UText *ut) {
//
// Fast-path the common case.
// UText iteration position is always on a code point boundary,
// never on the trail half of a surrogate pair.
//
-U_DRAFT UChar32 U_EXPORT2
+U_CAPI UChar32 U_EXPORT2
utext_current32(UText *ut) {
UChar32 c;
if (ut->chunkOffset==ut->chunkLength) {
}
-U_DRAFT UChar32 U_EXPORT2
+U_CAPI UChar32 U_EXPORT2
utext_char32At(UText *ut, int64_t nativeIndex) {
UChar32 c = U_SENTINEL;
}
-U_DRAFT UChar32 U_EXPORT2
+U_CAPI UChar32 U_EXPORT2
utext_next32(UText *ut) {
UChar32 c;
}
-U_DRAFT UChar32 U_EXPORT2
+U_CAPI UChar32 U_EXPORT2
utext_previous32(UText *ut) {
UChar32 c;
-U_DRAFT UChar32 U_EXPORT2
+U_CAPI UChar32 U_EXPORT2
utext_next32From(UText *ut, int64_t index) {
UChar32 c = U_SENTINEL;
}
-U_DRAFT UChar32 U_EXPORT2
+U_CAPI UChar32 U_EXPORT2
utext_previous32From(UText *ut, int64_t index) {
//
// Return the character preceding the specified index.
}
-U_DRAFT int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
utext_extract(UText *ut,
int64_t start, int64_t limit,
UChar *dest, int32_t destCapacity,
-U_DRAFT UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
utext_equals(const UText *a, const UText *b) {
if (a==NULL || b==NULL ||
a->magic != UTEXT_MAGIC ||
return TRUE;
}
-U_DRAFT UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
utext_isWritable(const UText *ut)
{
UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
}
-U_DRAFT void U_EXPORT2
+U_CAPI void U_EXPORT2
utext_freeze(UText *ut) {
// Zero out the WRITABLE flag.
ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
}
-U_DRAFT UBool U_EXPORT2
+U_CAPI UBool U_EXPORT2
utext_hasMetaData(const UText *ut)
{
UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
-U_DRAFT int32_t U_EXPORT2
+U_CAPI int32_t U_EXPORT2
utext_replace(UText *ut,
int64_t nativeStart, int64_t nativeLimit,
const UChar *replacementText, int32_t replacementLength,
return i;
}
-U_DRAFT void U_EXPORT2
+U_CAPI void U_EXPORT2
utext_copy(UText *ut,
int64_t nativeStart, int64_t nativeLimit,
int64_t destIndex,
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
- UText *result;
- result = src->pFuncs->clone(dest, src, deep, status);
+ if (U_FAILURE(*status)) {
+ return dest;
+ }
+ UText *result = src->pFuncs->clone(dest, src, deep, status);
+ if (U_FAILURE(*status)) {
+ return result;
+ }
+ if (result == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ return result;
+ }
if (readOnly) {
utext_freeze(result);
}
static const UText emptyText = UTEXT_INITIALIZER;
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
if (U_FAILURE(*status)) {
return ut;
ut = (UText *)uprv_malloc(spaceRequired);
if (ut == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
+ return NULL;
} else {
*ut = emptyText;
ut->flags |= UTEXT_HEAP_ALLOCATED;
if (spaceRequired>0) {
ut->extraSize = extraSpace;
ut->pExtra = &((ExtendedUText *)ut)->extension;
- uprv_memset(ut->pExtra, 0, extraSpace); // Purify whines about copying untouched extra [buffer]
- // space when cloning, so init it now.
}
}
} else {
} else {
ut->extraSize = extraSpace;
ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
- uprv_memset(ut->pExtra, 0, extraSpace);
}
}
}
ut->privB = 0;
ut->privC = 0;
ut->privP = NULL;
+ if (ut->pExtra!=NULL && ut->extraSize>0)
+ uprv_memset(ut->pExtra, 0, ut->extraSize);
+
}
return ut;
}
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_close(UText *ut) {
if (ut==NULL ||
ut->magic != UTEXT_MAGIC ||
adjustPointer(dest, &dest->p, src);
adjustPointer(dest, &dest->q, src);
adjustPointer(dest, &dest->r, src);
+ adjustPointer(dest, (const void **)&dest->chunkContents, src);
+
+ // The newly shallow-cloned UText does _not_ own the underlying storage for the text.
+ // (The source for the clone may or may not have owned the text.)
+
+ dest->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
return dest;
}
if (ix>length) {
if (length>=0) {
ix=length;
- } else if (ix>ut->c) {
+ } else if (ix>=ut->c) {
// Zero terminated string, and requested index is beyond
// the region that has already been scanned.
// Scan up to either the end of the string or to the
int32_t destIx = 0;
int32_t srcIx = ix;
UBool seenNonAscii = FALSE;
- UChar32 c;
+ UChar32 c = 0;
// Fill the chunk buffer and mapping arrays.
while (destIx<UTF8_TEXT_CHUNK_SIZE) {
if (c>0 && c<0x80) {
// Special case ASCII range for speed.
// zero is excluded to simplify bounds checking.
- buf[destIx] = c;
- mapToNative[destIx] = srcIx - ix;
- mapToUChars[srcIx-ix] = destIx;
+ buf[destIx] = (UChar)c;
+ mapToNative[destIx] = (uint8_t)(srcIx - ix);
+ mapToUChars[srcIx-ix] = (uint8_t)destIx;
srcIx++;
destIx++;
} else {
int32_t cIx = srcIx;
int32_t dIx = destIx;
int32_t dIxSaved = destIx;
- U8_NEXT(s8, srcIx, strLen, c);
+ U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
if (c==0 && nulTerminated) {
srcIx--;
break;
}
- if (c<0) {
- // Illegal UTF-8. Replace with sub character.
- c = 0x0fffd;
- }
U16_APPEND_UNSAFE(buf, destIx, c);
do {
- mapToNative[dIx++] = cIx - ix;
+ mapToNative[dIx++] = (uint8_t)(cIx - ix);
} while (dIx < destIx);
do {
- mapToUChars[cIx++ - ix] = dIxSaved;
+ mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
} while (cIx < srcIx);
}
if (srcIx>=strLen) {
// store Native <--> Chunk Map entries for the end of the buffer.
// There is no actual character here, but the index position is valid.
- mapToNative[destIx] = srcIx - ix;
- mapToUChars[srcIx - ix] = destIx;
+ mapToNative[destIx] = (uint8_t)(srcIx - ix);
+ mapToUChars[srcIx - ix] = (uint8_t)destIx;
// fill in Buffer descriptor
u8b->bufNativeStart = ix;
// Map to/from Native Indexes, fill in for the position at the end of
// the buffer.
//
- mapToNative[destIx] = srcIx - toUCharsMapStart;
- mapToUChars[srcIx - toUCharsMapStart] = destIx;
+ mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
+ mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
// Fill the chunk buffer
// Work backwards, filling from the end of the buffer towards the front.
c = s8[srcIx];
if (c<0x80) {
// Special case ASCII range for speed.
- buf[destIx] = c;
- mapToUChars[srcIx - toUCharsMapStart] = destIx;
- mapToNative[destIx] = srcIx - toUCharsMapStart;
+ buf[destIx] = (UChar)c;
+ mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
+ mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
} else {
// General case, handle everything non-ASCII.
int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
// Get the full character from the UTF8 string.
- // use code derived from tbe macros in utf.8
+ // use code derived from tbe macros in utf8.h
// Leaves srcIx pointing at the first byte of the UTF-8 char.
//
- if (c<=0xbf) {
- c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
- // leaves srcIx at first byte of the multi-byte char.
- } else {
- c=0x0fffd;
- }
+ c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
+ // leaves srcIx at first byte of the multi-byte char.
// Store the character in UTF-16 buffer.
if (c<0x10000) {
- buf[destIx] = c;
- mapToNative[destIx] = srcIx - toUCharsMapStart;
+ buf[destIx] = (UChar)c;
+ mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
} else {
buf[destIx] = U16_TRAIL(c);
- mapToNative[destIx] = srcIx - toUCharsMapStart;
+ mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
buf[--destIx] = U16_LEAD(c);
- mapToNative[destIx] = srcIx - toUCharsMapStart;
+ mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
}
// Fill in the map from native indexes to UChars buf index.
do {
- mapToUChars[sIx-- - toUCharsMapStart] = destIx;
+ mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
} while (sIx >= srcIx);
// Set native indexing limit to be the current position.
{
UChar *pDest = dest;
- UChar *pDestLimit = dest+destCapacity;
+ UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
UChar32 ch=0;
int32_t index = 0;
int32_t reqLength = 0;
if(ch <=0x7f){
*pDest++=(UChar)ch;
}else{
- ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
- if(ch<0){
- ch = 0xfffd;
- }
- if(ch<=0xFFFF){
+ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
+ if(U_IS_BMP(ch)){
*(pDest++)=(UChar)ch;
}else{
- *(pDest++)=UTF16_LEAD(ch);
+ *(pDest++)=U16_LEAD(ch);
if(pDest<pDestLimit){
- *(pDest++)=UTF16_TRAIL(ch);
+ *(pDest++)=U16_TRAIL(ch);
}else{
reqLength++;
break;
if(ch <= 0x7f){
reqLength++;
}else{
- ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
- if(ch<0){
- ch = 0xfffd;
- }
- reqLength+=UTF_CHAR_LENGTH(ch);
+ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
+ reqLength+=U16_LENGTH(ch);
}
}
int i;
if (start32 < ut->chunkNativeLimit) {
for (i=0; i<3; i++) {
- if (U8_IS_LEAD(buf[start32]) || start32==0) {
+ if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
break;
}
start32--;
if (limit32 < ut->chunkNativeLimit) {
for (i=0; i<3; i++) {
- if (U8_IS_LEAD(buf[limit32]) || limit32==0) {
+ if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
break;
}
limit32--;
utext_strFromUTF8(dest, destCapacity, &destLength,
(const char *)ut->context+start32, limit32-start32,
pErrorCode);
+ utf8TextAccess(ut, limit32, TRUE);
return destLength;
}
U_CDECL_END
-static struct UTextFuncs utf8Funcs =
+static const struct UTextFuncs utf8Funcs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
};
-U_DRAFT UText * U_EXPORT2
+static const char gEmptyString[] = {0};
+
+U_CAPI UText * U_EXPORT2
utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
if(U_FAILURE(*status)) {
return NULL;
}
+ if(s==NULL && length==0) {
+ s = gEmptyString;
+ }
+
if(s==NULL || length<-1 || length>INT32_MAX) {
*status=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
UnicodeString buffer(dest, 0, destCapacity); // writable alias
rep->extractBetween(start32, limit32, buffer);
+ repTextAccess(ut, limit32, TRUE);
+
return u_terminateUChars(dest, destCapacity, length, status);
}
repTextAccess(ut, nativeIterIndex, TRUE);
}
-static struct UTextFuncs repFuncs =
+static const struct UTextFuncs repFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
repTextLength,
repTextAccess,
repTextExtract,
- repTextReplace,
- repTextCopy,
+ repTextReplace,
+ repTextCopy,
NULL, // MapOffsetToNative,
NULL, // MapIndexToUTF16,
repTextClose,
};
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
{
if(U_FAILURE(*status)) {
trimmedLength=destCapacity;
}
us->extract(start32, trimmedLength, dest);
+ t->chunkOffset = start32+trimmedLength;
+ } else {
+ t->chunkOffset = start32;
}
u_terminateUChars(dest, destCapacity, length, pErrorCode);
return length;
}
-static struct UTextFuncs unistrFuncs =
+static const struct UTextFuncs unistrFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
unistrTextLength,
unistrTextAccess,
unistrTextExtract,
- unistrTextReplace,
- unistrTextCopy,
+ unistrTextReplace,
+ unistrTextCopy,
NULL, // MapOffsetToNative,
NULL, // MapIndexToUTF16,
unistrTextClose,
U_CDECL_END
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
- // TODO: use openConstUnicodeString, then add in the differences.
- //
- ut = utext_setup(ut, 0, status);
+ ut = utext_openConstUnicodeString(ut, s, status);
if (U_SUCCESS(*status)) {
- ut->pFuncs = &unistrFuncs;
- ut->context = s;
- ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
- I32_FLAG(UTEXT_PROVIDER_WRITABLE);
-
- ut->chunkContents = s->getBuffer();
- ut->chunkLength = s->length();
- ut->chunkNativeStart = 0;
- ut->chunkNativeLimit = ut->chunkLength;
- ut->nativeIndexingLimit = ut->chunkLength;
+ ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
}
return ut;
}
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
+ if (U_SUCCESS(*status) && s->isBogus()) {
+ // The UnicodeString is bogus, but we still need to detach the UText
+ // from whatever it was hooked to before, if anything.
+ utext_openUChars(ut, NULL, 0, status);
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return ut;
+ }
ut = utext_setup(ut, 0, status);
// note: use the standard (writable) function table for UnicodeString.
// The flag settings disable writing, so having the functions in
if (U16_IS_LEAD(str[chunkLimit-1])) {
--chunkLimit;
}
+ // Null-terminated chunk with end still unknown.
+ // Update the chunk length to reflect what has been scanned thus far.
+ // That the full length is still unknown is (still) flagged by
+ // ut->a being < 0.
ut->chunkNativeLimit = chunkLimit;
+ ut->nativeIndexingLimit = chunkLimit;
+ ut->chunkLength = chunkLimit;
}
}
return 0;
}
- const UChar *s=(const UChar *)ut->context;
+ //const UChar *s=(const UChar *)ut->context;
int32_t si, di;
int32_t start32;
// Pins 'start' to the length of the string, if it came in out-of-bounds.
// Snaps 'start' to the beginning of a code point.
ucstrTextAccess(ut, start, TRUE);
- U_ASSERT(start <= INT32_MAX);
- start32 = (int32_t)start;
+ const UChar *s=ut->chunkContents;
+ start32 = ut->chunkOffset;
int32_t strLength=(int32_t)ut->a;
if (strLength >= 0) {
} else {
limit32 = pinIndex(limit, INT32_MAX);
}
-
di = 0;
for (si=start32; si<limit32; si++) {
if (strLength<0 && s[si]==0) {
strLength = si;
break;
}
+ U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
if (di<destCapacity) {
// only store if there is space.
dest[di] = s[si];
if (strLength>=0) {
// We have filled the destination buffer, and the string length is known.
// Cut the loop short. There is no need to scan string termination.
- di = strLength;
+ di = limit32 - start32;
si = limit32;
break;
}
}
// Put iteration position at the point just following the extracted text
- ut->chunkOffset = si;
+ ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
// Add a terminating NUL if space in the buffer permits,
// and set the error status as required.
return di;
}
-static struct UTextFuncs ucstrFuncs =
+static const struct UTextFuncs ucstrFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
U_CDECL_END
+static const UChar gEmptyUString[] = {0};
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
- if (length < -1 || length>INT32_MAX) {
+ if(s==NULL && length==0) {
+ s = gEmptyUString;
+ }
+ if (s==NULL || length < -1 || length>INT32_MAX) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
int32_t limit32 = pinIndex(limit, length);
int32_t desti = 0;
int32_t srci;
+ int32_t copyLimit;
CharacterIterator *ci = (CharacterIterator *)ut->context;
ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
srci = ci->getIndex();
+ copyLimit = srci;
while (srci<limit32) {
UChar32 c = ci->next32PostInc();
int32_t len = U16_LENGTH(c);
+ U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
if (desti+len <= destCapacity) {
U16_APPEND_UNSAFE(dest, desti, c);
+ copyLimit = srci+len;
} else {
desti += len;
*status = U_BUFFER_OVERFLOW_ERROR;
srci += len;
}
+ charIterTextAccess(ut, copyLimit, TRUE);
+
u_terminateUChars(dest, destCapacity, desti, status);
return desti;
}
-static struct UTextFuncs charIterFuncs =
+static const struct UTextFuncs charIterFuncs =
{
sizeof(UTextFuncs),
0, 0, 0, // Reserved alignment padding
U_CDECL_END
-U_DRAFT UText * U_EXPORT2
+U_CAPI UText * U_EXPORT2
utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
return ut;
}
-
-
-