1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2005-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: utext.cpp
12 * tab size: 8 (not used)
15 * created on: 2005apr12
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
20 #include "unicode/ustring.h"
21 #include "unicode/unistr.h"
22 #include "unicode/chariter.h"
23 #include "unicode/utext.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf8.h"
26 #include "unicode/utf16.h"
35 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
39 utext_access(UText
*ut
, int64_t index
, UBool forward
) {
40 return ut
->pFuncs
->access(ut
, index
, forward
);
45 U_CAPI UBool U_EXPORT2
46 utext_moveIndex32(UText
*ut
, int32_t delta
) {
50 if(ut
->chunkOffset
>=ut
->chunkLength
&& !utext_access(ut
, ut
->chunkNativeLimit
, TRUE
)) {
53 c
= ut
->chunkContents
[ut
->chunkOffset
];
54 if (U16_IS_SURROGATE(c
)) {
56 if (c
== U_SENTINEL
) {
66 if(ut
->chunkOffset
<=0 && !utext_access(ut
, ut
->chunkNativeStart
, FALSE
)) {
69 c
= ut
->chunkContents
[ut
->chunkOffset
-1];
70 if (U16_IS_SURROGATE(c
)) {
71 c
= utext_previous32(ut
);
72 if (c
== U_SENTINEL
) {
85 U_CAPI
int64_t U_EXPORT2
86 utext_nativeLength(UText
*ut
) {
87 return ut
->pFuncs
->nativeLength(ut
);
91 U_CAPI UBool U_EXPORT2
92 utext_isLengthExpensive(const UText
*ut
) {
93 UBool r
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
)) != 0;
98 U_CAPI
int64_t U_EXPORT2
99 utext_getNativeIndex(const UText
*ut
) {
100 if(ut
->chunkOffset
<= ut
->nativeIndexingLimit
) {
101 return ut
->chunkNativeStart
+ut
->chunkOffset
;
103 return ut
->pFuncs
->mapOffsetToNative(ut
);
108 U_CAPI
void U_EXPORT2
109 utext_setNativeIndex(UText
*ut
, int64_t index
) {
110 // Apple note, at entry ut->chunkContents may be 0, not necessarily a problem
111 // (CF funcs will have set chunkNativeStart/Limit to 0 forcing call to access)
112 if(index
<ut
->chunkNativeStart
|| index
>=ut
->chunkNativeLimit
) {
113 // The desired position is outside of the current chunk.
114 // Access the new position. Assume a forward iteration from here,
115 // which will also be optimimum for a single random access.
116 // Reverse iterations may suffer slightly.
117 ut
->pFuncs
->access(ut
, index
, TRUE
);
118 } else if((int32_t)(index
- ut
->chunkNativeStart
) <= ut
->nativeIndexingLimit
) {
120 ut
->chunkOffset
=(int32_t)(index
-ut
->chunkNativeStart
);
122 ut
->chunkOffset
=ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
124 // Apple note, it can still be valid to have ut->chunkContents==0 at this
125 // point (just not inside the next block), see <rdar://problem/53610517>
127 // The convention is that the index must always be on a code point boundary.
128 // Adjust the index position if it is in the middle of a surrogate pair.
129 if (ut
->chunkOffset
<ut
->chunkLength
) {
130 UChar c
= ut
->chunkContents
[ut
->chunkOffset
];
131 if (U16_IS_TRAIL(c
)) {
132 if (ut
->chunkOffset
==0) {
133 ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
);
135 if (ut
->chunkOffset
>0) {
136 UChar lead
= ut
->chunkContents
[ut
->chunkOffset
-1];
137 if (U16_IS_LEAD(lead
)) {
147 U_CAPI
int64_t U_EXPORT2
148 utext_getPreviousNativeIndex(UText
*ut
) {
150 // Fast-path the common case.
151 // Common means current position is not at the beginning of a chunk
152 // and the preceding character is not supplementary.
154 int32_t i
= ut
->chunkOffset
- 1;
157 UChar c
= ut
->chunkContents
[i
];
158 if (U16_IS_TRAIL(c
) == FALSE
) {
159 if (i
<= ut
->nativeIndexingLimit
) {
160 result
= ut
->chunkNativeStart
+ i
;
163 result
= ut
->pFuncs
->mapOffsetToNative(ut
);
170 // If at the start of text, simply return 0.
171 if (ut
->chunkOffset
==0 && ut
->chunkNativeStart
==0) {
175 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
176 // Keep it simple, use other functions to handle the edges.
178 utext_previous32(ut
);
179 result
= UTEXT_GETNATIVEINDEX(ut
);
186 // utext_current32. Get the UChar32 at the current position.
187 // UText iteration position is always on a code point boundary,
188 // never on the trail half of a surrogate pair.
190 U_CAPI UChar32 U_EXPORT2
191 utext_current32(UText
*ut
) {
193 if (ut
->chunkOffset
==ut
->chunkLength
) {
194 // Current position is just off the end of the chunk.
195 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
196 // Off the end of the text.
201 c
= ut
->chunkContents
[ut
->chunkOffset
];
202 if (U16_IS_LEAD(c
) == FALSE
) {
203 // Normal, non-supplementary case.
208 // Possible supplementary char.
211 UChar32 supplementaryC
= c
;
212 if ((ut
->chunkOffset
+1) < ut
->chunkLength
) {
213 // The trail surrogate is in the same chunk.
214 trail
= ut
->chunkContents
[ut
->chunkOffset
+1];
216 // The trail surrogate is in a different chunk.
217 // Because we must maintain the iteration position, we need to switch forward
218 // into the new chunk, get the trail surrogate, then revert the chunk back to the
220 // An edge case to be careful of: the entire text may end with an unpaired
221 // leading surrogate. The attempt to access the trail will fail, but
222 // the original position before the unpaired lead still needs to be restored.
223 int64_t nativePosition
= ut
->chunkNativeLimit
;
224 int32_t originalOffset
= ut
->chunkOffset
;
225 if (ut
->pFuncs
->access(ut
, nativePosition
, TRUE
)) {
226 trail
= ut
->chunkContents
[ut
->chunkOffset
];
228 UBool r
= ut
->pFuncs
->access(ut
, nativePosition
, FALSE
); // reverse iteration flag loads preceding chunk
230 ut
->chunkOffset
= originalOffset
;
236 if (U16_IS_TRAIL(trail
)) {
237 supplementaryC
= U16_GET_SUPPLEMENTARY(c
, trail
);
239 return supplementaryC
;
244 U_CAPI UChar32 U_EXPORT2
245 utext_char32At(UText
*ut
, int64_t nativeIndex
) {
246 UChar32 c
= U_SENTINEL
;
248 // Fast path the common case.
249 if (nativeIndex
>=ut
->chunkNativeStart
&& nativeIndex
< ut
->chunkNativeStart
+ ut
->nativeIndexingLimit
) {
250 ut
->chunkOffset
= (int32_t)(nativeIndex
- ut
->chunkNativeStart
);
251 c
= ut
->chunkContents
[ut
->chunkOffset
];
252 if (U16_IS_SURROGATE(c
) == FALSE
) {
258 utext_setNativeIndex(ut
, nativeIndex
);
259 if (nativeIndex
>=ut
->chunkNativeStart
&& ut
->chunkOffset
<ut
->chunkLength
) {
260 c
= ut
->chunkContents
[ut
->chunkOffset
];
261 if (U16_IS_SURROGATE(c
)) {
262 // For surrogates, let current32() deal with the complications
263 // of supplementaries that may span chunk boundaries.
264 c
= utext_current32(ut
);
271 U_CAPI UChar32 U_EXPORT2
272 utext_next32(UText
*ut
) {
275 if (ut
->chunkOffset
>= ut
->chunkLength
) {
276 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
281 c
= ut
->chunkContents
[ut
->chunkOffset
++];
282 if (U16_IS_LEAD(c
) == FALSE
) {
283 // Normal case, not supplementary.
284 // (A trail surrogate seen here is just returned as is, as a surrogate value.
285 // It cannot be part of a pair.)
289 if (ut
->chunkOffset
>= ut
->chunkLength
) {
290 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
291 // c is an unpaired lead surrogate at the end of the text.
292 // return it as it is.
296 UChar32 trail
= ut
->chunkContents
[ut
->chunkOffset
];
297 if (U16_IS_TRAIL(trail
) == FALSE
) {
298 // c was an unpaired lead surrogate, not at the end of the text.
299 // return it as it is (unpaired). Iteration position is on the
300 // following character, possibly in the next chunk, where the
301 // trail surrogate would have been if it had existed.
305 UChar32 supplementary
= U16_GET_SUPPLEMENTARY(c
, trail
);
306 ut
->chunkOffset
++; // move iteration position over the trail surrogate.
307 return supplementary
;
311 U_CAPI UChar32 U_EXPORT2
312 utext_previous32(UText
*ut
) {
315 if (ut
->chunkOffset
<= 0) {
316 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
) == FALSE
) {
321 c
= ut
->chunkContents
[ut
->chunkOffset
];
322 if (U16_IS_TRAIL(c
) == FALSE
) {
323 // Normal case, not supplementary.
324 // (A lead surrogate seen here is just returned as is, as a surrogate value.
325 // It cannot be part of a pair.)
329 if (ut
->chunkOffset
<= 0) {
330 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
) == FALSE
) {
331 // c is an unpaired trail surrogate at the start of the text.
332 // return it as it is.
337 UChar32 lead
= ut
->chunkContents
[ut
->chunkOffset
-1];
338 if (U16_IS_LEAD(lead
) == FALSE
) {
339 // c was an unpaired trail surrogate, not at the end of the text.
340 // return it as it is (unpaired). Iteration position is at c
344 UChar32 supplementary
= U16_GET_SUPPLEMENTARY(lead
, c
);
345 ut
->chunkOffset
--; // move iteration position over the lead surrogate.
346 return supplementary
;
351 U_CAPI UChar32 U_EXPORT2
352 utext_next32From(UText
*ut
, int64_t index
) {
353 UChar32 c
= U_SENTINEL
;
355 if(index
<ut
->chunkNativeStart
|| index
>=ut
->chunkNativeLimit
) {
356 // Desired position is outside of the current chunk.
357 if(!ut
->pFuncs
->access(ut
, index
, TRUE
)) {
358 // no chunk available here
361 } else if (index
- ut
->chunkNativeStart
<= (int64_t)ut
->nativeIndexingLimit
) {
362 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
363 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
365 // Desired position is in chunk, with non-UTF16 indexing.
366 ut
->chunkOffset
= ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
369 c
= ut
->chunkContents
[ut
->chunkOffset
++];
370 if (U16_IS_SURROGATE(c
)) {
371 // Surrogates. Many edge cases. Use other functions that already
372 // deal with the problems.
373 utext_setNativeIndex(ut
, index
);
374 c
= utext_next32(ut
);
380 U_CAPI UChar32 U_EXPORT2
381 utext_previous32From(UText
*ut
, int64_t index
) {
383 // Return the character preceding the specified index.
384 // Leave the iteration position at the start of the character that was returned.
386 UChar32 cPrev
; // The character preceding cCurr, which is what we will return.
388 // Address the chunk containg the position preceding the incoming index
389 // A tricky edge case:
390 // We try to test the requested native index against the chunkNativeStart to determine
391 // whether the character preceding the one at the index is in the current chunk.
392 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
393 // requested index is on something other than the first position of the first char.
395 if(index
<=ut
->chunkNativeStart
|| index
>ut
->chunkNativeLimit
) {
396 // Requested native index is outside of the current chunk.
397 if(!ut
->pFuncs
->access(ut
, index
, FALSE
)) {
398 // no chunk available here
401 } else if(index
- ut
->chunkNativeStart
<= (int64_t)ut
->nativeIndexingLimit
) {
402 // Direct UTF-16 indexing.
403 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
405 ut
->chunkOffset
=ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
406 if (ut
->chunkOffset
==0 && !ut
->pFuncs
->access(ut
, index
, FALSE
)) {
407 // no chunk available here
413 // Simple case with no surrogates.
416 cPrev
= ut
->chunkContents
[ut
->chunkOffset
];
418 if (U16_IS_SURROGATE(cPrev
)) {
419 // Possible supplementary. Many edge cases.
420 // Let other functions do the heavy lifting.
421 utext_setNativeIndex(ut
, index
);
422 cPrev
= utext_previous32(ut
);
428 U_CAPI
int32_t U_EXPORT2
429 utext_extract(UText
*ut
,
430 int64_t start
, int64_t limit
,
431 UChar
*dest
, int32_t destCapacity
,
432 UErrorCode
*status
) {
433 return ut
->pFuncs
->extract(ut
, start
, limit
, dest
, destCapacity
, status
);
438 U_CAPI UBool U_EXPORT2
439 utext_equals(const UText
*a
, const UText
*b
) {
440 if (a
==NULL
|| b
==NULL
||
441 a
->magic
!= UTEXT_MAGIC
||
442 b
->magic
!= UTEXT_MAGIC
) {
443 // Null or invalid arguments don't compare equal to anything.
447 if (a
->pFuncs
!= b
->pFuncs
) {
448 // Different types of text providers.
452 if (a
->context
!= b
->context
) {
453 // Different sources (different strings)
456 if (utext_getNativeIndex(a
) != utext_getNativeIndex(b
)) {
457 // Different current position in the string.
464 U_CAPI UBool U_EXPORT2
465 utext_isWritable(const UText
*ut
)
467 UBool b
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) != 0;
472 U_CAPI
void U_EXPORT2
473 utext_freeze(UText
*ut
) {
474 // Zero out the WRITABLE flag.
475 ut
->providerProperties
&= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE
));
479 U_CAPI UBool U_EXPORT2
480 utext_hasMetaData(const UText
*ut
)
482 UBool b
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA
)) != 0;
488 U_CAPI
int32_t U_EXPORT2
489 utext_replace(UText
*ut
,
490 int64_t nativeStart
, int64_t nativeLimit
,
491 const UChar
*replacementText
, int32_t replacementLength
,
494 if (U_FAILURE(*status
)) {
497 if ((ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) == 0) {
498 *status
= U_NO_WRITE_PERMISSION
;
501 int32_t i
= ut
->pFuncs
->replace(ut
, nativeStart
, nativeLimit
, replacementText
, replacementLength
, status
);
505 U_CAPI
void U_EXPORT2
506 utext_copy(UText
*ut
,
507 int64_t nativeStart
, int64_t nativeLimit
,
512 if (U_FAILURE(*status
)) {
515 if ((ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) == 0) {
516 *status
= U_NO_WRITE_PERMISSION
;
519 ut
->pFuncs
->copy(ut
, nativeStart
, nativeLimit
, destIndex
, move
, status
);
524 U_CAPI UText
* U_EXPORT2
525 utext_clone(UText
*dest
, const UText
*src
, UBool deep
, UBool readOnly
, UErrorCode
*status
) {
526 if (U_FAILURE(*status
)) {
529 UText
*result
= src
->pFuncs
->clone(dest
, src
, deep
, status
);
530 if (U_FAILURE(*status
)) {
533 if (result
== NULL
) {
534 *status
= U_MEMORY_ALLOCATION_ERROR
;
538 utext_freeze(result
);
545 //------------------------------------------------------------------------------
547 // UText common functions implementation
549 //------------------------------------------------------------------------------
552 // UText.flags bit definitions
555 UTEXT_HEAP_ALLOCATED
= 1, // 1 if ICU has allocated this UText struct on the heap.
556 // 0 if caller provided storage for the UText.
558 UTEXT_EXTRA_HEAP_ALLOCATED
= 2, // 1 if ICU has allocated extra storage as a separate
560 // 0 if there is no separate allocation. Either no extra
561 // storage was requested, or it is appended to the end
562 // of the main UText storage.
564 UTEXT_OPEN
= 4 // 1 if this UText is currently open
565 // 0 if this UText is not open.
570 // Extended form of a UText. The purpose is to aid in computing the total size required
571 // when a provider asks for a UText to be allocated with extra storage.
573 struct ExtendedUText
{
575 max_align_t extension
;
578 static const UText emptyText
= UTEXT_INITIALIZER
;
580 U_CAPI UText
* U_EXPORT2
581 utext_setup(UText
*ut
, int32_t extraSpace
, UErrorCode
*status
) {
582 if (U_FAILURE(*status
)) {
587 // We need to heap-allocate storage for the new UText
588 int32_t spaceRequired
= sizeof(UText
);
589 if (extraSpace
> 0) {
590 spaceRequired
= sizeof(ExtendedUText
) + extraSpace
- sizeof(max_align_t
);
592 ut
= (UText
*)uprv_malloc(spaceRequired
);
594 *status
= U_MEMORY_ALLOCATION_ERROR
;
598 ut
->flags
|= UTEXT_HEAP_ALLOCATED
;
599 if (spaceRequired
>0) {
600 ut
->extraSize
= extraSpace
;
601 ut
->pExtra
= &((ExtendedUText
*)ut
)->extension
;
605 // We have been supplied with an already existing UText.
606 // Verify that it really appears to be a UText.
607 if (ut
->magic
!= UTEXT_MAGIC
) {
608 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
611 // If the ut is already open and there's a provider supplied close
612 // function, call it.
613 if ((ut
->flags
& UTEXT_OPEN
) && ut
->pFuncs
->close
!= NULL
) {
614 ut
->pFuncs
->close(ut
);
616 ut
->flags
&= ~UTEXT_OPEN
;
618 // If extra space was requested by our caller, check whether
619 // sufficient already exists, and allocate new if needed.
620 if (extraSpace
> ut
->extraSize
) {
621 // Need more space. If there is existing separately allocated space,
622 // delete it first, then allocate new space.
623 if (ut
->flags
& UTEXT_EXTRA_HEAP_ALLOCATED
) {
624 uprv_free(ut
->pExtra
);
627 ut
->pExtra
= uprv_malloc(extraSpace
);
628 if (ut
->pExtra
== NULL
) {
629 *status
= U_MEMORY_ALLOCATION_ERROR
;
631 ut
->extraSize
= extraSpace
;
632 ut
->flags
|= UTEXT_EXTRA_HEAP_ALLOCATED
;
636 if (U_SUCCESS(*status
)) {
637 ut
->flags
|= UTEXT_OPEN
;
639 // Initialize all remaining fields of the UText.
642 ut
->chunkContents
= NULL
;
651 ut
->chunkNativeStart
= 0;
652 ut
->chunkNativeLimit
= 0;
653 ut
->nativeIndexingLimit
= 0;
654 ut
->providerProperties
= 0;
659 if (ut
->pExtra
!=NULL
&& ut
->extraSize
>0)
660 uprv_memset(ut
->pExtra
, 0, ut
->extraSize
);
667 U_CAPI UText
* U_EXPORT2
668 utext_close(UText
*ut
) {
670 ut
->magic
!= UTEXT_MAGIC
||
671 (ut
->flags
& UTEXT_OPEN
) == 0)
673 // The supplied ut is not an open UText.
678 // If the provider gave us a close function, call it now.
679 // This will clean up anything allocated specifically by the provider.
680 if (ut
->pFuncs
->close
!= NULL
) {
681 ut
->pFuncs
->close(ut
);
683 ut
->flags
&= ~UTEXT_OPEN
;
685 // If we (the framework) allocated the UText or subsidiary storage,
687 if (ut
->flags
& UTEXT_EXTRA_HEAP_ALLOCATED
) {
688 uprv_free(ut
->pExtra
);
690 ut
->flags
&= ~UTEXT_EXTRA_HEAP_ALLOCATED
;
694 // Zero out function table of the closed UText. This is a defensive move,
695 // inteded to cause applications that inadvertantly use a closed
696 // utext to crash with null pointer errors.
699 if (ut
->flags
& UTEXT_HEAP_ALLOCATED
) {
700 // This UText was allocated by UText setup. We need to free it.
701 // Clear magic, so we can detect if the user messes up and immediately
702 // tries to reopen another UText using the deleted storage.
714 // invalidateChunk Reset a chunk to have no contents, so that the next call
715 // to access will cause new data to load.
716 // This is needed when copy/move/replace operate directly on the
717 // backing text, potentially putting it out of sync with the
718 // contents in the chunk.
721 invalidateChunk(UText
*ut
) {
723 ut
->chunkNativeLimit
= 0;
724 ut
->chunkNativeStart
= 0;
726 ut
->nativeIndexingLimit
= 0;
730 // pinIndex Do range pinning on a native index parameter.
731 // 64 bit pinning is done in place.
732 // 32 bit truncated result is returned as a convenience for
733 // use in providers that don't need 64 bits.
735 pinIndex(int64_t &index
, int64_t limit
) {
738 } else if (index
> limit
) {
741 return (int32_t)index
;
748 // Pointer relocation function,
749 // a utility used by shallow clone.
750 // Adjust a pointer that refers to something within one UText (the source)
751 // to refer to the same relative offset within a another UText (the target)
753 static void adjustPointer(UText
*dest
, const void **destPtr
, const UText
*src
) {
754 // convert all pointers to (char *) so that byte address arithmetic will work.
755 char *dptr
= (char *)*destPtr
;
756 char *dUText
= (char *)dest
;
757 char *sUText
= (char *)src
;
759 if (dptr
>= (char *)src
->pExtra
&& dptr
< ((char*)src
->pExtra
)+src
->extraSize
) {
760 // target ptr was to something within the src UText's pExtra storage.
761 // relocate it into the target UText's pExtra region.
762 *destPtr
= ((char *)dest
->pExtra
) + (dptr
- (char *)src
->pExtra
);
763 } else if (dptr
>=sUText
&& dptr
< sUText
+src
->sizeOfStruct
) {
764 // target ptr was pointing to somewhere within the source UText itself.
765 // Move it to the same offset within the target UText.
766 *destPtr
= dUText
+ (dptr
-sUText
);
772 // Clone. This is a generic copy-the-utext-by-value clone function that can be
773 // used as-is with some utext types, and as a helper by other clones.
775 static UText
* U_CALLCONV
776 shallowTextClone(UText
* dest
, const UText
* src
, UErrorCode
* status
) {
777 if (U_FAILURE(*status
)) {
780 int32_t srcExtraSize
= src
->extraSize
;
783 // Use the generic text_setup to allocate storage if required.
785 dest
= utext_setup(dest
, srcExtraSize
, status
);
786 if (U_FAILURE(*status
)) {
791 // flags (how the UText was allocated) and the pointer to the
792 // extra storage must retain the values in the cloned utext that
793 // were set up by utext_setup. Save them separately before
794 // copying the whole struct.
796 void *destExtra
= dest
->pExtra
;
797 int32_t flags
= dest
->flags
;
801 // Copy the whole UText struct by value.
802 // Any "Extra" storage is copied also.
804 int sizeToCopy
= src
->sizeOfStruct
;
805 if (sizeToCopy
> dest
->sizeOfStruct
) {
806 sizeToCopy
= dest
->sizeOfStruct
;
808 uprv_memcpy(dest
, src
, sizeToCopy
);
809 dest
->pExtra
= destExtra
;
811 if (srcExtraSize
> 0) {
812 uprv_memcpy(dest
->pExtra
, src
->pExtra
, srcExtraSize
);
816 // Relocate any pointers in the target that refer to the UText itself
817 // to point to the cloned copy rather than the original source.
819 adjustPointer(dest
, &dest
->context
, src
);
820 adjustPointer(dest
, &dest
->p
, src
);
821 adjustPointer(dest
, &dest
->q
, src
);
822 adjustPointer(dest
, &dest
->r
, src
);
823 adjustPointer(dest
, (const void **)&dest
->chunkContents
, src
);
825 // The newly shallow-cloned UText does _not_ own the underlying storage for the text.
826 // (The source for the clone may or may not have owned the text.)
828 dest
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
838 //------------------------------------------------------------------------------
840 // UText implementation for UTF-8 char * strings (read-only)
841 // Limitation: string length must be <= 0x7fffffff in length.
842 // (length must for in an int32_t variable)
844 // Use of UText data members:
845 // context pointer to UTF-8 string
846 // utext.b is the input string length (bytes).
847 // utext.c Length scanned so far in string
848 // (for optimizing finding length of zero terminated strings.)
849 // utext.p pointer to the current buffer
850 // utext.q pointer to the other buffer.
852 //------------------------------------------------------------------------------
855 // Must be less than 85 (256/3), because of byte mapping from UChar indexes to native indexes.
856 // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
858 // The longest illegal byte sequence treated as a single error (and converted to U+FFFD)
859 // is a three-byte sequence (truncated four-byte sequence).
861 enum { UTF8_TEXT_CHUNK_SIZE
=32 };
864 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
865 // Each contains the UChar chunk buffer, the to and from native maps, and
868 // because backwards iteration fills the buffers starting at the end and
869 // working towards the front, the filled part of the buffers may not begin
870 // at the start of the available storage for the buffers.
872 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
873 // the last character added being a supplementary, and thus requiring a surrogate
874 // pair. Doing this is simpler than checking for the edge case.
878 int32_t bufNativeStart
; // Native index of first char in UChar buf
879 int32_t bufNativeLimit
; // Native index following last char in buf.
880 int32_t bufStartIdx
; // First filled position in buf.
881 int32_t bufLimitIdx
; // Limit of filled range in buf.
882 int32_t bufNILimit
; // Limit of native indexing part of buf
883 int32_t toUCharsMapStart
; // Native index corresponding to
885 // Set to bufNativeStart when filling forwards.
886 // Set to computed value when filling backwards.
888 UChar buf
[UTF8_TEXT_CHUNK_SIZE
+4]; // The UChar buffer. Requires one extra position beyond the
889 // the chunk size, to allow for surrogate at the end.
890 // Length must be identical to mapToNative array, below,
891 // because of the way indexing works when the array is
892 // filled backwards during a reverse iteration. Thus,
893 // the additional extra size.
894 uint8_t mapToNative
[UTF8_TEXT_CHUNK_SIZE
+4]; // map UChar index in buf to
895 // native offset from bufNativeStart.
896 // Requires two extra slots,
897 // one for a supplementary starting in the last normal position,
898 // and one for an entry for the buffer limit position.
899 uint8_t mapToUChars
[UTF8_TEXT_CHUNK_SIZE
*3+6]; // Map native offset from bufNativeStart to
900 // correspoding offset in filled part of buf.
909 // Get the length of the string. If we don't already know it,
910 // we'll need to scan for the trailing nul.
912 static int64_t U_CALLCONV
913 utf8TextLength(UText
*ut
) {
915 // Zero terminated string, and we haven't scanned to the end yet.
917 const char *r
= (const char *)ut
->context
+ ut
->c
;
921 if ((r
- (const char *)ut
->context
) < 0x7fffffff) {
922 ut
->b
= (int32_t)(r
- (const char *)ut
->context
);
924 // Actual string was bigger (more than 2 gig) than we
925 // can handle. Clip it to 2 GB.
928 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
938 static UBool U_CALLCONV
939 utf8TextAccess(UText
*ut
, int64_t index
, UBool forward
) {
941 // Apologies to those who are allergic to goto statements.
942 // Consider each goto to a labelled block to be the equivalent of
943 // call the named block as if it were a function();
946 const uint8_t *s8
=(const uint8_t *)ut
->context
;
948 int32_t length
= ut
->b
; // Length of original utf-8
949 int32_t ix
= (int32_t)index
; // Requested index, trimmed to 32 bits.
950 int32_t mapIndex
= 0;
953 } else if (index
> 0x7fffffff) {
954 // Strings with 64 bit lengths not supported by this UTF-8 provider.
958 // Pin requested index to the string length.
962 } else if (ix
>=ut
->c
) {
963 // Zero terminated string, and requested index is beyond
964 // the region that has already been scanned.
965 // Scan up to either the end of the string or to the
966 // requested position, whichever comes first.
967 while (ut
->c
<ix
&& s8
[ut
->c
]!=0) {
970 // TODO: support for null terminated string length > 32 bits.
971 if (s8
[ut
->c
] == 0) {
972 // We just found the actual length of the string.
973 // Trim the requested index back to that.
977 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
983 // Dispatch to the appropriate action for a forward iteration request.
986 if (ix
==ut
->chunkNativeLimit
) {
987 // Check for normal sequential iteration cases first.
989 // Just reached end of string
990 // Don't swap buffers, but do set the
991 // current buffer position.
992 ut
->chunkOffset
= ut
->chunkLength
;
995 // End of current buffer.
996 // check whether other buffer already has what we need.
997 UTF8Buf
*altB
= (UTF8Buf
*)ut
->q
;
998 if (ix
>=altB
->bufNativeStart
&& ix
<altB
->bufNativeLimit
) {
1004 // A random access. Desired index could be in either or niether buf.
1005 // For optimizing the order of testing, first check for the index
1006 // being in the other buffer. This will be the case for uses that
1007 // move back and forth over a fairly limited range
1009 u8b
= (UTF8Buf
*)ut
->q
; // the alternate buffer
1010 if (ix
>=u8b
->bufNativeStart
&& ix
<u8b
->bufNativeLimit
) {
1011 // Requested index is in the other buffer.
1015 // Requested index is end-of-string.
1016 // (this is the case of randomly seeking to the end.
1017 // The case of iterating off the end is handled earlier.)
1018 if (ix
== ut
->chunkNativeLimit
) {
1019 // Current buffer extends up to the end of the string.
1020 // Leave it as the current buffer.
1021 ut
->chunkOffset
= ut
->chunkLength
;
1024 if (ix
== u8b
->bufNativeLimit
) {
1025 // Alternate buffer extends to the end of string.
1026 // Swap it in as the current buffer.
1027 goto swapBuffersAndFail
;
1030 // Neither existing buffer extends to the end of the string.
1031 goto makeStubBuffer
;
1034 if (ix
<ut
->chunkNativeStart
|| ix
>=ut
->chunkNativeLimit
) {
1035 // Requested index is in neither buffer.
1039 // Requested index is in this buffer.
1040 u8b
= (UTF8Buf
*)ut
->p
; // the current buffer
1041 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1042 U_ASSERT(mapIndex
< (int32_t)sizeof(UTF8Buf::mapToUChars
));
1043 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1051 // Dispatch to the appropriate action for a
1052 // Backwards Diretion iteration request.
1054 if (ix
==ut
->chunkNativeStart
) {
1055 // Check for normal sequential iteration cases first.
1057 // Just reached the start of string
1058 // Don't swap buffers, but do set the
1059 // current buffer position.
1060 ut
->chunkOffset
= 0;
1063 // Start of current buffer.
1064 // check whether other buffer already has what we need.
1065 UTF8Buf
*altB
= (UTF8Buf
*)ut
->q
;
1066 if (ix
>altB
->bufNativeStart
&& ix
<=altB
->bufNativeLimit
) {
1072 // A random access. Desired index could be in either or niether buf.
1073 // For optimizing the order of testing,
1074 // Most likely case: in the other buffer.
1075 // Second most likely: in neither buffer.
1076 // Unlikely, but must work: in the current buffer.
1077 u8b
= (UTF8Buf
*)ut
->q
; // the alternate buffer
1078 if (ix
>u8b
->bufNativeStart
&& ix
<=u8b
->bufNativeLimit
) {
1079 // Requested index is in the other buffer.
1082 // Requested index is start-of-string.
1083 // (this is the case of randomly seeking to the start.
1084 // The case of iterating off the start is handled earlier.)
1086 if (u8b
->bufNativeStart
==0) {
1087 // Alternate buffer contains the data for the start string.
1088 // Make it be the current buffer.
1089 goto swapBuffersAndFail
;
1091 // Request for data before the start of string,
1092 // neither buffer is usable.
1093 // set up a zero-length buffer.
1094 goto makeStubBuffer
;
1098 if (ix
<=ut
->chunkNativeStart
|| ix
>ut
->chunkNativeLimit
) {
1099 // Requested index is in neither buffer.
1103 // Requested index is in this buffer.
1104 // Set the utf16 buffer index.
1105 u8b
= (UTF8Buf
*)ut
->p
;
1106 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1107 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1108 if (ut
->chunkOffset
==0) {
1109 // This occurs when the first character in the text is
1110 // a multi-byte UTF-8 char, and the requested index is to
1111 // one of the trailing bytes. Because there is no preceding ,
1112 // character, this access fails. We can't pick up on the
1113 // situation sooner because the requested index is not zero.
1122 // The alternate buffer (ut->q) has the string data that was requested.
1123 // Swap the primary and alternate buffers, and set the
1124 // chunk index into the new primary buffer.
1126 u8b
= (UTF8Buf
*)ut
->q
;
1129 ut
->chunkContents
= &u8b
->buf
[u8b
->bufStartIdx
];
1130 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1131 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1132 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1133 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1135 // Index into the (now current) chunk
1136 // Use the map to set the chunk index. It's more trouble than it's worth
1137 // to check whether native indexing can be used.
1138 U_ASSERT(ix
>=u8b
->bufNativeStart
);
1139 U_ASSERT(ix
<=u8b
->bufNativeLimit
);
1140 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1141 U_ASSERT(mapIndex
>=0);
1142 U_ASSERT(mapIndex
<(int32_t)sizeof(u8b
->mapToUChars
));
1143 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1150 // We got a request for either the start or end of the string,
1151 // with iteration continuing in the out-of-bounds direction.
1152 // The alternate buffer already contains the data up to the
1154 // Swap the buffers, then return failure, indicating that we couldn't
1155 // make things correct for continuing the iteration in the requested
1156 // direction. The position & buffer are correct should the
1157 // user decide to iterate in the opposite direction.
1158 u8b
= (UTF8Buf
*)ut
->q
;
1161 ut
->chunkContents
= &u8b
->buf
[u8b
->bufStartIdx
];
1162 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1163 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1164 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1165 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1167 // Index into the (now current) chunk
1168 // For this function (swapBuffersAndFail), the requested index
1169 // will always be at either the start or end of the chunk.
1170 if (ix
==u8b
->bufNativeLimit
) {
1171 ut
->chunkOffset
= ut
->chunkLength
;
1173 ut
->chunkOffset
= 0;
1174 U_ASSERT(ix
== u8b
->bufNativeStart
);
1179 // The user has done a seek/access past the start or end
1180 // of the string. Rather than loading data that is likely
1181 // to never be used, just set up a zero-length buffer at
1183 u8b
= (UTF8Buf
*)ut
->q
;
1184 u8b
->bufNativeStart
= ix
;
1185 u8b
->bufNativeLimit
= ix
;
1186 u8b
->bufStartIdx
= 0;
1187 u8b
->bufLimitIdx
= 0;
1188 u8b
->bufNILimit
= 0;
1189 u8b
->toUCharsMapStart
= ix
;
1190 u8b
->mapToNative
[0] = 0;
1191 u8b
->mapToUChars
[0] = 0;
1192 goto swapBuffersAndFail
;
1198 // Move the incoming index to a code point boundary.
1199 U8_SET_CP_START(s8
, 0, ix
);
1201 // Swap the UText buffers.
1202 // We want to fill what was previously the alternate buffer,
1203 // and make what was the current buffer be the new alternate.
1204 UTF8Buf
*u8b_swap
= (UTF8Buf
*)ut
->q
;
1208 int32_t strLen
= ut
->b
;
1209 UBool nulTerminated
= FALSE
;
1211 strLen
= 0x7fffffff;
1212 nulTerminated
= TRUE
;
1215 UChar
*buf
= u8b_swap
->buf
;
1216 uint8_t *mapToNative
= u8b_swap
->mapToNative
;
1217 uint8_t *mapToUChars
= u8b_swap
->mapToUChars
;
1220 UBool seenNonAscii
= FALSE
;
1223 // Fill the chunk buffer and mapping arrays.
1224 while (destIx
<UTF8_TEXT_CHUNK_SIZE
) {
1226 if (c
>0 && c
<0x80) {
1227 // Special case ASCII range for speed.
1228 // zero is excluded to simplify bounds checking.
1229 buf
[destIx
] = (UChar
)c
;
1230 mapToNative
[destIx
] = (uint8_t)(srcIx
- ix
);
1231 mapToUChars
[srcIx
-ix
] = (uint8_t)destIx
;
1235 // General case, handle everything.
1236 if (seenNonAscii
== FALSE
) {
1237 seenNonAscii
= TRUE
;
1238 u8b_swap
->bufNILimit
= destIx
;
1241 int32_t cIx
= srcIx
;
1242 int32_t dIx
= destIx
;
1243 int32_t dIxSaved
= destIx
;
1244 U8_NEXT_OR_FFFD(s8
, srcIx
, strLen
, c
);
1245 if (c
==0 && nulTerminated
) {
1250 U16_APPEND_UNSAFE(buf
, destIx
, c
);
1252 mapToNative
[dIx
++] = (uint8_t)(cIx
- ix
);
1253 } while (dIx
< destIx
);
1256 mapToUChars
[cIx
++ - ix
] = (uint8_t)dIxSaved
;
1257 } while (cIx
< srcIx
);
1259 if (srcIx
>=strLen
) {
1265 // store Native <--> Chunk Map entries for the end of the buffer.
1266 // There is no actual character here, but the index position is valid.
1267 mapToNative
[destIx
] = (uint8_t)(srcIx
- ix
);
1268 mapToUChars
[srcIx
- ix
] = (uint8_t)destIx
;
1270 // fill in Buffer descriptor
1271 u8b_swap
->bufNativeStart
= ix
;
1272 u8b_swap
->bufNativeLimit
= srcIx
;
1273 u8b_swap
->bufStartIdx
= 0;
1274 u8b_swap
->bufLimitIdx
= destIx
;
1275 if (seenNonAscii
== FALSE
) {
1276 u8b_swap
->bufNILimit
= destIx
;
1278 u8b_swap
->toUCharsMapStart
= u8b_swap
->bufNativeStart
;
1280 // Set UText chunk to refer to this buffer.
1281 ut
->chunkContents
= buf
;
1282 ut
->chunkOffset
= 0;
1283 ut
->chunkLength
= u8b_swap
->bufLimitIdx
;
1284 ut
->chunkNativeStart
= u8b_swap
->bufNativeStart
;
1285 ut
->chunkNativeLimit
= u8b_swap
->bufNativeLimit
;
1286 ut
->nativeIndexingLimit
= u8b_swap
->bufNILimit
;
1288 // For zero terminated strings, keep track of the maximum point
1290 if (nulTerminated
&& srcIx
>ut
->c
) {
1293 // We scanned to the end.
1294 // Remember the actual length.
1296 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1305 // Move the incoming index to a code point boundary.
1306 // Can only do this if the incoming index is somewhere in the interior of the string.
1307 // If index is at the end, there is no character there to look at.
1309 // Note: this function will only move the index back if it is on a trail byte
1310 // and there is a preceding lead byte and the sequence from the lead
1311 // through this trail could be part of a valid UTF-8 sequence
1312 // Otherwise the index remains unchanged.
1313 U8_SET_CP_START(s8
, 0, ix
);
1316 // Swap the UText buffers.
1317 // We want to fill what was previously the alternate buffer,
1318 // and make what was the current buffer be the new alternate.
1319 UTF8Buf
*u8b_swap
= (UTF8Buf
*)ut
->q
;
1323 UChar
*buf
= u8b_swap
->buf
;
1324 uint8_t *mapToNative
= u8b_swap
->mapToNative
;
1325 uint8_t *mapToUChars
= u8b_swap
->mapToUChars
;
1326 int32_t toUCharsMapStart
= ix
- sizeof(UTF8Buf::mapToUChars
) + 1;
1327 // Note that toUCharsMapStart can be negative. Happens when the remaining
1328 // text from current position to the beginning is less than the buffer size.
1329 // + 1 because mapToUChars must have a slot at the end for the bufNativeLimit entry.
1330 int32_t destIx
= UTF8_TEXT_CHUNK_SIZE
+2; // Start in the overflow region
1331 // at end of buffer to leave room
1332 // for a surrogate pair at the
1335 int32_t bufNILimit
= destIx
;
1338 // Map to/from Native Indexes, fill in for the position at the end of
1341 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1342 mapToUChars
[srcIx
- toUCharsMapStart
] = (uint8_t)destIx
;
1344 // Fill the chunk buffer
1345 // Work backwards, filling from the end of the buffer towards the front.
1347 while (destIx
>2 && (srcIx
- toUCharsMapStart
> 5) && (srcIx
> 0)) {
1351 // Get last byte of the UTF-8 character
1354 // Special case ASCII range for speed.
1355 buf
[destIx
] = (UChar
)c
;
1356 U_ASSERT(toUCharsMapStart
<= srcIx
);
1357 mapToUChars
[srcIx
- toUCharsMapStart
] = (uint8_t)destIx
;
1358 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1360 // General case, handle everything non-ASCII.
1362 int32_t sIx
= srcIx
; // ix of last byte of multi-byte u8 char
1364 // Get the full character from the UTF8 string.
1365 // use code derived from tbe macros in utf8.h
1366 // Leaves srcIx pointing at the first byte of the UTF-8 char.
1368 c
=utf8_prevCharSafeBody(s8
, 0, &srcIx
, c
, -3);
1369 // leaves srcIx at first byte of the multi-byte char.
1371 // Store the character in UTF-16 buffer.
1373 buf
[destIx
] = (UChar
)c
;
1374 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1376 buf
[destIx
] = U16_TRAIL(c
);
1377 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1378 buf
[--destIx
] = U16_LEAD(c
);
1379 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1382 // Fill in the map from native indexes to UChars buf index.
1384 mapToUChars
[sIx
-- - toUCharsMapStart
] = (uint8_t)destIx
;
1385 } while (sIx
>= srcIx
);
1386 U_ASSERT(toUCharsMapStart
<= (srcIx
+1));
1388 // Set native indexing limit to be the current position.
1389 // We are processing a non-ascii, non-native-indexing char now;
1390 // the limit will be here if the rest of the chars to be
1391 // added to this buffer are ascii.
1392 bufNILimit
= destIx
;
1395 u8b_swap
->bufNativeStart
= srcIx
;
1396 u8b_swap
->bufNativeLimit
= ix
;
1397 u8b_swap
->bufStartIdx
= destIx
;
1398 u8b_swap
->bufLimitIdx
= UTF8_TEXT_CHUNK_SIZE
+2;
1399 u8b_swap
->bufNILimit
= bufNILimit
- u8b_swap
->bufStartIdx
;
1400 u8b_swap
->toUCharsMapStart
= toUCharsMapStart
;
1402 ut
->chunkContents
= &buf
[u8b_swap
->bufStartIdx
];
1403 ut
->chunkLength
= u8b_swap
->bufLimitIdx
- u8b_swap
->bufStartIdx
;
1404 ut
->chunkOffset
= ut
->chunkLength
;
1405 ut
->chunkNativeStart
= u8b_swap
->bufNativeStart
;
1406 ut
->chunkNativeLimit
= u8b_swap
->bufNativeLimit
;
1407 ut
->nativeIndexingLimit
= u8b_swap
->bufNILimit
;
1416 // This is a slightly modified copy of u_strFromUTF8,
1417 // Inserts a Replacement Char rather than failing on invalid UTF-8
1418 // Removes unnecessary features.
1421 utext_strFromUTF8(UChar
*dest
,
1422 int32_t destCapacity
,
1423 int32_t *pDestLength
,
1425 int32_t srcLength
, // required. NUL terminated not supported.
1426 UErrorCode
*pErrorCode
1430 UChar
*pDest
= dest
;
1431 UChar
*pDestLimit
= (dest
!=NULL
)?(dest
+destCapacity
):NULL
;
1434 int32_t reqLength
= 0;
1435 uint8_t* pSrc
= (uint8_t*) src
;
1438 while((index
< srcLength
)&&(pDest
<pDestLimit
)){
1443 ch
=utf8_nextCharSafeBody(pSrc
, &index
, srcLength
, ch
, -3);
1445 *(pDest
++)=(UChar
)ch
;
1447 *(pDest
++)=U16_LEAD(ch
);
1448 if(pDest
<pDestLimit
){
1449 *(pDest
++)=U16_TRAIL(ch
);
1457 /* donot fill the dest buffer just count the UChars needed */
1458 while(index
< srcLength
){
1463 ch
=utf8_nextCharSafeBody(pSrc
, &index
, srcLength
, ch
, -3);
1464 reqLength
+=U16_LENGTH(ch
);
1468 reqLength
+=(int32_t)(pDest
- dest
);
1471 *pDestLength
= reqLength
;
1474 /* Terminate the buffer */
1475 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
1482 static int32_t U_CALLCONV
1483 utf8TextExtract(UText
*ut
,
1484 int64_t start
, int64_t limit
,
1485 UChar
*dest
, int32_t destCapacity
,
1486 UErrorCode
*pErrorCode
) {
1487 if(U_FAILURE(*pErrorCode
)) {
1490 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1491 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1494 int32_t length
= ut
->b
;
1495 int32_t start32
= pinIndex(start
, length
);
1496 int32_t limit32
= pinIndex(limit
, length
);
1498 if(start32
>limit32
) {
1499 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1504 // adjust the incoming indexes to land on code point boundaries if needed.
1505 // adjust by no more than three, because that is the largest number of trail bytes
1506 // in a well formed UTF8 character.
1507 const uint8_t *buf
= (const uint8_t *)ut
->context
;
1509 if (start32
< ut
->chunkNativeLimit
) {
1510 for (i
=0; i
<3; i
++) {
1511 if (U8_IS_SINGLE(buf
[start32
]) || U8_IS_LEAD(buf
[start32
]) || start32
==0) {
1518 if (limit32
< ut
->chunkNativeLimit
) {
1519 for (i
=0; i
<3; i
++) {
1520 if (U8_IS_SINGLE(buf
[limit32
]) || U8_IS_LEAD(buf
[limit32
]) || limit32
==0) {
1527 // Do the actual extract.
1528 int32_t destLength
=0;
1529 utext_strFromUTF8(dest
, destCapacity
, &destLength
,
1530 (const char *)ut
->context
+start32
, limit32
-start32
,
1532 utf8TextAccess(ut
, limit32
, TRUE
);
1537 // utf8TextMapOffsetToNative
1539 // Map a chunk (UTF-16) offset to a native index.
1540 static int64_t U_CALLCONV
1541 utf8TextMapOffsetToNative(const UText
*ut
) {
1543 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->p
;
1544 U_ASSERT(ut
->chunkOffset
>ut
->nativeIndexingLimit
&& ut
->chunkOffset
<=ut
->chunkLength
);
1545 int32_t nativeOffset
= u8b
->mapToNative
[ut
->chunkOffset
+ u8b
->bufStartIdx
] + u8b
->toUCharsMapStart
;
1546 U_ASSERT(nativeOffset
>= ut
->chunkNativeStart
&& nativeOffset
<= ut
->chunkNativeLimit
);
1547 return nativeOffset
;
1551 // Map a native index to the corrsponding chunk offset
1553 static int32_t U_CALLCONV
1554 utf8TextMapIndexToUTF16(const UText
*ut
, int64_t index64
) {
1555 U_ASSERT(index64
<= 0x7fffffff);
1556 int32_t index
= (int32_t)index64
;
1557 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->p
;
1558 U_ASSERT(index
>=ut
->chunkNativeStart
+ut
->nativeIndexingLimit
);
1559 U_ASSERT(index
<=ut
->chunkNativeLimit
);
1560 int32_t mapIndex
= index
- u8b
->toUCharsMapStart
;
1561 U_ASSERT(mapIndex
< (int32_t)sizeof(UTF8Buf::mapToUChars
));
1562 int32_t offset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1563 U_ASSERT(offset
>=0 && offset
<=ut
->chunkLength
);
1567 static UText
* U_CALLCONV
1568 utf8TextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
)
1570 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1571 dest
= shallowTextClone(dest
, src
, status
);
1573 // For deep clones, make a copy of the string.
1574 // The copied storage is owned by the newly created clone.
1576 // TODO: There is an isssue with using utext_nativeLength().
1577 // That function is non-const in cases where the input was NUL terminated
1578 // and the length has not yet been determined.
1579 // This function (clone()) is const.
1580 // There potentially a thread safety issue lurking here.
1582 if (deep
&& U_SUCCESS(*status
)) {
1583 int32_t len
= (int32_t)utext_nativeLength((UText
*)src
);
1584 char *copyStr
= (char *)uprv_malloc(len
+1);
1585 if (copyStr
== NULL
) {
1586 *status
= U_MEMORY_ALLOCATION_ERROR
;
1588 uprv_memcpy(copyStr
, src
->context
, len
+1);
1589 dest
->context
= copyStr
;
1590 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
1597 static void U_CALLCONV
1598 utf8TextClose(UText
*ut
) {
1599 // Most of the work of close is done by the generic UText framework close.
1600 // All that needs to be done here is to delete the UTF8 string if the UText
1601 // owns it. This occurs if the UText was created by cloning.
1602 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
1603 char *s
= (char *)ut
->context
;
1612 static const struct UTextFuncs utf8Funcs
=
1615 0, 0, 0, // Reserved alignment padding
1622 utf8TextMapOffsetToNative
,
1623 utf8TextMapIndexToUTF16
,
1631 static const char gEmptyString
[] = {0};
1633 U_CAPI UText
* U_EXPORT2
1634 utext_openUTF8(UText
*ut
, const char *s
, int64_t length
, UErrorCode
*status
) {
1635 if(U_FAILURE(*status
)) {
1638 if(s
==NULL
&& length
==0) {
1642 if(s
==NULL
|| length
<-1 || length
>INT32_MAX
) {
1643 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1647 ut
= utext_setup(ut
, sizeof(UTF8Buf
) * 2, status
);
1648 if (U_FAILURE(*status
)) {
1652 ut
->pFuncs
= &utf8Funcs
;
1654 ut
->b
= (int32_t)length
;
1655 ut
->c
= (int32_t)length
;
1658 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1661 ut
->q
= (char *)ut
->pExtra
+ sizeof(UTF8Buf
);
1673 //------------------------------------------------------------------------------
1675 // UText implementation wrapper for Replaceable (read/write)
1677 // Use of UText data members:
1678 // context pointer to Replaceable.
1679 // p pointer to Replaceable if it is owned by the UText.
1681 //------------------------------------------------------------------------------
1685 // minimum chunk size for this implementation: 3
1686 // to allow for possible trimming for code point boundaries
1687 enum { REP_TEXT_CHUNK_SIZE
=10 };
1692 * +1 to simplify filling with surrogate pair at the end.
1694 UChar s
[REP_TEXT_CHUNK_SIZE
+1];
1700 static UText
* U_CALLCONV
1701 repTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
) {
1702 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1703 dest
= shallowTextClone(dest
, src
, status
);
1705 // For deep clones, make a copy of the Replaceable.
1706 // The copied Replaceable storage is owned by the newly created UText clone.
1707 // A non-NULL pointer in UText.p is the signal to the close() function to delete
1710 if (deep
&& U_SUCCESS(*status
)) {
1711 const Replaceable
*replSrc
= (const Replaceable
*)src
->context
;
1712 dest
->context
= replSrc
->clone();
1713 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
1715 // with deep clone, the copy is writable, even when the source is not.
1716 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
1722 static void U_CALLCONV
1723 repTextClose(UText
*ut
) {
1724 // Most of the work of close is done by the generic UText framework close.
1725 // All that needs to be done here is delete the Replaceable if the UText
1726 // owns it. This occurs if the UText was created by cloning.
1727 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
1728 Replaceable
*rep
= (Replaceable
*)ut
->context
;
1735 static int64_t U_CALLCONV
1736 repTextLength(UText
*ut
) {
1737 const Replaceable
*replSrc
= (const Replaceable
*)ut
->context
;
1738 int32_t len
= replSrc
->length();
1743 static UBool U_CALLCONV
1744 repTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
1745 const Replaceable
*rep
=(const Replaceable
*)ut
->context
;
1746 int32_t length
=rep
->length(); // Full length of the input text (bigger than a chunk)
1748 // clip the requested index to the limits of the text.
1749 int32_t index32
= pinIndex(index
, length
);
1750 U_ASSERT(index
<=INT32_MAX
);
1754 * Compute start/limit boundaries around index, for a segment of text
1756 * To allow for the possibility that our user gave an index to the trailing
1757 * half of a surrogate pair, we must request one extra preceding UChar when
1758 * going in the forward direction. This will ensure that the buffer has the
1759 * entire code point at the specified index.
1763 if (index32
>=ut
->chunkNativeStart
&& index32
<ut
->chunkNativeLimit
) {
1764 // Buffer already contains the requested position.
1765 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
1768 if (index32
>=length
&& ut
->chunkNativeLimit
==length
) {
1769 // Request for end of string, and buffer already extends up to it.
1770 // Can't get the data, but don't change the buffer.
1771 ut
->chunkOffset
= length
- (int32_t)ut
->chunkNativeStart
;
1775 ut
->chunkNativeLimit
= index
+ REP_TEXT_CHUNK_SIZE
- 1;
1776 // Going forward, so we want to have the buffer with stuff at and beyond
1777 // the requested index. The -1 gets us one code point before the
1778 // requested index also, to handle the case of the index being on
1779 // a trail surrogate of a surrogate pair.
1780 if(ut
->chunkNativeLimit
> length
) {
1781 ut
->chunkNativeLimit
= length
;
1783 // unless buffer ran off end, start is index-1.
1784 ut
->chunkNativeStart
= ut
->chunkNativeLimit
- REP_TEXT_CHUNK_SIZE
;
1785 if(ut
->chunkNativeStart
< 0) {
1786 ut
->chunkNativeStart
= 0;
1789 // Reverse iteration. Fill buffer with data preceding the requested index.
1790 if (index32
>ut
->chunkNativeStart
&& index32
<=ut
->chunkNativeLimit
) {
1791 // Requested position already in buffer.
1792 ut
->chunkOffset
= index32
- (int32_t)ut
->chunkNativeStart
;
1795 if (index32
==0 && ut
->chunkNativeStart
==0) {
1796 // Request for start, buffer already begins at start.
1797 // No data, but keep the buffer as is.
1798 ut
->chunkOffset
= 0;
1802 // Figure out the bounds of the chunk to extract for reverse iteration.
1803 // Need to worry about chunk not splitting surrogate pairs, and while still
1804 // containing the data we need.
1805 // Fix by requesting a chunk that includes an extra UChar at the end.
1806 // If this turns out to be a lead surrogate, we can lop it off and still have
1807 // the data we wanted.
1808 ut
->chunkNativeStart
= index32
+ 1 - REP_TEXT_CHUNK_SIZE
;
1809 if (ut
->chunkNativeStart
< 0) {
1810 ut
->chunkNativeStart
= 0;
1813 ut
->chunkNativeLimit
= index32
+ 1;
1814 if (ut
->chunkNativeLimit
> length
) {
1815 ut
->chunkNativeLimit
= length
;
1819 // Extract the new chunk of text from the Replaceable source.
1820 ReplExtra
*ex
= (ReplExtra
*)ut
->pExtra
;
1821 // UnicodeString with its buffer a writable alias to the chunk buffer
1822 UnicodeString
buffer(ex
->s
, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE
/*buffer capacity*/);
1823 rep
->extractBetween((int32_t)ut
->chunkNativeStart
, (int32_t)ut
->chunkNativeLimit
, buffer
);
1825 ut
->chunkContents
= ex
->s
;
1826 ut
->chunkLength
= (int32_t)(ut
->chunkNativeLimit
- ut
->chunkNativeStart
);
1827 ut
->chunkOffset
= (int32_t)(index32
- ut
->chunkNativeStart
);
1829 // Surrogate pairs from the input text must not span chunk boundaries.
1830 // If end of chunk could be the start of a surrogate, trim it off.
1831 if (ut
->chunkNativeLimit
< length
&&
1832 U16_IS_LEAD(ex
->s
[ut
->chunkLength
-1])) {
1834 ut
->chunkNativeLimit
--;
1835 if (ut
->chunkOffset
> ut
->chunkLength
) {
1836 ut
->chunkOffset
= ut
->chunkLength
;
1840 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
1842 if(ut
->chunkNativeStart
>0 && U16_IS_TRAIL(ex
->s
[0])) {
1843 ++(ut
->chunkContents
);
1844 ++(ut
->chunkNativeStart
);
1845 --(ut
->chunkLength
);
1846 --(ut
->chunkOffset
);
1849 // adjust the index/chunkOffset to a code point boundary
1850 U16_SET_CP_START(ut
->chunkContents
, 0, ut
->chunkOffset
);
1852 // Use fast indexing for get/setNativeIndex()
1853 ut
->nativeIndexingLimit
= ut
->chunkLength
;
1860 static int32_t U_CALLCONV
1861 repTextExtract(UText
*ut
,
1862 int64_t start
, int64_t limit
,
1863 UChar
*dest
, int32_t destCapacity
,
1864 UErrorCode
*status
) {
1865 const Replaceable
*rep
=(const Replaceable
*)ut
->context
;
1866 int32_t length
=rep
->length();
1868 if(U_FAILURE(*status
)) {
1871 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1872 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1875 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1879 int32_t start32
= pinIndex(start
, length
);
1880 int32_t limit32
= pinIndex(limit
, length
);
1882 // adjust start, limit if they point to trail half of surrogates
1883 if (start32
<length
&& U16_IS_TRAIL(rep
->charAt(start32
)) &&
1884 U_IS_SUPPLEMENTARY(rep
->char32At(start32
))){
1887 if (limit32
<length
&& U16_IS_TRAIL(rep
->charAt(limit32
)) &&
1888 U_IS_SUPPLEMENTARY(rep
->char32At(limit32
))){
1892 length
=limit32
-start32
;
1893 if(length
>destCapacity
) {
1894 limit32
= start32
+ destCapacity
;
1896 UnicodeString
buffer(dest
, 0, destCapacity
); // writable alias
1897 rep
->extractBetween(start32
, limit32
, buffer
);
1898 repTextAccess(ut
, limit32
, TRUE
);
1900 return u_terminateUChars(dest
, destCapacity
, length
, status
);
1903 static int32_t U_CALLCONV
1904 repTextReplace(UText
*ut
,
1905 int64_t start
, int64_t limit
,
1906 const UChar
*src
, int32_t length
,
1907 UErrorCode
*status
) {
1908 Replaceable
*rep
=(Replaceable
*)ut
->context
;
1911 if(U_FAILURE(*status
)) {
1914 if(src
==NULL
&& length
!=0) {
1915 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1918 oldLength
=rep
->length(); // will subtract from new length
1920 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1924 int32_t start32
= pinIndex(start
, oldLength
);
1925 int32_t limit32
= pinIndex(limit
, oldLength
);
1927 // Snap start & limit to code point boundaries.
1928 if (start32
<oldLength
&& U16_IS_TRAIL(rep
->charAt(start32
)) &&
1929 start32
>0 && U16_IS_LEAD(rep
->charAt(start32
-1)))
1933 if (limit32
<oldLength
&& U16_IS_LEAD(rep
->charAt(limit32
-1)) &&
1934 U16_IS_TRAIL(rep
->charAt(limit32
)))
1939 // Do the actual replace operation using methods of the Replaceable class
1940 UnicodeString
replStr((UBool
)(length
<0), src
, length
); // read-only alias
1941 rep
->handleReplaceBetween(start32
, limit32
, replStr
);
1942 int32_t newLength
= rep
->length();
1943 int32_t lengthDelta
= newLength
- oldLength
;
1945 // Is the UText chunk buffer OK?
1946 if (ut
->chunkNativeLimit
> start32
) {
1947 // this replace operation may have impacted the current chunk.
1948 // invalidate it, which will force a reload on the next access.
1949 invalidateChunk(ut
);
1952 // set the iteration position to the end of the newly inserted replacement text.
1953 int32_t newIndexPos
= limit32
+ lengthDelta
;
1954 repTextAccess(ut
, newIndexPos
, TRUE
);
1960 static void U_CALLCONV
1961 repTextCopy(UText
*ut
,
1962 int64_t start
, int64_t limit
,
1967 Replaceable
*rep
=(Replaceable
*)ut
->context
;
1968 int32_t length
=rep
->length();
1970 if(U_FAILURE(*status
)) {
1973 if (start
>limit
|| (start
<destIndex
&& destIndex
<limit
))
1975 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1979 int32_t start32
= pinIndex(start
, length
);
1980 int32_t limit32
= pinIndex(limit
, length
);
1981 int32_t destIndex32
= pinIndex(destIndex
, length
);
1983 // TODO: snap input parameters to code point boundaries.
1986 // move: copy to destIndex, then replace original with nothing
1987 int32_t segLength
=limit32
-start32
;
1988 rep
->copy(start32
, limit32
, destIndex32
);
1989 if(destIndex32
<start32
) {
1993 rep
->handleReplaceBetween(start32
, limit32
, UnicodeString());
1996 rep
->copy(start32
, limit32
, destIndex32
);
1999 // If the change to the text touched the region in the chunk buffer,
2000 // invalidate the buffer.
2001 int32_t firstAffectedIndex
= destIndex32
;
2002 if (move
&& start32
<firstAffectedIndex
) {
2003 firstAffectedIndex
= start32
;
2005 if (firstAffectedIndex
< ut
->chunkNativeLimit
) {
2006 // changes may have affected range covered by the chunk
2007 invalidateChunk(ut
);
2010 // Put iteration position at the newly inserted (moved) block,
2011 int32_t nativeIterIndex
= destIndex32
+ limit32
- start32
;
2012 if (move
&& destIndex32
>start32
) {
2013 // moved a block of text towards the end of the string.
2014 nativeIterIndex
= destIndex32
;
2017 // Set position, reload chunk if needed.
2018 repTextAccess(ut
, nativeIterIndex
, TRUE
);
2021 static const struct UTextFuncs repFuncs
=
2024 0, 0, 0, // Reserved alignment padding
2031 NULL
, // MapOffsetToNative,
2032 NULL
, // MapIndexToUTF16,
2040 U_CAPI UText
* U_EXPORT2
2041 utext_openReplaceable(UText
*ut
, Replaceable
*rep
, UErrorCode
*status
)
2043 if(U_FAILURE(*status
)) {
2047 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2050 ut
= utext_setup(ut
, sizeof(ReplExtra
), status
);
2051 if(U_FAILURE(*status
)) {
2055 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2056 if(rep
->hasMetaData()) {
2057 ut
->providerProperties
|=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA
);
2060 ut
->pFuncs
= &repFuncs
;
2074 //------------------------------------------------------------------------------
2076 // UText implementation for UnicodeString (read/write) and
2077 // for const UnicodeString (read only)
2078 // (same implementation, only the flags are different)
2080 // Use of UText data members:
2081 // context pointer to UnicodeString
2082 // p pointer to UnicodeString IF this UText owns the string
2083 // and it must be deleted on close(). NULL otherwise.
2085 //------------------------------------------------------------------------------
2090 static UText
* U_CALLCONV
2091 unistrTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
) {
2092 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2093 dest
= shallowTextClone(dest
, src
, status
);
2095 // For deep clones, make a copy of the UnicodeSring.
2096 // The copied UnicodeString storage is owned by the newly created UText clone.
2097 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2100 if (deep
&& U_SUCCESS(*status
)) {
2101 const UnicodeString
*srcString
= (const UnicodeString
*)src
->context
;
2102 dest
->context
= new UnicodeString(*srcString
);
2103 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2105 // with deep clone, the copy is writable, even when the source is not.
2106 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2111 static void U_CALLCONV
2112 unistrTextClose(UText
*ut
) {
2113 // Most of the work of close is done by the generic UText framework close.
2114 // All that needs to be done here is delete the UnicodeString if the UText
2115 // owns it. This occurs if the UText was created by cloning.
2116 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2117 UnicodeString
*str
= (UnicodeString
*)ut
->context
;
2124 static int64_t U_CALLCONV
2125 unistrTextLength(UText
*t
) {
2126 return ((const UnicodeString
*)t
->context
)->length();
2130 static UBool U_CALLCONV
2131 unistrTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2132 int32_t length
= ut
->chunkLength
;
2133 ut
->chunkOffset
= pinIndex(index
, length
);
2135 // Check whether request is at the start or end
2136 UBool retVal
= (forward
&& index
<length
) || (!forward
&& index
>0);
2142 static int32_t U_CALLCONV
2143 unistrTextExtract(UText
*t
,
2144 int64_t start
, int64_t limit
,
2145 UChar
*dest
, int32_t destCapacity
,
2146 UErrorCode
*pErrorCode
) {
2147 const UnicodeString
*us
=(const UnicodeString
*)t
->context
;
2148 int32_t length
=us
->length();
2150 if(U_FAILURE(*pErrorCode
)) {
2153 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
2154 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2156 if(start
<0 || start
>limit
) {
2157 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2161 int32_t start32
= start
<length
? us
->getChar32Start((int32_t)start
) : length
;
2162 int32_t limit32
= limit
<length
? us
->getChar32Start((int32_t)limit
) : length
;
2164 length
=limit32
-start32
;
2165 if (destCapacity
>0 && dest
!=NULL
) {
2166 int32_t trimmedLength
= length
;
2167 if(trimmedLength
>destCapacity
) {
2168 trimmedLength
=destCapacity
;
2170 us
->extract(start32
, trimmedLength
, dest
);
2171 t
->chunkOffset
= start32
+trimmedLength
;
2173 t
->chunkOffset
= start32
;
2175 u_terminateUChars(dest
, destCapacity
, length
, pErrorCode
);
2179 static int32_t U_CALLCONV
2180 unistrTextReplace(UText
*ut
,
2181 int64_t start
, int64_t limit
,
2182 const UChar
*src
, int32_t length
,
2183 UErrorCode
*pErrorCode
) {
2184 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
2187 if(U_FAILURE(*pErrorCode
)) {
2190 if(src
==NULL
&& length
!=0) {
2191 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2194 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2197 oldLength
=us
->length();
2198 int32_t start32
= pinIndex(start
, oldLength
);
2199 int32_t limit32
= pinIndex(limit
, oldLength
);
2200 if (start32
< oldLength
) {
2201 start32
= us
->getChar32Start(start32
);
2203 if (limit32
< oldLength
) {
2204 limit32
= us
->getChar32Start(limit32
);
2208 us
->replace(start32
, limit32
-start32
, src
, length
);
2209 int32_t newLength
= us
->length();
2211 // Update the chunk description.
2212 ut
->chunkContents
= us
->getBuffer();
2213 ut
->chunkLength
= newLength
;
2214 ut
->chunkNativeLimit
= newLength
;
2215 ut
->nativeIndexingLimit
= newLength
;
2217 // Set iteration position to the point just following the newly inserted text.
2218 int32_t lengthDelta
= newLength
- oldLength
;
2219 ut
->chunkOffset
= limit32
+ lengthDelta
;
2224 static void U_CALLCONV
2225 unistrTextCopy(UText
*ut
,
2226 int64_t start
, int64_t limit
,
2229 UErrorCode
*pErrorCode
) {
2230 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
2231 int32_t length
=us
->length();
2233 if(U_FAILURE(*pErrorCode
)) {
2236 int32_t start32
= pinIndex(start
, length
);
2237 int32_t limit32
= pinIndex(limit
, length
);
2238 int32_t destIndex32
= pinIndex(destIndex
, length
);
2240 if( start32
>limit32
|| (start32
<destIndex32
&& destIndex32
<limit32
)) {
2241 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2246 // move: copy to destIndex, then remove original
2247 int32_t segLength
=limit32
-start32
;
2248 us
->copy(start32
, limit32
, destIndex32
);
2249 if(destIndex32
<start32
) {
2252 us
->remove(start32
, segLength
);
2255 us
->copy(start32
, limit32
, destIndex32
);
2258 // update chunk description, set iteration position.
2259 ut
->chunkContents
= us
->getBuffer();
2261 // copy operation, string length grows
2262 ut
->chunkLength
+= limit32
-start32
;
2263 ut
->chunkNativeLimit
= ut
->chunkLength
;
2264 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2267 // Iteration position to end of the newly inserted text.
2268 ut
->chunkOffset
= destIndex32
+limit32
-start32
;
2269 if (move
&& destIndex32
>start32
) {
2270 ut
->chunkOffset
= destIndex32
;
2275 static const struct UTextFuncs unistrFuncs
=
2278 0, 0, 0, // Reserved alignment padding
2285 NULL
, // MapOffsetToNative,
2286 NULL
, // MapIndexToUTF16,
2298 U_CAPI UText
* U_EXPORT2
2299 utext_openUnicodeString(UText
*ut
, UnicodeString
*s
, UErrorCode
*status
) {
2300 ut
= utext_openConstUnicodeString(ut
, s
, status
);
2301 if (U_SUCCESS(*status
)) {
2302 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2309 U_CAPI UText
* U_EXPORT2
2310 utext_openConstUnicodeString(UText
*ut
, const UnicodeString
*s
, UErrorCode
*status
) {
2311 if (U_SUCCESS(*status
) && s
->isBogus()) {
2312 // The UnicodeString is bogus, but we still need to detach the UText
2313 // from whatever it was hooked to before, if anything.
2314 utext_openUChars(ut
, NULL
, 0, status
);
2315 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2318 ut
= utext_setup(ut
, 0, status
);
2319 // note: use the standard (writable) function table for UnicodeString.
2320 // The flag settings disable writing, so having the functions in
2321 // the table is harmless.
2322 if (U_SUCCESS(*status
)) {
2323 ut
->pFuncs
= &unistrFuncs
;
2325 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
);
2326 ut
->chunkContents
= s
->getBuffer();
2327 ut
->chunkLength
= s
->length();
2328 ut
->chunkNativeStart
= 0;
2329 ut
->chunkNativeLimit
= ut
->chunkLength
;
2330 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2335 //------------------------------------------------------------------------------
2337 // UText implementation for const UChar * strings
2339 // Use of UText data members:
2340 // context pointer to UnicodeString
2341 // a length. -1 if not yet known.
2343 // TODO: support 64 bit lengths.
2345 //------------------------------------------------------------------------------
2350 static UText
* U_CALLCONV
2351 ucstrTextClone(UText
*dest
, const UText
* src
, UBool deep
, UErrorCode
* status
) {
2352 // First do a generic shallow clone.
2353 dest
= shallowTextClone(dest
, src
, status
);
2355 // For deep clones, make a copy of the string.
2356 // The copied storage is owned by the newly created clone.
2357 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2360 if (deep
&& U_SUCCESS(*status
)) {
2361 U_ASSERT(utext_nativeLength(dest
) < INT32_MAX
);
2362 int32_t len
= (int32_t)utext_nativeLength(dest
);
2364 // The cloned string IS going to be NUL terminated, whether or not the original was.
2365 const UChar
*srcStr
= (const UChar
*)src
->context
;
2366 UChar
*copyStr
= (UChar
*)uprv_malloc((len
+1) * sizeof(UChar
));
2367 if (copyStr
== NULL
) {
2368 *status
= U_MEMORY_ALLOCATION_ERROR
;
2371 for (i
=0; i
<len
; i
++) {
2372 copyStr
[i
] = srcStr
[i
];
2375 dest
->context
= copyStr
;
2376 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2383 static void U_CALLCONV
2384 ucstrTextClose(UText
*ut
) {
2385 // Most of the work of close is done by the generic UText framework close.
2386 // All that needs to be done here is delete the string if the UText
2387 // owns it. This occurs if the UText was created by cloning.
2388 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2389 UChar
*s
= (UChar
*)ut
->context
;
2397 static int64_t U_CALLCONV
2398 ucstrTextLength(UText
*ut
) {
2400 // null terminated, we don't yet know the length. Scan for it.
2401 // Access is not convenient for doing this
2402 // because the current interation postion can't be changed.
2403 const UChar
*str
= (const UChar
*)ut
->context
;
2405 if (str
[ut
->chunkNativeLimit
] == 0) {
2408 ut
->chunkNativeLimit
++;
2410 ut
->a
= ut
->chunkNativeLimit
;
2411 ut
->chunkLength
= (int32_t)ut
->chunkNativeLimit
;
2412 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2413 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2419 static UBool U_CALLCONV
2420 ucstrTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2421 const UChar
*str
= (const UChar
*)ut
->context
;
2423 // pin the requested index to the bounds of the string,
2424 // and set current iteration position.
2427 } else if (index
< ut
->chunkNativeLimit
) {
2428 // The request data is within the chunk as it is known so far.
2429 // Put index on a code point boundary.
2430 U16_SET_CP_START(str
, 0, index
);
2431 } else if (ut
->a
>= 0) {
2432 // We know the length of this string, and the user is requesting something
2433 // at or beyond the length. Pin the requested index to the length.
2436 // Null terminated string, length not yet known, and the requested index
2437 // is beyond where we have scanned so far.
2438 // Scan to 32 UChars beyond the requested index. The strategy here is
2439 // to avoid fully scanning a long string when the caller only wants to
2440 // see a few characters at its beginning.
2441 int32_t scanLimit
= (int32_t)index
+ 32;
2442 if ((index
+ 32)>INT32_MAX
|| (index
+ 32)<0 ) { // note: int64 expression
2443 scanLimit
= INT32_MAX
;
2446 int32_t chunkLimit
= (int32_t)ut
->chunkNativeLimit
;
2447 for (; chunkLimit
<scanLimit
; chunkLimit
++) {
2448 if (str
[chunkLimit
] == 0) {
2449 // We found the end of the string. Remember it, pin the requested index to it,
2450 // and bail out of here.
2452 ut
->chunkLength
= chunkLimit
;
2453 ut
->nativeIndexingLimit
= chunkLimit
;
2454 if (index
>= chunkLimit
) {
2457 U16_SET_CP_START(str
, 0, index
);
2460 ut
->chunkNativeLimit
= chunkLimit
;
2461 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2465 // We scanned through the next batch of UChars without finding the end.
2466 U16_SET_CP_START(str
, 0, index
);
2467 if (chunkLimit
== INT32_MAX
) {
2468 // Scanned to the limit of a 32 bit length.
2469 // Forceably trim the overlength string back so length fits in int32
2470 // TODO: add support for 64 bit strings.
2472 ut
->chunkLength
= chunkLimit
;
2473 ut
->nativeIndexingLimit
= chunkLimit
;
2474 if (index
> chunkLimit
) {
2477 ut
->chunkNativeLimit
= chunkLimit
;
2478 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2480 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2481 // If the current end is on a lead surrogate, back the end up by one.
2482 // It doesn't matter if the end char happens to be an unpaired surrogate,
2483 // and it's simpler not to worry about it.
2484 if (U16_IS_LEAD(str
[chunkLimit
-1])) {
2487 // Null-terminated chunk with end still unknown.
2488 // Update the chunk length to reflect what has been scanned thus far.
2489 // That the full length is still unknown is (still) flagged by
2491 ut
->chunkNativeLimit
= chunkLimit
;
2492 ut
->nativeIndexingLimit
= chunkLimit
;
2493 ut
->chunkLength
= chunkLimit
;
2498 U_ASSERT(index
<=INT32_MAX
);
2499 ut
->chunkOffset
= (int32_t)index
;
2501 // Check whether request is at the start or end
2502 UBool retVal
= (forward
&& index
<ut
->chunkNativeLimit
) || (!forward
&& index
>0);
2508 static int32_t U_CALLCONV
2509 ucstrTextExtract(UText
*ut
,
2510 int64_t start
, int64_t limit
,
2511 UChar
*dest
, int32_t destCapacity
,
2512 UErrorCode
*pErrorCode
)
2514 if(U_FAILURE(*pErrorCode
)) {
2517 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) || start
>limit
) {
2518 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2522 //const UChar *s=(const UChar *)ut->context;
2528 // Access the start. Does two things we need:
2529 // Pins 'start' to the length of the string, if it came in out-of-bounds.
2530 // Snaps 'start' to the beginning of a code point.
2531 ucstrTextAccess(ut
, start
, TRUE
);
2532 const UChar
*s
=ut
->chunkContents
;
2533 start32
= ut
->chunkOffset
;
2535 int32_t strLength
=(int32_t)ut
->a
;
2536 if (strLength
>= 0) {
2537 limit32
= pinIndex(limit
, strLength
);
2539 limit32
= pinIndex(limit
, INT32_MAX
);
2542 for (si
=start32
; si
<limit32
; si
++) {
2543 if (strLength
<0 && s
[si
]==0) {
2544 // Just hit the end of a null-terminated string.
2545 ut
->a
= si
; // set string length for this UText
2546 ut
->chunkNativeLimit
= si
;
2547 ut
->chunkLength
= si
;
2548 ut
->nativeIndexingLimit
= si
;
2553 U_ASSERT(di
>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
2554 if (di
<destCapacity
) {
2555 // only store if there is space.
2559 // We have filled the destination buffer, and the string length is known.
2560 // Cut the loop short. There is no need to scan string termination.
2561 di
= limit32
- start32
;
2569 // If the limit index points to a lead surrogate of a pair,
2570 // add the corresponding trail surrogate to the destination.
2571 if (si
>0 && U16_IS_LEAD(s
[si
-1]) &&
2572 ((si
<strLength
|| strLength
<0) && U16_IS_TRAIL(s
[si
])))
2574 if (di
<destCapacity
) {
2575 // store only if there is space in the output buffer.
2581 // Put iteration position at the point just following the extracted text
2582 if (si
<= ut
->chunkNativeLimit
) {
2583 ut
->chunkOffset
= si
;
2585 ucstrTextAccess(ut
, si
, TRUE
);
2588 // Add a terminating NUL if space in the buffer permits,
2589 // and set the error status as required.
2590 u_terminateUChars(dest
, destCapacity
, di
, pErrorCode
);
2594 static const struct UTextFuncs ucstrFuncs
=
2597 0, 0, 0, // Reserved alignment padding
2604 NULL
, // MapOffsetToNative,
2605 NULL
, // MapIndexToUTF16,
2614 static const UChar gEmptyUString
[] = {0};
2616 U_CAPI UText
* U_EXPORT2
2617 utext_openUChars(UText
*ut
, const UChar
*s
, int64_t length
, UErrorCode
*status
) {
2618 if (U_FAILURE(*status
)) {
2621 if(s
==NULL
&& length
==0) {
2624 if (s
==NULL
|| length
< -1 || length
>INT32_MAX
) {
2625 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2628 ut
= utext_setup(ut
, 0, status
);
2629 if (U_SUCCESS(*status
)) {
2630 ut
->pFuncs
= &ucstrFuncs
;
2632 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
);
2634 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2637 ut
->chunkContents
= s
;
2638 ut
->chunkNativeStart
= 0;
2639 ut
->chunkNativeLimit
= length
>=0? length
: 0;
2640 ut
->chunkLength
= (int32_t)ut
->chunkNativeLimit
;
2641 ut
->chunkOffset
= 0;
2642 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2648 //------------------------------------------------------------------------------
2650 // UText implementation for text from ICU CharacterIterators
2652 // Use of UText data members:
2653 // context pointer to the CharacterIterator
2654 // a length of the full text.
2655 // p pointer to buffer 1
2656 // b start index of local buffer 1 contents
2657 // q pointer to buffer 2
2658 // c start index of local buffer 2 contents
2659 // r pointer to the character iterator if the UText owns it.
2662 //------------------------------------------------------------------------------
2663 #define CIBufSize 16
2666 static void U_CALLCONV
2667 charIterTextClose(UText
*ut
) {
2668 // Most of the work of close is done by the generic UText framework close.
2669 // All that needs to be done here is delete the CharacterIterator if the UText
2670 // owns it. This occurs if the UText was created by cloning.
2671 CharacterIterator
*ci
= (CharacterIterator
*)ut
->r
;
2676 static int64_t U_CALLCONV
2677 charIterTextLength(UText
*ut
) {
2678 return (int32_t)ut
->a
;
2681 static UBool U_CALLCONV
2682 charIterTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2683 CharacterIterator
*ci
= (CharacterIterator
*)ut
->context
;
2685 int32_t clippedIndex
= (int32_t)index
;
2686 if (clippedIndex
<0) {
2688 } else if (clippedIndex
>=ut
->a
) {
2689 clippedIndex
=(int32_t)ut
->a
;
2691 int32_t neededIndex
= clippedIndex
;
2692 if (!forward
&& neededIndex
>0) {
2693 // reverse iteration, want the position just before what was asked for.
2695 } else if (forward
&& neededIndex
==ut
->a
&& neededIndex
>0) {
2696 // Forward iteration, don't ask for something past the end of the text.
2700 // Find the native index of the start of the buffer containing what we want.
2701 neededIndex
-= neededIndex
% CIBufSize
;
2704 UBool needChunkSetup
= TRUE
;
2706 if (ut
->chunkNativeStart
== neededIndex
) {
2707 // The buffer we want is already the current chunk.
2708 needChunkSetup
= FALSE
;
2709 } else if (ut
->b
== neededIndex
) {
2710 // The first buffer (buffer p) has what we need.
2711 buf
= (UChar
*)ut
->p
;
2712 } else if (ut
->c
== neededIndex
) {
2713 // The second buffer (buffer q) has what we need.
2714 buf
= (UChar
*)ut
->q
;
2716 // Neither buffer already has what we need.
2717 // Load new data from the character iterator.
2718 // Use the buf that is not the current buffer.
2719 buf
= (UChar
*)ut
->p
;
2720 if (ut
->p
== ut
->chunkContents
) {
2721 buf
= (UChar
*)ut
->q
;
2723 ci
->setIndex(neededIndex
);
2724 for (i
=0; i
<CIBufSize
; i
++) {
2725 buf
[i
] = ci
->nextPostInc();
2726 if (i
+neededIndex
> ut
->a
) {
2732 // We have a buffer with the data we need.
2733 // Set it up as the current chunk, if it wasn't already.
2734 if (needChunkSetup
) {
2735 ut
->chunkContents
= buf
;
2736 ut
->chunkLength
= CIBufSize
;
2737 ut
->chunkNativeStart
= neededIndex
;
2738 ut
->chunkNativeLimit
= neededIndex
+ CIBufSize
;
2739 if (ut
->chunkNativeLimit
> ut
->a
) {
2740 ut
->chunkNativeLimit
= ut
->a
;
2741 ut
->chunkLength
= (int32_t)(ut
->chunkNativeLimit
)-(int32_t)(ut
->chunkNativeStart
);
2743 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2744 U_ASSERT(ut
->chunkOffset
>=0 && ut
->chunkOffset
<=CIBufSize
);
2746 ut
->chunkOffset
= clippedIndex
- (int32_t)ut
->chunkNativeStart
;
2747 UBool success
= (forward
? ut
->chunkOffset
<ut
->chunkLength
: ut
->chunkOffset
>0);
2751 static UText
* U_CALLCONV
2752 charIterTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
* status
) {
2753 if (U_FAILURE(*status
)) {
2758 // There is no CharacterIterator API for cloning the underlying text storage.
2759 *status
= U_UNSUPPORTED_ERROR
;
2762 CharacterIterator
*srcCI
=(CharacterIterator
*)src
->context
;
2763 srcCI
= srcCI
->clone();
2764 dest
= utext_openCharacterIterator(dest
, srcCI
, status
);
2765 if (U_FAILURE(*status
)) {
2768 // cast off const on getNativeIndex.
2769 // For CharacterIterator based UTexts, this is safe, the operation is const.
2770 int64_t ix
= utext_getNativeIndex((UText
*)src
);
2771 utext_setNativeIndex(dest
, ix
);
2772 dest
->r
= srcCI
; // flags that this UText owns the CharacterIterator
2777 static int32_t U_CALLCONV
2778 charIterTextExtract(UText
*ut
,
2779 int64_t start
, int64_t limit
,
2780 UChar
*dest
, int32_t destCapacity
,
2783 if(U_FAILURE(*status
)) {
2786 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) || start
>limit
) {
2787 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2790 int32_t length
= (int32_t)ut
->a
;
2791 int32_t start32
= pinIndex(start
, length
);
2792 int32_t limit32
= pinIndex(limit
, length
);
2797 CharacterIterator
*ci
= (CharacterIterator
*)ut
->context
;
2798 ci
->setIndex32(start32
); // Moves ix to lead of surrogate pair, if needed.
2799 srci
= ci
->getIndex();
2801 while (srci
<limit32
) {
2802 UChar32 c
= ci
->next32PostInc();
2803 int32_t len
= U16_LENGTH(c
);
2804 U_ASSERT(desti
+len
>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
2805 if (desti
+len
<= destCapacity
) {
2806 U16_APPEND_UNSAFE(dest
, desti
, c
);
2807 copyLimit
= srci
+len
;
2810 *status
= U_BUFFER_OVERFLOW_ERROR
;
2815 charIterTextAccess(ut
, copyLimit
, TRUE
);
2817 u_terminateUChars(dest
, destCapacity
, desti
, status
);
2821 static const struct UTextFuncs charIterFuncs
=
2824 0, 0, 0, // Reserved alignment padding
2828 charIterTextExtract
,
2831 NULL
, // MapOffsetToNative,
2832 NULL
, // MapIndexToUTF16,
2841 U_CAPI UText
* U_EXPORT2
2842 utext_openCharacterIterator(UText
*ut
, CharacterIterator
*ci
, UErrorCode
*status
) {
2843 if (U_FAILURE(*status
)) {
2847 if (ci
->startIndex() > 0) {
2848 // No support for CharacterIterators that do not start indexing from zero.
2849 *status
= U_UNSUPPORTED_ERROR
;
2853 // Extra space in UText for 2 buffers of CIBufSize UChars each.
2854 int32_t extraSpace
= 2 * CIBufSize
* sizeof(UChar
);
2855 ut
= utext_setup(ut
, extraSpace
, status
);
2856 if (U_SUCCESS(*status
)) {
2857 ut
->pFuncs
= &charIterFuncs
;
2859 ut
->providerProperties
= 0;
2860 ut
->a
= ci
->endIndex(); // Length of text
2861 ut
->p
= ut
->pExtra
; // First buffer
2862 ut
->b
= -1; // Native index of first buffer contents
2863 ut
->q
= (UChar
*)ut
->pExtra
+CIBufSize
; // Second buffer
2864 ut
->c
= -1; // Native index of second buffer contents
2866 // Initialize current chunk contents to be empty.
2867 // First access will fault something in.
2868 // Note: The initial nativeStart and chunkOffset must sum to zero
2869 // so that getNativeIndex() will correctly compute to zero
2870 // if no call to Access() has ever been made. They can't be both
2871 // zero without Access() thinking that the chunk is valid.
2872 ut
->chunkContents
= (UChar
*)ut
->p
;
2873 ut
->chunkNativeStart
= -1;
2874 ut
->chunkOffset
= 1;
2875 ut
->chunkNativeLimit
= 0;
2876 ut
->chunkLength
= 0;
2877 ut
->nativeIndexingLimit
= ut
->chunkOffset
; // enables native indexing