2 *******************************************************************************
4 * Copyright (C) 2005-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2005apr12
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/chariter.h"
21 #include "unicode/utext.h"
28 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
32 utext_access(UText
*ut
, int64_t index
, UBool forward
) {
33 return ut
->pFuncs
->access(ut
, index
, forward
);
38 U_DRAFT UBool U_EXPORT2
39 utext_moveIndex32(UText
*ut
, int32_t delta
) {
43 if(ut
->chunkOffset
>=ut
->chunkLength
&& !utext_access(ut
, ut
->chunkNativeLimit
, TRUE
)) {
46 c
= ut
->chunkContents
[ut
->chunkOffset
];
47 if (U16_IS_SURROGATE(c
)) {
49 if (c
== U_SENTINEL
) {
59 if(ut
->chunkOffset
<=0 && !utext_access(ut
, ut
->chunkNativeStart
, FALSE
)) {
62 c
= ut
->chunkContents
[ut
->chunkOffset
-1];
63 if (U16_IS_SURROGATE(c
)) {
64 c
= utext_previous32(ut
);
65 if (c
== U_SENTINEL
) {
78 U_DRAFT
int64_t U_EXPORT2
79 utext_nativeLength(UText
*ut
) {
80 return ut
->pFuncs
->nativeLength(ut
);
84 U_DRAFT UBool U_EXPORT2
85 utext_isLengthExpensive(const UText
*ut
) {
86 UBool r
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
)) != 0;
91 U_DRAFT
int64_t U_EXPORT2
92 utext_getNativeIndex(const UText
*ut
) {
93 if(ut
->chunkOffset
<= ut
->nativeIndexingLimit
) {
94 return ut
->chunkNativeStart
+ut
->chunkOffset
;
96 return ut
->pFuncs
->mapOffsetToNative(ut
);
101 U_DRAFT
void U_EXPORT2
102 utext_setNativeIndex(UText
*ut
, int64_t index
) {
103 if(index
<ut
->chunkNativeStart
|| index
>=ut
->chunkNativeLimit
) {
104 // The desired position is outside of the current chunk.
105 // Access the new position. Assume a forward iteration from here,
106 // which will also be optimimum for a single random access.
107 // Reverse iterations may suffer slightly.
108 ut
->pFuncs
->access(ut
, index
, TRUE
);
109 } else if((int32_t)(index
- ut
->chunkNativeStart
) <= ut
->nativeIndexingLimit
) {
111 ut
->chunkOffset
=(int32_t)(index
-ut
->chunkNativeStart
);
113 ut
->chunkOffset
=ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
115 // The convention is that the index must always be on a code point boundary.
116 // Adjust the index position if it is in the middle of a surrogate pair.
117 if (ut
->chunkOffset
<ut
->chunkLength
) {
118 UChar c
= ut
->chunkContents
[ut
->chunkOffset
];
119 if (UTF16_IS_TRAIL(c
)) {
120 if (ut
->chunkOffset
==0) {
121 ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
);
123 if (ut
->chunkOffset
>0) {
124 UChar lead
= ut
->chunkContents
[ut
->chunkOffset
-1];
125 if (UTF16_IS_LEAD(lead
)) {
135 U_DRAFT
int64_t U_EXPORT2
136 utext_getPreviousNativeIndex(UText
*ut
) {
138 // Fast-path the common case.
139 // Common means current position is not at the beginning of a chunk
140 // and the preceding character is not supplementary.
142 int32_t i
= ut
->chunkOffset
- 1;
145 UChar c
= ut
->chunkContents
[i
];
146 if (U16_IS_TRAIL(c
) == FALSE
) {
147 if (i
<= ut
->nativeIndexingLimit
) {
148 result
= ut
->chunkNativeStart
+ i
;
151 result
= ut
->pFuncs
->mapOffsetToNative(ut
);
158 // If at the start of text, simply return 0.
159 if (ut
->chunkOffset
==0 && ut
->chunkNativeStart
==0) {
163 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
164 // Keep it simple, use other functions to handle the edges.
166 utext_previous32(ut
);
167 result
= UTEXT_GETNATIVEINDEX(ut
);
174 // utext_current32. Get the UChar32 at the current position.
175 // UText iteration position is always on a code point boundary,
176 // never on the trail half of a surrogate pair.
178 U_DRAFT UChar32 U_EXPORT2
179 utext_current32(UText
*ut
) {
181 if (ut
->chunkOffset
==ut
->chunkLength
) {
182 // Current position is just off the end of the chunk.
183 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
184 // Off the end of the text.
189 c
= ut
->chunkContents
[ut
->chunkOffset
];
190 if (U16_IS_LEAD(c
) == FALSE
) {
191 // Normal, non-supplementary case.
196 // Possible supplementary char.
199 UChar32 supplementaryC
= c
;
200 if ((ut
->chunkOffset
+1) < ut
->chunkLength
) {
201 // The trail surrogate is in the same chunk.
202 trail
= ut
->chunkContents
[ut
->chunkOffset
+1];
204 // The trail surrogate is in a different chunk.
205 // Because we must maintain the iteration position, we need to switch forward
206 // into the new chunk, get the trail surrogate, then revert the chunk back to the
208 // An edge case to be careful of: the entire text may end with an unpaired
209 // leading surrogate. The attempt to access the trail will fail, but
210 // the original position before the unpaired lead still needs to be restored.
211 int64_t nativePosition
= ut
->chunkNativeLimit
;
212 int32_t originalOffset
= ut
->chunkOffset
;
213 if (ut
->pFuncs
->access(ut
, nativePosition
, TRUE
)) {
214 trail
= ut
->chunkContents
[ut
->chunkOffset
];
216 UBool r
= ut
->pFuncs
->access(ut
, nativePosition
, FALSE
); // reverse iteration flag loads preceding chunk
218 ut
->chunkOffset
= originalOffset
;
224 if (U16_IS_TRAIL(trail
)) {
225 supplementaryC
= U16_GET_SUPPLEMENTARY(c
, trail
);
227 return supplementaryC
;
232 U_DRAFT UChar32 U_EXPORT2
233 utext_char32At(UText
*ut
, int64_t nativeIndex
) {
234 UChar32 c
= U_SENTINEL
;
236 // Fast path the common case.
237 if (nativeIndex
>=ut
->chunkNativeStart
&& nativeIndex
< ut
->chunkNativeStart
+ ut
->nativeIndexingLimit
) {
238 ut
->chunkOffset
= (int32_t)(nativeIndex
- ut
->chunkNativeStart
);
239 c
= ut
->chunkContents
[ut
->chunkOffset
];
240 if (U16_IS_SURROGATE(c
) == FALSE
) {
246 utext_setNativeIndex(ut
, nativeIndex
);
247 if (nativeIndex
>=ut
->chunkNativeStart
&& ut
->chunkOffset
<ut
->chunkLength
) {
248 c
= ut
->chunkContents
[ut
->chunkOffset
];
249 if (U16_IS_SURROGATE(c
)) {
250 // For surrogates, let current32() deal with the complications
251 // of supplementaries that may span chunk boundaries.
252 c
= utext_current32(ut
);
259 U_DRAFT UChar32 U_EXPORT2
260 utext_next32(UText
*ut
) {
263 if (ut
->chunkOffset
>= ut
->chunkLength
) {
264 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
269 c
= ut
->chunkContents
[ut
->chunkOffset
++];
270 if (U16_IS_LEAD(c
) == FALSE
) {
271 // Normal case, not supplementary.
272 // (A trail surrogate seen here is just returned as is, as a surrogate value.
273 // It cannot be part of a pair.)
277 if (ut
->chunkOffset
>= ut
->chunkLength
) {
278 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
279 // c is an unpaired lead surrogate at the end of the text.
280 // return it as it is.
284 UChar32 trail
= ut
->chunkContents
[ut
->chunkOffset
];
285 if (U16_IS_TRAIL(trail
) == FALSE
) {
286 // c was an unpaired lead surrogate, not at the end of the text.
287 // return it as it is (unpaired). Iteration position is on the
288 // following character, possibly in the next chunk, where the
289 // trail surrogate would have been if it had existed.
293 UChar32 supplementary
= U16_GET_SUPPLEMENTARY(c
, trail
);
294 ut
->chunkOffset
++; // move iteration position over the trail surrogate.
295 return supplementary
;
299 U_DRAFT UChar32 U_EXPORT2
300 utext_previous32(UText
*ut
) {
303 if (ut
->chunkOffset
<= 0) {
304 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
) == FALSE
) {
309 c
= ut
->chunkContents
[ut
->chunkOffset
];
310 if (U16_IS_TRAIL(c
) == FALSE
) {
311 // Normal case, not supplementary.
312 // (A lead surrogate seen here is just returned as is, as a surrogate value.
313 // It cannot be part of a pair.)
317 if (ut
->chunkOffset
<= 0) {
318 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
) == FALSE
) {
319 // c is an unpaired trail surrogate at the start of the text.
320 // return it as it is.
325 UChar32 lead
= ut
->chunkContents
[ut
->chunkOffset
-1];
326 if (U16_IS_LEAD(lead
) == FALSE
) {
327 // c was an unpaired trail surrogate, not at the end of the text.
328 // return it as it is (unpaired). Iteration position is at c
332 UChar32 supplementary
= U16_GET_SUPPLEMENTARY(lead
, c
);
333 ut
->chunkOffset
--; // move iteration position over the lead surrogate.
334 return supplementary
;
339 U_DRAFT UChar32 U_EXPORT2
340 utext_next32From(UText
*ut
, int64_t index
) {
341 UChar32 c
= U_SENTINEL
;
343 if(index
<ut
->chunkNativeStart
|| index
>=ut
->chunkNativeLimit
) {
344 // Desired position is outside of the current chunk.
345 if(!ut
->pFuncs
->access(ut
, index
, TRUE
)) {
346 // no chunk available here
349 } else if (index
- ut
->chunkNativeStart
<= (int64_t)ut
->nativeIndexingLimit
) {
350 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
351 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
353 // Desired position is in chunk, with non-UTF16 indexing.
354 ut
->chunkOffset
= ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
357 c
= ut
->chunkContents
[ut
->chunkOffset
++];
358 if (U16_IS_SURROGATE(c
)) {
359 // Surrogates. Many edge cases. Use other functions that already
360 // deal with the problems.
361 utext_setNativeIndex(ut
, index
);
362 c
= utext_next32(ut
);
368 U_DRAFT UChar32 U_EXPORT2
369 utext_previous32From(UText
*ut
, int64_t index
) {
371 // Return the character preceding the specified index.
372 // Leave the iteration position at the start of the character that was returned.
374 UChar32 cPrev
; // The character preceding cCurr, which is what we will return.
376 // Address the chunk containg the position preceding the incoming index
377 // A tricky edge case:
378 // We try to test the requested native index against the chunkNativeStart to determine
379 // whether the character preceding the one at the index is in the current chunk.
380 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
381 // requested index is on something other than the first position of the first char.
383 if(index
<=ut
->chunkNativeStart
|| index
>ut
->chunkNativeLimit
) {
384 // Requested native index is outside of the current chunk.
385 if(!ut
->pFuncs
->access(ut
, index
, FALSE
)) {
386 // no chunk available here
389 } else if(index
- ut
->chunkNativeStart
<= (int64_t)ut
->nativeIndexingLimit
) {
390 // Direct UTF-16 indexing.
391 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
393 ut
->chunkOffset
=ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
394 if (ut
->chunkOffset
==0 && !ut
->pFuncs
->access(ut
, index
, FALSE
)) {
395 // no chunk available here
401 // Simple case with no surrogates.
404 cPrev
= ut
->chunkContents
[ut
->chunkOffset
];
406 if (U16_IS_SURROGATE(cPrev
)) {
407 // Possible supplementary. Many edge cases.
408 // Let other functions do the heavy lifting.
409 utext_setNativeIndex(ut
, index
);
410 cPrev
= utext_previous32(ut
);
416 U_DRAFT
int32_t U_EXPORT2
417 utext_extract(UText
*ut
,
418 int64_t start
, int64_t limit
,
419 UChar
*dest
, int32_t destCapacity
,
420 UErrorCode
*status
) {
421 return ut
->pFuncs
->extract(ut
, start
, limit
, dest
, destCapacity
, status
);
426 U_DRAFT UBool U_EXPORT2
427 utext_equals(const UText
*a
, const UText
*b
) {
428 if (a
==NULL
|| b
==NULL
||
429 a
->magic
!= UTEXT_MAGIC
||
430 b
->magic
!= UTEXT_MAGIC
) {
431 // Null or invalid arguments don't compare equal to anything.
435 if (a
->pFuncs
!= b
->pFuncs
) {
436 // Different types of text providers.
440 if (a
->context
!= b
->context
) {
441 // Different sources (different strings)
444 if (utext_getNativeIndex(a
) != utext_getNativeIndex(b
)) {
445 // Different current position in the string.
452 U_DRAFT UBool U_EXPORT2
453 utext_isWritable(const UText
*ut
)
455 UBool b
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) != 0;
460 U_DRAFT
void U_EXPORT2
461 utext_freeze(UText
*ut
) {
462 // Zero out the WRITABLE flag.
463 ut
->providerProperties
&= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE
));
467 U_DRAFT UBool U_EXPORT2
468 utext_hasMetaData(const UText
*ut
)
470 UBool b
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA
)) != 0;
476 U_DRAFT
int32_t U_EXPORT2
477 utext_replace(UText
*ut
,
478 int64_t nativeStart
, int64_t nativeLimit
,
479 const UChar
*replacementText
, int32_t replacementLength
,
482 if (U_FAILURE(*status
)) {
485 if ((ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) == 0) {
486 *status
= U_NO_WRITE_PERMISSION
;
489 int32_t i
= ut
->pFuncs
->replace(ut
, nativeStart
, nativeLimit
, replacementText
, replacementLength
, status
);
493 U_DRAFT
void U_EXPORT2
494 utext_copy(UText
*ut
,
495 int64_t nativeStart
, int64_t nativeLimit
,
500 if (U_FAILURE(*status
)) {
503 if ((ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) == 0) {
504 *status
= U_NO_WRITE_PERMISSION
;
507 ut
->pFuncs
->copy(ut
, nativeStart
, nativeLimit
, destIndex
, move
, status
);
512 U_DRAFT UText
* U_EXPORT2
513 utext_clone(UText
*dest
, const UText
*src
, UBool deep
, UBool readOnly
, UErrorCode
*status
) {
515 result
= src
->pFuncs
->clone(dest
, src
, deep
, status
);
517 utext_freeze(result
);
524 //------------------------------------------------------------------------------
526 // UText common functions implementation
528 //------------------------------------------------------------------------------
531 // UText.flags bit definitions
534 UTEXT_HEAP_ALLOCATED
= 1, // 1 if ICU has allocated this UText struct on the heap.
535 // 0 if caller provided storage for the UText.
537 UTEXT_EXTRA_HEAP_ALLOCATED
= 2, // 1 if ICU has allocated extra storage as a separate
539 // 0 if there is no separate allocation. Either no extra
540 // storage was requested, or it is appended to the end
541 // of the main UText storage.
543 UTEXT_OPEN
= 4 // 1 if this UText is currently open
544 // 0 if this UText is not open.
549 // Extended form of a UText. The purpose is to aid in computing the total size required
550 // when a provider asks for a UText to be allocated with extra storage.
552 struct ExtendedUText
{
554 UAlignedMemory extension
;
557 static const UText emptyText
= UTEXT_INITIALIZER
;
559 U_DRAFT UText
* U_EXPORT2
560 utext_setup(UText
*ut
, int32_t extraSpace
, UErrorCode
*status
) {
561 if (U_FAILURE(*status
)) {
566 // We need to heap-allocate storage for the new UText
567 int32_t spaceRequired
= sizeof(UText
);
568 if (extraSpace
> 0) {
569 spaceRequired
= sizeof(ExtendedUText
) + extraSpace
- sizeof(UAlignedMemory
);
571 ut
= (UText
*)uprv_malloc(spaceRequired
);
573 *status
= U_MEMORY_ALLOCATION_ERROR
;
576 ut
->flags
|= UTEXT_HEAP_ALLOCATED
;
577 if (spaceRequired
>0) {
578 ut
->extraSize
= extraSpace
;
579 ut
->pExtra
= &((ExtendedUText
*)ut
)->extension
;
580 uprv_memset(ut
->pExtra
, 0, extraSpace
); // Purify whines about copying untouched extra [buffer]
581 // space when cloning, so init it now.
585 // We have been supplied with an already existing UText.
586 // Verify that it really appears to be a UText.
587 if (ut
->magic
!= UTEXT_MAGIC
) {
588 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
591 // If the ut is already open and there's a provider supplied close
592 // function, call it.
593 if ((ut
->flags
& UTEXT_OPEN
) && ut
->pFuncs
->close
!= NULL
) {
594 ut
->pFuncs
->close(ut
);
596 ut
->flags
&= ~UTEXT_OPEN
;
598 // If extra space was requested by our caller, check whether
599 // sufficient already exists, and allocate new if needed.
600 if (extraSpace
> ut
->extraSize
) {
601 // Need more space. If there is existing separately allocated space,
602 // delete it first, then allocate new space.
603 if (ut
->flags
& UTEXT_EXTRA_HEAP_ALLOCATED
) {
604 uprv_free(ut
->pExtra
);
607 ut
->pExtra
= uprv_malloc(extraSpace
);
608 if (ut
->pExtra
== NULL
) {
609 *status
= U_MEMORY_ALLOCATION_ERROR
;
611 ut
->extraSize
= extraSpace
;
612 ut
->flags
|= UTEXT_EXTRA_HEAP_ALLOCATED
;
613 uprv_memset(ut
->pExtra
, 0, extraSpace
);
617 if (U_SUCCESS(*status
)) {
618 ut
->flags
|= UTEXT_OPEN
;
620 // Initialize all remaining fields of the UText.
623 ut
->chunkContents
= NULL
;
632 ut
->chunkNativeStart
= 0;
633 ut
->chunkNativeLimit
= 0;
634 ut
->nativeIndexingLimit
= 0;
635 ut
->providerProperties
= 0;
645 U_DRAFT UText
* U_EXPORT2
646 utext_close(UText
*ut
) {
648 ut
->magic
!= UTEXT_MAGIC
||
649 (ut
->flags
& UTEXT_OPEN
) == 0)
651 // The supplied ut is not an open UText.
656 // If the provider gave us a close function, call it now.
657 // This will clean up anything allocated specifically by the provider.
658 if (ut
->pFuncs
->close
!= NULL
) {
659 ut
->pFuncs
->close(ut
);
661 ut
->flags
&= ~UTEXT_OPEN
;
663 // If we (the framework) allocated the UText or subsidiary storage,
665 if (ut
->flags
& UTEXT_EXTRA_HEAP_ALLOCATED
) {
666 uprv_free(ut
->pExtra
);
668 ut
->flags
&= ~UTEXT_EXTRA_HEAP_ALLOCATED
;
672 // Zero out function table of the closed UText. This is a defensive move,
673 // inteded to cause applications that inadvertantly use a closed
674 // utext to crash with null pointer errors.
677 if (ut
->flags
& UTEXT_HEAP_ALLOCATED
) {
678 // This UText was allocated by UText setup. We need to free it.
679 // Clear magic, so we can detect if the user messes up and immediately
680 // tries to reopen another UText using the deleted storage.
692 // invalidateChunk Reset a chunk to have no contents, so that the next call
693 // to access will cause new data to load.
694 // This is needed when copy/move/replace operate directly on the
695 // backing text, potentially putting it out of sync with the
696 // contents in the chunk.
699 invalidateChunk(UText
*ut
) {
701 ut
->chunkNativeLimit
= 0;
702 ut
->chunkNativeStart
= 0;
704 ut
->nativeIndexingLimit
= 0;
708 // pinIndex Do range pinning on a native index parameter.
709 // 64 bit pinning is done in place.
710 // 32 bit truncated result is returned as a convenience for
711 // use in providers that don't need 64 bits.
713 pinIndex(int64_t &index
, int64_t limit
) {
716 } else if (index
> limit
) {
719 return (int32_t)index
;
726 // Pointer relocation function,
727 // a utility used by shallow clone.
728 // Adjust a pointer that refers to something within one UText (the source)
729 // to refer to the same relative offset within a another UText (the target)
731 static void adjustPointer(UText
*dest
, const void **destPtr
, const UText
*src
) {
732 // convert all pointers to (char *) so that byte address arithmetic will work.
733 char *dptr
= (char *)*destPtr
;
734 char *dUText
= (char *)dest
;
735 char *sUText
= (char *)src
;
737 if (dptr
>= (char *)src
->pExtra
&& dptr
< ((char*)src
->pExtra
)+src
->extraSize
) {
738 // target ptr was to something within the src UText's pExtra storage.
739 // relocate it into the target UText's pExtra region.
740 *destPtr
= ((char *)dest
->pExtra
) + (dptr
- (char *)src
->pExtra
);
741 } else if (dptr
>=sUText
&& dptr
< sUText
+src
->sizeOfStruct
) {
742 // target ptr was pointing to somewhere within the source UText itself.
743 // Move it to the same offset within the target UText.
744 *destPtr
= dUText
+ (dptr
-sUText
);
750 // Clone. This is a generic copy-the-utext-by-value clone function that can be
751 // used as-is with some utext types, and as a helper by other clones.
753 static UText
* U_CALLCONV
754 shallowTextClone(UText
* dest
, const UText
* src
, UErrorCode
* status
) {
755 if (U_FAILURE(*status
)) {
758 int32_t srcExtraSize
= src
->extraSize
;
761 // Use the generic text_setup to allocate storage if required.
763 dest
= utext_setup(dest
, srcExtraSize
, status
);
764 if (U_FAILURE(*status
)) {
769 // flags (how the UText was allocated) and the pointer to the
770 // extra storage must retain the values in the cloned utext that
771 // were set up by utext_setup. Save them separately before
772 // copying the whole struct.
774 void *destExtra
= dest
->pExtra
;
775 int32_t flags
= dest
->flags
;
779 // Copy the whole UText struct by value.
780 // Any "Extra" storage is copied also.
782 int sizeToCopy
= src
->sizeOfStruct
;
783 if (sizeToCopy
> dest
->sizeOfStruct
) {
784 sizeToCopy
= dest
->sizeOfStruct
;
786 uprv_memcpy(dest
, src
, sizeToCopy
);
787 dest
->pExtra
= destExtra
;
789 if (srcExtraSize
> 0) {
790 uprv_memcpy(dest
->pExtra
, src
->pExtra
, srcExtraSize
);
794 // Relocate any pointers in the target that refer to the UText itself
795 // to point to the cloned copy rather than the original source.
797 adjustPointer(dest
, &dest
->context
, src
);
798 adjustPointer(dest
, &dest
->p
, src
);
799 adjustPointer(dest
, &dest
->q
, src
);
800 adjustPointer(dest
, &dest
->r
, src
);
810 //------------------------------------------------------------------------------
812 // UText implementation for UTF-8 char * strings (read-only)
813 // Limitation: string length must be <= 0x7fffffff in length.
814 // (length must for in an int32_t variable)
816 // Use of UText data members:
817 // context pointer to UTF-8 string
818 // utext.b is the input string length (bytes).
819 // utext.c Length scanned so far in string
820 // (for optimizing finding length of zero terminated strings.)
821 // utext.p pointer to the current buffer
822 // utext.q pointer to the other buffer.
824 //------------------------------------------------------------------------------
827 // Must be less than 85, because of byte mapping from UChar indexes to native indexes.
828 // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
831 enum { UTF8_TEXT_CHUNK_SIZE
=32 };
834 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
835 // Each contains the UChar chunk buffer, the to and from native maps, and
838 // because backwards iteration fills the buffers starting at the end and
839 // working towards the front, the filled part of the buffers may not begin
840 // at the start of the available storage for the buffers.
842 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
843 // the last character added being a supplementary, and thus requiring a surrogate
844 // pair. Doing this is simpler than checking for the edge case.
848 int32_t bufNativeStart
; // Native index of first char in UChar buf
849 int32_t bufNativeLimit
; // Native index following last char in buf.
850 int32_t bufStartIdx
; // First filled position in buf.
851 int32_t bufLimitIdx
; // Limit of filled range in buf.
852 int32_t bufNILimit
; // Limit of native indexing part of buf
853 int32_t toUCharsMapStart
; // Native index corresponding to
855 // Set to bufNativeStart when filling forwards.
856 // Set to computed value when filling backwards.
858 UChar buf
[UTF8_TEXT_CHUNK_SIZE
+4]; // The UChar buffer. Requires one extra position beyond the
859 // the chunk size, to allow for surrogate at the end.
860 // Length must be identical to mapToNative array, below,
861 // because of the way indexing works when the array is
862 // filled backwards during a reverse iteration. Thus,
863 // the additional extra size.
864 uint8_t mapToNative
[UTF8_TEXT_CHUNK_SIZE
+4]; // map UChar index in buf to
865 // native offset from bufNativeStart.
866 // Requires two extra slots,
867 // one for a supplementary starting in the last normal position,
868 // and one for an entry for the buffer limit position.
869 uint8_t mapToUChars
[UTF8_TEXT_CHUNK_SIZE
*3+6]; // Map native offset from bufNativeStart to
870 // correspoding offset in filled part of buf.
879 // Get the length of the string. If we don't already know it,
880 // we'll need to scan for the trailing nul.
882 static int64_t U_CALLCONV
883 utf8TextLength(UText
*ut
) {
885 // Zero terminated string, and we haven't scanned to the end yet.
887 const char *r
= (const char *)ut
->context
+ ut
->c
;
891 if ((r
- (const char *)ut
->context
) < 0x7fffffff) {
892 ut
->b
= (int32_t)(r
- (const char *)ut
->context
);
894 // Actual string was bigger (more than 2 gig) than we
895 // can handle. Clip it to 2 GB.
898 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
908 static UBool U_CALLCONV
909 utf8TextAccess(UText
*ut
, int64_t index
, UBool forward
) {
911 // Apologies to those who are allergic to goto statements.
912 // Consider each goto to a labelled block to be the equivalent of
913 // call the named block as if it were a function();
916 const uint8_t *s8
=(const uint8_t *)ut
->context
;
918 int32_t length
= ut
->b
; // Length of original utf-8
919 int32_t ix
= (int32_t)index
; // Requested index, trimmed to 32 bits.
920 int32_t mapIndex
= 0;
923 } else if (index
> 0x7fffffff) {
924 // Strings with 64 bit lengths not supported by this UTF-8 provider.
928 // Pin requested index to the string length.
932 } else if (ix
>ut
->c
) {
933 // Zero terminated string, and requested index is beyond
934 // the region that has already been scanned.
935 // Scan up to either the end of the string or to the
936 // requested position, whichever comes first.
937 while (ut
->c
<ix
&& s8
[ut
->c
]!=0) {
940 // TODO: support for null terminated string length > 32 bits.
941 if (s8
[ut
->c
] == 0) {
942 // We just found the actual length of the string.
943 // Trim the requested index back to that.
947 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
953 // Dispatch to the appropriate action for a forward iteration request.
956 if (ix
==ut
->chunkNativeLimit
) {
957 // Check for normal sequential iteration cases first.
959 // Just reached end of string
960 // Don't swap buffers, but do set the
961 // current buffer position.
962 ut
->chunkOffset
= ut
->chunkLength
;
965 // End of current buffer.
966 // check whether other buffer already has what we need.
967 UTF8Buf
*altB
= (UTF8Buf
*)ut
->q
;
968 if (ix
>=altB
->bufNativeStart
&& ix
<altB
->bufNativeLimit
) {
974 // A random access. Desired index could be in either or niether buf.
975 // For optimizing the order of testing, first check for the index
976 // being in the other buffer. This will be the case for uses that
977 // move back and forth over a fairly limited range
979 u8b
= (UTF8Buf
*)ut
->q
; // the alternate buffer
980 if (ix
>=u8b
->bufNativeStart
&& ix
<u8b
->bufNativeLimit
) {
981 // Requested index is in the other buffer.
985 // Requested index is end-of-string.
986 // (this is the case of randomly seeking to the end.
987 // The case of iterating off the end is handled earlier.)
988 if (ix
== ut
->chunkNativeLimit
) {
989 // Current buffer extends up to the end of the string.
990 // Leave it as the current buffer.
991 ut
->chunkOffset
= ut
->chunkLength
;
994 if (ix
== u8b
->bufNativeLimit
) {
995 // Alternate buffer extends to the end of string.
996 // Swap it in as the current buffer.
997 goto swapBuffersAndFail
;
1000 // Neither existing buffer extends to the end of the string.
1001 goto makeStubBuffer
;
1004 if (ix
<ut
->chunkNativeStart
|| ix
>=ut
->chunkNativeLimit
) {
1005 // Requested index is in neither buffer.
1009 // Requested index is in this buffer.
1010 u8b
= (UTF8Buf
*)ut
->p
; // the current buffer
1011 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1012 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1020 // Dispatch to the appropriate action for a
1021 // Backwards Diretion iteration request.
1023 if (ix
==ut
->chunkNativeStart
) {
1024 // Check for normal sequential iteration cases first.
1026 // Just reached the start of string
1027 // Don't swap buffers, but do set the
1028 // current buffer position.
1029 ut
->chunkOffset
= 0;
1032 // Start of current buffer.
1033 // check whether other buffer already has what we need.
1034 UTF8Buf
*altB
= (UTF8Buf
*)ut
->q
;
1035 if (ix
>altB
->bufNativeStart
&& ix
<=altB
->bufNativeLimit
) {
1041 // A random access. Desired index could be in either or niether buf.
1042 // For optimizing the order of testing,
1043 // Most likely case: in the other buffer.
1044 // Second most likely: in neither buffer.
1045 // Unlikely, but must work: in the current buffer.
1046 u8b
= (UTF8Buf
*)ut
->q
; // the alternate buffer
1047 if (ix
>u8b
->bufNativeStart
&& ix
<=u8b
->bufNativeLimit
) {
1048 // Requested index is in the other buffer.
1051 // Requested index is start-of-string.
1052 // (this is the case of randomly seeking to the start.
1053 // The case of iterating off the start is handled earlier.)
1055 if (u8b
->bufNativeStart
==0) {
1056 // Alternate buffer contains the data for the start string.
1057 // Make it be the current buffer.
1058 goto swapBuffersAndFail
;
1060 // Request for data before the start of string,
1061 // neither buffer is usable.
1062 // set up a zero-length buffer.
1063 goto makeStubBuffer
;
1067 if (ix
<=ut
->chunkNativeStart
|| ix
>ut
->chunkNativeLimit
) {
1068 // Requested index is in neither buffer.
1072 // Requested index is in this buffer.
1073 // Set the utf16 buffer index.
1074 u8b
= (UTF8Buf
*)ut
->p
;
1075 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1076 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1077 if (ut
->chunkOffset
==0) {
1078 // This occurs when the first character in the text is
1079 // a multi-byte UTF-8 char, and the requested index is to
1080 // one of the trailing bytes. Because there is no preceding ,
1081 // character, this access fails. We can't pick up on the
1082 // situation sooner because the requested index is not zero.
1091 // The alternate buffer (ut->q) has the string data that was requested.
1092 // Swap the primary and alternate buffers, and set the
1093 // chunk index into the new primary buffer.
1095 u8b
= (UTF8Buf
*)ut
->q
;
1098 ut
->chunkContents
= &u8b
->buf
[u8b
->bufStartIdx
];
1099 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1100 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1101 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1102 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1104 // Index into the (now current) chunk
1105 // Use the map to set the chunk index. It's more trouble than it's worth
1106 // to check whether native indexing can be used.
1107 U_ASSERT(ix
>=u8b
->bufNativeStart
);
1108 U_ASSERT(ix
<=u8b
->bufNativeLimit
);
1109 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1110 U_ASSERT(mapIndex
>=0);
1111 U_ASSERT(mapIndex
<(int32_t)sizeof(u8b
->mapToUChars
));
1112 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1119 // We got a request for either the start or end of the string,
1120 // with iteration continuing in the out-of-bounds direction.
1121 // The alternate buffer already contains the data up to the
1123 // Swap the buffers, then return failure, indicating that we couldn't
1124 // make things correct for continuing the iteration in the requested
1125 // direction. The position & buffer are correct should the
1126 // user decide to iterate in the opposite direction.
1127 u8b
= (UTF8Buf
*)ut
->q
;
1130 ut
->chunkContents
= &u8b
->buf
[u8b
->bufStartIdx
];
1131 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1132 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1133 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1134 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1136 // Index into the (now current) chunk
1137 // For this function (swapBuffersAndFail), the requested index
1138 // will always be at either the start or end of the chunk.
1139 if (ix
==u8b
->bufNativeLimit
) {
1140 ut
->chunkOffset
= ut
->chunkLength
;
1142 ut
->chunkOffset
= 0;
1143 U_ASSERT(ix
== u8b
->bufNativeStart
);
1148 // The user has done a seek/access past the start or end
1149 // of the string. Rather than loading data that is likely
1150 // to never be used, just set up a zero-length buffer at
1152 u8b
= (UTF8Buf
*)ut
->q
;
1153 u8b
->bufNativeStart
= ix
;
1154 u8b
->bufNativeLimit
= ix
;
1155 u8b
->bufStartIdx
= 0;
1156 u8b
->bufLimitIdx
= 0;
1157 u8b
->bufNILimit
= 0;
1158 u8b
->toUCharsMapStart
= ix
;
1159 u8b
->mapToNative
[0] = 0;
1160 u8b
->mapToUChars
[0] = 0;
1161 goto swapBuffersAndFail
;
1167 // Move the incoming index to a code point boundary.
1168 U8_SET_CP_START(s8
, 0, ix
);
1170 // Swap the UText buffers.
1171 // We want to fill what was previously the alternate buffer,
1172 // and make what was the current buffer be the new alternate.
1173 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->q
;
1177 int32_t strLen
= ut
->b
;
1178 UBool nulTerminated
= FALSE
;
1180 strLen
= 0x7fffffff;
1181 nulTerminated
= TRUE
;
1184 UChar
*buf
= u8b
->buf
;
1185 uint8_t *mapToNative
= u8b
->mapToNative
;
1186 uint8_t *mapToUChars
= u8b
->mapToUChars
;
1189 UBool seenNonAscii
= FALSE
;
1192 // Fill the chunk buffer and mapping arrays.
1193 while (destIx
<UTF8_TEXT_CHUNK_SIZE
) {
1195 if (c
>0 && c
<0x80) {
1196 // Special case ASCII range for speed.
1197 // zero is excluded to simplify bounds checking.
1199 mapToNative
[destIx
] = srcIx
- ix
;
1200 mapToUChars
[srcIx
-ix
] = destIx
;
1204 // General case, handle everything.
1205 if (seenNonAscii
== FALSE
) {
1206 seenNonAscii
= TRUE
;
1207 u8b
->bufNILimit
= destIx
;
1210 int32_t cIx
= srcIx
;
1211 int32_t dIx
= destIx
;
1212 int32_t dIxSaved
= destIx
;
1213 U8_NEXT(s8
, srcIx
, strLen
, c
);
1214 if (c
==0 && nulTerminated
) {
1219 // Illegal UTF-8. Replace with sub character.
1223 U16_APPEND_UNSAFE(buf
, destIx
, c
);
1225 mapToNative
[dIx
++] = cIx
- ix
;
1226 } while (dIx
< destIx
);
1229 mapToUChars
[cIx
++ - ix
] = dIxSaved
;
1230 } while (cIx
< srcIx
);
1232 if (srcIx
>=strLen
) {
1238 // store Native <--> Chunk Map entries for the end of the buffer.
1239 // There is no actual character here, but the index position is valid.
1240 mapToNative
[destIx
] = srcIx
- ix
;
1241 mapToUChars
[srcIx
- ix
] = destIx
;
1243 // fill in Buffer descriptor
1244 u8b
->bufNativeStart
= ix
;
1245 u8b
->bufNativeLimit
= srcIx
;
1246 u8b
->bufStartIdx
= 0;
1247 u8b
->bufLimitIdx
= destIx
;
1248 if (seenNonAscii
== FALSE
) {
1249 u8b
->bufNILimit
= destIx
;
1251 u8b
->toUCharsMapStart
= u8b
->bufNativeStart
;
1253 // Set UText chunk to refer to this buffer.
1254 ut
->chunkContents
= buf
;
1255 ut
->chunkOffset
= 0;
1256 ut
->chunkLength
= u8b
->bufLimitIdx
;
1257 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1258 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1259 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1261 // For zero terminated strings, keep track of the maximum point
1263 if (nulTerminated
&& srcIx
>ut
->c
) {
1266 // We scanned to the end.
1267 // Remember the actual length.
1269 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1278 // Move the incoming index to a code point boundary.
1279 // Can only do this if the incoming index is somewhere in the interior of the string.
1280 // If index is at the end, there is no character there to look at.
1282 U8_SET_CP_START(s8
, 0, ix
);
1285 // Swap the UText buffers.
1286 // We want to fill what was previously the alternate buffer,
1287 // and make what was the current buffer be the new alternate.
1288 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->q
;
1292 UChar
*buf
= u8b
->buf
;
1293 uint8_t *mapToNative
= u8b
->mapToNative
;
1294 uint8_t *mapToUChars
= u8b
->mapToUChars
;
1295 int32_t toUCharsMapStart
= ix
- (UTF8_TEXT_CHUNK_SIZE
*3 + 1);
1296 int32_t destIx
= UTF8_TEXT_CHUNK_SIZE
+2; // Start in the overflow region
1297 // at end of buffer to leave room
1298 // for a surrogate pair at the
1301 int32_t bufNILimit
= destIx
;
1304 // Map to/from Native Indexes, fill in for the position at the end of
1307 mapToNative
[destIx
] = srcIx
- toUCharsMapStart
;
1308 mapToUChars
[srcIx
- toUCharsMapStart
] = destIx
;
1310 // Fill the chunk buffer
1311 // Work backwards, filling from the end of the buffer towards the front.
1313 while (destIx
>2 && (srcIx
- toUCharsMapStart
> 5) && (srcIx
> 0)) {
1317 // Get last byte of the UTF-8 character
1320 // Special case ASCII range for speed.
1322 mapToUChars
[srcIx
- toUCharsMapStart
] = destIx
;
1323 mapToNative
[destIx
] = srcIx
- toUCharsMapStart
;
1325 // General case, handle everything non-ASCII.
1327 int32_t sIx
= srcIx
; // ix of last byte of multi-byte u8 char
1329 // Get the full character from the UTF8 string.
1330 // use code derived from tbe macros in utf.8
1331 // Leaves srcIx pointing at the first byte of the UTF-8 char.
1334 c
=utf8_prevCharSafeBody(s8
, 0, &srcIx
, c
, -1);
1335 // leaves srcIx at first byte of the multi-byte char.
1340 // Store the character in UTF-16 buffer.
1343 mapToNative
[destIx
] = srcIx
- toUCharsMapStart
;
1345 buf
[destIx
] = U16_TRAIL(c
);
1346 mapToNative
[destIx
] = srcIx
- toUCharsMapStart
;
1347 buf
[--destIx
] = U16_LEAD(c
);
1348 mapToNative
[destIx
] = srcIx
- toUCharsMapStart
;
1351 // Fill in the map from native indexes to UChars buf index.
1353 mapToUChars
[sIx
-- - toUCharsMapStart
] = destIx
;
1354 } while (sIx
>= srcIx
);
1356 // Set native indexing limit to be the current position.
1357 // We are processing a non-ascii, non-native-indexing char now;
1358 // the limit will be here if the rest of the chars to be
1359 // added to this buffer are ascii.
1360 bufNILimit
= destIx
;
1363 u8b
->bufNativeStart
= srcIx
;
1364 u8b
->bufNativeLimit
= ix
;
1365 u8b
->bufStartIdx
= destIx
;
1366 u8b
->bufLimitIdx
= UTF8_TEXT_CHUNK_SIZE
+2;
1367 u8b
->bufNILimit
= bufNILimit
- u8b
->bufStartIdx
;
1368 u8b
->toUCharsMapStart
= toUCharsMapStart
;
1370 ut
->chunkContents
= &buf
[u8b
->bufStartIdx
];
1371 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1372 ut
->chunkOffset
= ut
->chunkLength
;
1373 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1374 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1375 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1384 // This is a slightly modified copy of u_strFromUTF8,
1385 // Inserts a Replacement Char rather than failing on invalid UTF-8
1386 // Removes unnecessary features.
1389 utext_strFromUTF8(UChar
*dest
,
1390 int32_t destCapacity
,
1391 int32_t *pDestLength
,
1393 int32_t srcLength
, // required. NUL terminated not supported.
1394 UErrorCode
*pErrorCode
1398 UChar
*pDest
= dest
;
1399 UChar
*pDestLimit
= dest
+destCapacity
;
1402 int32_t reqLength
= 0;
1403 uint8_t* pSrc
= (uint8_t*) src
;
1406 while((index
< srcLength
)&&(pDest
<pDestLimit
)){
1411 ch
=utf8_nextCharSafeBody(pSrc
, &index
, srcLength
, ch
, -1);
1416 *(pDest
++)=(UChar
)ch
;
1418 *(pDest
++)=UTF16_LEAD(ch
);
1419 if(pDest
<pDestLimit
){
1420 *(pDest
++)=UTF16_TRAIL(ch
);
1428 /* donot fill the dest buffer just count the UChars needed */
1429 while(index
< srcLength
){
1434 ch
=utf8_nextCharSafeBody(pSrc
, &index
, srcLength
, ch
, -1);
1438 reqLength
+=UTF_CHAR_LENGTH(ch
);
1442 reqLength
+=(int32_t)(pDest
- dest
);
1445 *pDestLength
= reqLength
;
1448 /* Terminate the buffer */
1449 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
1456 static int32_t U_CALLCONV
1457 utf8TextExtract(UText
*ut
,
1458 int64_t start
, int64_t limit
,
1459 UChar
*dest
, int32_t destCapacity
,
1460 UErrorCode
*pErrorCode
) {
1461 if(U_FAILURE(*pErrorCode
)) {
1464 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1465 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1468 int32_t length
= ut
->b
;
1469 int32_t start32
= pinIndex(start
, length
);
1470 int32_t limit32
= pinIndex(limit
, length
);
1472 if(start32
>limit32
) {
1473 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1478 // adjust the incoming indexes to land on code point boundaries if needed.
1479 // adjust by no more than three, because that is the largest number of trail bytes
1480 // in a well formed UTF8 character.
1481 const uint8_t *buf
= (const uint8_t *)ut
->context
;
1483 if (start32
< ut
->chunkNativeLimit
) {
1484 for (i
=0; i
<3; i
++) {
1485 if (U8_IS_LEAD(buf
[start32
]) || start32
==0) {
1492 if (limit32
< ut
->chunkNativeLimit
) {
1493 for (i
=0; i
<3; i
++) {
1494 if (U8_IS_LEAD(buf
[limit32
]) || limit32
==0) {
1501 // Do the actual extract.
1502 int32_t destLength
=0;
1503 utext_strFromUTF8(dest
, destCapacity
, &destLength
,
1504 (const char *)ut
->context
+start32
, limit32
-start32
,
1510 // utf8TextMapOffsetToNative
1512 // Map a chunk (UTF-16) offset to a native index.
1513 static int64_t U_CALLCONV
1514 utf8TextMapOffsetToNative(const UText
*ut
) {
1516 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->p
;
1517 U_ASSERT(ut
->chunkOffset
>ut
->nativeIndexingLimit
&& ut
->chunkOffset
<=ut
->chunkLength
);
1518 int32_t nativeOffset
= u8b
->mapToNative
[ut
->chunkOffset
+ u8b
->bufStartIdx
] + u8b
->toUCharsMapStart
;
1519 U_ASSERT(nativeOffset
>= ut
->chunkNativeStart
&& nativeOffset
<= ut
->chunkNativeLimit
);
1520 return nativeOffset
;
1524 // Map a native index to the corrsponding chunk offset
1526 static int32_t U_CALLCONV
1527 utf8TextMapIndexToUTF16(const UText
*ut
, int64_t index64
) {
1528 U_ASSERT(index64
<= 0x7fffffff);
1529 int32_t index
= (int32_t)index64
;
1530 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->p
;
1531 U_ASSERT(index
>=ut
->chunkNativeStart
+ut
->nativeIndexingLimit
);
1532 U_ASSERT(index
<=ut
->chunkNativeLimit
);
1533 int32_t mapIndex
= index
- u8b
->toUCharsMapStart
;
1534 int32_t offset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1535 U_ASSERT(offset
>=0 && offset
<=ut
->chunkLength
);
1539 static UText
* U_CALLCONV
1540 utf8TextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
)
1542 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1543 dest
= shallowTextClone(dest
, src
, status
);
1545 // For deep clones, make a copy of the string.
1546 // The copied storage is owned by the newly created clone.
1548 // TODO: There is an isssue with using utext_nativeLength().
1549 // That function is non-const in cases where the input was NUL terminated
1550 // and the length has not yet been determined.
1551 // This function (clone()) is const.
1552 // There potentially a thread safety issue lurking here.
1554 if (deep
&& U_SUCCESS(*status
)) {
1555 int32_t len
= (int32_t)utext_nativeLength((UText
*)src
);
1556 char *copyStr
= (char *)uprv_malloc(len
+1);
1557 if (copyStr
== NULL
) {
1558 *status
= U_MEMORY_ALLOCATION_ERROR
;
1560 uprv_memcpy(copyStr
, src
->context
, len
+1);
1561 dest
->context
= copyStr
;
1562 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
1569 static void U_CALLCONV
1570 utf8TextClose(UText
*ut
) {
1571 // Most of the work of close is done by the generic UText framework close.
1572 // All that needs to be done here is to delete the UTF8 string if the UText
1573 // owns it. This occurs if the UText was created by cloning.
1574 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
1575 char *s
= (char *)ut
->context
;
1584 static struct UTextFuncs utf8Funcs
=
1587 0, 0, 0, // Reserved alignment padding
1594 utf8TextMapOffsetToNative
,
1595 utf8TextMapIndexToUTF16
,
1603 U_DRAFT UText
* U_EXPORT2
1604 utext_openUTF8(UText
*ut
, const char *s
, int64_t length
, UErrorCode
*status
) {
1605 if(U_FAILURE(*status
)) {
1608 if(s
==NULL
|| length
<-1 || length
>INT32_MAX
) {
1609 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1613 ut
= utext_setup(ut
, sizeof(UTF8Buf
) * 2, status
);
1614 if (U_FAILURE(*status
)) {
1618 ut
->pFuncs
= &utf8Funcs
;
1620 ut
->b
= (int32_t)length
;
1621 ut
->c
= (int32_t)length
;
1624 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1627 ut
->q
= (char *)ut
->pExtra
+ sizeof(UTF8Buf
);
1639 //------------------------------------------------------------------------------
1641 // UText implementation wrapper for Replaceable (read/write)
1643 // Use of UText data members:
1644 // context pointer to Replaceable.
1645 // p pointer to Replaceable if it is owned by the UText.
1647 //------------------------------------------------------------------------------
1651 // minimum chunk size for this implementation: 3
1652 // to allow for possible trimming for code point boundaries
1653 enum { REP_TEXT_CHUNK_SIZE
=10 };
1658 * +1 to simplify filling with surrogate pair at the end.
1660 UChar s
[REP_TEXT_CHUNK_SIZE
+1];
1666 static UText
* U_CALLCONV
1667 repTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
) {
1668 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1669 dest
= shallowTextClone(dest
, src
, status
);
1671 // For deep clones, make a copy of the Replaceable.
1672 // The copied Replaceable storage is owned by the newly created UText clone.
1673 // A non-NULL pointer in UText.p is the signal to the close() function to delete
1676 if (deep
&& U_SUCCESS(*status
)) {
1677 const Replaceable
*replSrc
= (const Replaceable
*)src
->context
;
1678 dest
->context
= replSrc
->clone();
1679 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
1681 // with deep clone, the copy is writable, even when the source is not.
1682 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
1688 static void U_CALLCONV
1689 repTextClose(UText
*ut
) {
1690 // Most of the work of close is done by the generic UText framework close.
1691 // All that needs to be done here is delete the Replaceable if the UText
1692 // owns it. This occurs if the UText was created by cloning.
1693 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
1694 Replaceable
*rep
= (Replaceable
*)ut
->context
;
1701 static int64_t U_CALLCONV
1702 repTextLength(UText
*ut
) {
1703 const Replaceable
*replSrc
= (const Replaceable
*)ut
->context
;
1704 int32_t len
= replSrc
->length();
1709 static UBool U_CALLCONV
1710 repTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
1711 const Replaceable
*rep
=(const Replaceable
*)ut
->context
;
1712 int32_t length
=rep
->length(); // Full length of the input text (bigger than a chunk)
1714 // clip the requested index to the limits of the text.
1715 int32_t index32
= pinIndex(index
, length
);
1716 U_ASSERT(index
<=INT32_MAX
);
1720 * Compute start/limit boundaries around index, for a segment of text
1722 * To allow for the possibility that our user gave an index to the trailing
1723 * half of a surrogate pair, we must request one extra preceding UChar when
1724 * going in the forward direction. This will ensure that the buffer has the
1725 * entire code point at the specified index.
1729 if (index32
>=ut
->chunkNativeStart
&& index32
<ut
->chunkNativeLimit
) {
1730 // Buffer already contains the requested position.
1731 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
1734 if (index32
>=length
&& ut
->chunkNativeLimit
==length
) {
1735 // Request for end of string, and buffer already extends up to it.
1736 // Can't get the data, but don't change the buffer.
1737 ut
->chunkOffset
= length
- (int32_t)ut
->chunkNativeStart
;
1741 ut
->chunkNativeLimit
= index
+ REP_TEXT_CHUNK_SIZE
- 1;
1742 // Going forward, so we want to have the buffer with stuff at and beyond
1743 // the requested index. The -1 gets us one code point before the
1744 // requested index also, to handle the case of the index being on
1745 // a trail surrogate of a surrogate pair.
1746 if(ut
->chunkNativeLimit
> length
) {
1747 ut
->chunkNativeLimit
= length
;
1749 // unless buffer ran off end, start is index-1.
1750 ut
->chunkNativeStart
= ut
->chunkNativeLimit
- REP_TEXT_CHUNK_SIZE
;
1751 if(ut
->chunkNativeStart
< 0) {
1752 ut
->chunkNativeStart
= 0;
1755 // Reverse iteration. Fill buffer with data preceding the requested index.
1756 if (index32
>ut
->chunkNativeStart
&& index32
<=ut
->chunkNativeLimit
) {
1757 // Requested position already in buffer.
1758 ut
->chunkOffset
= index32
- (int32_t)ut
->chunkNativeStart
;
1761 if (index32
==0 && ut
->chunkNativeStart
==0) {
1762 // Request for start, buffer already begins at start.
1763 // No data, but keep the buffer as is.
1764 ut
->chunkOffset
= 0;
1768 // Figure out the bounds of the chunk to extract for reverse iteration.
1769 // Need to worry about chunk not splitting surrogate pairs, and while still
1770 // containing the data we need.
1771 // Fix by requesting a chunk that includes an extra UChar at the end.
1772 // If this turns out to be a lead surrogate, we can lop it off and still have
1773 // the data we wanted.
1774 ut
->chunkNativeStart
= index32
+ 1 - REP_TEXT_CHUNK_SIZE
;
1775 if (ut
->chunkNativeStart
< 0) {
1776 ut
->chunkNativeStart
= 0;
1779 ut
->chunkNativeLimit
= index32
+ 1;
1780 if (ut
->chunkNativeLimit
> length
) {
1781 ut
->chunkNativeLimit
= length
;
1785 // Extract the new chunk of text from the Replaceable source.
1786 ReplExtra
*ex
= (ReplExtra
*)ut
->pExtra
;
1787 // UnicodeString with its buffer a writable alias to the chunk buffer
1788 UnicodeString
buffer(ex
->s
, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE
/*buffer capacity*/);
1789 rep
->extractBetween((int32_t)ut
->chunkNativeStart
, (int32_t)ut
->chunkNativeLimit
, buffer
);
1791 ut
->chunkContents
= ex
->s
;
1792 ut
->chunkLength
= (int32_t)(ut
->chunkNativeLimit
- ut
->chunkNativeStart
);
1793 ut
->chunkOffset
= (int32_t)(index32
- ut
->chunkNativeStart
);
1795 // Surrogate pairs from the input text must not span chunk boundaries.
1796 // If end of chunk could be the start of a surrogate, trim it off.
1797 if (ut
->chunkNativeLimit
< length
&&
1798 U16_IS_LEAD(ex
->s
[ut
->chunkLength
-1])) {
1800 ut
->chunkNativeLimit
--;
1801 if (ut
->chunkOffset
> ut
->chunkLength
) {
1802 ut
->chunkOffset
= ut
->chunkLength
;
1806 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
1808 if(ut
->chunkNativeStart
>0 && U16_IS_TRAIL(ex
->s
[0])) {
1809 ++(ut
->chunkContents
);
1810 ++(ut
->chunkNativeStart
);
1811 --(ut
->chunkLength
);
1812 --(ut
->chunkOffset
);
1815 // adjust the index/chunkOffset to a code point boundary
1816 U16_SET_CP_START(ut
->chunkContents
, 0, ut
->chunkOffset
);
1818 // Use fast indexing for get/setNativeIndex()
1819 ut
->nativeIndexingLimit
= ut
->chunkLength
;
1826 static int32_t U_CALLCONV
1827 repTextExtract(UText
*ut
,
1828 int64_t start
, int64_t limit
,
1829 UChar
*dest
, int32_t destCapacity
,
1830 UErrorCode
*status
) {
1831 const Replaceable
*rep
=(const Replaceable
*)ut
->context
;
1832 int32_t length
=rep
->length();
1834 if(U_FAILURE(*status
)) {
1837 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1838 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1841 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1845 int32_t start32
= pinIndex(start
, length
);
1846 int32_t limit32
= pinIndex(limit
, length
);
1848 // adjust start, limit if they point to trail half of surrogates
1849 if (start32
<length
&& U16_IS_TRAIL(rep
->charAt(start32
)) &&
1850 U_IS_SUPPLEMENTARY(rep
->char32At(start32
))){
1853 if (limit32
<length
&& U16_IS_TRAIL(rep
->charAt(limit32
)) &&
1854 U_IS_SUPPLEMENTARY(rep
->char32At(limit32
))){
1858 length
=limit32
-start32
;
1859 if(length
>destCapacity
) {
1860 limit32
= start32
+ destCapacity
;
1862 UnicodeString
buffer(dest
, 0, destCapacity
); // writable alias
1863 rep
->extractBetween(start32
, limit32
, buffer
);
1864 return u_terminateUChars(dest
, destCapacity
, length
, status
);
1867 static int32_t U_CALLCONV
1868 repTextReplace(UText
*ut
,
1869 int64_t start
, int64_t limit
,
1870 const UChar
*src
, int32_t length
,
1871 UErrorCode
*status
) {
1872 Replaceable
*rep
=(Replaceable
*)ut
->context
;
1875 if(U_FAILURE(*status
)) {
1878 if(src
==NULL
&& length
!=0) {
1879 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1882 oldLength
=rep
->length(); // will subtract from new length
1884 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1888 int32_t start32
= pinIndex(start
, oldLength
);
1889 int32_t limit32
= pinIndex(limit
, oldLength
);
1891 // Snap start & limit to code point boundaries.
1892 if (start32
<oldLength
&& U16_IS_TRAIL(rep
->charAt(start32
)) &&
1893 start32
>0 && U16_IS_LEAD(rep
->charAt(start32
-1)))
1897 if (limit32
<oldLength
&& U16_IS_LEAD(rep
->charAt(limit32
-1)) &&
1898 U16_IS_TRAIL(rep
->charAt(limit32
)))
1903 // Do the actual replace operation using methods of the Replaceable class
1904 UnicodeString
replStr((UBool
)(length
<0), src
, length
); // read-only alias
1905 rep
->handleReplaceBetween(start32
, limit32
, replStr
);
1906 int32_t newLength
= rep
->length();
1907 int32_t lengthDelta
= newLength
- oldLength
;
1909 // Is the UText chunk buffer OK?
1910 if (ut
->chunkNativeLimit
> start32
) {
1911 // this replace operation may have impacted the current chunk.
1912 // invalidate it, which will force a reload on the next access.
1913 invalidateChunk(ut
);
1916 // set the iteration position to the end of the newly inserted replacement text.
1917 int32_t newIndexPos
= limit32
+ lengthDelta
;
1918 repTextAccess(ut
, newIndexPos
, TRUE
);
1924 static void U_CALLCONV
1925 repTextCopy(UText
*ut
,
1926 int64_t start
, int64_t limit
,
1931 Replaceable
*rep
=(Replaceable
*)ut
->context
;
1932 int32_t length
=rep
->length();
1934 if(U_FAILURE(*status
)) {
1937 if (start
>limit
|| (start
<destIndex
&& destIndex
<limit
))
1939 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
1943 int32_t start32
= pinIndex(start
, length
);
1944 int32_t limit32
= pinIndex(limit
, length
);
1945 int32_t destIndex32
= pinIndex(destIndex
, length
);
1947 // TODO: snap input parameters to code point boundaries.
1950 // move: copy to destIndex, then replace original with nothing
1951 int32_t segLength
=limit32
-start32
;
1952 rep
->copy(start32
, limit32
, destIndex32
);
1953 if(destIndex32
<start32
) {
1957 rep
->handleReplaceBetween(start32
, limit32
, UnicodeString());
1960 rep
->copy(start32
, limit32
, destIndex32
);
1963 // If the change to the text touched the region in the chunk buffer,
1964 // invalidate the buffer.
1965 int32_t firstAffectedIndex
= destIndex32
;
1966 if (move
&& start32
<firstAffectedIndex
) {
1967 firstAffectedIndex
= start32
;
1969 if (firstAffectedIndex
< ut
->chunkNativeLimit
) {
1970 // changes may have affected range covered by the chunk
1971 invalidateChunk(ut
);
1974 // Put iteration position at the newly inserted (moved) block,
1975 int32_t nativeIterIndex
= destIndex32
+ limit32
- start32
;
1976 if (move
&& destIndex32
>start32
) {
1977 // moved a block of text towards the end of the string.
1978 nativeIterIndex
= destIndex32
;
1981 // Set position, reload chunk if needed.
1982 repTextAccess(ut
, nativeIterIndex
, TRUE
);
1985 static struct UTextFuncs repFuncs
=
1988 0, 0, 0, // Reserved alignment padding
1995 NULL
, // MapOffsetToNative,
1996 NULL
, // MapIndexToUTF16,
2004 U_DRAFT UText
* U_EXPORT2
2005 utext_openReplaceable(UText
*ut
, Replaceable
*rep
, UErrorCode
*status
)
2007 if(U_FAILURE(*status
)) {
2011 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2014 ut
= utext_setup(ut
, sizeof(ReplExtra
), status
);
2016 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2017 if(rep
->hasMetaData()) {
2018 ut
->providerProperties
|=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA
);
2021 ut
->pFuncs
= &repFuncs
;
2035 //------------------------------------------------------------------------------
2037 // UText implementation for UnicodeString (read/write) and
2038 // for const UnicodeString (read only)
2039 // (same implementation, only the flags are different)
2041 // Use of UText data members:
2042 // context pointer to UnicodeString
2043 // p pointer to UnicodeString IF this UText owns the string
2044 // and it must be deleted on close(). NULL otherwise.
2046 //------------------------------------------------------------------------------
2051 static UText
* U_CALLCONV
2052 unistrTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
) {
2053 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2054 dest
= shallowTextClone(dest
, src
, status
);
2056 // For deep clones, make a copy of the UnicodeSring.
2057 // The copied UnicodeString storage is owned by the newly created UText clone.
2058 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2061 if (deep
&& U_SUCCESS(*status
)) {
2062 const UnicodeString
*srcString
= (const UnicodeString
*)src
->context
;
2063 dest
->context
= new UnicodeString(*srcString
);
2064 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2066 // with deep clone, the copy is writable, even when the source is not.
2067 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2072 static void U_CALLCONV
2073 unistrTextClose(UText
*ut
) {
2074 // Most of the work of close is done by the generic UText framework close.
2075 // All that needs to be done here is delete the UnicodeString if the UText
2076 // owns it. This occurs if the UText was created by cloning.
2077 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2078 UnicodeString
*str
= (UnicodeString
*)ut
->context
;
2085 static int64_t U_CALLCONV
2086 unistrTextLength(UText
*t
) {
2087 return ((const UnicodeString
*)t
->context
)->length();
2091 static UBool U_CALLCONV
2092 unistrTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2093 int32_t length
= ut
->chunkLength
;
2094 ut
->chunkOffset
= pinIndex(index
, length
);
2096 // Check whether request is at the start or end
2097 UBool retVal
= (forward
&& index
<length
) || (!forward
&& index
>0);
2103 static int32_t U_CALLCONV
2104 unistrTextExtract(UText
*t
,
2105 int64_t start
, int64_t limit
,
2106 UChar
*dest
, int32_t destCapacity
,
2107 UErrorCode
*pErrorCode
) {
2108 const UnicodeString
*us
=(const UnicodeString
*)t
->context
;
2109 int32_t length
=us
->length();
2111 if(U_FAILURE(*pErrorCode
)) {
2114 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
2115 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2117 if(start
<0 || start
>limit
) {
2118 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2122 int32_t start32
= start
<length
? us
->getChar32Start((int32_t)start
) : length
;
2123 int32_t limit32
= limit
<length
? us
->getChar32Start((int32_t)limit
) : length
;
2125 length
=limit32
-start32
;
2126 if (destCapacity
>0 && dest
!=NULL
) {
2127 int32_t trimmedLength
= length
;
2128 if(trimmedLength
>destCapacity
) {
2129 trimmedLength
=destCapacity
;
2131 us
->extract(start32
, trimmedLength
, dest
);
2133 u_terminateUChars(dest
, destCapacity
, length
, pErrorCode
);
2137 static int32_t U_CALLCONV
2138 unistrTextReplace(UText
*ut
,
2139 int64_t start
, int64_t limit
,
2140 const UChar
*src
, int32_t length
,
2141 UErrorCode
*pErrorCode
) {
2142 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
2145 if(U_FAILURE(*pErrorCode
)) {
2148 if(src
==NULL
&& length
!=0) {
2149 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2152 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2155 oldLength
=us
->length();
2156 int32_t start32
= pinIndex(start
, oldLength
);
2157 int32_t limit32
= pinIndex(limit
, oldLength
);
2158 if (start32
< oldLength
) {
2159 start32
= us
->getChar32Start(start32
);
2161 if (limit32
< oldLength
) {
2162 limit32
= us
->getChar32Start(limit32
);
2166 us
->replace(start32
, limit32
-start32
, src
, length
);
2167 int32_t newLength
= us
->length();
2169 // Update the chunk description.
2170 ut
->chunkContents
= us
->getBuffer();
2171 ut
->chunkLength
= newLength
;
2172 ut
->chunkNativeLimit
= newLength
;
2173 ut
->nativeIndexingLimit
= newLength
;
2175 // Set iteration position to the point just following the newly inserted text.
2176 int32_t lengthDelta
= newLength
- oldLength
;
2177 ut
->chunkOffset
= limit32
+ lengthDelta
;
2182 static void U_CALLCONV
2183 unistrTextCopy(UText
*ut
,
2184 int64_t start
, int64_t limit
,
2187 UErrorCode
*pErrorCode
) {
2188 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
2189 int32_t length
=us
->length();
2191 if(U_FAILURE(*pErrorCode
)) {
2194 int32_t start32
= pinIndex(start
, length
);
2195 int32_t limit32
= pinIndex(limit
, length
);
2196 int32_t destIndex32
= pinIndex(destIndex
, length
);
2198 if( start32
>limit32
|| (start32
<destIndex32
&& destIndex32
<limit32
)) {
2199 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2204 // move: copy to destIndex, then replace original with nothing
2205 int32_t segLength
=limit32
-start32
;
2206 us
->copy(start32
, limit32
, destIndex32
);
2207 if(destIndex32
<start32
) {
2210 us
->replace(start32
, segLength
, NULL
, 0);
2213 us
->copy(start32
, limit32
, destIndex32
);
2216 // update chunk description, set iteration position.
2217 ut
->chunkContents
= us
->getBuffer();
2219 // copy operation, string length grows
2220 ut
->chunkLength
+= limit32
-start32
;
2221 ut
->chunkNativeLimit
= ut
->chunkLength
;
2222 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2225 // Iteration position to end of the newly inserted text.
2226 ut
->chunkOffset
= destIndex32
+limit32
-start32
;
2227 if (move
&& destIndex32
>start32
) {
2228 ut
->chunkOffset
= destIndex32
;
2233 static struct UTextFuncs unistrFuncs
=
2236 0, 0, 0, // Reserved alignment padding
2243 NULL
, // MapOffsetToNative,
2244 NULL
, // MapIndexToUTF16,
2256 U_DRAFT UText
* U_EXPORT2
2257 utext_openUnicodeString(UText
*ut
, UnicodeString
*s
, UErrorCode
*status
) {
2258 // TODO: use openConstUnicodeString, then add in the differences.
2260 ut
= utext_setup(ut
, 0, status
);
2261 if (U_SUCCESS(*status
)) {
2262 ut
->pFuncs
= &unistrFuncs
;
2264 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
)|
2265 I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2267 ut
->chunkContents
= s
->getBuffer();
2268 ut
->chunkLength
= s
->length();
2269 ut
->chunkNativeStart
= 0;
2270 ut
->chunkNativeLimit
= ut
->chunkLength
;
2271 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2278 U_DRAFT UText
* U_EXPORT2
2279 utext_openConstUnicodeString(UText
*ut
, const UnicodeString
*s
, UErrorCode
*status
) {
2280 ut
= utext_setup(ut
, 0, status
);
2281 // note: use the standard (writable) function table for UnicodeString.
2282 // The flag settings disable writing, so having the functions in
2283 // the table is harmless.
2284 if (U_SUCCESS(*status
)) {
2285 ut
->pFuncs
= &unistrFuncs
;
2287 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
);
2288 ut
->chunkContents
= s
->getBuffer();
2289 ut
->chunkLength
= s
->length();
2290 ut
->chunkNativeStart
= 0;
2291 ut
->chunkNativeLimit
= ut
->chunkLength
;
2292 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2297 //------------------------------------------------------------------------------
2299 // UText implementation for const UChar * strings
2301 // Use of UText data members:
2302 // context pointer to UnicodeString
2303 // a length. -1 if not yet known.
2305 // TODO: support 64 bit lengths.
2307 //------------------------------------------------------------------------------
2312 static UText
* U_CALLCONV
2313 ucstrTextClone(UText
*dest
, const UText
* src
, UBool deep
, UErrorCode
* status
) {
2314 // First do a generic shallow clone.
2315 dest
= shallowTextClone(dest
, src
, status
);
2317 // For deep clones, make a copy of the string.
2318 // The copied storage is owned by the newly created clone.
2319 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2322 if (deep
&& U_SUCCESS(*status
)) {
2323 U_ASSERT(utext_nativeLength(dest
) < INT32_MAX
);
2324 int32_t len
= (int32_t)utext_nativeLength(dest
);
2326 // The cloned string IS going to be NUL terminated, whether or not the original was.
2327 const UChar
*srcStr
= (const UChar
*)src
->context
;
2328 UChar
*copyStr
= (UChar
*)uprv_malloc((len
+1) * sizeof(UChar
));
2329 if (copyStr
== NULL
) {
2330 *status
= U_MEMORY_ALLOCATION_ERROR
;
2333 for (i
=0; i
<len
; i
++) {
2334 copyStr
[i
] = srcStr
[i
];
2337 dest
->context
= copyStr
;
2338 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2345 static void U_CALLCONV
2346 ucstrTextClose(UText
*ut
) {
2347 // Most of the work of close is done by the generic UText framework close.
2348 // All that needs to be done here is delete the string if the UText
2349 // owns it. This occurs if the UText was created by cloning.
2350 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2351 UChar
*s
= (UChar
*)ut
->context
;
2359 static int64_t U_CALLCONV
2360 ucstrTextLength(UText
*ut
) {
2362 // null terminated, we don't yet know the length. Scan for it.
2363 // Access is not convenient for doing this
2364 // because the current interation postion can't be changed.
2365 const UChar
*str
= (const UChar
*)ut
->context
;
2367 if (str
[ut
->chunkNativeLimit
] == 0) {
2370 ut
->chunkNativeLimit
++;
2372 ut
->a
= ut
->chunkNativeLimit
;
2373 ut
->chunkLength
= (int32_t)ut
->chunkNativeLimit
;
2374 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2375 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2381 static UBool U_CALLCONV
2382 ucstrTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2383 const UChar
*str
= (const UChar
*)ut
->context
;
2385 // pin the requested index to the bounds of the string,
2386 // and set current iteration position.
2389 } else if (index
< ut
->chunkNativeLimit
) {
2390 // The request data is within the chunk as it is known so far.
2391 // Put index on a code point boundary.
2392 U16_SET_CP_START(str
, 0, index
);
2393 } else if (ut
->a
>= 0) {
2394 // We know the length of this string, and the user is requesting something
2395 // at or beyond the length. Pin the requested index to the length.
2398 // Null terminated string, length not yet known, and the requested index
2399 // is beyond where we have scanned so far.
2400 // Scan to 32 UChars beyond the requested index. The strategy here is
2401 // to avoid fully scanning a long string when the caller only wants to
2402 // see a few characters at its beginning.
2403 int32_t scanLimit
= (int32_t)index
+ 32;
2404 if ((index
+ 32)>INT32_MAX
|| (index
+ 32)<0 ) { // note: int64 expression
2405 scanLimit
= INT32_MAX
;
2408 int32_t chunkLimit
= (int32_t)ut
->chunkNativeLimit
;
2409 for (; chunkLimit
<scanLimit
; chunkLimit
++) {
2410 if (str
[chunkLimit
] == 0) {
2411 // We found the end of the string. Remember it, pin the requested index to it,
2412 // and bail out of here.
2414 ut
->chunkLength
= chunkLimit
;
2415 ut
->nativeIndexingLimit
= chunkLimit
;
2416 if (index
>= chunkLimit
) {
2419 U16_SET_CP_START(str
, 0, index
);
2422 ut
->chunkNativeLimit
= chunkLimit
;
2423 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2427 // We scanned through the next batch of UChars without finding the end.
2428 U16_SET_CP_START(str
, 0, index
);
2429 if (chunkLimit
== INT32_MAX
) {
2430 // Scanned to the limit of a 32 bit length.
2431 // Forceably trim the overlength string back so length fits in int32
2432 // TODO: add support for 64 bit strings.
2434 ut
->chunkLength
= chunkLimit
;
2435 ut
->nativeIndexingLimit
= chunkLimit
;
2436 if (index
> chunkLimit
) {
2439 ut
->chunkNativeLimit
= chunkLimit
;
2440 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2442 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2443 // If the current end is on a lead surrogate, back the end up by one.
2444 // It doesn't matter if the end char happens to be an unpaired surrogate,
2445 // and it's simpler not to worry about it.
2446 if (U16_IS_LEAD(str
[chunkLimit
-1])) {
2449 ut
->chunkNativeLimit
= chunkLimit
;
2454 U_ASSERT(index
<=INT32_MAX
);
2455 ut
->chunkOffset
= (int32_t)index
;
2457 // Check whether request is at the start or end
2458 UBool retVal
= (forward
&& index
<ut
->chunkNativeLimit
) || (!forward
&& index
>0);
2464 static int32_t U_CALLCONV
2465 ucstrTextExtract(UText
*ut
,
2466 int64_t start
, int64_t limit
,
2467 UChar
*dest
, int32_t destCapacity
,
2468 UErrorCode
*pErrorCode
)
2470 if(U_FAILURE(*pErrorCode
)) {
2473 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) || start
>limit
) {
2474 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2478 const UChar
*s
=(const UChar
*)ut
->context
;
2484 // Access the start. Does two things we need:
2485 // Pins 'start' to the length of the string, if it came in out-of-bounds.
2486 // Snaps 'start' to the beginning of a code point.
2487 ucstrTextAccess(ut
, start
, TRUE
);
2488 U_ASSERT(start
<= INT32_MAX
);
2489 start32
= (int32_t)start
;
2491 int32_t strLength
=(int32_t)ut
->a
;
2492 if (strLength
>= 0) {
2493 limit32
= pinIndex(limit
, strLength
);
2495 limit32
= pinIndex(limit
, INT32_MAX
);
2499 for (si
=start32
; si
<limit32
; si
++) {
2500 if (strLength
<0 && s
[si
]==0) {
2501 // Just hit the end of a null-terminated string.
2502 ut
->a
= si
; // set string length for this UText
2503 ut
->chunkNativeLimit
= si
;
2504 ut
->chunkLength
= si
;
2505 ut
->nativeIndexingLimit
= si
;
2509 if (di
<destCapacity
) {
2510 // only store if there is space.
2514 // We have filled the destination buffer, and the string length is known.
2515 // Cut the loop short. There is no need to scan string termination.
2524 // If the limit index points to a lead surrogate of a pair,
2525 // add the corresponding trail surrogate to the destination.
2526 if (si
>0 && U16_IS_LEAD(s
[si
-1]) &&
2527 ((si
<strLength
|| strLength
<0) && U16_IS_TRAIL(s
[si
])))
2529 if (di
<destCapacity
) {
2530 // store only if there is space in the output buffer.
2531 dest
[di
++] = s
[si
++];
2535 // Put iteration position at the point just following the extracted text
2536 ut
->chunkOffset
= si
;
2538 // Add a terminating NUL if space in the buffer permits,
2539 // and set the error status as required.
2540 u_terminateUChars(dest
, destCapacity
, di
, pErrorCode
);
2544 static struct UTextFuncs ucstrFuncs
=
2547 0, 0, 0, // Reserved alignment padding
2554 NULL
, // MapOffsetToNative,
2555 NULL
, // MapIndexToUTF16,
2565 U_DRAFT UText
* U_EXPORT2
2566 utext_openUChars(UText
*ut
, const UChar
*s
, int64_t length
, UErrorCode
*status
) {
2567 if (U_FAILURE(*status
)) {
2570 if (length
< -1 || length
>INT32_MAX
) {
2571 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2574 ut
= utext_setup(ut
, 0, status
);
2575 if (U_SUCCESS(*status
)) {
2576 ut
->pFuncs
= &ucstrFuncs
;
2578 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
);
2580 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2583 ut
->chunkContents
= s
;
2584 ut
->chunkNativeStart
= 0;
2585 ut
->chunkNativeLimit
= length
>=0? length
: 0;
2586 ut
->chunkLength
= (int32_t)ut
->chunkNativeLimit
;
2587 ut
->chunkOffset
= 0;
2588 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2594 //------------------------------------------------------------------------------
2596 // UText implementation for text from ICU CharacterIterators
2598 // Use of UText data members:
2599 // context pointer to the CharacterIterator
2600 // a length of the full text.
2601 // p pointer to buffer 1
2602 // b start index of local buffer 1 contents
2603 // q pointer to buffer 2
2604 // c start index of local buffer 2 contents
2605 // r pointer to the character iterator if the UText owns it.
2608 //------------------------------------------------------------------------------
2609 #define CIBufSize 16
2612 static void U_CALLCONV
2613 charIterTextClose(UText
*ut
) {
2614 // Most of the work of close is done by the generic UText framework close.
2615 // All that needs to be done here is delete the CharacterIterator if the UText
2616 // owns it. This occurs if the UText was created by cloning.
2617 CharacterIterator
*ci
= (CharacterIterator
*)ut
->r
;
2622 static int64_t U_CALLCONV
2623 charIterTextLength(UText
*ut
) {
2624 return (int32_t)ut
->a
;
2627 static UBool U_CALLCONV
2628 charIterTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2629 CharacterIterator
*ci
= (CharacterIterator
*)ut
->context
;
2631 int32_t clippedIndex
= (int32_t)index
;
2632 if (clippedIndex
<0) {
2634 } else if (clippedIndex
>=ut
->a
) {
2635 clippedIndex
=(int32_t)ut
->a
;
2637 int32_t neededIndex
= clippedIndex
;
2638 if (!forward
&& neededIndex
>0) {
2639 // reverse iteration, want the position just before what was asked for.
2641 } else if (forward
&& neededIndex
==ut
->a
&& neededIndex
>0) {
2642 // Forward iteration, don't ask for something past the end of the text.
2646 // Find the native index of the start of the buffer containing what we want.
2647 neededIndex
-= neededIndex
% CIBufSize
;
2650 UBool needChunkSetup
= TRUE
;
2652 if (ut
->chunkNativeStart
== neededIndex
) {
2653 // The buffer we want is already the current chunk.
2654 needChunkSetup
= FALSE
;
2655 } else if (ut
->b
== neededIndex
) {
2656 // The first buffer (buffer p) has what we need.
2657 buf
= (UChar
*)ut
->p
;
2658 } else if (ut
->c
== neededIndex
) {
2659 // The second buffer (buffer q) has what we need.
2660 buf
= (UChar
*)ut
->q
;
2662 // Neither buffer already has what we need.
2663 // Load new data from the character iterator.
2664 // Use the buf that is not the current buffer.
2665 buf
= (UChar
*)ut
->p
;
2666 if (ut
->p
== ut
->chunkContents
) {
2667 buf
= (UChar
*)ut
->q
;
2669 ci
->setIndex(neededIndex
);
2670 for (i
=0; i
<CIBufSize
; i
++) {
2671 buf
[i
] = ci
->nextPostInc();
2672 if (i
+neededIndex
> ut
->a
) {
2678 // We have a buffer with the data we need.
2679 // Set it up as the current chunk, if it wasn't already.
2680 if (needChunkSetup
) {
2681 ut
->chunkContents
= buf
;
2682 ut
->chunkLength
= CIBufSize
;
2683 ut
->chunkNativeStart
= neededIndex
;
2684 ut
->chunkNativeLimit
= neededIndex
+ CIBufSize
;
2685 if (ut
->chunkNativeLimit
> ut
->a
) {
2686 ut
->chunkNativeLimit
= ut
->a
;
2687 ut
->chunkLength
= (int32_t)(ut
->chunkNativeLimit
)-(int32_t)(ut
->chunkNativeStart
);
2689 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2690 U_ASSERT(ut
->chunkOffset
>=0 && ut
->chunkOffset
<=CIBufSize
);
2692 ut
->chunkOffset
= clippedIndex
- (int32_t)ut
->chunkNativeStart
;
2693 UBool success
= (forward
? ut
->chunkOffset
<ut
->chunkLength
: ut
->chunkOffset
>0);
2697 static UText
* U_CALLCONV
2698 charIterTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
* status
) {
2699 if (U_FAILURE(*status
)) {
2704 // There is no CharacterIterator API for cloning the underlying text storage.
2705 *status
= U_UNSUPPORTED_ERROR
;
2708 CharacterIterator
*srcCI
=(CharacterIterator
*)src
->context
;
2709 srcCI
= srcCI
->clone();
2710 dest
= utext_openCharacterIterator(dest
, srcCI
, status
);
2711 // cast off const on getNativeIndex.
2712 // For CharacterIterator based UTexts, this is safe, the operation is const.
2713 int64_t ix
= utext_getNativeIndex((UText
*)src
);
2714 utext_setNativeIndex(dest
, ix
);
2715 dest
->r
= srcCI
; // flags that this UText owns the CharacterIterator
2720 static int32_t U_CALLCONV
2721 charIterTextExtract(UText
*ut
,
2722 int64_t start
, int64_t limit
,
2723 UChar
*dest
, int32_t destCapacity
,
2726 if(U_FAILURE(*status
)) {
2729 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) || start
>limit
) {
2730 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2733 int32_t length
= (int32_t)ut
->a
;
2734 int32_t start32
= pinIndex(start
, length
);
2735 int32_t limit32
= pinIndex(limit
, length
);
2739 CharacterIterator
*ci
= (CharacterIterator
*)ut
->context
;
2740 ci
->setIndex32(start32
); // Moves ix to lead of surrogate pair, if needed.
2741 srci
= ci
->getIndex();
2742 while (srci
<limit32
) {
2743 UChar32 c
= ci
->next32PostInc();
2744 int32_t len
= U16_LENGTH(c
);
2745 if (desti
+len
<= destCapacity
) {
2746 U16_APPEND_UNSAFE(dest
, desti
, c
);
2749 *status
= U_BUFFER_OVERFLOW_ERROR
;
2754 u_terminateUChars(dest
, destCapacity
, desti
, status
);
2758 static struct UTextFuncs charIterFuncs
=
2761 0, 0, 0, // Reserved alignment padding
2765 charIterTextExtract
,
2768 NULL
, // MapOffsetToNative,
2769 NULL
, // MapIndexToUTF16,
2778 U_DRAFT UText
* U_EXPORT2
2779 utext_openCharacterIterator(UText
*ut
, CharacterIterator
*ci
, UErrorCode
*status
) {
2780 if (U_FAILURE(*status
)) {
2784 if (ci
->startIndex() > 0) {
2785 // No support for CharacterIterators that do not start indexing from zero.
2786 *status
= U_UNSUPPORTED_ERROR
;
2790 // Extra space in UText for 2 buffers of CIBufSize UChars each.
2791 int32_t extraSpace
= 2 * CIBufSize
* sizeof(UChar
);
2792 ut
= utext_setup(ut
, extraSpace
, status
);
2793 if (U_SUCCESS(*status
)) {
2794 ut
->pFuncs
= &charIterFuncs
;
2796 ut
->providerProperties
= 0;
2797 ut
->a
= ci
->endIndex(); // Length of text
2798 ut
->p
= ut
->pExtra
; // First buffer
2799 ut
->b
= -1; // Native index of first buffer contents
2800 ut
->q
= (UChar
*)ut
->pExtra
+CIBufSize
; // Second buffer
2801 ut
->c
= -1; // Native index of second buffer contents
2803 // Initialize current chunk contents to be empty.
2804 // First access will fault something in.
2805 // Note: The initial nativeStart and chunkOffset must sum to zero
2806 // so that getNativeIndex() will correctly compute to zero
2807 // if no call to Access() has ever been made. They can't be both
2808 // zero without Access() thinking that the chunk is valid.
2809 ut
->chunkContents
= (UChar
*)ut
->p
;
2810 ut
->chunkNativeStart
= -1;
2811 ut
->chunkOffset
= 1;
2812 ut
->chunkNativeLimit
= 0;
2813 ut
->chunkLength
= 0;
2814 ut
->nativeIndexingLimit
= ut
->chunkOffset
; // enables native indexing