2 *******************************************************************************
4 * Copyright (C) 2005-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2005apr12
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/chariter.h"
21 #include "unicode/utext.h"
30 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
34 utext_access(UText
*ut
, int64_t index
, UBool forward
) {
35 return ut
->pFuncs
->access(ut
, index
, forward
);
40 U_CAPI UBool U_EXPORT2
41 utext_moveIndex32(UText
*ut
, int32_t delta
) {
45 if(ut
->chunkOffset
>=ut
->chunkLength
&& !utext_access(ut
, ut
->chunkNativeLimit
, TRUE
)) {
48 c
= ut
->chunkContents
[ut
->chunkOffset
];
49 if (U16_IS_SURROGATE(c
)) {
51 if (c
== U_SENTINEL
) {
61 if(ut
->chunkOffset
<=0 && !utext_access(ut
, ut
->chunkNativeStart
, FALSE
)) {
64 c
= ut
->chunkContents
[ut
->chunkOffset
-1];
65 if (U16_IS_SURROGATE(c
)) {
66 c
= utext_previous32(ut
);
67 if (c
== U_SENTINEL
) {
80 U_CAPI
int64_t U_EXPORT2
81 utext_nativeLength(UText
*ut
) {
82 return ut
->pFuncs
->nativeLength(ut
);
86 U_CAPI UBool U_EXPORT2
87 utext_isLengthExpensive(const UText
*ut
) {
88 UBool r
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
)) != 0;
93 U_CAPI
int64_t U_EXPORT2
94 utext_getNativeIndex(const UText
*ut
) {
95 if(ut
->chunkOffset
<= ut
->nativeIndexingLimit
) {
96 return ut
->chunkNativeStart
+ut
->chunkOffset
;
98 return ut
->pFuncs
->mapOffsetToNative(ut
);
103 U_CAPI
void U_EXPORT2
104 utext_setNativeIndex(UText
*ut
, int64_t index
) {
105 if(index
<ut
->chunkNativeStart
|| index
>=ut
->chunkNativeLimit
) {
106 // The desired position is outside of the current chunk.
107 // Access the new position. Assume a forward iteration from here,
108 // which will also be optimimum for a single random access.
109 // Reverse iterations may suffer slightly.
110 ut
->pFuncs
->access(ut
, index
, TRUE
);
111 } else if((int32_t)(index
- ut
->chunkNativeStart
) <= ut
->nativeIndexingLimit
) {
113 ut
->chunkOffset
=(int32_t)(index
-ut
->chunkNativeStart
);
115 ut
->chunkOffset
=ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
117 // The convention is that the index must always be on a code point boundary.
118 // Adjust the index position if it is in the middle of a surrogate pair.
119 if (ut
->chunkOffset
<ut
->chunkLength
) {
120 UChar c
= ut
->chunkContents
[ut
->chunkOffset
];
121 if (UTF16_IS_TRAIL(c
)) {
122 if (ut
->chunkOffset
==0) {
123 ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
);
125 if (ut
->chunkOffset
>0) {
126 UChar lead
= ut
->chunkContents
[ut
->chunkOffset
-1];
127 if (UTF16_IS_LEAD(lead
)) {
137 U_CAPI
int64_t U_EXPORT2
138 utext_getPreviousNativeIndex(UText
*ut
) {
140 // Fast-path the common case.
141 // Common means current position is not at the beginning of a chunk
142 // and the preceding character is not supplementary.
144 int32_t i
= ut
->chunkOffset
- 1;
147 UChar c
= ut
->chunkContents
[i
];
148 if (U16_IS_TRAIL(c
) == FALSE
) {
149 if (i
<= ut
->nativeIndexingLimit
) {
150 result
= ut
->chunkNativeStart
+ i
;
153 result
= ut
->pFuncs
->mapOffsetToNative(ut
);
160 // If at the start of text, simply return 0.
161 if (ut
->chunkOffset
==0 && ut
->chunkNativeStart
==0) {
165 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
166 // Keep it simple, use other functions to handle the edges.
168 utext_previous32(ut
);
169 result
= UTEXT_GETNATIVEINDEX(ut
);
176 // utext_current32. Get the UChar32 at the current position.
177 // UText iteration position is always on a code point boundary,
178 // never on the trail half of a surrogate pair.
180 U_CAPI UChar32 U_EXPORT2
181 utext_current32(UText
*ut
) {
183 if (ut
->chunkOffset
==ut
->chunkLength
) {
184 // Current position is just off the end of the chunk.
185 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
186 // Off the end of the text.
191 c
= ut
->chunkContents
[ut
->chunkOffset
];
192 if (U16_IS_LEAD(c
) == FALSE
) {
193 // Normal, non-supplementary case.
198 // Possible supplementary char.
201 UChar32 supplementaryC
= c
;
202 if ((ut
->chunkOffset
+1) < ut
->chunkLength
) {
203 // The trail surrogate is in the same chunk.
204 trail
= ut
->chunkContents
[ut
->chunkOffset
+1];
206 // The trail surrogate is in a different chunk.
207 // Because we must maintain the iteration position, we need to switch forward
208 // into the new chunk, get the trail surrogate, then revert the chunk back to the
210 // An edge case to be careful of: the entire text may end with an unpaired
211 // leading surrogate. The attempt to access the trail will fail, but
212 // the original position before the unpaired lead still needs to be restored.
213 int64_t nativePosition
= ut
->chunkNativeLimit
;
214 int32_t originalOffset
= ut
->chunkOffset
;
215 if (ut
->pFuncs
->access(ut
, nativePosition
, TRUE
)) {
216 trail
= ut
->chunkContents
[ut
->chunkOffset
];
218 UBool r
= ut
->pFuncs
->access(ut
, nativePosition
, FALSE
); // reverse iteration flag loads preceding chunk
220 ut
->chunkOffset
= originalOffset
;
226 if (U16_IS_TRAIL(trail
)) {
227 supplementaryC
= U16_GET_SUPPLEMENTARY(c
, trail
);
229 return supplementaryC
;
234 U_CAPI UChar32 U_EXPORT2
235 utext_char32At(UText
*ut
, int64_t nativeIndex
) {
236 UChar32 c
= U_SENTINEL
;
238 // Fast path the common case.
239 if (nativeIndex
>=ut
->chunkNativeStart
&& nativeIndex
< ut
->chunkNativeStart
+ ut
->nativeIndexingLimit
) {
240 ut
->chunkOffset
= (int32_t)(nativeIndex
- ut
->chunkNativeStart
);
241 c
= ut
->chunkContents
[ut
->chunkOffset
];
242 if (U16_IS_SURROGATE(c
) == FALSE
) {
248 utext_setNativeIndex(ut
, nativeIndex
);
249 if (nativeIndex
>=ut
->chunkNativeStart
&& ut
->chunkOffset
<ut
->chunkLength
) {
250 c
= ut
->chunkContents
[ut
->chunkOffset
];
251 if (U16_IS_SURROGATE(c
)) {
252 // For surrogates, let current32() deal with the complications
253 // of supplementaries that may span chunk boundaries.
254 c
= utext_current32(ut
);
261 U_CAPI UChar32 U_EXPORT2
262 utext_next32(UText
*ut
) {
265 if (ut
->chunkOffset
>= ut
->chunkLength
) {
266 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
271 c
= ut
->chunkContents
[ut
->chunkOffset
++];
272 if (U16_IS_LEAD(c
) == FALSE
) {
273 // Normal case, not supplementary.
274 // (A trail surrogate seen here is just returned as is, as a surrogate value.
275 // It cannot be part of a pair.)
279 if (ut
->chunkOffset
>= ut
->chunkLength
) {
280 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeLimit
, TRUE
) == FALSE
) {
281 // c is an unpaired lead surrogate at the end of the text.
282 // return it as it is.
286 UChar32 trail
= ut
->chunkContents
[ut
->chunkOffset
];
287 if (U16_IS_TRAIL(trail
) == FALSE
) {
288 // c was an unpaired lead surrogate, not at the end of the text.
289 // return it as it is (unpaired). Iteration position is on the
290 // following character, possibly in the next chunk, where the
291 // trail surrogate would have been if it had existed.
295 UChar32 supplementary
= U16_GET_SUPPLEMENTARY(c
, trail
);
296 ut
->chunkOffset
++; // move iteration position over the trail surrogate.
297 return supplementary
;
301 U_CAPI UChar32 U_EXPORT2
302 utext_previous32(UText
*ut
) {
305 if (ut
->chunkOffset
<= 0) {
306 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
) == FALSE
) {
311 c
= ut
->chunkContents
[ut
->chunkOffset
];
312 if (U16_IS_TRAIL(c
) == FALSE
) {
313 // Normal case, not supplementary.
314 // (A lead surrogate seen here is just returned as is, as a surrogate value.
315 // It cannot be part of a pair.)
319 if (ut
->chunkOffset
<= 0) {
320 if (ut
->pFuncs
->access(ut
, ut
->chunkNativeStart
, FALSE
) == FALSE
) {
321 // c is an unpaired trail surrogate at the start of the text.
322 // return it as it is.
327 UChar32 lead
= ut
->chunkContents
[ut
->chunkOffset
-1];
328 if (U16_IS_LEAD(lead
) == FALSE
) {
329 // c was an unpaired trail surrogate, not at the end of the text.
330 // return it as it is (unpaired). Iteration position is at c
334 UChar32 supplementary
= U16_GET_SUPPLEMENTARY(lead
, c
);
335 ut
->chunkOffset
--; // move iteration position over the lead surrogate.
336 return supplementary
;
341 U_CAPI UChar32 U_EXPORT2
342 utext_next32From(UText
*ut
, int64_t index
) {
343 UChar32 c
= U_SENTINEL
;
345 if(index
<ut
->chunkNativeStart
|| index
>=ut
->chunkNativeLimit
) {
346 // Desired position is outside of the current chunk.
347 if(!ut
->pFuncs
->access(ut
, index
, TRUE
)) {
348 // no chunk available here
351 } else if (index
- ut
->chunkNativeStart
<= (int64_t)ut
->nativeIndexingLimit
) {
352 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
353 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
355 // Desired position is in chunk, with non-UTF16 indexing.
356 ut
->chunkOffset
= ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
359 c
= ut
->chunkContents
[ut
->chunkOffset
++];
360 if (U16_IS_SURROGATE(c
)) {
361 // Surrogates. Many edge cases. Use other functions that already
362 // deal with the problems.
363 utext_setNativeIndex(ut
, index
);
364 c
= utext_next32(ut
);
370 U_CAPI UChar32 U_EXPORT2
371 utext_previous32From(UText
*ut
, int64_t index
) {
373 // Return the character preceding the specified index.
374 // Leave the iteration position at the start of the character that was returned.
376 UChar32 cPrev
; // The character preceding cCurr, which is what we will return.
378 // Address the chunk containg the position preceding the incoming index
379 // A tricky edge case:
380 // We try to test the requested native index against the chunkNativeStart to determine
381 // whether the character preceding the one at the index is in the current chunk.
382 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
383 // requested index is on something other than the first position of the first char.
385 if(index
<=ut
->chunkNativeStart
|| index
>ut
->chunkNativeLimit
) {
386 // Requested native index is outside of the current chunk.
387 if(!ut
->pFuncs
->access(ut
, index
, FALSE
)) {
388 // no chunk available here
391 } else if(index
- ut
->chunkNativeStart
<= (int64_t)ut
->nativeIndexingLimit
) {
392 // Direct UTF-16 indexing.
393 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
395 ut
->chunkOffset
=ut
->pFuncs
->mapNativeIndexToUTF16(ut
, index
);
396 if (ut
->chunkOffset
==0 && !ut
->pFuncs
->access(ut
, index
, FALSE
)) {
397 // no chunk available here
403 // Simple case with no surrogates.
406 cPrev
= ut
->chunkContents
[ut
->chunkOffset
];
408 if (U16_IS_SURROGATE(cPrev
)) {
409 // Possible supplementary. Many edge cases.
410 // Let other functions do the heavy lifting.
411 utext_setNativeIndex(ut
, index
);
412 cPrev
= utext_previous32(ut
);
418 U_CAPI
int32_t U_EXPORT2
419 utext_extract(UText
*ut
,
420 int64_t start
, int64_t limit
,
421 UChar
*dest
, int32_t destCapacity
,
422 UErrorCode
*status
) {
423 return ut
->pFuncs
->extract(ut
, start
, limit
, dest
, destCapacity
, status
);
428 U_CAPI UBool U_EXPORT2
429 utext_equals(const UText
*a
, const UText
*b
) {
430 if (a
==NULL
|| b
==NULL
||
431 a
->magic
!= UTEXT_MAGIC
||
432 b
->magic
!= UTEXT_MAGIC
) {
433 // Null or invalid arguments don't compare equal to anything.
437 if (a
->pFuncs
!= b
->pFuncs
) {
438 // Different types of text providers.
442 if (a
->context
!= b
->context
) {
443 // Different sources (different strings)
446 if (utext_getNativeIndex(a
) != utext_getNativeIndex(b
)) {
447 // Different current position in the string.
454 U_CAPI
int32_t U_EXPORT2
455 utext_compare(UText
*s1
, int32_t length1
,
456 UText
*s2
, int32_t length2
) {
457 UChar32 c1
= 0, c2
= 0;
459 if(length1
<0 && length2
<0) {
460 /* strcmp style, go until end of string */
462 c1
= UTEXT_NEXT32(s1
);
463 c2
= UTEXT_NEXT32(s2
);
466 } else if(c1
== U_SENTINEL
) {
473 } else if (length2
< 0) {
477 /* memcmp/UnicodeString style, both length-specified */
478 while((length1
> 0 || length1
== INT32_MIN
) && (length2
> 0 || length2
== INT32_MIN
)) {
479 c1
= UTEXT_NEXT32(s1
);
480 c2
= UTEXT_NEXT32(s2
);
484 } else if(c1
== U_SENTINEL
) {
488 if (length1
!= INT32_MIN
) {
491 if (length2
!= INT32_MIN
) {
496 if(length1
<= 0 && length1
!= INT32_MIN
) {
502 } else if(length2
<= 0 && length2
!= INT32_MIN
) {
511 return (int32_t)c1
-(int32_t)c2
;
514 U_CAPI
int32_t U_EXPORT2
515 utext_compareNativeLimit(UText
*s1
, int64_t limit1
,
516 UText
*s2
, int64_t limit2
) {
519 if(limit1
<0 && limit2
<0) {
520 /* strcmp style, go until end of string */
522 c1
= UTEXT_NEXT32(s1
);
523 c2
= UTEXT_NEXT32(s2
);
525 return (int32_t)c1
-(int32_t)c2
;
526 } else if(c1
== U_SENTINEL
) {
531 /* memcmp/UnicodeString style, both length-specified */
532 int64_t index1
= (limit1
>= 0 ? UTEXT_GETNATIVEINDEX(s1
) : 0);
533 int64_t index2
= (limit2
>= 0 ? UTEXT_GETNATIVEINDEX(s2
) : 0);
535 while((limit1
< 0 || index1
< limit1
) && (limit2
< 0 || index2
< limit2
)) {
536 c1
= UTEXT_NEXT32(s1
);
537 c2
= UTEXT_NEXT32(s2
);
540 return (int32_t)c1
-(int32_t)c2
;
541 } else if(c1
== U_SENTINEL
) {
546 index1
= UTEXT_GETNATIVEINDEX(s1
);
549 index2
= UTEXT_GETNATIVEINDEX(s2
);
553 if(limit1
>= 0 && index1
>= limit1
) {
554 if(index2
>= limit2
) {
560 if(index1
>= limit1
) {
569 U_CAPI
int32_t U_EXPORT2
570 utext_caseCompare(UText
*s1
, int32_t length1
,
571 UText
*s2
, int32_t length2
,
572 uint32_t options
, UErrorCode
*pErrorCode
) {
573 const UCaseProps
*csp
;
575 /* case folding variables */
579 /* case folding buffers, only use current-level start/limit */
580 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
581 int32_t foldOffset1
, foldOffset2
, foldLength1
, foldLength2
;
583 /* current code points */
585 uint8_t cLength1
, cLength2
;
587 /* argument checking */
588 if(U_FAILURE(*pErrorCode
)) {
591 if(s1
==NULL
|| s2
==NULL
) {
592 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
596 csp
=ucase_getSingleton();
598 /* for variable-length strings */
607 foldOffset1
= foldOffset2
= foldLength1
= foldLength2
= 0;
609 /* comparison loop */
610 while((foldOffset1
< foldLength1
|| length1
> 0 || length1
== INT32_MIN
) &&
611 (foldOffset2
< foldLength2
|| length2
> 0 || length2
== INT32_MIN
)) {
612 if(foldOffset1
< foldLength1
) {
613 U16_NEXT_UNSAFE(fold1
, foldOffset1
, c1
);
616 c1
= UTEXT_NEXT32(s1
);
617 if (c1
!= U_SENTINEL
) {
618 cLength1
= U16_LENGTH(c1
);
620 length
= ucase_toFullFolding(csp
, c1
, &p
, options
);
622 if(length
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle 0-length folded-case strings
623 u_memcpy(fold1
, p
, length
);
625 foldLength1
= length
;
626 U16_NEXT_UNSAFE(fold1
, foldOffset1
, c1
);
633 if(length1
!= INT32_MIN
) {
638 if(foldOffset2
< foldLength2
) {
639 U16_NEXT_UNSAFE(fold2
, foldOffset2
, c2
);
642 c2
= UTEXT_NEXT32(s2
);
643 if (c2
!= U_SENTINEL
) {
644 cLength2
= U16_LENGTH(c2
);
646 length
= ucase_toFullFolding(csp
, c2
, &p
, options
);
648 if(length
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle 0-length folded-case strings
649 u_memcpy(fold2
, p
, length
);
651 foldLength2
= length
;
652 U16_NEXT_UNSAFE(fold2
, foldOffset2
, c2
);
657 } else if(c1
== U_SENTINEL
) {
658 return 0; // end of both strings at once
661 if(length2
!= INT32_MIN
) {
667 return (int32_t)c1
-(int32_t)c2
;
671 /* By now at least one of the strings is out of characters */
672 length1
+= foldLength1
- foldOffset1
;
673 length2
+= foldLength2
- foldOffset2
;
675 if(length1
<= 0 && length1
!= INT32_MIN
) {
690 U_CAPI
int32_t U_EXPORT2
691 utext_caseCompareNativeLimit(UText
*s1
, int64_t limit1
,
692 UText
*s2
, int64_t limit2
,
693 uint32_t options
, UErrorCode
*pErrorCode
) {
694 const UCaseProps
*csp
;
696 /* case folding variables */
700 /* case folding buffers, only use current-level start/limit */
701 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
702 int32_t foldOffset1
, foldOffset2
, foldLength1
, foldLength2
;
704 /* current code points */
707 /* native indexes into s1 and s2 */
708 int64_t index1
, index2
;
710 /* argument checking */
711 if(U_FAILURE(*pErrorCode
)) {
714 if(s1
==NULL
|| s2
==NULL
) {
715 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
719 csp
=ucase_getSingleton();
722 index1
= (limit1
>= 0 ? UTEXT_GETNATIVEINDEX(s1
) : 0);
723 index2
= (limit2
>= 0 ? UTEXT_GETNATIVEINDEX(s2
) : 0);
725 foldOffset1
= foldOffset2
= foldLength1
= foldLength2
= 0;
727 /* comparison loop */
728 while((foldOffset1
< foldLength1
|| limit1
< 0 || index1
< limit1
) &&
729 (foldOffset2
< foldLength2
|| limit2
< 0 || index2
< limit2
)) {
730 if(foldOffset1
< foldLength1
) {
731 U16_NEXT_UNSAFE(fold1
, foldOffset1
, c1
);
733 c1
= UTEXT_NEXT32(s1
);
734 if (c1
!= U_SENTINEL
) {
735 length
= ucase_toFullFolding(csp
, c1
, &p
, options
);
737 if(length
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle 0-length folded-case strings
738 u_memcpy(fold1
, p
, length
);
740 foldLength1
= length
;
741 U16_NEXT_UNSAFE(fold1
, foldOffset1
, c1
);
749 index1
= UTEXT_GETNATIVEINDEX(s1
);
753 if(foldOffset2
< foldLength2
) {
754 U16_NEXT_UNSAFE(fold2
, foldOffset2
, c2
);
756 c2
= UTEXT_NEXT32(s2
);
757 if (c2
!= U_SENTINEL
) {
758 length
= ucase_toFullFolding(csp
, c2
, &p
, options
);
760 if(length
<= UCASE_MAX_STRING_LENGTH
) { // !!!: Does not correctly handle 0-length folded-case strings
761 u_memcpy(fold2
, p
, length
);
763 foldLength2
= length
;
764 U16_NEXT_UNSAFE(fold2
, foldOffset2
, c2
);
769 } else if(c1
== U_SENTINEL
) {
774 index2
= UTEXT_GETNATIVEINDEX(s2
);
779 return (int32_t)c1
-(int32_t)c2
;
783 /* By now at least one of the strings is out of characters */
784 index1
-= foldLength1
- foldOffset1
;
785 index2
-= foldLength2
- foldOffset2
;
787 if(limit1
>= 0 && index1
>= limit1
) {
788 if(index2
>= limit2
) {
794 if(index1
>= limit1
) {
803 U_CAPI UBool U_EXPORT2
804 utext_isWritable(const UText
*ut
)
806 UBool b
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) != 0;
811 U_CAPI
void U_EXPORT2
812 utext_freeze(UText
*ut
) {
813 // Zero out the WRITABLE flag.
814 ut
->providerProperties
&= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE
));
818 U_CAPI UBool U_EXPORT2
819 utext_hasMetaData(const UText
*ut
)
821 UBool b
= (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA
)) != 0;
827 U_CAPI
int32_t U_EXPORT2
828 utext_replace(UText
*ut
,
829 int64_t nativeStart
, int64_t nativeLimit
,
830 const UChar
*replacementText
, int32_t replacementLength
,
833 if (U_FAILURE(*status
)) {
836 if ((ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) == 0) {
837 *status
= U_NO_WRITE_PERMISSION
;
840 int32_t i
= ut
->pFuncs
->replace(ut
, nativeStart
, nativeLimit
, replacementText
, replacementLength
, status
);
844 U_CAPI
void U_EXPORT2
845 utext_copy(UText
*ut
,
846 int64_t nativeStart
, int64_t nativeLimit
,
851 if (U_FAILURE(*status
)) {
854 if ((ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_WRITABLE
)) == 0) {
855 *status
= U_NO_WRITE_PERMISSION
;
858 ut
->pFuncs
->copy(ut
, nativeStart
, nativeLimit
, destIndex
, move
, status
);
863 U_CAPI UText
* U_EXPORT2
864 utext_clone(UText
*dest
, const UText
*src
, UBool deep
, UBool readOnly
, UErrorCode
*status
) {
866 result
= src
->pFuncs
->clone(dest
, src
, deep
, status
);
868 utext_freeze(result
);
875 //------------------------------------------------------------------------------
877 // UText common functions implementation
879 //------------------------------------------------------------------------------
882 // UText.flags bit definitions
885 UTEXT_HEAP_ALLOCATED
= 1, // 1 if ICU has allocated this UText struct on the heap.
886 // 0 if caller provided storage for the UText.
888 UTEXT_EXTRA_HEAP_ALLOCATED
= 2, // 1 if ICU has allocated extra storage as a separate
890 // 0 if there is no separate allocation. Either no extra
891 // storage was requested, or it is appended to the end
892 // of the main UText storage.
894 UTEXT_OPEN
= 4 // 1 if this UText is currently open
895 // 0 if this UText is not open.
900 // Extended form of a UText. The purpose is to aid in computing the total size required
901 // when a provider asks for a UText to be allocated with extra storage.
903 struct ExtendedUText
{
905 UAlignedMemory extension
;
908 static const UText emptyText
= UTEXT_INITIALIZER
;
910 U_CAPI UText
* U_EXPORT2
911 utext_setup(UText
*ut
, int32_t extraSpace
, UErrorCode
*status
) {
912 if (U_FAILURE(*status
)) {
917 // We need to heap-allocate storage for the new UText
918 int32_t spaceRequired
= sizeof(UText
);
919 if (extraSpace
> 0) {
920 spaceRequired
= sizeof(ExtendedUText
) + extraSpace
- sizeof(UAlignedMemory
);
922 ut
= (UText
*)uprv_malloc(spaceRequired
);
924 *status
= U_MEMORY_ALLOCATION_ERROR
;
928 ut
->flags
|= UTEXT_HEAP_ALLOCATED
;
929 if (spaceRequired
>0) {
930 ut
->extraSize
= extraSpace
;
931 ut
->pExtra
= &((ExtendedUText
*)ut
)->extension
;
935 // We have been supplied with an already existing UText.
936 // Verify that it really appears to be a UText.
937 if (ut
->magic
!= UTEXT_MAGIC
) {
938 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
941 // If the ut is already open and there's a provider supplied close
942 // function, call it.
943 if ((ut
->flags
& UTEXT_OPEN
) && ut
->pFuncs
->close
!= NULL
) {
944 ut
->pFuncs
->close(ut
);
946 ut
->flags
&= ~UTEXT_OPEN
;
948 // If extra space was requested by our caller, check whether
949 // sufficient already exists, and allocate new if needed.
950 if (extraSpace
> ut
->extraSize
) {
951 // Need more space. If there is existing separately allocated space,
952 // delete it first, then allocate new space.
953 if (ut
->flags
& UTEXT_EXTRA_HEAP_ALLOCATED
) {
954 uprv_free(ut
->pExtra
);
957 ut
->pExtra
= uprv_malloc(extraSpace
);
958 if (ut
->pExtra
== NULL
) {
959 *status
= U_MEMORY_ALLOCATION_ERROR
;
961 ut
->extraSize
= extraSpace
;
962 ut
->flags
|= UTEXT_EXTRA_HEAP_ALLOCATED
;
966 if (U_SUCCESS(*status
)) {
967 ut
->flags
|= UTEXT_OPEN
;
969 // Initialize all remaining fields of the UText.
972 ut
->chunkContents
= NULL
;
981 ut
->chunkNativeStart
= 0;
982 ut
->chunkNativeLimit
= 0;
983 ut
->nativeIndexingLimit
= 0;
984 ut
->providerProperties
= 0;
989 if (ut
->pExtra
!=NULL
&& ut
->extraSize
>0)
990 uprv_memset(ut
->pExtra
, 0, ut
->extraSize
);
997 U_CAPI UText
* U_EXPORT2
998 utext_close(UText
*ut
) {
1000 ut
->magic
!= UTEXT_MAGIC
||
1001 (ut
->flags
& UTEXT_OPEN
) == 0)
1003 // The supplied ut is not an open UText.
1008 // If the provider gave us a close function, call it now.
1009 // This will clean up anything allocated specifically by the provider.
1010 if (ut
->pFuncs
->close
!= NULL
) {
1011 ut
->pFuncs
->close(ut
);
1013 ut
->flags
&= ~UTEXT_OPEN
;
1015 // If we (the framework) allocated the UText or subsidiary storage,
1017 if (ut
->flags
& UTEXT_EXTRA_HEAP_ALLOCATED
) {
1018 uprv_free(ut
->pExtra
);
1020 ut
->flags
&= ~UTEXT_EXTRA_HEAP_ALLOCATED
;
1024 // Zero out function table of the closed UText. This is a defensive move,
1025 // inteded to cause applications that inadvertantly use a closed
1026 // utext to crash with null pointer errors.
1029 if (ut
->flags
& UTEXT_HEAP_ALLOCATED
) {
1030 // This UText was allocated by UText setup. We need to free it.
1031 // Clear magic, so we can detect if the user messes up and immediately
1032 // tries to reopen another UText using the deleted storage.
1044 // invalidateChunk Reset a chunk to have no contents, so that the next call
1045 // to access will cause new data to load.
1046 // This is needed when copy/move/replace operate directly on the
1047 // backing text, potentially putting it out of sync with the
1048 // contents in the chunk.
1051 invalidateChunk(UText
*ut
) {
1052 ut
->chunkLength
= 0;
1053 ut
->chunkNativeLimit
= 0;
1054 ut
->chunkNativeStart
= 0;
1055 ut
->chunkOffset
= 0;
1056 ut
->nativeIndexingLimit
= 0;
1060 // pinIndex Do range pinning on a native index parameter.
1061 // 64 bit pinning is done in place.
1062 // 32 bit truncated result is returned as a convenience for
1063 // use in providers that don't need 64 bits.
1065 pinIndex(int64_t &index
, int64_t limit
) {
1068 } else if (index
> limit
) {
1071 return (int32_t)index
;
1078 // Pointer relocation function,
1079 // a utility used by shallow clone.
1080 // Adjust a pointer that refers to something within one UText (the source)
1081 // to refer to the same relative offset within a another UText (the target)
1083 static void adjustPointer(UText
*dest
, const void **destPtr
, const UText
*src
) {
1084 // convert all pointers to (char *) so that byte address arithmetic will work.
1085 char *dptr
= (char *)*destPtr
;
1086 char *dUText
= (char *)dest
;
1087 char *sUText
= (char *)src
;
1089 if (dptr
>= (char *)src
->pExtra
&& dptr
< ((char*)src
->pExtra
)+src
->extraSize
) {
1090 // target ptr was to something within the src UText's pExtra storage.
1091 // relocate it into the target UText's pExtra region.
1092 *destPtr
= ((char *)dest
->pExtra
) + (dptr
- (char *)src
->pExtra
);
1093 } else if (dptr
>=sUText
&& dptr
< sUText
+src
->sizeOfStruct
) {
1094 // target ptr was pointing to somewhere within the source UText itself.
1095 // Move it to the same offset within the target UText.
1096 *destPtr
= dUText
+ (dptr
-sUText
);
1102 // Clone. This is a generic copy-the-utext-by-value clone function that can be
1103 // used as-is with some utext types, and as a helper by other clones.
1105 static UText
* U_CALLCONV
1106 shallowTextClone(UText
* dest
, const UText
* src
, UErrorCode
* status
) {
1107 if (U_FAILURE(*status
)) {
1110 int32_t srcExtraSize
= src
->extraSize
;
1113 // Use the generic text_setup to allocate storage if required.
1115 dest
= utext_setup(dest
, srcExtraSize
, status
);
1116 if (U_FAILURE(*status
)) {
1121 // flags (how the UText was allocated) and the pointer to the
1122 // extra storage must retain the values in the cloned utext that
1123 // were set up by utext_setup. Save them separately before
1124 // copying the whole struct.
1126 void *destExtra
= dest
->pExtra
;
1127 int32_t flags
= dest
->flags
;
1131 // Copy the whole UText struct by value.
1132 // Any "Extra" storage is copied also.
1134 int sizeToCopy
= src
->sizeOfStruct
;
1135 if (sizeToCopy
> dest
->sizeOfStruct
) {
1136 sizeToCopy
= dest
->sizeOfStruct
;
1138 uprv_memcpy(dest
, src
, sizeToCopy
);
1139 dest
->pExtra
= destExtra
;
1140 dest
->flags
= flags
;
1141 if (srcExtraSize
> 0) {
1142 uprv_memcpy(dest
->pExtra
, src
->pExtra
, srcExtraSize
);
1146 // Relocate any pointers in the target that refer to the UText itself
1147 // to point to the cloned copy rather than the original source.
1149 adjustPointer(dest
, &dest
->context
, src
);
1150 adjustPointer(dest
, &dest
->p
, src
);
1151 adjustPointer(dest
, &dest
->q
, src
);
1152 adjustPointer(dest
, &dest
->r
, src
);
1153 adjustPointer(dest
, (const void **)&dest
->chunkContents
, src
);
1163 //------------------------------------------------------------------------------
1165 // UText implementation for UTF-8 char * strings (read-only)
1166 // Limitation: string length must be <= 0x7fffffff in length.
1167 // (length must for in an int32_t variable)
1169 // Use of UText data members:
1170 // context pointer to UTF-8 string
1171 // utext.b is the input string length (bytes).
1172 // utext.c Length scanned so far in string
1173 // (for optimizing finding length of zero terminated strings.)
1174 // utext.p pointer to the current buffer
1175 // utext.q pointer to the other buffer.
1177 //------------------------------------------------------------------------------
1180 // Must be less than 85, because of byte mapping from UChar indexes to native indexes.
1181 // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
1184 enum { UTF8_TEXT_CHUNK_SIZE
=32 };
1187 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
1188 // Each contains the UChar chunk buffer, the to and from native maps, and
1191 // because backwards iteration fills the buffers starting at the end and
1192 // working towards the front, the filled part of the buffers may not begin
1193 // at the start of the available storage for the buffers.
1195 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
1196 // the last character added being a supplementary, and thus requiring a surrogate
1197 // pair. Doing this is simpler than checking for the edge case.
1201 int32_t bufNativeStart
; // Native index of first char in UChar buf
1202 int32_t bufNativeLimit
; // Native index following last char in buf.
1203 int32_t bufStartIdx
; // First filled position in buf.
1204 int32_t bufLimitIdx
; // Limit of filled range in buf.
1205 int32_t bufNILimit
; // Limit of native indexing part of buf
1206 int32_t toUCharsMapStart
; // Native index corresponding to
1208 // Set to bufNativeStart when filling forwards.
1209 // Set to computed value when filling backwards.
1211 UChar buf
[UTF8_TEXT_CHUNK_SIZE
+4]; // The UChar buffer. Requires one extra position beyond the
1212 // the chunk size, to allow for surrogate at the end.
1213 // Length must be identical to mapToNative array, below,
1214 // because of the way indexing works when the array is
1215 // filled backwards during a reverse iteration. Thus,
1216 // the additional extra size.
1217 uint8_t mapToNative
[UTF8_TEXT_CHUNK_SIZE
+4]; // map UChar index in buf to
1218 // native offset from bufNativeStart.
1219 // Requires two extra slots,
1220 // one for a supplementary starting in the last normal position,
1221 // and one for an entry for the buffer limit position.
1222 uint8_t mapToUChars
[UTF8_TEXT_CHUNK_SIZE
*3+6]; // Map native offset from bufNativeStart to
1223 // correspoding offset in filled part of buf.
1232 // Get the length of the string. If we don't already know it,
1233 // we'll need to scan for the trailing nul.
1235 static int64_t U_CALLCONV
1236 utf8TextLength(UText
*ut
) {
1238 // Zero terminated string, and we haven't scanned to the end yet.
1240 const char *r
= (const char *)ut
->context
+ ut
->c
;
1244 if ((r
- (const char *)ut
->context
) < 0x7fffffff) {
1245 ut
->b
= (int32_t)(r
- (const char *)ut
->context
);
1247 // Actual string was bigger (more than 2 gig) than we
1248 // can handle. Clip it to 2 GB.
1251 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1261 static UBool U_CALLCONV
1262 utf8TextAccess(UText
*ut
, int64_t index
, UBool forward
) {
1264 // Apologies to those who are allergic to goto statements.
1265 // Consider each goto to a labelled block to be the equivalent of
1266 // call the named block as if it were a function();
1269 const uint8_t *s8
=(const uint8_t *)ut
->context
;
1270 UTF8Buf
*u8b
= NULL
;
1271 int32_t length
= ut
->b
; // Length of original utf-8
1272 int32_t ix
= (int32_t)index
; // Requested index, trimmed to 32 bits.
1273 int32_t mapIndex
= 0;
1276 } else if (index
> 0x7fffffff) {
1277 // Strings with 64 bit lengths not supported by this UTF-8 provider.
1281 // Pin requested index to the string length.
1285 } else if (ix
>=ut
->c
) {
1286 // Zero terminated string, and requested index is beyond
1287 // the region that has already been scanned.
1288 // Scan up to either the end of the string or to the
1289 // requested position, whichever comes first.
1290 while (ut
->c
<ix
&& s8
[ut
->c
]!=0) {
1293 // TODO: support for null terminated string length > 32 bits.
1294 if (s8
[ut
->c
] == 0) {
1295 // We just found the actual length of the string.
1296 // Trim the requested index back to that.
1300 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1306 // Dispatch to the appropriate action for a forward iteration request.
1309 if (ix
==ut
->chunkNativeLimit
) {
1310 // Check for normal sequential iteration cases first.
1312 // Just reached end of string
1313 // Don't swap buffers, but do set the
1314 // current buffer position.
1315 ut
->chunkOffset
= ut
->chunkLength
;
1318 // End of current buffer.
1319 // check whether other buffer already has what we need.
1320 UTF8Buf
*altB
= (UTF8Buf
*)ut
->q
;
1321 if (ix
>=altB
->bufNativeStart
&& ix
<altB
->bufNativeLimit
) {
1327 // A random access. Desired index could be in either or niether buf.
1328 // For optimizing the order of testing, first check for the index
1329 // being in the other buffer. This will be the case for uses that
1330 // move back and forth over a fairly limited range
1332 u8b
= (UTF8Buf
*)ut
->q
; // the alternate buffer
1333 if (ix
>=u8b
->bufNativeStart
&& ix
<u8b
->bufNativeLimit
) {
1334 // Requested index is in the other buffer.
1338 // Requested index is end-of-string.
1339 // (this is the case of randomly seeking to the end.
1340 // The case of iterating off the end is handled earlier.)
1341 if (ix
== ut
->chunkNativeLimit
) {
1342 // Current buffer extends up to the end of the string.
1343 // Leave it as the current buffer.
1344 ut
->chunkOffset
= ut
->chunkLength
;
1347 if (ix
== u8b
->bufNativeLimit
) {
1348 // Alternate buffer extends to the end of string.
1349 // Swap it in as the current buffer.
1350 goto swapBuffersAndFail
;
1353 // Neither existing buffer extends to the end of the string.
1354 goto makeStubBuffer
;
1357 if (ix
<ut
->chunkNativeStart
|| ix
>=ut
->chunkNativeLimit
) {
1358 // Requested index is in neither buffer.
1362 // Requested index is in this buffer.
1363 u8b
= (UTF8Buf
*)ut
->p
; // the current buffer
1364 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1365 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1373 // Dispatch to the appropriate action for a
1374 // Backwards Diretion iteration request.
1376 if (ix
==ut
->chunkNativeStart
) {
1377 // Check for normal sequential iteration cases first.
1379 // Just reached the start of string
1380 // Don't swap buffers, but do set the
1381 // current buffer position.
1382 ut
->chunkOffset
= 0;
1385 // Start of current buffer.
1386 // check whether other buffer already has what we need.
1387 UTF8Buf
*altB
= (UTF8Buf
*)ut
->q
;
1388 if (ix
>altB
->bufNativeStart
&& ix
<=altB
->bufNativeLimit
) {
1394 // A random access. Desired index could be in either or niether buf.
1395 // For optimizing the order of testing,
1396 // Most likely case: in the other buffer.
1397 // Second most likely: in neither buffer.
1398 // Unlikely, but must work: in the current buffer.
1399 u8b
= (UTF8Buf
*)ut
->q
; // the alternate buffer
1400 if (ix
>u8b
->bufNativeStart
&& ix
<=u8b
->bufNativeLimit
) {
1401 // Requested index is in the other buffer.
1404 // Requested index is start-of-string.
1405 // (this is the case of randomly seeking to the start.
1406 // The case of iterating off the start is handled earlier.)
1408 if (u8b
->bufNativeStart
==0) {
1409 // Alternate buffer contains the data for the start string.
1410 // Make it be the current buffer.
1411 goto swapBuffersAndFail
;
1413 // Request for data before the start of string,
1414 // neither buffer is usable.
1415 // set up a zero-length buffer.
1416 goto makeStubBuffer
;
1420 if (ix
<=ut
->chunkNativeStart
|| ix
>ut
->chunkNativeLimit
) {
1421 // Requested index is in neither buffer.
1425 // Requested index is in this buffer.
1426 // Set the utf16 buffer index.
1427 u8b
= (UTF8Buf
*)ut
->p
;
1428 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1429 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1430 if (ut
->chunkOffset
==0) {
1431 // This occurs when the first character in the text is
1432 // a multi-byte UTF-8 char, and the requested index is to
1433 // one of the trailing bytes. Because there is no preceding ,
1434 // character, this access fails. We can't pick up on the
1435 // situation sooner because the requested index is not zero.
1444 // The alternate buffer (ut->q) has the string data that was requested.
1445 // Swap the primary and alternate buffers, and set the
1446 // chunk index into the new primary buffer.
1448 u8b
= (UTF8Buf
*)ut
->q
;
1451 ut
->chunkContents
= &u8b
->buf
[u8b
->bufStartIdx
];
1452 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1453 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1454 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1455 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1457 // Index into the (now current) chunk
1458 // Use the map to set the chunk index. It's more trouble than it's worth
1459 // to check whether native indexing can be used.
1460 U_ASSERT(ix
>=u8b
->bufNativeStart
);
1461 U_ASSERT(ix
<=u8b
->bufNativeLimit
);
1462 mapIndex
= ix
- u8b
->toUCharsMapStart
;
1463 U_ASSERT(mapIndex
>=0);
1464 U_ASSERT(mapIndex
<(int32_t)sizeof(u8b
->mapToUChars
));
1465 ut
->chunkOffset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1472 // We got a request for either the start or end of the string,
1473 // with iteration continuing in the out-of-bounds direction.
1474 // The alternate buffer already contains the data up to the
1476 // Swap the buffers, then return failure, indicating that we couldn't
1477 // make things correct for continuing the iteration in the requested
1478 // direction. The position & buffer are correct should the
1479 // user decide to iterate in the opposite direction.
1480 u8b
= (UTF8Buf
*)ut
->q
;
1483 ut
->chunkContents
= &u8b
->buf
[u8b
->bufStartIdx
];
1484 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1485 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1486 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1487 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1489 // Index into the (now current) chunk
1490 // For this function (swapBuffersAndFail), the requested index
1491 // will always be at either the start or end of the chunk.
1492 if (ix
==u8b
->bufNativeLimit
) {
1493 ut
->chunkOffset
= ut
->chunkLength
;
1495 ut
->chunkOffset
= 0;
1496 U_ASSERT(ix
== u8b
->bufNativeStart
);
1501 // The user has done a seek/access past the start or end
1502 // of the string. Rather than loading data that is likely
1503 // to never be used, just set up a zero-length buffer at
1505 u8b
= (UTF8Buf
*)ut
->q
;
1506 u8b
->bufNativeStart
= ix
;
1507 u8b
->bufNativeLimit
= ix
;
1508 u8b
->bufStartIdx
= 0;
1509 u8b
->bufLimitIdx
= 0;
1510 u8b
->bufNILimit
= 0;
1511 u8b
->toUCharsMapStart
= ix
;
1512 u8b
->mapToNative
[0] = 0;
1513 u8b
->mapToUChars
[0] = 0;
1514 goto swapBuffersAndFail
;
1520 // Move the incoming index to a code point boundary.
1521 U8_SET_CP_START(s8
, 0, ix
);
1523 // Swap the UText buffers.
1524 // We want to fill what was previously the alternate buffer,
1525 // and make what was the current buffer be the new alternate.
1526 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->q
;
1530 int32_t strLen
= ut
->b
;
1531 UBool nulTerminated
= FALSE
;
1533 strLen
= 0x7fffffff;
1534 nulTerminated
= TRUE
;
1537 UChar
*buf
= u8b
->buf
;
1538 uint8_t *mapToNative
= u8b
->mapToNative
;
1539 uint8_t *mapToUChars
= u8b
->mapToUChars
;
1542 UBool seenNonAscii
= FALSE
;
1545 // Fill the chunk buffer and mapping arrays.
1546 while (destIx
<UTF8_TEXT_CHUNK_SIZE
) {
1548 if (c
>0 && c
<0x80) {
1549 // Special case ASCII range for speed.
1550 // zero is excluded to simplify bounds checking.
1551 buf
[destIx
] = (UChar
)c
;
1552 mapToNative
[destIx
] = (uint8_t)(srcIx
- ix
);
1553 mapToUChars
[srcIx
-ix
] = (uint8_t)destIx
;
1557 // General case, handle everything.
1558 if (seenNonAscii
== FALSE
) {
1559 seenNonAscii
= TRUE
;
1560 u8b
->bufNILimit
= destIx
;
1563 int32_t cIx
= srcIx
;
1564 int32_t dIx
= destIx
;
1565 int32_t dIxSaved
= destIx
;
1566 U8_NEXT(s8
, srcIx
, strLen
, c
);
1567 if (c
==0 && nulTerminated
) {
1572 // Illegal UTF-8. Replace with sub character.
1576 U16_APPEND_UNSAFE(buf
, destIx
, c
);
1578 mapToNative
[dIx
++] = (uint8_t)(cIx
- ix
);
1579 } while (dIx
< destIx
);
1582 mapToUChars
[cIx
++ - ix
] = (uint8_t)dIxSaved
;
1583 } while (cIx
< srcIx
);
1585 if (srcIx
>=strLen
) {
1591 // store Native <--> Chunk Map entries for the end of the buffer.
1592 // There is no actual character here, but the index position is valid.
1593 mapToNative
[destIx
] = (uint8_t)(srcIx
- ix
);
1594 mapToUChars
[srcIx
- ix
] = (uint8_t)destIx
;
1596 // fill in Buffer descriptor
1597 u8b
->bufNativeStart
= ix
;
1598 u8b
->bufNativeLimit
= srcIx
;
1599 u8b
->bufStartIdx
= 0;
1600 u8b
->bufLimitIdx
= destIx
;
1601 if (seenNonAscii
== FALSE
) {
1602 u8b
->bufNILimit
= destIx
;
1604 u8b
->toUCharsMapStart
= u8b
->bufNativeStart
;
1606 // Set UText chunk to refer to this buffer.
1607 ut
->chunkContents
= buf
;
1608 ut
->chunkOffset
= 0;
1609 ut
->chunkLength
= u8b
->bufLimitIdx
;
1610 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1611 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1612 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1614 // For zero terminated strings, keep track of the maximum point
1616 if (nulTerminated
&& srcIx
>ut
->c
) {
1619 // We scanned to the end.
1620 // Remember the actual length.
1622 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1631 // Move the incoming index to a code point boundary.
1632 // Can only do this if the incoming index is somewhere in the interior of the string.
1633 // If index is at the end, there is no character there to look at.
1635 U8_SET_CP_START(s8
, 0, ix
);
1638 // Swap the UText buffers.
1639 // We want to fill what was previously the alternate buffer,
1640 // and make what was the current buffer be the new alternate.
1641 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->q
;
1645 UChar
*buf
= u8b
->buf
;
1646 uint8_t *mapToNative
= u8b
->mapToNative
;
1647 uint8_t *mapToUChars
= u8b
->mapToUChars
;
1648 int32_t toUCharsMapStart
= ix
- (UTF8_TEXT_CHUNK_SIZE
*3 + 1);
1649 int32_t destIx
= UTF8_TEXT_CHUNK_SIZE
+2; // Start in the overflow region
1650 // at end of buffer to leave room
1651 // for a surrogate pair at the
1654 int32_t bufNILimit
= destIx
;
1657 // Map to/from Native Indexes, fill in for the position at the end of
1660 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1661 mapToUChars
[srcIx
- toUCharsMapStart
] = (uint8_t)destIx
;
1663 // Fill the chunk buffer
1664 // Work backwards, filling from the end of the buffer towards the front.
1666 while (destIx
>2 && (srcIx
- toUCharsMapStart
> 5) && (srcIx
> 0)) {
1670 // Get last byte of the UTF-8 character
1673 // Special case ASCII range for speed.
1674 buf
[destIx
] = (UChar
)c
;
1675 mapToUChars
[srcIx
- toUCharsMapStart
] = (uint8_t)destIx
;
1676 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1678 // General case, handle everything non-ASCII.
1680 int32_t sIx
= srcIx
; // ix of last byte of multi-byte u8 char
1682 // Get the full character from the UTF8 string.
1683 // use code derived from tbe macros in utf.8
1684 // Leaves srcIx pointing at the first byte of the UTF-8 char.
1687 c
=utf8_prevCharSafeBody(s8
, 0, &srcIx
, c
, -1);
1688 // leaves srcIx at first byte of the multi-byte char.
1693 // Store the character in UTF-16 buffer.
1695 buf
[destIx
] = (UChar
)c
;
1696 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1698 buf
[destIx
] = U16_TRAIL(c
);
1699 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1700 buf
[--destIx
] = U16_LEAD(c
);
1701 mapToNative
[destIx
] = (uint8_t)(srcIx
- toUCharsMapStart
);
1704 // Fill in the map from native indexes to UChars buf index.
1706 mapToUChars
[sIx
-- - toUCharsMapStart
] = (uint8_t)destIx
;
1707 } while (sIx
>= srcIx
);
1709 // Set native indexing limit to be the current position.
1710 // We are processing a non-ascii, non-native-indexing char now;
1711 // the limit will be here if the rest of the chars to be
1712 // added to this buffer are ascii.
1713 bufNILimit
= destIx
;
1716 u8b
->bufNativeStart
= srcIx
;
1717 u8b
->bufNativeLimit
= ix
;
1718 u8b
->bufStartIdx
= destIx
;
1719 u8b
->bufLimitIdx
= UTF8_TEXT_CHUNK_SIZE
+2;
1720 u8b
->bufNILimit
= bufNILimit
- u8b
->bufStartIdx
;
1721 u8b
->toUCharsMapStart
= toUCharsMapStart
;
1723 ut
->chunkContents
= &buf
[u8b
->bufStartIdx
];
1724 ut
->chunkLength
= u8b
->bufLimitIdx
- u8b
->bufStartIdx
;
1725 ut
->chunkOffset
= ut
->chunkLength
;
1726 ut
->chunkNativeStart
= u8b
->bufNativeStart
;
1727 ut
->chunkNativeLimit
= u8b
->bufNativeLimit
;
1728 ut
->nativeIndexingLimit
= u8b
->bufNILimit
;
1737 // This is a slightly modified copy of u_strFromUTF8,
1738 // Inserts a Replacement Char rather than failing on invalid UTF-8
1739 // Removes unnecessary features.
1742 utext_strFromUTF8(UChar
*dest
,
1743 int32_t destCapacity
,
1744 int32_t *pDestLength
,
1746 int32_t srcLength
, // required. NUL terminated not supported.
1747 UErrorCode
*pErrorCode
1751 UChar
*pDest
= dest
;
1752 UChar
*pDestLimit
= dest
+destCapacity
;
1755 int32_t reqLength
= 0;
1756 uint8_t* pSrc
= (uint8_t*) src
;
1759 while((index
< srcLength
)&&(pDest
<pDestLimit
)){
1764 ch
=utf8_nextCharSafeBody(pSrc
, &index
, srcLength
, ch
, -1);
1769 *(pDest
++)=(UChar
)ch
;
1771 *(pDest
++)=UTF16_LEAD(ch
);
1772 if(pDest
<pDestLimit
){
1773 *(pDest
++)=UTF16_TRAIL(ch
);
1781 /* donot fill the dest buffer just count the UChars needed */
1782 while(index
< srcLength
){
1787 ch
=utf8_nextCharSafeBody(pSrc
, &index
, srcLength
, ch
, -1);
1791 reqLength
+=U16_LENGTH(ch
);
1795 reqLength
+=(int32_t)(pDest
- dest
);
1798 *pDestLength
= reqLength
;
1801 /* Terminate the buffer */
1802 u_terminateUChars(dest
,destCapacity
,reqLength
,pErrorCode
);
1809 static int32_t U_CALLCONV
1810 utf8TextExtract(UText
*ut
,
1811 int64_t start
, int64_t limit
,
1812 UChar
*dest
, int32_t destCapacity
,
1813 UErrorCode
*pErrorCode
) {
1814 if(U_FAILURE(*pErrorCode
)) {
1817 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1818 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1821 int32_t length
= ut
->b
;
1822 int32_t start32
= pinIndex(start
, length
);
1823 int32_t limit32
= pinIndex(limit
, length
);
1825 if(start32
>limit32
) {
1826 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1831 // adjust the incoming indexes to land on code point boundaries if needed.
1832 // adjust by no more than three, because that is the largest number of trail bytes
1833 // in a well formed UTF8 character.
1834 const uint8_t *buf
= (const uint8_t *)ut
->context
;
1836 if (start32
< ut
->chunkNativeLimit
) {
1837 for (i
=0; i
<3; i
++) {
1838 if (U8_IS_SINGLE(buf
[start32
]) || U8_IS_LEAD(buf
[start32
]) || start32
==0) {
1845 if (limit32
< ut
->chunkNativeLimit
) {
1846 for (i
=0; i
<3; i
++) {
1847 if (U8_IS_SINGLE(buf
[limit32
]) || U8_IS_LEAD(buf
[limit32
]) || limit32
==0) {
1854 // Do the actual extract.
1855 int32_t destLength
=0;
1856 utext_strFromUTF8(dest
, destCapacity
, &destLength
,
1857 (const char *)ut
->context
+start32
, limit32
-start32
,
1859 utf8TextAccess(ut
, limit32
, TRUE
);
1864 // utf8TextMapOffsetToNative
1866 // Map a chunk (UTF-16) offset to a native index.
1867 static int64_t U_CALLCONV
1868 utf8TextMapOffsetToNative(const UText
*ut
) {
1870 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->p
;
1871 U_ASSERT(ut
->chunkOffset
>ut
->nativeIndexingLimit
&& ut
->chunkOffset
<=ut
->chunkLength
);
1872 int32_t nativeOffset
= u8b
->mapToNative
[ut
->chunkOffset
+ u8b
->bufStartIdx
] + u8b
->toUCharsMapStart
;
1873 U_ASSERT(nativeOffset
>= ut
->chunkNativeStart
&& nativeOffset
<= ut
->chunkNativeLimit
);
1874 return nativeOffset
;
1878 // Map a native index to the corrsponding chunk offset
1880 static int32_t U_CALLCONV
1881 utf8TextMapIndexToUTF16(const UText
*ut
, int64_t index64
) {
1882 U_ASSERT(index64
<= 0x7fffffff);
1883 int32_t index
= (int32_t)index64
;
1884 UTF8Buf
*u8b
= (UTF8Buf
*)ut
->p
;
1885 U_ASSERT(index
>=ut
->chunkNativeStart
+ut
->nativeIndexingLimit
);
1886 U_ASSERT(index
<=ut
->chunkNativeLimit
);
1887 int32_t mapIndex
= index
- u8b
->toUCharsMapStart
;
1888 int32_t offset
= u8b
->mapToUChars
[mapIndex
] - u8b
->bufStartIdx
;
1889 U_ASSERT(offset
>=0 && offset
<=ut
->chunkLength
);
1893 static UText
* U_CALLCONV
1894 utf8TextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
)
1896 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1897 dest
= shallowTextClone(dest
, src
, status
);
1899 // For deep clones, make a copy of the string.
1900 // The copied storage is owned by the newly created clone.
1902 // TODO: There is an isssue with using utext_nativeLength().
1903 // That function is non-const in cases where the input was NUL terminated
1904 // and the length has not yet been determined.
1905 // This function (clone()) is const.
1906 // There potentially a thread safety issue lurking here.
1908 if (deep
&& U_SUCCESS(*status
)) {
1909 int32_t len
= (int32_t)utext_nativeLength((UText
*)src
);
1910 char *copyStr
= (char *)uprv_malloc(len
+1);
1911 if (copyStr
== NULL
) {
1912 *status
= U_MEMORY_ALLOCATION_ERROR
;
1914 uprv_memcpy(copyStr
, src
->context
, len
+1);
1915 dest
->context
= copyStr
;
1916 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
1923 static void U_CALLCONV
1924 utf8TextClose(UText
*ut
) {
1925 // Most of the work of close is done by the generic UText framework close.
1926 // All that needs to be done here is to delete the UTF8 string if the UText
1927 // owns it. This occurs if the UText was created by cloning.
1928 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
1929 char *s
= (char *)ut
->context
;
1938 static const struct UTextFuncs utf8Funcs
=
1941 0, 0, 0, // Reserved alignment padding
1948 utf8TextMapOffsetToNative
,
1949 utf8TextMapIndexToUTF16
,
1957 static const char gEmptyString
[] = {0};
1959 U_CAPI UText
* U_EXPORT2
1960 utext_openUTF8(UText
*ut
, const char *s
, int64_t length
, UErrorCode
*status
) {
1961 if(U_FAILURE(*status
)) {
1964 if(s
==NULL
&& length
==0) {
1968 if(s
==NULL
|| length
<-1 || length
>INT32_MAX
) {
1969 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
1973 ut
= utext_setup(ut
, sizeof(UTF8Buf
) * 2, status
);
1974 if (U_FAILURE(*status
)) {
1978 ut
->pFuncs
= &utf8Funcs
;
1980 ut
->b
= (int32_t)length
;
1981 ut
->c
= (int32_t)length
;
1984 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
1987 ut
->q
= (char *)ut
->pExtra
+ sizeof(UTF8Buf
);
1999 //------------------------------------------------------------------------------
2001 // UText implementation wrapper for Replaceable (read/write)
2003 // Use of UText data members:
2004 // context pointer to Replaceable.
2005 // p pointer to Replaceable if it is owned by the UText.
2007 //------------------------------------------------------------------------------
2011 // minimum chunk size for this implementation: 3
2012 // to allow for possible trimming for code point boundaries
2013 enum { REP_TEXT_CHUNK_SIZE
=10 };
2018 * +1 to simplify filling with surrogate pair at the end.
2020 UChar s
[REP_TEXT_CHUNK_SIZE
+1];
2026 static UText
* U_CALLCONV
2027 repTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
) {
2028 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2029 dest
= shallowTextClone(dest
, src
, status
);
2031 // For deep clones, make a copy of the Replaceable.
2032 // The copied Replaceable storage is owned by the newly created UText clone.
2033 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2036 if (deep
&& U_SUCCESS(*status
)) {
2037 const Replaceable
*replSrc
= (const Replaceable
*)src
->context
;
2038 dest
->context
= replSrc
->clone();
2039 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2041 // with deep clone, the copy is writable, even when the source is not.
2042 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2048 static void U_CALLCONV
2049 repTextClose(UText
*ut
) {
2050 // Most of the work of close is done by the generic UText framework close.
2051 // All that needs to be done here is delete the Replaceable if the UText
2052 // owns it. This occurs if the UText was created by cloning.
2053 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2054 Replaceable
*rep
= (Replaceable
*)ut
->context
;
2061 static int64_t U_CALLCONV
2062 repTextLength(UText
*ut
) {
2063 const Replaceable
*replSrc
= (const Replaceable
*)ut
->context
;
2064 int32_t len
= replSrc
->length();
2069 static UBool U_CALLCONV
2070 repTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2071 const Replaceable
*rep
=(const Replaceable
*)ut
->context
;
2072 int32_t length
=rep
->length(); // Full length of the input text (bigger than a chunk)
2074 // clip the requested index to the limits of the text.
2075 int32_t index32
= pinIndex(index
, length
);
2076 U_ASSERT(index
<=INT32_MAX
);
2080 * Compute start/limit boundaries around index, for a segment of text
2082 * To allow for the possibility that our user gave an index to the trailing
2083 * half of a surrogate pair, we must request one extra preceding UChar when
2084 * going in the forward direction. This will ensure that the buffer has the
2085 * entire code point at the specified index.
2089 if (index32
>=ut
->chunkNativeStart
&& index32
<ut
->chunkNativeLimit
) {
2090 // Buffer already contains the requested position.
2091 ut
->chunkOffset
= (int32_t)(index
- ut
->chunkNativeStart
);
2094 if (index32
>=length
&& ut
->chunkNativeLimit
==length
) {
2095 // Request for end of string, and buffer already extends up to it.
2096 // Can't get the data, but don't change the buffer.
2097 ut
->chunkOffset
= length
- (int32_t)ut
->chunkNativeStart
;
2101 ut
->chunkNativeLimit
= index
+ REP_TEXT_CHUNK_SIZE
- 1;
2102 // Going forward, so we want to have the buffer with stuff at and beyond
2103 // the requested index. The -1 gets us one code point before the
2104 // requested index also, to handle the case of the index being on
2105 // a trail surrogate of a surrogate pair.
2106 if(ut
->chunkNativeLimit
> length
) {
2107 ut
->chunkNativeLimit
= length
;
2109 // unless buffer ran off end, start is index-1.
2110 ut
->chunkNativeStart
= ut
->chunkNativeLimit
- REP_TEXT_CHUNK_SIZE
;
2111 if(ut
->chunkNativeStart
< 0) {
2112 ut
->chunkNativeStart
= 0;
2115 // Reverse iteration. Fill buffer with data preceding the requested index.
2116 if (index32
>ut
->chunkNativeStart
&& index32
<=ut
->chunkNativeLimit
) {
2117 // Requested position already in buffer.
2118 ut
->chunkOffset
= index32
- (int32_t)ut
->chunkNativeStart
;
2121 if (index32
==0 && ut
->chunkNativeStart
==0) {
2122 // Request for start, buffer already begins at start.
2123 // No data, but keep the buffer as is.
2124 ut
->chunkOffset
= 0;
2128 // Figure out the bounds of the chunk to extract for reverse iteration.
2129 // Need to worry about chunk not splitting surrogate pairs, and while still
2130 // containing the data we need.
2131 // Fix by requesting a chunk that includes an extra UChar at the end.
2132 // If this turns out to be a lead surrogate, we can lop it off and still have
2133 // the data we wanted.
2134 ut
->chunkNativeStart
= index32
+ 1 - REP_TEXT_CHUNK_SIZE
;
2135 if (ut
->chunkNativeStart
< 0) {
2136 ut
->chunkNativeStart
= 0;
2139 ut
->chunkNativeLimit
= index32
+ 1;
2140 if (ut
->chunkNativeLimit
> length
) {
2141 ut
->chunkNativeLimit
= length
;
2145 // Extract the new chunk of text from the Replaceable source.
2146 ReplExtra
*ex
= (ReplExtra
*)ut
->pExtra
;
2147 // UnicodeString with its buffer a writable alias to the chunk buffer
2148 UnicodeString
buffer(ex
->s
, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE
/*buffer capacity*/);
2149 rep
->extractBetween((int32_t)ut
->chunkNativeStart
, (int32_t)ut
->chunkNativeLimit
, buffer
);
2151 ut
->chunkContents
= ex
->s
;
2152 ut
->chunkLength
= (int32_t)(ut
->chunkNativeLimit
- ut
->chunkNativeStart
);
2153 ut
->chunkOffset
= (int32_t)(index32
- ut
->chunkNativeStart
);
2155 // Surrogate pairs from the input text must not span chunk boundaries.
2156 // If end of chunk could be the start of a surrogate, trim it off.
2157 if (ut
->chunkNativeLimit
< length
&&
2158 U16_IS_LEAD(ex
->s
[ut
->chunkLength
-1])) {
2160 ut
->chunkNativeLimit
--;
2161 if (ut
->chunkOffset
> ut
->chunkLength
) {
2162 ut
->chunkOffset
= ut
->chunkLength
;
2166 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
2168 if(ut
->chunkNativeStart
>0 && U16_IS_TRAIL(ex
->s
[0])) {
2169 ++(ut
->chunkContents
);
2170 ++(ut
->chunkNativeStart
);
2171 --(ut
->chunkLength
);
2172 --(ut
->chunkOffset
);
2175 // adjust the index/chunkOffset to a code point boundary
2176 U16_SET_CP_START(ut
->chunkContents
, 0, ut
->chunkOffset
);
2178 // Use fast indexing for get/setNativeIndex()
2179 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2186 static int32_t U_CALLCONV
2187 repTextExtract(UText
*ut
,
2188 int64_t start
, int64_t limit
,
2189 UChar
*dest
, int32_t destCapacity
,
2190 UErrorCode
*status
) {
2191 const Replaceable
*rep
=(const Replaceable
*)ut
->context
;
2192 int32_t length
=rep
->length();
2194 if(U_FAILURE(*status
)) {
2197 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
2198 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2201 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
2205 int32_t start32
= pinIndex(start
, length
);
2206 int32_t limit32
= pinIndex(limit
, length
);
2208 // adjust start, limit if they point to trail half of surrogates
2209 if (start32
<length
&& U16_IS_TRAIL(rep
->charAt(start32
)) &&
2210 U_IS_SUPPLEMENTARY(rep
->char32At(start32
))){
2213 if (limit32
<length
&& U16_IS_TRAIL(rep
->charAt(limit32
)) &&
2214 U_IS_SUPPLEMENTARY(rep
->char32At(limit32
))){
2218 length
=limit32
-start32
;
2219 if(length
>destCapacity
) {
2220 limit32
= start32
+ destCapacity
;
2222 UnicodeString
buffer(dest
, 0, destCapacity
); // writable alias
2223 rep
->extractBetween(start32
, limit32
, buffer
);
2224 repTextAccess(ut
, limit32
, TRUE
);
2226 return u_terminateUChars(dest
, destCapacity
, length
, status
);
2229 static int32_t U_CALLCONV
2230 repTextReplace(UText
*ut
,
2231 int64_t start
, int64_t limit
,
2232 const UChar
*src
, int32_t length
,
2233 UErrorCode
*status
) {
2234 Replaceable
*rep
=(Replaceable
*)ut
->context
;
2237 if(U_FAILURE(*status
)) {
2240 if(src
==NULL
&& length
!=0) {
2241 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2244 oldLength
=rep
->length(); // will subtract from new length
2246 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
2250 int32_t start32
= pinIndex(start
, oldLength
);
2251 int32_t limit32
= pinIndex(limit
, oldLength
);
2253 // Snap start & limit to code point boundaries.
2254 if (start32
<oldLength
&& U16_IS_TRAIL(rep
->charAt(start32
)) &&
2255 start32
>0 && U16_IS_LEAD(rep
->charAt(start32
-1)))
2259 if (limit32
<oldLength
&& U16_IS_LEAD(rep
->charAt(limit32
-1)) &&
2260 U16_IS_TRAIL(rep
->charAt(limit32
)))
2265 // Do the actual replace operation using methods of the Replaceable class
2266 UnicodeString
replStr((UBool
)(length
<0), src
, length
); // read-only alias
2267 rep
->handleReplaceBetween(start32
, limit32
, replStr
);
2268 int32_t newLength
= rep
->length();
2269 int32_t lengthDelta
= newLength
- oldLength
;
2271 // Is the UText chunk buffer OK?
2272 if (ut
->chunkNativeLimit
> start32
) {
2273 // this replace operation may have impacted the current chunk.
2274 // invalidate it, which will force a reload on the next access.
2275 invalidateChunk(ut
);
2278 // set the iteration position to the end of the newly inserted replacement text.
2279 int32_t newIndexPos
= limit32
+ lengthDelta
;
2280 repTextAccess(ut
, newIndexPos
, TRUE
);
2286 static void U_CALLCONV
2287 repTextCopy(UText
*ut
,
2288 int64_t start
, int64_t limit
,
2293 Replaceable
*rep
=(Replaceable
*)ut
->context
;
2294 int32_t length
=rep
->length();
2296 if(U_FAILURE(*status
)) {
2299 if (start
>limit
|| (start
<destIndex
&& destIndex
<limit
))
2301 *status
=U_INDEX_OUTOFBOUNDS_ERROR
;
2305 int32_t start32
= pinIndex(start
, length
);
2306 int32_t limit32
= pinIndex(limit
, length
);
2307 int32_t destIndex32
= pinIndex(destIndex
, length
);
2309 // TODO: snap input parameters to code point boundaries.
2312 // move: copy to destIndex, then replace original with nothing
2313 int32_t segLength
=limit32
-start32
;
2314 rep
->copy(start32
, limit32
, destIndex32
);
2315 if(destIndex32
<start32
) {
2319 rep
->handleReplaceBetween(start32
, limit32
, UnicodeString());
2322 rep
->copy(start32
, limit32
, destIndex32
);
2325 // If the change to the text touched the region in the chunk buffer,
2326 // invalidate the buffer.
2327 int32_t firstAffectedIndex
= destIndex32
;
2328 if (move
&& start32
<firstAffectedIndex
) {
2329 firstAffectedIndex
= start32
;
2331 if (firstAffectedIndex
< ut
->chunkNativeLimit
) {
2332 // changes may have affected range covered by the chunk
2333 invalidateChunk(ut
);
2336 // Put iteration position at the newly inserted (moved) block,
2337 int32_t nativeIterIndex
= destIndex32
+ limit32
- start32
;
2338 if (move
&& destIndex32
>start32
) {
2339 // moved a block of text towards the end of the string.
2340 nativeIterIndex
= destIndex32
;
2343 // Set position, reload chunk if needed.
2344 repTextAccess(ut
, nativeIterIndex
, TRUE
);
2347 static const struct UTextFuncs repFuncs
=
2350 0, 0, 0, // Reserved alignment padding
2357 NULL
, // MapOffsetToNative,
2358 NULL
, // MapIndexToUTF16,
2366 U_CAPI UText
* U_EXPORT2
2367 utext_openReplaceable(UText
*ut
, Replaceable
*rep
, UErrorCode
*status
)
2369 if(U_FAILURE(*status
)) {
2373 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
2376 ut
= utext_setup(ut
, sizeof(ReplExtra
), status
);
2378 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2379 if(rep
->hasMetaData()) {
2380 ut
->providerProperties
|=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA
);
2383 ut
->pFuncs
= &repFuncs
;
2397 //------------------------------------------------------------------------------
2399 // UText implementation for UnicodeString (read/write) and
2400 // for const UnicodeString (read only)
2401 // (same implementation, only the flags are different)
2403 // Use of UText data members:
2404 // context pointer to UnicodeString
2405 // p pointer to UnicodeString IF this UText owns the string
2406 // and it must be deleted on close(). NULL otherwise.
2408 //------------------------------------------------------------------------------
2413 static UText
* U_CALLCONV
2414 unistrTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
*status
) {
2415 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2416 dest
= shallowTextClone(dest
, src
, status
);
2418 // For deep clones, make a copy of the UnicodeSring.
2419 // The copied UnicodeString storage is owned by the newly created UText clone.
2420 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2423 if (deep
&& U_SUCCESS(*status
)) {
2424 const UnicodeString
*srcString
= (const UnicodeString
*)src
->context
;
2425 dest
->context
= new UnicodeString(*srcString
);
2426 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2428 // with deep clone, the copy is writable, even when the source is not.
2429 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2434 static void U_CALLCONV
2435 unistrTextClose(UText
*ut
) {
2436 // Most of the work of close is done by the generic UText framework close.
2437 // All that needs to be done here is delete the UnicodeString if the UText
2438 // owns it. This occurs if the UText was created by cloning.
2439 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2440 UnicodeString
*str
= (UnicodeString
*)ut
->context
;
2447 static int64_t U_CALLCONV
2448 unistrTextLength(UText
*t
) {
2449 return ((const UnicodeString
*)t
->context
)->length();
2453 static UBool U_CALLCONV
2454 unistrTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2455 int32_t length
= ut
->chunkLength
;
2456 ut
->chunkOffset
= pinIndex(index
, length
);
2458 // Check whether request is at the start or end
2459 UBool retVal
= (forward
&& index
<length
) || (!forward
&& index
>0);
2465 static int32_t U_CALLCONV
2466 unistrTextExtract(UText
*t
,
2467 int64_t start
, int64_t limit
,
2468 UChar
*dest
, int32_t destCapacity
,
2469 UErrorCode
*pErrorCode
) {
2470 const UnicodeString
*us
=(const UnicodeString
*)t
->context
;
2471 int32_t length
=us
->length();
2473 if(U_FAILURE(*pErrorCode
)) {
2476 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
2477 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2479 if(start
<0 || start
>limit
) {
2480 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2484 int32_t start32
= start
<length
? us
->getChar32Start((int32_t)start
) : length
;
2485 int32_t limit32
= limit
<length
? us
->getChar32Start((int32_t)limit
) : length
;
2487 length
=limit32
-start32
;
2488 if (destCapacity
>0 && dest
!=NULL
) {
2489 int32_t trimmedLength
= length
;
2490 if(trimmedLength
>destCapacity
) {
2491 trimmedLength
=destCapacity
;
2493 us
->extract(start32
, trimmedLength
, dest
);
2494 t
->chunkOffset
= start32
+trimmedLength
;
2496 t
->chunkOffset
= start32
;
2498 u_terminateUChars(dest
, destCapacity
, length
, pErrorCode
);
2502 static int32_t U_CALLCONV
2503 unistrTextReplace(UText
*ut
,
2504 int64_t start
, int64_t limit
,
2505 const UChar
*src
, int32_t length
,
2506 UErrorCode
*pErrorCode
) {
2507 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
2510 if(U_FAILURE(*pErrorCode
)) {
2513 if(src
==NULL
&& length
!=0) {
2514 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2517 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2520 oldLength
=us
->length();
2521 int32_t start32
= pinIndex(start
, oldLength
);
2522 int32_t limit32
= pinIndex(limit
, oldLength
);
2523 if (start32
< oldLength
) {
2524 start32
= us
->getChar32Start(start32
);
2526 if (limit32
< oldLength
) {
2527 limit32
= us
->getChar32Start(limit32
);
2531 us
->replace(start32
, limit32
-start32
, src
, length
);
2532 int32_t newLength
= us
->length();
2534 // Update the chunk description.
2535 ut
->chunkContents
= us
->getBuffer();
2536 ut
->chunkLength
= newLength
;
2537 ut
->chunkNativeLimit
= newLength
;
2538 ut
->nativeIndexingLimit
= newLength
;
2540 // Set iteration position to the point just following the newly inserted text.
2541 int32_t lengthDelta
= newLength
- oldLength
;
2542 ut
->chunkOffset
= limit32
+ lengthDelta
;
2547 static void U_CALLCONV
2548 unistrTextCopy(UText
*ut
,
2549 int64_t start
, int64_t limit
,
2552 UErrorCode
*pErrorCode
) {
2553 UnicodeString
*us
=(UnicodeString
*)ut
->context
;
2554 int32_t length
=us
->length();
2556 if(U_FAILURE(*pErrorCode
)) {
2559 int32_t start32
= pinIndex(start
, length
);
2560 int32_t limit32
= pinIndex(limit
, length
);
2561 int32_t destIndex32
= pinIndex(destIndex
, length
);
2563 if( start32
>limit32
|| (start32
<destIndex32
&& destIndex32
<limit32
)) {
2564 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2569 // move: copy to destIndex, then replace original with nothing
2570 int32_t segLength
=limit32
-start32
;
2571 us
->copy(start32
, limit32
, destIndex32
);
2572 if(destIndex32
<start32
) {
2575 us
->replace(start32
, segLength
, NULL
, 0);
2578 us
->copy(start32
, limit32
, destIndex32
);
2581 // update chunk description, set iteration position.
2582 ut
->chunkContents
= us
->getBuffer();
2584 // copy operation, string length grows
2585 ut
->chunkLength
+= limit32
-start32
;
2586 ut
->chunkNativeLimit
= ut
->chunkLength
;
2587 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2590 // Iteration position to end of the newly inserted text.
2591 ut
->chunkOffset
= destIndex32
+limit32
-start32
;
2592 if (move
&& destIndex32
>start32
) {
2593 ut
->chunkOffset
= destIndex32
;
2598 static const struct UTextFuncs unistrFuncs
=
2601 0, 0, 0, // Reserved alignment padding
2608 NULL
, // MapOffsetToNative,
2609 NULL
, // MapIndexToUTF16,
2621 U_CAPI UText
* U_EXPORT2
2622 utext_openUnicodeString(UText
*ut
, UnicodeString
*s
, UErrorCode
*status
) {
2623 // TODO: use openConstUnicodeString, then add in the differences.
2625 ut
= utext_setup(ut
, 0, status
);
2626 if (U_SUCCESS(*status
)) {
2627 ut
->pFuncs
= &unistrFuncs
;
2629 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
)|
2630 I32_FLAG(UTEXT_PROVIDER_WRITABLE
);
2632 ut
->chunkContents
= s
->getBuffer();
2633 ut
->chunkLength
= s
->length();
2634 ut
->chunkNativeStart
= 0;
2635 ut
->chunkNativeLimit
= ut
->chunkLength
;
2636 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2643 U_CAPI UText
* U_EXPORT2
2644 utext_openConstUnicodeString(UText
*ut
, const UnicodeString
*s
, UErrorCode
*status
) {
2645 ut
= utext_setup(ut
, 0, status
);
2646 // note: use the standard (writable) function table for UnicodeString.
2647 // The flag settings disable writing, so having the functions in
2648 // the table is harmless.
2649 if (U_SUCCESS(*status
)) {
2650 ut
->pFuncs
= &unistrFuncs
;
2652 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
);
2653 ut
->chunkContents
= s
->getBuffer();
2654 ut
->chunkLength
= s
->length();
2655 ut
->chunkNativeStart
= 0;
2656 ut
->chunkNativeLimit
= ut
->chunkLength
;
2657 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2662 //------------------------------------------------------------------------------
2664 // UText implementation for const UChar * strings
2666 // Use of UText data members:
2667 // context pointer to UnicodeString
2668 // a length. -1 if not yet known.
2670 // TODO: support 64 bit lengths.
2672 //------------------------------------------------------------------------------
2677 static UText
* U_CALLCONV
2678 ucstrTextClone(UText
*dest
, const UText
* src
, UBool deep
, UErrorCode
* status
) {
2679 // First do a generic shallow clone.
2680 dest
= shallowTextClone(dest
, src
, status
);
2682 // For deep clones, make a copy of the string.
2683 // The copied storage is owned by the newly created clone.
2684 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2687 if (deep
&& U_SUCCESS(*status
)) {
2688 U_ASSERT(utext_nativeLength(dest
) < INT32_MAX
);
2689 int32_t len
= (int32_t)utext_nativeLength(dest
);
2691 // The cloned string IS going to be NUL terminated, whether or not the original was.
2692 const UChar
*srcStr
= (const UChar
*)src
->context
;
2693 UChar
*copyStr
= (UChar
*)uprv_malloc((len
+1) * sizeof(UChar
));
2694 if (copyStr
== NULL
) {
2695 *status
= U_MEMORY_ALLOCATION_ERROR
;
2698 for (i
=0; i
<len
; i
++) {
2699 copyStr
[i
] = srcStr
[i
];
2702 dest
->context
= copyStr
;
2703 dest
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
);
2710 static void U_CALLCONV
2711 ucstrTextClose(UText
*ut
) {
2712 // Most of the work of close is done by the generic UText framework close.
2713 // All that needs to be done here is delete the string if the UText
2714 // owns it. This occurs if the UText was created by cloning.
2715 if (ut
->providerProperties
& I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT
)) {
2716 UChar
*s
= (UChar
*)ut
->context
;
2724 static int64_t U_CALLCONV
2725 ucstrTextLength(UText
*ut
) {
2727 // null terminated, we don't yet know the length. Scan for it.
2728 // Access is not convenient for doing this
2729 // because the current interation postion can't be changed.
2730 const UChar
*str
= (const UChar
*)ut
->context
;
2732 if (str
[ut
->chunkNativeLimit
] == 0) {
2735 ut
->chunkNativeLimit
++;
2737 ut
->a
= ut
->chunkNativeLimit
;
2738 ut
->chunkLength
= (int32_t)ut
->chunkNativeLimit
;
2739 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2740 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2746 static UBool U_CALLCONV
2747 ucstrTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
2748 const UChar
*str
= (const UChar
*)ut
->context
;
2750 // pin the requested index to the bounds of the string,
2751 // and set current iteration position.
2754 } else if (index
< ut
->chunkNativeLimit
) {
2755 // The request data is within the chunk as it is known so far.
2756 // Put index on a code point boundary.
2757 U16_SET_CP_START(str
, 0, index
);
2758 } else if (ut
->a
>= 0) {
2759 // We know the length of this string, and the user is requesting something
2760 // at or beyond the length. Pin the requested index to the length.
2763 // Null terminated string, length not yet known, and the requested index
2764 // is beyond where we have scanned so far.
2765 // Scan to 32 UChars beyond the requested index. The strategy here is
2766 // to avoid fully scanning a long string when the caller only wants to
2767 // see a few characters at its beginning.
2768 int32_t scanLimit
= (int32_t)index
+ 32;
2769 if ((index
+ 32)>INT32_MAX
|| (index
+ 32)<0 ) { // note: int64 expression
2770 scanLimit
= INT32_MAX
;
2773 int32_t chunkLimit
= (int32_t)ut
->chunkNativeLimit
;
2774 for (; chunkLimit
<scanLimit
; chunkLimit
++) {
2775 if (str
[chunkLimit
] == 0) {
2776 // We found the end of the string. Remember it, pin the requested index to it,
2777 // and bail out of here.
2779 ut
->chunkLength
= chunkLimit
;
2780 ut
->nativeIndexingLimit
= chunkLimit
;
2781 if (index
>= chunkLimit
) {
2784 U16_SET_CP_START(str
, 0, index
);
2787 ut
->chunkNativeLimit
= chunkLimit
;
2788 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2792 // We scanned through the next batch of UChars without finding the end.
2793 U16_SET_CP_START(str
, 0, index
);
2794 if (chunkLimit
== INT32_MAX
) {
2795 // Scanned to the limit of a 32 bit length.
2796 // Forceably trim the overlength string back so length fits in int32
2797 // TODO: add support for 64 bit strings.
2799 ut
->chunkLength
= chunkLimit
;
2800 ut
->nativeIndexingLimit
= chunkLimit
;
2801 if (index
> chunkLimit
) {
2804 ut
->chunkNativeLimit
= chunkLimit
;
2805 ut
->providerProperties
&= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2807 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2808 // If the current end is on a lead surrogate, back the end up by one.
2809 // It doesn't matter if the end char happens to be an unpaired surrogate,
2810 // and it's simpler not to worry about it.
2811 if (U16_IS_LEAD(str
[chunkLimit
-1])) {
2814 // Null-terminated chunk with end still unknown.
2815 // Update the chunk length to reflect what has been scanned thus far.
2816 // That the full length is still unknown is (still) flagged by
2818 ut
->chunkNativeLimit
= chunkLimit
;
2819 ut
->nativeIndexingLimit
= chunkLimit
;
2820 ut
->chunkLength
= chunkLimit
;
2825 U_ASSERT(index
<=INT32_MAX
);
2826 ut
->chunkOffset
= (int32_t)index
;
2828 // Check whether request is at the start or end
2829 UBool retVal
= (forward
&& index
<ut
->chunkNativeLimit
) || (!forward
&& index
>0);
2835 static int32_t U_CALLCONV
2836 ucstrTextExtract(UText
*ut
,
2837 int64_t start
, int64_t limit
,
2838 UChar
*dest
, int32_t destCapacity
,
2839 UErrorCode
*pErrorCode
)
2841 if(U_FAILURE(*pErrorCode
)) {
2844 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) || start
>limit
) {
2845 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2849 //const UChar *s=(const UChar *)ut->context;
2855 // Access the start. Does two things we need:
2856 // Pins 'start' to the length of the string, if it came in out-of-bounds.
2857 // Snaps 'start' to the beginning of a code point.
2858 ucstrTextAccess(ut
, start
, TRUE
);
2859 const UChar
*s
=ut
->chunkContents
;
2860 start32
= ut
->chunkOffset
;
2862 int32_t strLength
=(int32_t)ut
->a
;
2863 if (strLength
>= 0) {
2864 limit32
= pinIndex(limit
, strLength
);
2866 limit32
= pinIndex(limit
, INT32_MAX
);
2870 for (si
=start32
; si
<limit32
; si
++) {
2871 if (strLength
<0 && s
[si
]==0) {
2872 // Just hit the end of a null-terminated string.
2873 ut
->a
= si
; // set string length for this UText
2874 ut
->chunkNativeLimit
= si
;
2875 ut
->chunkLength
= si
;
2876 ut
->nativeIndexingLimit
= si
;
2880 if (di
<destCapacity
) {
2881 // only store if there is space.
2885 // We have filled the destination buffer, and the string length is known.
2886 // Cut the loop short. There is no need to scan string termination.
2887 di
= limit32
- start32
;
2895 // If the limit index points to a lead surrogate of a pair,
2896 // add the corresponding trail surrogate to the destination.
2897 if (si
>0 && U16_IS_LEAD(s
[si
-1]) &&
2898 ((si
<strLength
|| strLength
<0) && U16_IS_TRAIL(s
[si
])))
2900 if (di
<destCapacity
) {
2901 // store only if there is space in the output buffer.
2902 dest
[di
++] = s
[si
++];
2906 // Put iteration position at the point just following the extracted text
2907 ut
->chunkOffset
= uprv_min(strLength
, start32
+ destCapacity
);
2909 // Add a terminating NUL if space in the buffer permits,
2910 // and set the error status as required.
2911 u_terminateUChars(dest
, destCapacity
, di
, pErrorCode
);
2915 static const struct UTextFuncs ucstrFuncs
=
2918 0, 0, 0, // Reserved alignment padding
2925 NULL
, // MapOffsetToNative,
2926 NULL
, // MapIndexToUTF16,
2935 static const UChar gEmptyUString
[] = {0};
2937 U_CAPI UText
* U_EXPORT2
2938 utext_openUChars(UText
*ut
, const UChar
*s
, int64_t length
, UErrorCode
*status
) {
2939 if (U_FAILURE(*status
)) {
2942 if(s
==NULL
&& length
==0) {
2945 if (s
==NULL
|| length
< -1 || length
>INT32_MAX
) {
2946 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
2949 ut
= utext_setup(ut
, 0, status
);
2950 if (U_SUCCESS(*status
)) {
2951 ut
->pFuncs
= &ucstrFuncs
;
2953 ut
->providerProperties
= I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS
);
2955 ut
->providerProperties
|= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE
);
2958 ut
->chunkContents
= s
;
2959 ut
->chunkNativeStart
= 0;
2960 ut
->chunkNativeLimit
= length
>=0? length
: 0;
2961 ut
->chunkLength
= (int32_t)ut
->chunkNativeLimit
;
2962 ut
->chunkOffset
= 0;
2963 ut
->nativeIndexingLimit
= ut
->chunkLength
;
2969 //------------------------------------------------------------------------------
2971 // UText implementation for text from ICU CharacterIterators
2973 // Use of UText data members:
2974 // context pointer to the CharacterIterator
2975 // a length of the full text.
2976 // p pointer to buffer 1
2977 // b start index of local buffer 1 contents
2978 // q pointer to buffer 2
2979 // c start index of local buffer 2 contents
2980 // r pointer to the character iterator if the UText owns it.
2983 //------------------------------------------------------------------------------
2984 #define CIBufSize 16
2987 static void U_CALLCONV
2988 charIterTextClose(UText
*ut
) {
2989 // Most of the work of close is done by the generic UText framework close.
2990 // All that needs to be done here is delete the CharacterIterator if the UText
2991 // owns it. This occurs if the UText was created by cloning.
2992 CharacterIterator
*ci
= (CharacterIterator
*)ut
->r
;
2997 static int64_t U_CALLCONV
2998 charIterTextLength(UText
*ut
) {
2999 return (int32_t)ut
->a
;
3002 static UBool U_CALLCONV
3003 charIterTextAccess(UText
*ut
, int64_t index
, UBool forward
) {
3004 CharacterIterator
*ci
= (CharacterIterator
*)ut
->context
;
3006 int32_t clippedIndex
= (int32_t)index
;
3007 if (clippedIndex
<0) {
3009 } else if (clippedIndex
>=ut
->a
) {
3010 clippedIndex
=(int32_t)ut
->a
;
3012 int32_t neededIndex
= clippedIndex
;
3013 if (!forward
&& neededIndex
>0) {
3014 // reverse iteration, want the position just before what was asked for.
3016 } else if (forward
&& neededIndex
==ut
->a
&& neededIndex
>0) {
3017 // Forward iteration, don't ask for something past the end of the text.
3021 // Find the native index of the start of the buffer containing what we want.
3022 neededIndex
-= neededIndex
% CIBufSize
;
3025 UBool needChunkSetup
= TRUE
;
3027 if (ut
->chunkNativeStart
== neededIndex
) {
3028 // The buffer we want is already the current chunk.
3029 needChunkSetup
= FALSE
;
3030 } else if (ut
->b
== neededIndex
) {
3031 // The first buffer (buffer p) has what we need.
3032 buf
= (UChar
*)ut
->p
;
3033 } else if (ut
->c
== neededIndex
) {
3034 // The second buffer (buffer q) has what we need.
3035 buf
= (UChar
*)ut
->q
;
3037 // Neither buffer already has what we need.
3038 // Load new data from the character iterator.
3039 // Use the buf that is not the current buffer.
3040 buf
= (UChar
*)ut
->p
;
3041 if (ut
->p
== ut
->chunkContents
) {
3042 buf
= (UChar
*)ut
->q
;
3044 ci
->setIndex(neededIndex
);
3045 for (i
=0; i
<CIBufSize
; i
++) {
3046 buf
[i
] = ci
->nextPostInc();
3047 if (i
+neededIndex
> ut
->a
) {
3053 // We have a buffer with the data we need.
3054 // Set it up as the current chunk, if it wasn't already.
3055 if (needChunkSetup
) {
3056 ut
->chunkContents
= buf
;
3057 ut
->chunkLength
= CIBufSize
;
3058 ut
->chunkNativeStart
= neededIndex
;
3059 ut
->chunkNativeLimit
= neededIndex
+ CIBufSize
;
3060 if (ut
->chunkNativeLimit
> ut
->a
) {
3061 ut
->chunkNativeLimit
= ut
->a
;
3062 ut
->chunkLength
= (int32_t)(ut
->chunkNativeLimit
)-(int32_t)(ut
->chunkNativeStart
);
3064 ut
->nativeIndexingLimit
= ut
->chunkLength
;
3065 U_ASSERT(ut
->chunkOffset
>=0 && ut
->chunkOffset
<=CIBufSize
);
3067 ut
->chunkOffset
= clippedIndex
- (int32_t)ut
->chunkNativeStart
;
3068 UBool success
= (forward
? ut
->chunkOffset
<ut
->chunkLength
: ut
->chunkOffset
>0);
3072 static UText
* U_CALLCONV
3073 charIterTextClone(UText
*dest
, const UText
*src
, UBool deep
, UErrorCode
* status
) {
3074 if (U_FAILURE(*status
)) {
3079 // There is no CharacterIterator API for cloning the underlying text storage.
3080 *status
= U_UNSUPPORTED_ERROR
;
3083 CharacterIterator
*srcCI
=(CharacterIterator
*)src
->context
;
3084 srcCI
= srcCI
->clone();
3085 dest
= utext_openCharacterIterator(dest
, srcCI
, status
);
3086 // cast off const on getNativeIndex.
3087 // For CharacterIterator based UTexts, this is safe, the operation is const.
3088 int64_t ix
= utext_getNativeIndex((UText
*)src
);
3089 utext_setNativeIndex(dest
, ix
);
3090 dest
->r
= srcCI
; // flags that this UText owns the CharacterIterator
3095 static int32_t U_CALLCONV
3096 charIterTextExtract(UText
*ut
,
3097 int64_t start
, int64_t limit
,
3098 UChar
*dest
, int32_t destCapacity
,
3101 if(U_FAILURE(*status
)) {
3104 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) || start
>limit
) {
3105 *status
=U_ILLEGAL_ARGUMENT_ERROR
;
3108 int32_t length
= (int32_t)ut
->a
;
3109 int32_t start32
= pinIndex(start
, length
);
3110 int32_t limit32
= pinIndex(limit
, length
);
3115 CharacterIterator
*ci
= (CharacterIterator
*)ut
->context
;
3116 ci
->setIndex32(start32
); // Moves ix to lead of surrogate pair, if needed.
3117 srci
= ci
->getIndex();
3119 while (srci
<limit32
) {
3120 UChar32 c
= ci
->next32PostInc();
3121 int32_t len
= U16_LENGTH(c
);
3122 if (desti
+len
<= destCapacity
) {
3123 U16_APPEND_UNSAFE(dest
, desti
, c
);
3124 copyLimit
= srci
+len
;
3127 *status
= U_BUFFER_OVERFLOW_ERROR
;
3132 charIterTextAccess(ut
, copyLimit
, TRUE
);
3134 u_terminateUChars(dest
, destCapacity
, desti
, status
);
3138 static const struct UTextFuncs charIterFuncs
=
3141 0, 0, 0, // Reserved alignment padding
3145 charIterTextExtract
,
3148 NULL
, // MapOffsetToNative,
3149 NULL
, // MapIndexToUTF16,
3158 U_CAPI UText
* U_EXPORT2
3159 utext_openCharacterIterator(UText
*ut
, CharacterIterator
*ci
, UErrorCode
*status
) {
3160 if (U_FAILURE(*status
)) {
3164 if (ci
->startIndex() > 0) {
3165 // No support for CharacterIterators that do not start indexing from zero.
3166 *status
= U_UNSUPPORTED_ERROR
;
3170 // Extra space in UText for 2 buffers of CIBufSize UChars each.
3171 int32_t extraSpace
= 2 * CIBufSize
* sizeof(UChar
);
3172 ut
= utext_setup(ut
, extraSpace
, status
);
3173 if (U_SUCCESS(*status
)) {
3174 ut
->pFuncs
= &charIterFuncs
;
3176 ut
->providerProperties
= 0;
3177 ut
->a
= ci
->endIndex(); // Length of text
3178 ut
->p
= ut
->pExtra
; // First buffer
3179 ut
->b
= -1; // Native index of first buffer contents
3180 ut
->q
= (UChar
*)ut
->pExtra
+CIBufSize
; // Second buffer
3181 ut
->c
= -1; // Native index of second buffer contents
3183 // Initialize current chunk contents to be empty.
3184 // First access will fault something in.
3185 // Note: The initial nativeStart and chunkOffset must sum to zero
3186 // so that getNativeIndex() will correctly compute to zero
3187 // if no call to Access() has ever been made. They can't be both
3188 // zero without Access() thinking that the chunk is valid.
3189 ut
->chunkContents
= (UChar
*)ut
->p
;
3190 ut
->chunkNativeStart
= -1;
3191 ut
->chunkOffset
= 1;
3192 ut
->chunkNativeLimit
= 0;
3193 ut
->chunkLength
= 0;
3194 ut
->nativeIndexingLimit
= ut
->chunkOffset
; // enables native indexing