]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/utext.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / utext.cpp
CommitLineData
73c04bcf
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2005-2006, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: utext.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2005apr12
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/ustring.h"
19#include "unicode/unistr.h"
20#include "unicode/chariter.h"
21#include "unicode/utext.h"
22#include "ustr_imp.h"
23#include "cmemory.h"
24#include "cstring.h"
25#include "uassert.h"
26
27
28#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
29
30
31static UBool
32utext_access(UText *ut, int64_t index, UBool forward) {
33 return ut->pFuncs->access(ut, index, forward);
34}
35
36
37
38U_DRAFT UBool U_EXPORT2
39utext_moveIndex32(UText *ut, int32_t delta) {
40 UChar32 c;
41 if (delta > 0) {
42 do {
43 if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
44 return FALSE;
45 }
46 c = ut->chunkContents[ut->chunkOffset];
47 if (U16_IS_SURROGATE(c)) {
48 c = utext_next32(ut);
49 if (c == U_SENTINEL) {
50 return FALSE;
51 }
52 } else {
53 ut->chunkOffset++;
54 }
55 } while(--delta>0);
56
57 } else if (delta<0) {
58 do {
59 if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
60 return FALSE;
61 }
62 c = ut->chunkContents[ut->chunkOffset-1];
63 if (U16_IS_SURROGATE(c)) {
64 c = utext_previous32(ut);
65 if (c == U_SENTINEL) {
66 return FALSE;
67 }
68 } else {
69 ut->chunkOffset--;
70 }
71 } while(++delta<0);
72 }
73
74 return TRUE;
75}
76
77
78U_DRAFT int64_t U_EXPORT2
79utext_nativeLength(UText *ut) {
80 return ut->pFuncs->nativeLength(ut);
81}
82
83
84U_DRAFT UBool U_EXPORT2
85utext_isLengthExpensive(const UText *ut) {
86 UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
87 return r;
88}
89
90
91U_DRAFT int64_t U_EXPORT2
92utext_getNativeIndex(const UText *ut) {
93 if(ut->chunkOffset <= ut->nativeIndexingLimit) {
94 return ut->chunkNativeStart+ut->chunkOffset;
95 } else {
96 return ut->pFuncs->mapOffsetToNative(ut);
97 }
98}
99
100
101U_DRAFT void U_EXPORT2
102utext_setNativeIndex(UText *ut, int64_t index) {
103 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
104 // The desired position is outside of the current chunk.
105 // Access the new position. Assume a forward iteration from here,
106 // which will also be optimimum for a single random access.
107 // Reverse iterations may suffer slightly.
108 ut->pFuncs->access(ut, index, TRUE);
109 } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
110 // utf-16 indexing.
111 ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
112 } else {
113 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
114 }
115 // The convention is that the index must always be on a code point boundary.
116 // Adjust the index position if it is in the middle of a surrogate pair.
117 if (ut->chunkOffset<ut->chunkLength) {
118 UChar c= ut->chunkContents[ut->chunkOffset];
119 if (UTF16_IS_TRAIL(c)) {
120 if (ut->chunkOffset==0) {
121 ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
122 }
123 if (ut->chunkOffset>0) {
124 UChar lead = ut->chunkContents[ut->chunkOffset-1];
125 if (UTF16_IS_LEAD(lead)) {
126 ut->chunkOffset--;
127 }
128 }
129 }
130 }
131}
132
133
134
135U_DRAFT int64_t U_EXPORT2
136utext_getPreviousNativeIndex(UText *ut) {
137 //
138 // Fast-path the common case.
139 // Common means current position is not at the beginning of a chunk
140 // and the preceding character is not supplementary.
141 //
142 int32_t i = ut->chunkOffset - 1;
143 int64_t result;
144 if (i >= 0) {
145 UChar c = ut->chunkContents[i];
146 if (U16_IS_TRAIL(c) == FALSE) {
147 if (i <= ut->nativeIndexingLimit) {
148 result = ut->chunkNativeStart + i;
149 } else {
150 ut->chunkOffset = i;
151 result = ut->pFuncs->mapOffsetToNative(ut);
152 ut->chunkOffset++;
153 }
154 return result;
155 }
156 }
157
158 // If at the start of text, simply return 0.
159 if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
160 return 0;
161 }
162
163 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
164 // Keep it simple, use other functions to handle the edges.
165 //
166 utext_previous32(ut);
167 result = UTEXT_GETNATIVEINDEX(ut);
168 utext_next32(ut);
169 return result;
170}
171
172
173//
174// utext_current32. Get the UChar32 at the current position.
175// UText iteration position is always on a code point boundary,
176// never on the trail half of a surrogate pair.
177//
178U_DRAFT UChar32 U_EXPORT2
179utext_current32(UText *ut) {
180 UChar32 c;
181 if (ut->chunkOffset==ut->chunkLength) {
182 // Current position is just off the end of the chunk.
183 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
184 // Off the end of the text.
185 return U_SENTINEL;
186 }
187 }
188
189 c = ut->chunkContents[ut->chunkOffset];
190 if (U16_IS_LEAD(c) == FALSE) {
191 // Normal, non-supplementary case.
192 return c;
193 }
194
195 //
196 // Possible supplementary char.
197 //
198 UChar32 trail = 0;
199 UChar32 supplementaryC = c;
200 if ((ut->chunkOffset+1) < ut->chunkLength) {
201 // The trail surrogate is in the same chunk.
202 trail = ut->chunkContents[ut->chunkOffset+1];
203 } else {
204 // The trail surrogate is in a different chunk.
205 // Because we must maintain the iteration position, we need to switch forward
206 // into the new chunk, get the trail surrogate, then revert the chunk back to the
207 // original one.
208 // An edge case to be careful of: the entire text may end with an unpaired
209 // leading surrogate. The attempt to access the trail will fail, but
210 // the original position before the unpaired lead still needs to be restored.
211 int64_t nativePosition = ut->chunkNativeLimit;
212 int32_t originalOffset = ut->chunkOffset;
213 if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
214 trail = ut->chunkContents[ut->chunkOffset];
215 }
216 UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
217 U_ASSERT(r==TRUE);
218 ut->chunkOffset = originalOffset;
219 if(!r) {
220 return U_SENTINEL;
221 }
222 }
223
224 if (U16_IS_TRAIL(trail)) {
225 supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
226 }
227 return supplementaryC;
228
229}
230
231
232U_DRAFT UChar32 U_EXPORT2
233utext_char32At(UText *ut, int64_t nativeIndex) {
234 UChar32 c = U_SENTINEL;
235
236 // Fast path the common case.
237 if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
238 ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
239 c = ut->chunkContents[ut->chunkOffset];
240 if (U16_IS_SURROGATE(c) == FALSE) {
241 return c;
242 }
243 }
244
245
246 utext_setNativeIndex(ut, nativeIndex);
247 if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
248 c = ut->chunkContents[ut->chunkOffset];
249 if (U16_IS_SURROGATE(c)) {
250 // For surrogates, let current32() deal with the complications
251 // of supplementaries that may span chunk boundaries.
252 c = utext_current32(ut);
253 }
254 }
255 return c;
256}
257
258
259U_DRAFT UChar32 U_EXPORT2
260utext_next32(UText *ut) {
261 UChar32 c;
262
263 if (ut->chunkOffset >= ut->chunkLength) {
264 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
265 return U_SENTINEL;
266 }
267 }
268
269 c = ut->chunkContents[ut->chunkOffset++];
270 if (U16_IS_LEAD(c) == FALSE) {
271 // Normal case, not supplementary.
272 // (A trail surrogate seen here is just returned as is, as a surrogate value.
273 // It cannot be part of a pair.)
274 return c;
275 }
276
277 if (ut->chunkOffset >= ut->chunkLength) {
278 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
279 // c is an unpaired lead surrogate at the end of the text.
280 // return it as it is.
281 return c;
282 }
283 }
284 UChar32 trail = ut->chunkContents[ut->chunkOffset];
285 if (U16_IS_TRAIL(trail) == FALSE) {
286 // c was an unpaired lead surrogate, not at the end of the text.
287 // return it as it is (unpaired). Iteration position is on the
288 // following character, possibly in the next chunk, where the
289 // trail surrogate would have been if it had existed.
290 return c;
291 }
292
293 UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
294 ut->chunkOffset++; // move iteration position over the trail surrogate.
295 return supplementary;
296 }
297
298
299U_DRAFT UChar32 U_EXPORT2
300utext_previous32(UText *ut) {
301 UChar32 c;
302
303 if (ut->chunkOffset <= 0) {
304 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
305 return U_SENTINEL;
306 }
307 }
308 ut->chunkOffset--;
309 c = ut->chunkContents[ut->chunkOffset];
310 if (U16_IS_TRAIL(c) == FALSE) {
311 // Normal case, not supplementary.
312 // (A lead surrogate seen here is just returned as is, as a surrogate value.
313 // It cannot be part of a pair.)
314 return c;
315 }
316
317 if (ut->chunkOffset <= 0) {
318 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
319 // c is an unpaired trail surrogate at the start of the text.
320 // return it as it is.
321 return c;
322 }
323 }
324
325 UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
326 if (U16_IS_LEAD(lead) == FALSE) {
327 // c was an unpaired trail surrogate, not at the end of the text.
328 // return it as it is (unpaired). Iteration position is at c
329 return c;
330 }
331
332 UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
333 ut->chunkOffset--; // move iteration position over the lead surrogate.
334 return supplementary;
335}
336
337
338
339U_DRAFT UChar32 U_EXPORT2
340utext_next32From(UText *ut, int64_t index) {
341 UChar32 c = U_SENTINEL;
342
343 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
344 // Desired position is outside of the current chunk.
345 if(!ut->pFuncs->access(ut, index, TRUE)) {
346 // no chunk available here
347 return U_SENTINEL;
348 }
349 } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
350 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
351 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
352 } else {
353 // Desired position is in chunk, with non-UTF16 indexing.
354 ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
355 }
356
357 c = ut->chunkContents[ut->chunkOffset++];
358 if (U16_IS_SURROGATE(c)) {
359 // Surrogates. Many edge cases. Use other functions that already
360 // deal with the problems.
361 utext_setNativeIndex(ut, index);
362 c = utext_next32(ut);
363 }
364 return c;
365}
366
367
368U_DRAFT UChar32 U_EXPORT2
369utext_previous32From(UText *ut, int64_t index) {
370 //
371 // Return the character preceding the specified index.
372 // Leave the iteration position at the start of the character that was returned.
373 //
374 UChar32 cPrev; // The character preceding cCurr, which is what we will return.
375
376 // Address the chunk containg the position preceding the incoming index
377 // A tricky edge case:
378 // We try to test the requested native index against the chunkNativeStart to determine
379 // whether the character preceding the one at the index is in the current chunk.
380 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
381 // requested index is on something other than the first position of the first char.
382 //
383 if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
384 // Requested native index is outside of the current chunk.
385 if(!ut->pFuncs->access(ut, index, FALSE)) {
386 // no chunk available here
387 return U_SENTINEL;
388 }
389 } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
390 // Direct UTF-16 indexing.
391 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
392 } else {
393 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
394 if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
395 // no chunk available here
396 return U_SENTINEL;
397 }
398 }
399
400 //
401 // Simple case with no surrogates.
402 //
403 ut->chunkOffset--;
404 cPrev = ut->chunkContents[ut->chunkOffset];
405
406 if (U16_IS_SURROGATE(cPrev)) {
407 // Possible supplementary. Many edge cases.
408 // Let other functions do the heavy lifting.
409 utext_setNativeIndex(ut, index);
410 cPrev = utext_previous32(ut);
411 }
412 return cPrev;
413}
414
415
416U_DRAFT int32_t U_EXPORT2
417utext_extract(UText *ut,
418 int64_t start, int64_t limit,
419 UChar *dest, int32_t destCapacity,
420 UErrorCode *status) {
421 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
422 }
423
424
425
426U_DRAFT UBool U_EXPORT2
427utext_equals(const UText *a, const UText *b) {
428 if (a==NULL || b==NULL ||
429 a->magic != UTEXT_MAGIC ||
430 b->magic != UTEXT_MAGIC) {
431 // Null or invalid arguments don't compare equal to anything.
432 return FALSE;
433 }
434
435 if (a->pFuncs != b->pFuncs) {
436 // Different types of text providers.
437 return FALSE;
438 }
439
440 if (a->context != b->context) {
441 // Different sources (different strings)
442 return FALSE;
443 }
444 if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
445 // Different current position in the string.
446 return FALSE;
447 }
448
449 return TRUE;
450}
451
452U_DRAFT UBool U_EXPORT2
453utext_isWritable(const UText *ut)
454{
455 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
456 return b;
457}
458
459
460U_DRAFT void U_EXPORT2
461utext_freeze(UText *ut) {
462 // Zero out the WRITABLE flag.
463 ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
464}
465
466
467U_DRAFT UBool U_EXPORT2
468utext_hasMetaData(const UText *ut)
469{
470 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
471 return b;
472}
473
474
475
476U_DRAFT int32_t U_EXPORT2
477utext_replace(UText *ut,
478 int64_t nativeStart, int64_t nativeLimit,
479 const UChar *replacementText, int32_t replacementLength,
480 UErrorCode *status)
481{
482 if (U_FAILURE(*status)) {
483 return 0;
484 }
485 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
486 *status = U_NO_WRITE_PERMISSION;
487 return 0;
488 }
489 int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
490 return i;
491}
492
493U_DRAFT void U_EXPORT2
494utext_copy(UText *ut,
495 int64_t nativeStart, int64_t nativeLimit,
496 int64_t destIndex,
497 UBool move,
498 UErrorCode *status)
499{
500 if (U_FAILURE(*status)) {
501 return;
502 }
503 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
504 *status = U_NO_WRITE_PERMISSION;
505 return;
506 }
507 ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
508}
509
510
511
512U_DRAFT UText * U_EXPORT2
513utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
514 UText *result;
515 result = src->pFuncs->clone(dest, src, deep, status);
516 if (readOnly) {
517 utext_freeze(result);
518 }
519 return result;
520}
521
522
523
524//------------------------------------------------------------------------------
525//
526// UText common functions implementation
527//
528//------------------------------------------------------------------------------
529
530//
531// UText.flags bit definitions
532//
533enum {
534 UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap.
535 // 0 if caller provided storage for the UText.
536
537 UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate
538 // heap block.
539 // 0 if there is no separate allocation. Either no extra
540 // storage was requested, or it is appended to the end
541 // of the main UText storage.
542
543 UTEXT_OPEN = 4 // 1 if this UText is currently open
544 // 0 if this UText is not open.
545};
546
547
548//
549// Extended form of a UText. The purpose is to aid in computing the total size required
550// when a provider asks for a UText to be allocated with extra storage.
551
552struct ExtendedUText {
553 UText ut;
554 UAlignedMemory extension;
555};
556
557static const UText emptyText = UTEXT_INITIALIZER;
558
559U_DRAFT UText * U_EXPORT2
560utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
561 if (U_FAILURE(*status)) {
562 return ut;
563 }
564
565 if (ut == NULL) {
566 // We need to heap-allocate storage for the new UText
567 int32_t spaceRequired = sizeof(UText);
568 if (extraSpace > 0) {
569 spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
570 }
571 ut = (UText *)uprv_malloc(spaceRequired);
572 if (ut == NULL) {
573 *status = U_MEMORY_ALLOCATION_ERROR;
574 } else {
575 *ut = emptyText;
576 ut->flags |= UTEXT_HEAP_ALLOCATED;
577 if (spaceRequired>0) {
578 ut->extraSize = extraSpace;
579 ut->pExtra = &((ExtendedUText *)ut)->extension;
580 uprv_memset(ut->pExtra, 0, extraSpace); // Purify whines about copying untouched extra [buffer]
581 // space when cloning, so init it now.
582 }
583 }
584 } else {
585 // We have been supplied with an already existing UText.
586 // Verify that it really appears to be a UText.
587 if (ut->magic != UTEXT_MAGIC) {
588 *status = U_ILLEGAL_ARGUMENT_ERROR;
589 return ut;
590 }
591 // If the ut is already open and there's a provider supplied close
592 // function, call it.
593 if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) {
594 ut->pFuncs->close(ut);
595 }
596 ut->flags &= ~UTEXT_OPEN;
597
598 // If extra space was requested by our caller, check whether
599 // sufficient already exists, and allocate new if needed.
600 if (extraSpace > ut->extraSize) {
601 // Need more space. If there is existing separately allocated space,
602 // delete it first, then allocate new space.
603 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
604 uprv_free(ut->pExtra);
605 ut->extraSize = 0;
606 }
607 ut->pExtra = uprv_malloc(extraSpace);
608 if (ut->pExtra == NULL) {
609 *status = U_MEMORY_ALLOCATION_ERROR;
610 } else {
611 ut->extraSize = extraSpace;
612 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
613 uprv_memset(ut->pExtra, 0, extraSpace);
614 }
615 }
616 }
617 if (U_SUCCESS(*status)) {
618 ut->flags |= UTEXT_OPEN;
619
620 // Initialize all remaining fields of the UText.
621 //
622 ut->context = NULL;
623 ut->chunkContents = NULL;
624 ut->p = NULL;
625 ut->q = NULL;
626 ut->r = NULL;
627 ut->a = 0;
628 ut->b = 0;
629 ut->c = 0;
630 ut->chunkOffset = 0;
631 ut->chunkLength = 0;
632 ut->chunkNativeStart = 0;
633 ut->chunkNativeLimit = 0;
634 ut->nativeIndexingLimit = 0;
635 ut->providerProperties = 0;
636 ut->privA = 0;
637 ut->privB = 0;
638 ut->privC = 0;
639 ut->privP = NULL;
640 }
641 return ut;
642}
643
644
645U_DRAFT UText * U_EXPORT2
646utext_close(UText *ut) {
647 if (ut==NULL ||
648 ut->magic != UTEXT_MAGIC ||
649 (ut->flags & UTEXT_OPEN) == 0)
650 {
651 // The supplied ut is not an open UText.
652 // Do nothing.
653 return ut;
654 }
655
656 // If the provider gave us a close function, call it now.
657 // This will clean up anything allocated specifically by the provider.
658 if (ut->pFuncs->close != NULL) {
659 ut->pFuncs->close(ut);
660 }
661 ut->flags &= ~UTEXT_OPEN;
662
663 // If we (the framework) allocated the UText or subsidiary storage,
664 // delete it.
665 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
666 uprv_free(ut->pExtra);
667 ut->pExtra = NULL;
668 ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
669 ut->extraSize = 0;
670 }
671
672 // Zero out function table of the closed UText. This is a defensive move,
673 // inteded to cause applications that inadvertantly use a closed
674 // utext to crash with null pointer errors.
675 ut->pFuncs = NULL;
676
677 if (ut->flags & UTEXT_HEAP_ALLOCATED) {
678 // This UText was allocated by UText setup. We need to free it.
679 // Clear magic, so we can detect if the user messes up and immediately
680 // tries to reopen another UText using the deleted storage.
681 ut->magic = 0;
682 uprv_free(ut);
683 ut = NULL;
684 }
685 return ut;
686}
687
688
689
690
691//
692// invalidateChunk Reset a chunk to have no contents, so that the next call
693// to access will cause new data to load.
694// This is needed when copy/move/replace operate directly on the
695// backing text, potentially putting it out of sync with the
696// contents in the chunk.
697//
698static void
699invalidateChunk(UText *ut) {
700 ut->chunkLength = 0;
701 ut->chunkNativeLimit = 0;
702 ut->chunkNativeStart = 0;
703 ut->chunkOffset = 0;
704 ut->nativeIndexingLimit = 0;
705}
706
707//
708// pinIndex Do range pinning on a native index parameter.
709// 64 bit pinning is done in place.
710// 32 bit truncated result is returned as a convenience for
711// use in providers that don't need 64 bits.
712static int32_t
713pinIndex(int64_t &index, int64_t limit) {
714 if (index<0) {
715 index = 0;
716 } else if (index > limit) {
717 index = limit;
718 }
719 return (int32_t)index;
720}
721
722
723U_CDECL_BEGIN
724
725//
726// Pointer relocation function,
727// a utility used by shallow clone.
728// Adjust a pointer that refers to something within one UText (the source)
729// to refer to the same relative offset within a another UText (the target)
730//
731static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
732 // convert all pointers to (char *) so that byte address arithmetic will work.
733 char *dptr = (char *)*destPtr;
734 char *dUText = (char *)dest;
735 char *sUText = (char *)src;
736
737 if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
738 // target ptr was to something within the src UText's pExtra storage.
739 // relocate it into the target UText's pExtra region.
740 *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
741 } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
742 // target ptr was pointing to somewhere within the source UText itself.
743 // Move it to the same offset within the target UText.
744 *destPtr = dUText + (dptr-sUText);
745 }
746}
747
748
749//
750// Clone. This is a generic copy-the-utext-by-value clone function that can be
751// used as-is with some utext types, and as a helper by other clones.
752//
753static UText * U_CALLCONV
754shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
755 if (U_FAILURE(*status)) {
756 return NULL;
757 }
758 int32_t srcExtraSize = src->extraSize;
759
760 //
761 // Use the generic text_setup to allocate storage if required.
762 //
763 dest = utext_setup(dest, srcExtraSize, status);
764 if (U_FAILURE(*status)) {
765 return dest;
766 }
767
768 //
769 // flags (how the UText was allocated) and the pointer to the
770 // extra storage must retain the values in the cloned utext that
771 // were set up by utext_setup. Save them separately before
772 // copying the whole struct.
773 //
774 void *destExtra = dest->pExtra;
775 int32_t flags = dest->flags;
776
777
778 //
779 // Copy the whole UText struct by value.
780 // Any "Extra" storage is copied also.
781 //
782 int sizeToCopy = src->sizeOfStruct;
783 if (sizeToCopy > dest->sizeOfStruct) {
784 sizeToCopy = dest->sizeOfStruct;
785 }
786 uprv_memcpy(dest, src, sizeToCopy);
787 dest->pExtra = destExtra;
788 dest->flags = flags;
789 if (srcExtraSize > 0) {
790 uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
791 }
792
793 //
794 // Relocate any pointers in the target that refer to the UText itself
795 // to point to the cloned copy rather than the original source.
796 //
797 adjustPointer(dest, &dest->context, src);
798 adjustPointer(dest, &dest->p, src);
799 adjustPointer(dest, &dest->q, src);
800 adjustPointer(dest, &dest->r, src);
801
802 return dest;
803}
804
805
806U_CDECL_END
807
808
809
810//------------------------------------------------------------------------------
811//
812// UText implementation for UTF-8 char * strings (read-only)
813// Limitation: string length must be <= 0x7fffffff in length.
814// (length must for in an int32_t variable)
815//
816// Use of UText data members:
817// context pointer to UTF-8 string
818// utext.b is the input string length (bytes).
819// utext.c Length scanned so far in string
820// (for optimizing finding length of zero terminated strings.)
821// utext.p pointer to the current buffer
822// utext.q pointer to the other buffer.
823//
824//------------------------------------------------------------------------------
825
826// Chunk size.
827// Must be less than 85, because of byte mapping from UChar indexes to native indexes.
828// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
829// to two UChars.)
830//
831enum { UTF8_TEXT_CHUNK_SIZE=32 };
832
833//
834// UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
835// Each contains the UChar chunk buffer, the to and from native maps, and
836// header info.
837//
838// because backwards iteration fills the buffers starting at the end and
839// working towards the front, the filled part of the buffers may not begin
840// at the start of the available storage for the buffers.
841//
842// Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
843// the last character added being a supplementary, and thus requiring a surrogate
844// pair. Doing this is simpler than checking for the edge case.
845//
846
847struct UTF8Buf {
848 int32_t bufNativeStart; // Native index of first char in UChar buf
849 int32_t bufNativeLimit; // Native index following last char in buf.
850 int32_t bufStartIdx; // First filled position in buf.
851 int32_t bufLimitIdx; // Limit of filled range in buf.
852 int32_t bufNILimit; // Limit of native indexing part of buf
853 int32_t toUCharsMapStart; // Native index corresponding to
854 // mapToUChars[0].
855 // Set to bufNativeStart when filling forwards.
856 // Set to computed value when filling backwards.
857
858 UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the
859 // the chunk size, to allow for surrogate at the end.
860 // Length must be identical to mapToNative array, below,
861 // because of the way indexing works when the array is
862 // filled backwards during a reverse iteration. Thus,
863 // the additional extra size.
864 uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to
865 // native offset from bufNativeStart.
866 // Requires two extra slots,
867 // one for a supplementary starting in the last normal position,
868 // and one for an entry for the buffer limit position.
869 uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
870 // correspoding offset in filled part of buf.
871 int32_t align;
872};
873
874U_CDECL_BEGIN
875
876//
877// utf8TextLength
878//
879// Get the length of the string. If we don't already know it,
880// we'll need to scan for the trailing nul.
881//
882static int64_t U_CALLCONV
883utf8TextLength(UText *ut) {
884 if (ut->b < 0) {
885 // Zero terminated string, and we haven't scanned to the end yet.
886 // Scan it now.
887 const char *r = (const char *)ut->context + ut->c;
888 while (*r != 0) {
889 r++;
890 }
891 if ((r - (const char *)ut->context) < 0x7fffffff) {
892 ut->b = (int32_t)(r - (const char *)ut->context);
893 } else {
894 // Actual string was bigger (more than 2 gig) than we
895 // can handle. Clip it to 2 GB.
896 ut->b = 0x7fffffff;
897 }
898 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
899 }
900 return ut->b;
901}
902
903
904
905
906
907
908static UBool U_CALLCONV
909utf8TextAccess(UText *ut, int64_t index, UBool forward) {
910 //
911 // Apologies to those who are allergic to goto statements.
912 // Consider each goto to a labelled block to be the equivalent of
913 // call the named block as if it were a function();
914 // return;
915 //
916 const uint8_t *s8=(const uint8_t *)ut->context;
917 UTF8Buf *u8b = NULL;
918 int32_t length = ut->b; // Length of original utf-8
919 int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
920 int32_t mapIndex = 0;
921 if (index<0) {
922 ix=0;
923 } else if (index > 0x7fffffff) {
924 // Strings with 64 bit lengths not supported by this UTF-8 provider.
925 ix = 0x7fffffff;
926 }
927
928 // Pin requested index to the string length.
929 if (ix>length) {
930 if (length>=0) {
931 ix=length;
932 } else if (ix>ut->c) {
933 // Zero terminated string, and requested index is beyond
934 // the region that has already been scanned.
935 // Scan up to either the end of the string or to the
936 // requested position, whichever comes first.
937 while (ut->c<ix && s8[ut->c]!=0) {
938 ut->c++;
939 }
940 // TODO: support for null terminated string length > 32 bits.
941 if (s8[ut->c] == 0) {
942 // We just found the actual length of the string.
943 // Trim the requested index back to that.
944 ix = ut->c;
945 ut->b = ut->c;
946 length = ut->c;
947 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
948 }
949 }
950 }
951
952 //
953 // Dispatch to the appropriate action for a forward iteration request.
954 //
955 if (forward) {
956 if (ix==ut->chunkNativeLimit) {
957 // Check for normal sequential iteration cases first.
958 if (ix==length) {
959 // Just reached end of string
960 // Don't swap buffers, but do set the
961 // current buffer position.
962 ut->chunkOffset = ut->chunkLength;
963 return FALSE;
964 } else {
965 // End of current buffer.
966 // check whether other buffer already has what we need.
967 UTF8Buf *altB = (UTF8Buf *)ut->q;
968 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
969 goto swapBuffers;
970 }
971 }
972 }
973
974 // A random access. Desired index could be in either or niether buf.
975 // For optimizing the order of testing, first check for the index
976 // being in the other buffer. This will be the case for uses that
977 // move back and forth over a fairly limited range
978 {
979 u8b = (UTF8Buf *)ut->q; // the alternate buffer
980 if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
981 // Requested index is in the other buffer.
982 goto swapBuffers;
983 }
984 if (ix == length) {
985 // Requested index is end-of-string.
986 // (this is the case of randomly seeking to the end.
987 // The case of iterating off the end is handled earlier.)
988 if (ix == ut->chunkNativeLimit) {
989 // Current buffer extends up to the end of the string.
990 // Leave it as the current buffer.
991 ut->chunkOffset = ut->chunkLength;
992 return FALSE;
993 }
994 if (ix == u8b->bufNativeLimit) {
995 // Alternate buffer extends to the end of string.
996 // Swap it in as the current buffer.
997 goto swapBuffersAndFail;
998 }
999
1000 // Neither existing buffer extends to the end of the string.
1001 goto makeStubBuffer;
1002 }
1003
1004 if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
1005 // Requested index is in neither buffer.
1006 goto fillForward;
1007 }
1008
1009 // Requested index is in this buffer.
1010 u8b = (UTF8Buf *)ut->p; // the current buffer
1011 mapIndex = ix - u8b->toUCharsMapStart;
1012 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1013 return TRUE;
1014
1015 }
1016 }
1017
1018
1019 //
1020 // Dispatch to the appropriate action for a
1021 // Backwards Diretion iteration request.
1022 //
1023 if (ix==ut->chunkNativeStart) {
1024 // Check for normal sequential iteration cases first.
1025 if (ix==0) {
1026 // Just reached the start of string
1027 // Don't swap buffers, but do set the
1028 // current buffer position.
1029 ut->chunkOffset = 0;
1030 return FALSE;
1031 } else {
1032 // Start of current buffer.
1033 // check whether other buffer already has what we need.
1034 UTF8Buf *altB = (UTF8Buf *)ut->q;
1035 if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
1036 goto swapBuffers;
1037 }
1038 }
1039 }
1040
1041 // A random access. Desired index could be in either or niether buf.
1042 // For optimizing the order of testing,
1043 // Most likely case: in the other buffer.
1044 // Second most likely: in neither buffer.
1045 // Unlikely, but must work: in the current buffer.
1046 u8b = (UTF8Buf *)ut->q; // the alternate buffer
1047 if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
1048 // Requested index is in the other buffer.
1049 goto swapBuffers;
1050 }
1051 // Requested index is start-of-string.
1052 // (this is the case of randomly seeking to the start.
1053 // The case of iterating off the start is handled earlier.)
1054 if (ix==0) {
1055 if (u8b->bufNativeStart==0) {
1056 // Alternate buffer contains the data for the start string.
1057 // Make it be the current buffer.
1058 goto swapBuffersAndFail;
1059 } else {
1060 // Request for data before the start of string,
1061 // neither buffer is usable.
1062 // set up a zero-length buffer.
1063 goto makeStubBuffer;
1064 }
1065 }
1066
1067 if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
1068 // Requested index is in neither buffer.
1069 goto fillReverse;
1070 }
1071
1072 // Requested index is in this buffer.
1073 // Set the utf16 buffer index.
1074 u8b = (UTF8Buf *)ut->p;
1075 mapIndex = ix - u8b->toUCharsMapStart;
1076 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1077 if (ut->chunkOffset==0) {
1078 // This occurs when the first character in the text is
1079 // a multi-byte UTF-8 char, and the requested index is to
1080 // one of the trailing bytes. Because there is no preceding ,
1081 // character, this access fails. We can't pick up on the
1082 // situation sooner because the requested index is not zero.
1083 return FALSE;
1084 } else {
1085 return TRUE;
1086 }
1087
1088
1089
1090swapBuffers:
1091 // The alternate buffer (ut->q) has the string data that was requested.
1092 // Swap the primary and alternate buffers, and set the
1093 // chunk index into the new primary buffer.
1094 {
1095 u8b = (UTF8Buf *)ut->q;
1096 ut->q = ut->p;
1097 ut->p = u8b;
1098 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1099 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1100 ut->chunkNativeStart = u8b->bufNativeStart;
1101 ut->chunkNativeLimit = u8b->bufNativeLimit;
1102 ut->nativeIndexingLimit = u8b->bufNILimit;
1103
1104 // Index into the (now current) chunk
1105 // Use the map to set the chunk index. It's more trouble than it's worth
1106 // to check whether native indexing can be used.
1107 U_ASSERT(ix>=u8b->bufNativeStart);
1108 U_ASSERT(ix<=u8b->bufNativeLimit);
1109 mapIndex = ix - u8b->toUCharsMapStart;
1110 U_ASSERT(mapIndex>=0);
1111 U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
1112 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1113
1114 return TRUE;
1115 }
1116
1117
1118 swapBuffersAndFail:
1119 // We got a request for either the start or end of the string,
1120 // with iteration continuing in the out-of-bounds direction.
1121 // The alternate buffer already contains the data up to the
1122 // start/end.
1123 // Swap the buffers, then return failure, indicating that we couldn't
1124 // make things correct for continuing the iteration in the requested
1125 // direction. The position & buffer are correct should the
1126 // user decide to iterate in the opposite direction.
1127 u8b = (UTF8Buf *)ut->q;
1128 ut->q = ut->p;
1129 ut->p = u8b;
1130 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
1131 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1132 ut->chunkNativeStart = u8b->bufNativeStart;
1133 ut->chunkNativeLimit = u8b->bufNativeLimit;
1134 ut->nativeIndexingLimit = u8b->bufNILimit;
1135
1136 // Index into the (now current) chunk
1137 // For this function (swapBuffersAndFail), the requested index
1138 // will always be at either the start or end of the chunk.
1139 if (ix==u8b->bufNativeLimit) {
1140 ut->chunkOffset = ut->chunkLength;
1141 } else {
1142 ut->chunkOffset = 0;
1143 U_ASSERT(ix == u8b->bufNativeStart);
1144 }
1145 return FALSE;
1146
1147makeStubBuffer:
1148 // The user has done a seek/access past the start or end
1149 // of the string. Rather than loading data that is likely
1150 // to never be used, just set up a zero-length buffer at
1151 // the position.
1152 u8b = (UTF8Buf *)ut->q;
1153 u8b->bufNativeStart = ix;
1154 u8b->bufNativeLimit = ix;
1155 u8b->bufStartIdx = 0;
1156 u8b->bufLimitIdx = 0;
1157 u8b->bufNILimit = 0;
1158 u8b->toUCharsMapStart = ix;
1159 u8b->mapToNative[0] = 0;
1160 u8b->mapToUChars[0] = 0;
1161 goto swapBuffersAndFail;
1162
1163
1164
1165fillForward:
1166 {
1167 // Move the incoming index to a code point boundary.
1168 U8_SET_CP_START(s8, 0, ix);
1169
1170 // Swap the UText buffers.
1171 // We want to fill what was previously the alternate buffer,
1172 // and make what was the current buffer be the new alternate.
1173 UTF8Buf *u8b = (UTF8Buf *)ut->q;
1174 ut->q = ut->p;
1175 ut->p = u8b;
1176
1177 int32_t strLen = ut->b;
1178 UBool nulTerminated = FALSE;
1179 if (strLen < 0) {
1180 strLen = 0x7fffffff;
1181 nulTerminated = TRUE;
1182 }
1183
1184 UChar *buf = u8b->buf;
1185 uint8_t *mapToNative = u8b->mapToNative;
1186 uint8_t *mapToUChars = u8b->mapToUChars;
1187 int32_t destIx = 0;
1188 int32_t srcIx = ix;
1189 UBool seenNonAscii = FALSE;
1190 UChar32 c;
1191
1192 // Fill the chunk buffer and mapping arrays.
1193 while (destIx<UTF8_TEXT_CHUNK_SIZE) {
1194 c = s8[srcIx];
1195 if (c>0 && c<0x80) {
1196 // Special case ASCII range for speed.
1197 // zero is excluded to simplify bounds checking.
1198 buf[destIx] = c;
1199 mapToNative[destIx] = srcIx - ix;
1200 mapToUChars[srcIx-ix] = destIx;
1201 srcIx++;
1202 destIx++;
1203 } else {
1204 // General case, handle everything.
1205 if (seenNonAscii == FALSE) {
1206 seenNonAscii = TRUE;
1207 u8b->bufNILimit = destIx;
1208 }
1209
1210 int32_t cIx = srcIx;
1211 int32_t dIx = destIx;
1212 int32_t dIxSaved = destIx;
1213 U8_NEXT(s8, srcIx, strLen, c);
1214 if (c==0 && nulTerminated) {
1215 srcIx--;
1216 break;
1217 }
1218 if (c<0) {
1219 // Illegal UTF-8. Replace with sub character.
1220 c = 0x0fffd;
1221 }
1222
1223 U16_APPEND_UNSAFE(buf, destIx, c);
1224 do {
1225 mapToNative[dIx++] = cIx - ix;
1226 } while (dIx < destIx);
1227
1228 do {
1229 mapToUChars[cIx++ - ix] = dIxSaved;
1230 } while (cIx < srcIx);
1231 }
1232 if (srcIx>=strLen) {
1233 break;
1234 }
1235
1236 }
1237
1238 // store Native <--> Chunk Map entries for the end of the buffer.
1239 // There is no actual character here, but the index position is valid.
1240 mapToNative[destIx] = srcIx - ix;
1241 mapToUChars[srcIx - ix] = destIx;
1242
1243 // fill in Buffer descriptor
1244 u8b->bufNativeStart = ix;
1245 u8b->bufNativeLimit = srcIx;
1246 u8b->bufStartIdx = 0;
1247 u8b->bufLimitIdx = destIx;
1248 if (seenNonAscii == FALSE) {
1249 u8b->bufNILimit = destIx;
1250 }
1251 u8b->toUCharsMapStart = u8b->bufNativeStart;
1252
1253 // Set UText chunk to refer to this buffer.
1254 ut->chunkContents = buf;
1255 ut->chunkOffset = 0;
1256 ut->chunkLength = u8b->bufLimitIdx;
1257 ut->chunkNativeStart = u8b->bufNativeStart;
1258 ut->chunkNativeLimit = u8b->bufNativeLimit;
1259 ut->nativeIndexingLimit = u8b->bufNILimit;
1260
1261 // For zero terminated strings, keep track of the maximum point
1262 // scanned so far.
1263 if (nulTerminated && srcIx>ut->c) {
1264 ut->c = srcIx;
1265 if (c==0) {
1266 // We scanned to the end.
1267 // Remember the actual length.
1268 ut->b = srcIx;
1269 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1270 }
1271 }
1272 return TRUE;
1273 }
1274
1275
1276fillReverse:
1277 {
1278 // Move the incoming index to a code point boundary.
1279 // Can only do this if the incoming index is somewhere in the interior of the string.
1280 // If index is at the end, there is no character there to look at.
1281 if (ix != ut->b) {
1282 U8_SET_CP_START(s8, 0, ix);
1283 }
1284
1285 // Swap the UText buffers.
1286 // We want to fill what was previously the alternate buffer,
1287 // and make what was the current buffer be the new alternate.
1288 UTF8Buf *u8b = (UTF8Buf *)ut->q;
1289 ut->q = ut->p;
1290 ut->p = u8b;
1291
1292 UChar *buf = u8b->buf;
1293 uint8_t *mapToNative = u8b->mapToNative;
1294 uint8_t *mapToUChars = u8b->mapToUChars;
1295 int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
1296 int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
1297 // at end of buffer to leave room
1298 // for a surrogate pair at the
1299 // buffer start.
1300 int32_t srcIx = ix;
1301 int32_t bufNILimit = destIx;
1302 UChar32 c;
1303
1304 // Map to/from Native Indexes, fill in for the position at the end of
1305 // the buffer.
1306 //
1307 mapToNative[destIx] = srcIx - toUCharsMapStart;
1308 mapToUChars[srcIx - toUCharsMapStart] = destIx;
1309
1310 // Fill the chunk buffer
1311 // Work backwards, filling from the end of the buffer towards the front.
1312 //
1313 while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
1314 srcIx--;
1315 destIx--;
1316
1317 // Get last byte of the UTF-8 character
1318 c = s8[srcIx];
1319 if (c<0x80) {
1320 // Special case ASCII range for speed.
1321 buf[destIx] = c;
1322 mapToUChars[srcIx - toUCharsMapStart] = destIx;
1323 mapToNative[destIx] = srcIx - toUCharsMapStart;
1324 } else {
1325 // General case, handle everything non-ASCII.
1326
1327 int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
1328
1329 // Get the full character from the UTF8 string.
1330 // use code derived from tbe macros in utf.8
1331 // Leaves srcIx pointing at the first byte of the UTF-8 char.
1332 //
1333 if (c<=0xbf) {
1334 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -1);
1335 // leaves srcIx at first byte of the multi-byte char.
1336 } else {
1337 c=0x0fffd;
1338 }
1339
1340 // Store the character in UTF-16 buffer.
1341 if (c<0x10000) {
1342 buf[destIx] = c;
1343 mapToNative[destIx] = srcIx - toUCharsMapStart;
1344 } else {
1345 buf[destIx] = U16_TRAIL(c);
1346 mapToNative[destIx] = srcIx - toUCharsMapStart;
1347 buf[--destIx] = U16_LEAD(c);
1348 mapToNative[destIx] = srcIx - toUCharsMapStart;
1349 }
1350
1351 // Fill in the map from native indexes to UChars buf index.
1352 do {
1353 mapToUChars[sIx-- - toUCharsMapStart] = destIx;
1354 } while (sIx >= srcIx);
1355
1356 // Set native indexing limit to be the current position.
1357 // We are processing a non-ascii, non-native-indexing char now;
1358 // the limit will be here if the rest of the chars to be
1359 // added to this buffer are ascii.
1360 bufNILimit = destIx;
1361 }
1362 }
1363 u8b->bufNativeStart = srcIx;
1364 u8b->bufNativeLimit = ix;
1365 u8b->bufStartIdx = destIx;
1366 u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
1367 u8b->bufNILimit = bufNILimit - u8b->bufStartIdx;
1368 u8b->toUCharsMapStart = toUCharsMapStart;
1369
1370 ut->chunkContents = &buf[u8b->bufStartIdx];
1371 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
1372 ut->chunkOffset = ut->chunkLength;
1373 ut->chunkNativeStart = u8b->bufNativeStart;
1374 ut->chunkNativeLimit = u8b->bufNativeLimit;
1375 ut->nativeIndexingLimit = u8b->bufNILimit;
1376 return TRUE;
1377 }
1378
1379}
1380
1381
1382
1383//
1384// This is a slightly modified copy of u_strFromUTF8,
1385// Inserts a Replacement Char rather than failing on invalid UTF-8
1386// Removes unnecessary features.
1387//
1388static UChar*
1389utext_strFromUTF8(UChar *dest,
1390 int32_t destCapacity,
1391 int32_t *pDestLength,
1392 const char* src,
1393 int32_t srcLength, // required. NUL terminated not supported.
1394 UErrorCode *pErrorCode
1395 )
1396{
1397
1398 UChar *pDest = dest;
1399 UChar *pDestLimit = dest+destCapacity;
1400 UChar32 ch=0;
1401 int32_t index = 0;
1402 int32_t reqLength = 0;
1403 uint8_t* pSrc = (uint8_t*) src;
1404
1405
1406 while((index < srcLength)&&(pDest<pDestLimit)){
1407 ch = pSrc[index++];
1408 if(ch <=0x7f){
1409 *pDest++=(UChar)ch;
1410 }else{
1411 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1412 if(ch<0){
1413 ch = 0xfffd;
1414 }
1415 if(ch<=0xFFFF){
1416 *(pDest++)=(UChar)ch;
1417 }else{
1418 *(pDest++)=UTF16_LEAD(ch);
1419 if(pDest<pDestLimit){
1420 *(pDest++)=UTF16_TRAIL(ch);
1421 }else{
1422 reqLength++;
1423 break;
1424 }
1425 }
1426 }
1427 }
1428 /* donot fill the dest buffer just count the UChars needed */
1429 while(index < srcLength){
1430 ch = pSrc[index++];
1431 if(ch <= 0x7f){
1432 reqLength++;
1433 }else{
1434 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
1435 if(ch<0){
1436 ch = 0xfffd;
1437 }
1438 reqLength+=UTF_CHAR_LENGTH(ch);
1439 }
1440 }
1441
1442 reqLength+=(int32_t)(pDest - dest);
1443
1444 if(pDestLength){
1445 *pDestLength = reqLength;
1446 }
1447
1448 /* Terminate the buffer */
1449 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
1450
1451 return dest;
1452}
1453
1454
1455
1456static int32_t U_CALLCONV
1457utf8TextExtract(UText *ut,
1458 int64_t start, int64_t limit,
1459 UChar *dest, int32_t destCapacity,
1460 UErrorCode *pErrorCode) {
1461 if(U_FAILURE(*pErrorCode)) {
1462 return 0;
1463 }
1464 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1465 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1466 return 0;
1467 }
1468 int32_t length = ut->b;
1469 int32_t start32 = pinIndex(start, length);
1470 int32_t limit32 = pinIndex(limit, length);
1471
1472 if(start32>limit32) {
1473 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1474 return 0;
1475 }
1476
1477
1478 // adjust the incoming indexes to land on code point boundaries if needed.
1479 // adjust by no more than three, because that is the largest number of trail bytes
1480 // in a well formed UTF8 character.
1481 const uint8_t *buf = (const uint8_t *)ut->context;
1482 int i;
1483 if (start32 < ut->chunkNativeLimit) {
1484 for (i=0; i<3; i++) {
1485 if (U8_IS_LEAD(buf[start32]) || start32==0) {
1486 break;
1487 }
1488 start32--;
1489 }
1490 }
1491
1492 if (limit32 < ut->chunkNativeLimit) {
1493 for (i=0; i<3; i++) {
1494 if (U8_IS_LEAD(buf[limit32]) || limit32==0) {
1495 break;
1496 }
1497 limit32--;
1498 }
1499 }
1500
1501 // Do the actual extract.
1502 int32_t destLength=0;
1503 utext_strFromUTF8(dest, destCapacity, &destLength,
1504 (const char *)ut->context+start32, limit32-start32,
1505 pErrorCode);
1506 return destLength;
1507}
1508
1509//
1510// utf8TextMapOffsetToNative
1511//
1512// Map a chunk (UTF-16) offset to a native index.
1513static int64_t U_CALLCONV
1514utf8TextMapOffsetToNative(const UText *ut) {
1515 //
1516 UTF8Buf *u8b = (UTF8Buf *)ut->p;
1517 U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
1518 int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
1519 U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
1520 return nativeOffset;
1521}
1522
1523//
1524// Map a native index to the corrsponding chunk offset
1525//
1526static int32_t U_CALLCONV
1527utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
1528 U_ASSERT(index64 <= 0x7fffffff);
1529 int32_t index = (int32_t)index64;
1530 UTF8Buf *u8b = (UTF8Buf *)ut->p;
1531 U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
1532 U_ASSERT(index<=ut->chunkNativeLimit);
1533 int32_t mapIndex = index - u8b->toUCharsMapStart;
1534 int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
1535 U_ASSERT(offset>=0 && offset<=ut->chunkLength);
1536 return offset;
1537}
1538
1539static UText * U_CALLCONV
1540utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
1541{
1542 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1543 dest = shallowTextClone(dest, src, status);
1544
1545 // For deep clones, make a copy of the string.
1546 // The copied storage is owned by the newly created clone.
1547 //
1548 // TODO: There is an isssue with using utext_nativeLength().
1549 // That function is non-const in cases where the input was NUL terminated
1550 // and the length has not yet been determined.
1551 // This function (clone()) is const.
1552 // There potentially a thread safety issue lurking here.
1553 //
1554 if (deep && U_SUCCESS(*status)) {
1555 int32_t len = (int32_t)utext_nativeLength((UText *)src);
1556 char *copyStr = (char *)uprv_malloc(len+1);
1557 if (copyStr == NULL) {
1558 *status = U_MEMORY_ALLOCATION_ERROR;
1559 } else {
1560 uprv_memcpy(copyStr, src->context, len+1);
1561 dest->context = copyStr;
1562 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1563 }
1564 }
1565 return dest;
1566}
1567
1568
1569static void U_CALLCONV
1570utf8TextClose(UText *ut) {
1571 // Most of the work of close is done by the generic UText framework close.
1572 // All that needs to be done here is to delete the UTF8 string if the UText
1573 // owns it. This occurs if the UText was created by cloning.
1574 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1575 char *s = (char *)ut->context;
1576 uprv_free(s);
1577 ut->context = NULL;
1578 }
1579}
1580
1581U_CDECL_END
1582
1583
1584static struct UTextFuncs utf8Funcs =
1585{
1586 sizeof(UTextFuncs),
1587 0, 0, 0, // Reserved alignment padding
1588 utf8TextClone,
1589 utf8TextLength,
1590 utf8TextAccess,
1591 utf8TextExtract,
1592 NULL, /* replace*/
1593 NULL, /* copy */
1594 utf8TextMapOffsetToNative,
1595 utf8TextMapIndexToUTF16,
1596 utf8TextClose,
1597 NULL, // spare 1
1598 NULL, // spare 2
1599 NULL // spare 3
1600};
1601
1602
1603U_DRAFT UText * U_EXPORT2
1604utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
1605 if(U_FAILURE(*status)) {
1606 return NULL;
1607 }
1608 if(s==NULL || length<-1 || length>INT32_MAX) {
1609 *status=U_ILLEGAL_ARGUMENT_ERROR;
1610 return NULL;
1611 }
1612
1613 ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
1614 if (U_FAILURE(*status)) {
1615 return ut;
1616 }
1617
1618 ut->pFuncs = &utf8Funcs;
1619 ut->context = s;
1620 ut->b = (int32_t)length;
1621 ut->c = (int32_t)length;
1622 if (ut->c < 0) {
1623 ut->c = 0;
1624 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
1625 }
1626 ut->p = ut->pExtra;
1627 ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
1628 return ut;
1629
1630}
1631
1632
1633
1634
1635
1636
1637
1638
1639//------------------------------------------------------------------------------
1640//
1641// UText implementation wrapper for Replaceable (read/write)
1642//
1643// Use of UText data members:
1644// context pointer to Replaceable.
1645// p pointer to Replaceable if it is owned by the UText.
1646//
1647//------------------------------------------------------------------------------
1648
1649
1650
1651// minimum chunk size for this implementation: 3
1652// to allow for possible trimming for code point boundaries
1653enum { REP_TEXT_CHUNK_SIZE=10 };
1654
1655struct ReplExtra {
1656 /*
1657 * Chunk UChars.
1658 * +1 to simplify filling with surrogate pair at the end.
1659 */
1660 UChar s[REP_TEXT_CHUNK_SIZE+1];
1661};
1662
1663
1664U_CDECL_BEGIN
1665
1666static UText * U_CALLCONV
1667repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
1668 // First do a generic shallow clone. Does everything needed for the UText struct itself.
1669 dest = shallowTextClone(dest, src, status);
1670
1671 // For deep clones, make a copy of the Replaceable.
1672 // The copied Replaceable storage is owned by the newly created UText clone.
1673 // A non-NULL pointer in UText.p is the signal to the close() function to delete
1674 // it.
1675 //
1676 if (deep && U_SUCCESS(*status)) {
1677 const Replaceable *replSrc = (const Replaceable *)src->context;
1678 dest->context = replSrc->clone();
1679 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
1680
1681 // with deep clone, the copy is writable, even when the source is not.
1682 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
1683 }
1684 return dest;
1685}
1686
1687
1688static void U_CALLCONV
1689repTextClose(UText *ut) {
1690 // Most of the work of close is done by the generic UText framework close.
1691 // All that needs to be done here is delete the Replaceable if the UText
1692 // owns it. This occurs if the UText was created by cloning.
1693 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
1694 Replaceable *rep = (Replaceable *)ut->context;
1695 delete rep;
1696 ut->context = NULL;
1697 }
1698}
1699
1700
1701static int64_t U_CALLCONV
1702repTextLength(UText *ut) {
1703 const Replaceable *replSrc = (const Replaceable *)ut->context;
1704 int32_t len = replSrc->length();
1705 return len;
1706}
1707
1708
1709static UBool U_CALLCONV
1710repTextAccess(UText *ut, int64_t index, UBool forward) {
1711 const Replaceable *rep=(const Replaceable *)ut->context;
1712 int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
1713
1714 // clip the requested index to the limits of the text.
1715 int32_t index32 = pinIndex(index, length);
1716 U_ASSERT(index<=INT32_MAX);
1717
1718
1719 /*
1720 * Compute start/limit boundaries around index, for a segment of text
1721 * to be extracted.
1722 * To allow for the possibility that our user gave an index to the trailing
1723 * half of a surrogate pair, we must request one extra preceding UChar when
1724 * going in the forward direction. This will ensure that the buffer has the
1725 * entire code point at the specified index.
1726 */
1727 if(forward) {
1728
1729 if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
1730 // Buffer already contains the requested position.
1731 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
1732 return TRUE;
1733 }
1734 if (index32>=length && ut->chunkNativeLimit==length) {
1735 // Request for end of string, and buffer already extends up to it.
1736 // Can't get the data, but don't change the buffer.
1737 ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
1738 return FALSE;
1739 }
1740
1741 ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
1742 // Going forward, so we want to have the buffer with stuff at and beyond
1743 // the requested index. The -1 gets us one code point before the
1744 // requested index also, to handle the case of the index being on
1745 // a trail surrogate of a surrogate pair.
1746 if(ut->chunkNativeLimit > length) {
1747 ut->chunkNativeLimit = length;
1748 }
1749 // unless buffer ran off end, start is index-1.
1750 ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
1751 if(ut->chunkNativeStart < 0) {
1752 ut->chunkNativeStart = 0;
1753 }
1754 } else {
1755 // Reverse iteration. Fill buffer with data preceding the requested index.
1756 if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
1757 // Requested position already in buffer.
1758 ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
1759 return TRUE;
1760 }
1761 if (index32==0 && ut->chunkNativeStart==0) {
1762 // Request for start, buffer already begins at start.
1763 // No data, but keep the buffer as is.
1764 ut->chunkOffset = 0;
1765 return FALSE;
1766 }
1767
1768 // Figure out the bounds of the chunk to extract for reverse iteration.
1769 // Need to worry about chunk not splitting surrogate pairs, and while still
1770 // containing the data we need.
1771 // Fix by requesting a chunk that includes an extra UChar at the end.
1772 // If this turns out to be a lead surrogate, we can lop it off and still have
1773 // the data we wanted.
1774 ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
1775 if (ut->chunkNativeStart < 0) {
1776 ut->chunkNativeStart = 0;
1777 }
1778
1779 ut->chunkNativeLimit = index32 + 1;
1780 if (ut->chunkNativeLimit > length) {
1781 ut->chunkNativeLimit = length;
1782 }
1783 }
1784
1785 // Extract the new chunk of text from the Replaceable source.
1786 ReplExtra *ex = (ReplExtra *)ut->pExtra;
1787 // UnicodeString with its buffer a writable alias to the chunk buffer
1788 UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
1789 rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
1790
1791 ut->chunkContents = ex->s;
1792 ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
1793 ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
1794
1795 // Surrogate pairs from the input text must not span chunk boundaries.
1796 // If end of chunk could be the start of a surrogate, trim it off.
1797 if (ut->chunkNativeLimit < length &&
1798 U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
1799 ut->chunkLength--;
1800 ut->chunkNativeLimit--;
1801 if (ut->chunkOffset > ut->chunkLength) {
1802 ut->chunkOffset = ut->chunkLength;
1803 }
1804 }
1805
1806 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
1807 // trim it off.
1808 if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
1809 ++(ut->chunkContents);
1810 ++(ut->chunkNativeStart);
1811 --(ut->chunkLength);
1812 --(ut->chunkOffset);
1813 }
1814
1815 // adjust the index/chunkOffset to a code point boundary
1816 U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
1817
1818 // Use fast indexing for get/setNativeIndex()
1819 ut->nativeIndexingLimit = ut->chunkLength;
1820
1821 return TRUE;
1822}
1823
1824
1825
1826static int32_t U_CALLCONV
1827repTextExtract(UText *ut,
1828 int64_t start, int64_t limit,
1829 UChar *dest, int32_t destCapacity,
1830 UErrorCode *status) {
1831 const Replaceable *rep=(const Replaceable *)ut->context;
1832 int32_t length=rep->length();
1833
1834 if(U_FAILURE(*status)) {
1835 return 0;
1836 }
1837 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1838 *status=U_ILLEGAL_ARGUMENT_ERROR;
1839 }
1840 if(start>limit) {
1841 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1842 return 0;
1843 }
1844
1845 int32_t start32 = pinIndex(start, length);
1846 int32_t limit32 = pinIndex(limit, length);
1847
1848 // adjust start, limit if they point to trail half of surrogates
1849 if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
1850 U_IS_SUPPLEMENTARY(rep->char32At(start32))){
1851 start32--;
1852 }
1853 if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
1854 U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
1855 limit32--;
1856 }
1857
1858 length=limit32-start32;
1859 if(length>destCapacity) {
1860 limit32 = start32 + destCapacity;
1861 }
1862 UnicodeString buffer(dest, 0, destCapacity); // writable alias
1863 rep->extractBetween(start32, limit32, buffer);
1864 return u_terminateUChars(dest, destCapacity, length, status);
1865}
1866
1867static int32_t U_CALLCONV
1868repTextReplace(UText *ut,
1869 int64_t start, int64_t limit,
1870 const UChar *src, int32_t length,
1871 UErrorCode *status) {
1872 Replaceable *rep=(Replaceable *)ut->context;
1873 int32_t oldLength;
1874
1875 if(U_FAILURE(*status)) {
1876 return 0;
1877 }
1878 if(src==NULL && length!=0) {
1879 *status=U_ILLEGAL_ARGUMENT_ERROR;
1880 return 0;
1881 }
1882 oldLength=rep->length(); // will subtract from new length
1883 if(start>limit ) {
1884 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1885 return 0;
1886 }
1887
1888 int32_t start32 = pinIndex(start, oldLength);
1889 int32_t limit32 = pinIndex(limit, oldLength);
1890
1891 // Snap start & limit to code point boundaries.
1892 if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
1893 start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
1894 {
1895 start32--;
1896 }
1897 if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
1898 U16_IS_TRAIL(rep->charAt(limit32)))
1899 {
1900 limit32++;
1901 }
1902
1903 // Do the actual replace operation using methods of the Replaceable class
1904 UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
1905 rep->handleReplaceBetween(start32, limit32, replStr);
1906 int32_t newLength = rep->length();
1907 int32_t lengthDelta = newLength - oldLength;
1908
1909 // Is the UText chunk buffer OK?
1910 if (ut->chunkNativeLimit > start32) {
1911 // this replace operation may have impacted the current chunk.
1912 // invalidate it, which will force a reload on the next access.
1913 invalidateChunk(ut);
1914 }
1915
1916 // set the iteration position to the end of the newly inserted replacement text.
1917 int32_t newIndexPos = limit32 + lengthDelta;
1918 repTextAccess(ut, newIndexPos, TRUE);
1919
1920 return lengthDelta;
1921}
1922
1923
1924static void U_CALLCONV
1925repTextCopy(UText *ut,
1926 int64_t start, int64_t limit,
1927 int64_t destIndex,
1928 UBool move,
1929 UErrorCode *status)
1930{
1931 Replaceable *rep=(Replaceable *)ut->context;
1932 int32_t length=rep->length();
1933
1934 if(U_FAILURE(*status)) {
1935 return;
1936 }
1937 if (start>limit || (start<destIndex && destIndex<limit))
1938 {
1939 *status=U_INDEX_OUTOFBOUNDS_ERROR;
1940 return;
1941 }
1942
1943 int32_t start32 = pinIndex(start, length);
1944 int32_t limit32 = pinIndex(limit, length);
1945 int32_t destIndex32 = pinIndex(destIndex, length);
1946
1947 // TODO: snap input parameters to code point boundaries.
1948
1949 if(move) {
1950 // move: copy to destIndex, then replace original with nothing
1951 int32_t segLength=limit32-start32;
1952 rep->copy(start32, limit32, destIndex32);
1953 if(destIndex32<start32) {
1954 start32+=segLength;
1955 limit32+=segLength;
1956 }
1957 rep->handleReplaceBetween(start32, limit32, UnicodeString());
1958 } else {
1959 // copy
1960 rep->copy(start32, limit32, destIndex32);
1961 }
1962
1963 // If the change to the text touched the region in the chunk buffer,
1964 // invalidate the buffer.
1965 int32_t firstAffectedIndex = destIndex32;
1966 if (move && start32<firstAffectedIndex) {
1967 firstAffectedIndex = start32;
1968 }
1969 if (firstAffectedIndex < ut->chunkNativeLimit) {
1970 // changes may have affected range covered by the chunk
1971 invalidateChunk(ut);
1972 }
1973
1974 // Put iteration position at the newly inserted (moved) block,
1975 int32_t nativeIterIndex = destIndex32 + limit32 - start32;
1976 if (move && destIndex32>start32) {
1977 // moved a block of text towards the end of the string.
1978 nativeIterIndex = destIndex32;
1979 }
1980
1981 // Set position, reload chunk if needed.
1982 repTextAccess(ut, nativeIterIndex, TRUE);
1983}
1984
1985static struct UTextFuncs repFuncs =
1986{
1987 sizeof(UTextFuncs),
1988 0, 0, 0, // Reserved alignment padding
1989 repTextClone,
1990 repTextLength,
1991 repTextAccess,
1992 repTextExtract,
1993 repTextReplace,
1994 repTextCopy,
1995 NULL, // MapOffsetToNative,
1996 NULL, // MapIndexToUTF16,
1997 repTextClose,
1998 NULL, // spare 1
1999 NULL, // spare 2
2000 NULL // spare 3
2001};
2002
2003
2004U_DRAFT UText * U_EXPORT2
2005utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
2006{
2007 if(U_FAILURE(*status)) {
2008 return NULL;
2009 }
2010 if(rep==NULL) {
2011 *status=U_ILLEGAL_ARGUMENT_ERROR;
2012 return NULL;
2013 }
2014 ut = utext_setup(ut, sizeof(ReplExtra), status);
2015
2016 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2017 if(rep->hasMetaData()) {
2018 ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
2019 }
2020
2021 ut->pFuncs = &repFuncs;
2022 ut->context = rep;
2023 return ut;
2024}
2025
2026U_CDECL_END
2027
2028
2029
2030
2031
2032
2033
2034
2035//------------------------------------------------------------------------------
2036//
2037// UText implementation for UnicodeString (read/write) and
2038// for const UnicodeString (read only)
2039// (same implementation, only the flags are different)
2040//
2041// Use of UText data members:
2042// context pointer to UnicodeString
2043// p pointer to UnicodeString IF this UText owns the string
2044// and it must be deleted on close(). NULL otherwise.
2045//
2046//------------------------------------------------------------------------------
2047
2048U_CDECL_BEGIN
2049
2050
2051static UText * U_CALLCONV
2052unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
2053 // First do a generic shallow clone. Does everything needed for the UText struct itself.
2054 dest = shallowTextClone(dest, src, status);
2055
2056 // For deep clones, make a copy of the UnicodeSring.
2057 // The copied UnicodeString storage is owned by the newly created UText clone.
2058 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2059 // the UText.
2060 //
2061 if (deep && U_SUCCESS(*status)) {
2062 const UnicodeString *srcString = (const UnicodeString *)src->context;
2063 dest->context = new UnicodeString(*srcString);
2064 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2065
2066 // with deep clone, the copy is writable, even when the source is not.
2067 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2068 }
2069 return dest;
2070}
2071
2072static void U_CALLCONV
2073unistrTextClose(UText *ut) {
2074 // Most of the work of close is done by the generic UText framework close.
2075 // All that needs to be done here is delete the UnicodeString if the UText
2076 // owns it. This occurs if the UText was created by cloning.
2077 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2078 UnicodeString *str = (UnicodeString *)ut->context;
2079 delete str;
2080 ut->context = NULL;
2081 }
2082}
2083
2084
2085static int64_t U_CALLCONV
2086unistrTextLength(UText *t) {
2087 return ((const UnicodeString *)t->context)->length();
2088}
2089
2090
2091static UBool U_CALLCONV
2092unistrTextAccess(UText *ut, int64_t index, UBool forward) {
2093 int32_t length = ut->chunkLength;
2094 ut->chunkOffset = pinIndex(index, length);
2095
2096 // Check whether request is at the start or end
2097 UBool retVal = (forward && index<length) || (!forward && index>0);
2098 return retVal;
2099}
2100
2101
2102
2103static int32_t U_CALLCONV
2104unistrTextExtract(UText *t,
2105 int64_t start, int64_t limit,
2106 UChar *dest, int32_t destCapacity,
2107 UErrorCode *pErrorCode) {
2108 const UnicodeString *us=(const UnicodeString *)t->context;
2109 int32_t length=us->length();
2110
2111 if(U_FAILURE(*pErrorCode)) {
2112 return 0;
2113 }
2114 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
2115 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2116 }
2117 if(start<0 || start>limit) {
2118 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2119 return 0;
2120 }
2121
2122 int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
2123 int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
2124
2125 length=limit32-start32;
2126 if (destCapacity>0 && dest!=NULL) {
2127 int32_t trimmedLength = length;
2128 if(trimmedLength>destCapacity) {
2129 trimmedLength=destCapacity;
2130 }
2131 us->extract(start32, trimmedLength, dest);
2132 }
2133 u_terminateUChars(dest, destCapacity, length, pErrorCode);
2134 return length;
2135}
2136
2137static int32_t U_CALLCONV
2138unistrTextReplace(UText *ut,
2139 int64_t start, int64_t limit,
2140 const UChar *src, int32_t length,
2141 UErrorCode *pErrorCode) {
2142 UnicodeString *us=(UnicodeString *)ut->context;
2143 int32_t oldLength;
2144
2145 if(U_FAILURE(*pErrorCode)) {
2146 return 0;
2147 }
2148 if(src==NULL && length!=0) {
2149 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2150 }
2151 if(start>limit) {
2152 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2153 return 0;
2154 }
2155 oldLength=us->length();
2156 int32_t start32 = pinIndex(start, oldLength);
2157 int32_t limit32 = pinIndex(limit, oldLength);
2158 if (start32 < oldLength) {
2159 start32 = us->getChar32Start(start32);
2160 }
2161 if (limit32 < oldLength) {
2162 limit32 = us->getChar32Start(limit32);
2163 }
2164
2165 // replace
2166 us->replace(start32, limit32-start32, src, length);
2167 int32_t newLength = us->length();
2168
2169 // Update the chunk description.
2170 ut->chunkContents = us->getBuffer();
2171 ut->chunkLength = newLength;
2172 ut->chunkNativeLimit = newLength;
2173 ut->nativeIndexingLimit = newLength;
2174
2175 // Set iteration position to the point just following the newly inserted text.
2176 int32_t lengthDelta = newLength - oldLength;
2177 ut->chunkOffset = limit32 + lengthDelta;
2178
2179 return lengthDelta;
2180}
2181
2182static void U_CALLCONV
2183unistrTextCopy(UText *ut,
2184 int64_t start, int64_t limit,
2185 int64_t destIndex,
2186 UBool move,
2187 UErrorCode *pErrorCode) {
2188 UnicodeString *us=(UnicodeString *)ut->context;
2189 int32_t length=us->length();
2190
2191 if(U_FAILURE(*pErrorCode)) {
2192 return;
2193 }
2194 int32_t start32 = pinIndex(start, length);
2195 int32_t limit32 = pinIndex(limit, length);
2196 int32_t destIndex32 = pinIndex(destIndex, length);
2197
2198 if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
2199 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2200 return;
2201 }
2202
2203 if(move) {
2204 // move: copy to destIndex, then replace original with nothing
2205 int32_t segLength=limit32-start32;
2206 us->copy(start32, limit32, destIndex32);
2207 if(destIndex32<start32) {
2208 start32+=segLength;
2209 }
2210 us->replace(start32, segLength, NULL, 0);
2211 } else {
2212 // copy
2213 us->copy(start32, limit32, destIndex32);
2214 }
2215
2216 // update chunk description, set iteration position.
2217 ut->chunkContents = us->getBuffer();
2218 if (move==FALSE) {
2219 // copy operation, string length grows
2220 ut->chunkLength += limit32-start32;
2221 ut->chunkNativeLimit = ut->chunkLength;
2222 ut->nativeIndexingLimit = ut->chunkLength;
2223 }
2224
2225 // Iteration position to end of the newly inserted text.
2226 ut->chunkOffset = destIndex32+limit32-start32;
2227 if (move && destIndex32>start32) {
2228 ut->chunkOffset = destIndex32;
2229 }
2230
2231}
2232
2233static struct UTextFuncs unistrFuncs =
2234{
2235 sizeof(UTextFuncs),
2236 0, 0, 0, // Reserved alignment padding
2237 unistrTextClone,
2238 unistrTextLength,
2239 unistrTextAccess,
2240 unistrTextExtract,
2241 unistrTextReplace,
2242 unistrTextCopy,
2243 NULL, // MapOffsetToNative,
2244 NULL, // MapIndexToUTF16,
2245 unistrTextClose,
2246 NULL, // spare 1
2247 NULL, // spare 2
2248 NULL // spare 3
2249};
2250
2251
2252
2253U_CDECL_END
2254
2255
2256U_DRAFT UText * U_EXPORT2
2257utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
2258 // TODO: use openConstUnicodeString, then add in the differences.
2259 //
2260 ut = utext_setup(ut, 0, status);
2261 if (U_SUCCESS(*status)) {
2262 ut->pFuncs = &unistrFuncs;
2263 ut->context = s;
2264 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
2265 I32_FLAG(UTEXT_PROVIDER_WRITABLE);
2266
2267 ut->chunkContents = s->getBuffer();
2268 ut->chunkLength = s->length();
2269 ut->chunkNativeStart = 0;
2270 ut->chunkNativeLimit = ut->chunkLength;
2271 ut->nativeIndexingLimit = ut->chunkLength;
2272 }
2273 return ut;
2274}
2275
2276
2277
2278U_DRAFT UText * U_EXPORT2
2279utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
2280 ut = utext_setup(ut, 0, status);
2281 // note: use the standard (writable) function table for UnicodeString.
2282 // The flag settings disable writing, so having the functions in
2283 // the table is harmless.
2284 if (U_SUCCESS(*status)) {
2285 ut->pFuncs = &unistrFuncs;
2286 ut->context = s;
2287 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2288 ut->chunkContents = s->getBuffer();
2289 ut->chunkLength = s->length();
2290 ut->chunkNativeStart = 0;
2291 ut->chunkNativeLimit = ut->chunkLength;
2292 ut->nativeIndexingLimit = ut->chunkLength;
2293 }
2294 return ut;
2295}
2296
2297//------------------------------------------------------------------------------
2298//
2299// UText implementation for const UChar * strings
2300//
2301// Use of UText data members:
2302// context pointer to UnicodeString
2303// a length. -1 if not yet known.
2304//
2305// TODO: support 64 bit lengths.
2306//
2307//------------------------------------------------------------------------------
2308
2309U_CDECL_BEGIN
2310
2311
2312static UText * U_CALLCONV
2313ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
2314 // First do a generic shallow clone.
2315 dest = shallowTextClone(dest, src, status);
2316
2317 // For deep clones, make a copy of the string.
2318 // The copied storage is owned by the newly created clone.
2319 // A non-NULL pointer in UText.p is the signal to the close() function to delete
2320 // it.
2321 //
2322 if (deep && U_SUCCESS(*status)) {
2323 U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
2324 int32_t len = (int32_t)utext_nativeLength(dest);
2325
2326 // The cloned string IS going to be NUL terminated, whether or not the original was.
2327 const UChar *srcStr = (const UChar *)src->context;
2328 UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
2329 if (copyStr == NULL) {
2330 *status = U_MEMORY_ALLOCATION_ERROR;
2331 } else {
2332 int64_t i;
2333 for (i=0; i<len; i++) {
2334 copyStr[i] = srcStr[i];
2335 }
2336 copyStr[len] = 0;
2337 dest->context = copyStr;
2338 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
2339 }
2340 }
2341 return dest;
2342}
2343
2344
2345static void U_CALLCONV
2346ucstrTextClose(UText *ut) {
2347 // Most of the work of close is done by the generic UText framework close.
2348 // All that needs to be done here is delete the string if the UText
2349 // owns it. This occurs if the UText was created by cloning.
2350 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
2351 UChar *s = (UChar *)ut->context;
2352 uprv_free(s);
2353 ut->context = NULL;
2354 }
2355}
2356
2357
2358
2359static int64_t U_CALLCONV
2360ucstrTextLength(UText *ut) {
2361 if (ut->a < 0) {
2362 // null terminated, we don't yet know the length. Scan for it.
2363 // Access is not convenient for doing this
2364 // because the current interation postion can't be changed.
2365 const UChar *str = (const UChar *)ut->context;
2366 for (;;) {
2367 if (str[ut->chunkNativeLimit] == 0) {
2368 break;
2369 }
2370 ut->chunkNativeLimit++;
2371 }
2372 ut->a = ut->chunkNativeLimit;
2373 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2374 ut->nativeIndexingLimit = ut->chunkLength;
2375 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2376 }
2377 return ut->a;
2378}
2379
2380
2381static UBool U_CALLCONV
2382ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
2383 const UChar *str = (const UChar *)ut->context;
2384
2385 // pin the requested index to the bounds of the string,
2386 // and set current iteration position.
2387 if (index<0) {
2388 index = 0;
2389 } else if (index < ut->chunkNativeLimit) {
2390 // The request data is within the chunk as it is known so far.
2391 // Put index on a code point boundary.
2392 U16_SET_CP_START(str, 0, index);
2393 } else if (ut->a >= 0) {
2394 // We know the length of this string, and the user is requesting something
2395 // at or beyond the length. Pin the requested index to the length.
2396 index = ut->a;
2397 } else {
2398 // Null terminated string, length not yet known, and the requested index
2399 // is beyond where we have scanned so far.
2400 // Scan to 32 UChars beyond the requested index. The strategy here is
2401 // to avoid fully scanning a long string when the caller only wants to
2402 // see a few characters at its beginning.
2403 int32_t scanLimit = (int32_t)index + 32;
2404 if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression
2405 scanLimit = INT32_MAX;
2406 }
2407
2408 int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
2409 for (; chunkLimit<scanLimit; chunkLimit++) {
2410 if (str[chunkLimit] == 0) {
2411 // We found the end of the string. Remember it, pin the requested index to it,
2412 // and bail out of here.
2413 ut->a = chunkLimit;
2414 ut->chunkLength = chunkLimit;
2415 ut->nativeIndexingLimit = chunkLimit;
2416 if (index >= chunkLimit) {
2417 index = chunkLimit;
2418 } else {
2419 U16_SET_CP_START(str, 0, index);
2420 }
2421
2422 ut->chunkNativeLimit = chunkLimit;
2423 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2424 goto breakout;
2425 }
2426 }
2427 // We scanned through the next batch of UChars without finding the end.
2428 U16_SET_CP_START(str, 0, index);
2429 if (chunkLimit == INT32_MAX) {
2430 // Scanned to the limit of a 32 bit length.
2431 // Forceably trim the overlength string back so length fits in int32
2432 // TODO: add support for 64 bit strings.
2433 ut->a = chunkLimit;
2434 ut->chunkLength = chunkLimit;
2435 ut->nativeIndexingLimit = chunkLimit;
2436 if (index > chunkLimit) {
2437 index = chunkLimit;
2438 }
2439 ut->chunkNativeLimit = chunkLimit;
2440 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2441 } else {
2442 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
2443 // If the current end is on a lead surrogate, back the end up by one.
2444 // It doesn't matter if the end char happens to be an unpaired surrogate,
2445 // and it's simpler not to worry about it.
2446 if (U16_IS_LEAD(str[chunkLimit-1])) {
2447 --chunkLimit;
2448 }
2449 ut->chunkNativeLimit = chunkLimit;
2450 }
2451
2452 }
2453breakout:
2454 U_ASSERT(index<=INT32_MAX);
2455 ut->chunkOffset = (int32_t)index;
2456
2457 // Check whether request is at the start or end
2458 UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
2459 return retVal;
2460}
2461
2462
2463
2464static int32_t U_CALLCONV
2465ucstrTextExtract(UText *ut,
2466 int64_t start, int64_t limit,
2467 UChar *dest, int32_t destCapacity,
2468 UErrorCode *pErrorCode)
2469{
2470 if(U_FAILURE(*pErrorCode)) {
2471 return 0;
2472 }
2473 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2474 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2475 return 0;
2476 }
2477
2478 const UChar *s=(const UChar *)ut->context;
2479 int32_t si, di;
2480
2481 int32_t start32;
2482 int32_t limit32;
2483
2484 // Access the start. Does two things we need:
2485 // Pins 'start' to the length of the string, if it came in out-of-bounds.
2486 // Snaps 'start' to the beginning of a code point.
2487 ucstrTextAccess(ut, start, TRUE);
2488 U_ASSERT(start <= INT32_MAX);
2489 start32 = (int32_t)start;
2490
2491 int32_t strLength=(int32_t)ut->a;
2492 if (strLength >= 0) {
2493 limit32 = pinIndex(limit, strLength);
2494 } else {
2495 limit32 = pinIndex(limit, INT32_MAX);
2496 }
2497
2498 di = 0;
2499 for (si=start32; si<limit32; si++) {
2500 if (strLength<0 && s[si]==0) {
2501 // Just hit the end of a null-terminated string.
2502 ut->a = si; // set string length for this UText
2503 ut->chunkNativeLimit = si;
2504 ut->chunkLength = si;
2505 ut->nativeIndexingLimit = si;
2506 strLength = si;
2507 break;
2508 }
2509 if (di<destCapacity) {
2510 // only store if there is space.
2511 dest[di] = s[si];
2512 } else {
2513 if (strLength>=0) {
2514 // We have filled the destination buffer, and the string length is known.
2515 // Cut the loop short. There is no need to scan string termination.
2516 di = strLength;
2517 si = limit32;
2518 break;
2519 }
2520 }
2521 di++;
2522 }
2523
2524 // If the limit index points to a lead surrogate of a pair,
2525 // add the corresponding trail surrogate to the destination.
2526 if (si>0 && U16_IS_LEAD(s[si-1]) &&
2527 ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
2528 {
2529 if (di<destCapacity) {
2530 // store only if there is space in the output buffer.
2531 dest[di++] = s[si++];
2532 }
2533 }
2534
2535 // Put iteration position at the point just following the extracted text
2536 ut->chunkOffset = si;
2537
2538 // Add a terminating NUL if space in the buffer permits,
2539 // and set the error status as required.
2540 u_terminateUChars(dest, destCapacity, di, pErrorCode);
2541 return di;
2542}
2543
2544static struct UTextFuncs ucstrFuncs =
2545{
2546 sizeof(UTextFuncs),
2547 0, 0, 0, // Reserved alignment padding
2548 ucstrTextClone,
2549 ucstrTextLength,
2550 ucstrTextAccess,
2551 ucstrTextExtract,
2552 NULL, // Replace
2553 NULL, // Copy
2554 NULL, // MapOffsetToNative,
2555 NULL, // MapIndexToUTF16,
2556 ucstrTextClose,
2557 NULL, // spare 1
2558 NULL, // spare 2
2559 NULL, // spare 3
2560};
2561
2562U_CDECL_END
2563
2564
2565U_DRAFT UText * U_EXPORT2
2566utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
2567 if (U_FAILURE(*status)) {
2568 return NULL;
2569 }
2570 if (length < -1 || length>INT32_MAX) {
2571 *status = U_ILLEGAL_ARGUMENT_ERROR;
2572 return NULL;
2573 }
2574 ut = utext_setup(ut, 0, status);
2575 if (U_SUCCESS(*status)) {
2576 ut->pFuncs = &ucstrFuncs;
2577 ut->context = s;
2578 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
2579 if (length==-1) {
2580 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
2581 }
2582 ut->a = length;
2583 ut->chunkContents = s;
2584 ut->chunkNativeStart = 0;
2585 ut->chunkNativeLimit = length>=0? length : 0;
2586 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
2587 ut->chunkOffset = 0;
2588 ut->nativeIndexingLimit = ut->chunkLength;
2589 }
2590 return ut;
2591}
2592
2593
2594//------------------------------------------------------------------------------
2595//
2596// UText implementation for text from ICU CharacterIterators
2597//
2598// Use of UText data members:
2599// context pointer to the CharacterIterator
2600// a length of the full text.
2601// p pointer to buffer 1
2602// b start index of local buffer 1 contents
2603// q pointer to buffer 2
2604// c start index of local buffer 2 contents
2605// r pointer to the character iterator if the UText owns it.
2606// Null otherwise.
2607//
2608//------------------------------------------------------------------------------
2609#define CIBufSize 16
2610
2611U_CDECL_BEGIN
2612static void U_CALLCONV
2613charIterTextClose(UText *ut) {
2614 // Most of the work of close is done by the generic UText framework close.
2615 // All that needs to be done here is delete the CharacterIterator if the UText
2616 // owns it. This occurs if the UText was created by cloning.
2617 CharacterIterator *ci = (CharacterIterator *)ut->r;
2618 delete ci;
2619 ut->r = NULL;
2620}
2621
2622static int64_t U_CALLCONV
2623charIterTextLength(UText *ut) {
2624 return (int32_t)ut->a;
2625}
2626
2627static UBool U_CALLCONV
2628charIterTextAccess(UText *ut, int64_t index, UBool forward) {
2629 CharacterIterator *ci = (CharacterIterator *)ut->context;
2630
2631 int32_t clippedIndex = (int32_t)index;
2632 if (clippedIndex<0) {
2633 clippedIndex=0;
2634 } else if (clippedIndex>=ut->a) {
2635 clippedIndex=(int32_t)ut->a;
2636 }
2637 int32_t neededIndex = clippedIndex;
2638 if (!forward && neededIndex>0) {
2639 // reverse iteration, want the position just before what was asked for.
2640 neededIndex--;
2641 } else if (forward && neededIndex==ut->a && neededIndex>0) {
2642 // Forward iteration, don't ask for something past the end of the text.
2643 neededIndex--;
2644 }
2645
2646 // Find the native index of the start of the buffer containing what we want.
2647 neededIndex -= neededIndex % CIBufSize;
2648
2649 UChar *buf = NULL;
2650 UBool needChunkSetup = TRUE;
2651 int i;
2652 if (ut->chunkNativeStart == neededIndex) {
2653 // The buffer we want is already the current chunk.
2654 needChunkSetup = FALSE;
2655 } else if (ut->b == neededIndex) {
2656 // The first buffer (buffer p) has what we need.
2657 buf = (UChar *)ut->p;
2658 } else if (ut->c == neededIndex) {
2659 // The second buffer (buffer q) has what we need.
2660 buf = (UChar *)ut->q;
2661 } else {
2662 // Neither buffer already has what we need.
2663 // Load new data from the character iterator.
2664 // Use the buf that is not the current buffer.
2665 buf = (UChar *)ut->p;
2666 if (ut->p == ut->chunkContents) {
2667 buf = (UChar *)ut->q;
2668 }
2669 ci->setIndex(neededIndex);
2670 for (i=0; i<CIBufSize; i++) {
2671 buf[i] = ci->nextPostInc();
2672 if (i+neededIndex > ut->a) {
2673 break;
2674 }
2675 }
2676 }
2677
2678 // We have a buffer with the data we need.
2679 // Set it up as the current chunk, if it wasn't already.
2680 if (needChunkSetup) {
2681 ut->chunkContents = buf;
2682 ut->chunkLength = CIBufSize;
2683 ut->chunkNativeStart = neededIndex;
2684 ut->chunkNativeLimit = neededIndex + CIBufSize;
2685 if (ut->chunkNativeLimit > ut->a) {
2686 ut->chunkNativeLimit = ut->a;
2687 ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
2688 }
2689 ut->nativeIndexingLimit = ut->chunkLength;
2690 U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
2691 }
2692 ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
2693 UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
2694 return success;
2695}
2696
2697static UText * U_CALLCONV
2698charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
2699 if (U_FAILURE(*status)) {
2700 return NULL;
2701 }
2702
2703 if (deep) {
2704 // There is no CharacterIterator API for cloning the underlying text storage.
2705 *status = U_UNSUPPORTED_ERROR;
2706 return NULL;
2707 } else {
2708 CharacterIterator *srcCI =(CharacterIterator *)src->context;
2709 srcCI = srcCI->clone();
2710 dest = utext_openCharacterIterator(dest, srcCI, status);
2711 // cast off const on getNativeIndex.
2712 // For CharacterIterator based UTexts, this is safe, the operation is const.
2713 int64_t ix = utext_getNativeIndex((UText *)src);
2714 utext_setNativeIndex(dest, ix);
2715 dest->r = srcCI; // flags that this UText owns the CharacterIterator
2716 }
2717 return dest;
2718}
2719
2720static int32_t U_CALLCONV
2721charIterTextExtract(UText *ut,
2722 int64_t start, int64_t limit,
2723 UChar *dest, int32_t destCapacity,
2724 UErrorCode *status)
2725{
2726 if(U_FAILURE(*status)) {
2727 return 0;
2728 }
2729 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
2730 *status=U_ILLEGAL_ARGUMENT_ERROR;
2731 return 0;
2732 }
2733 int32_t length = (int32_t)ut->a;
2734 int32_t start32 = pinIndex(start, length);
2735 int32_t limit32 = pinIndex(limit, length);
2736 int32_t desti = 0;
2737 int32_t srci;
2738
2739 CharacterIterator *ci = (CharacterIterator *)ut->context;
2740 ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
2741 srci = ci->getIndex();
2742 while (srci<limit32) {
2743 UChar32 c = ci->next32PostInc();
2744 int32_t len = U16_LENGTH(c);
2745 if (desti+len <= destCapacity) {
2746 U16_APPEND_UNSAFE(dest, desti, c);
2747 } else {
2748 desti += len;
2749 *status = U_BUFFER_OVERFLOW_ERROR;
2750 }
2751 srci += len;
2752 }
2753
2754 u_terminateUChars(dest, destCapacity, desti, status);
2755 return desti;
2756}
2757
2758static struct UTextFuncs charIterFuncs =
2759{
2760 sizeof(UTextFuncs),
2761 0, 0, 0, // Reserved alignment padding
2762 charIterTextClone,
2763 charIterTextLength,
2764 charIterTextAccess,
2765 charIterTextExtract,
2766 NULL, // Replace
2767 NULL, // Copy
2768 NULL, // MapOffsetToNative,
2769 NULL, // MapIndexToUTF16,
2770 charIterTextClose,
2771 NULL, // spare 1
2772 NULL, // spare 2
2773 NULL // spare 3
2774};
2775U_CDECL_END
2776
2777
2778U_DRAFT UText * U_EXPORT2
2779utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
2780 if (U_FAILURE(*status)) {
2781 return NULL;
2782 }
2783
2784 if (ci->startIndex() > 0) {
2785 // No support for CharacterIterators that do not start indexing from zero.
2786 *status = U_UNSUPPORTED_ERROR;
2787 return NULL;
2788 }
2789
2790 // Extra space in UText for 2 buffers of CIBufSize UChars each.
2791 int32_t extraSpace = 2 * CIBufSize * sizeof(UChar);
2792 ut = utext_setup(ut, extraSpace, status);
2793 if (U_SUCCESS(*status)) {
2794 ut->pFuncs = &charIterFuncs;
2795 ut->context = ci;
2796 ut->providerProperties = 0;
2797 ut->a = ci->endIndex(); // Length of text
2798 ut->p = ut->pExtra; // First buffer
2799 ut->b = -1; // Native index of first buffer contents
2800 ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer
2801 ut->c = -1; // Native index of second buffer contents
2802
2803 // Initialize current chunk contents to be empty.
2804 // First access will fault something in.
2805 // Note: The initial nativeStart and chunkOffset must sum to zero
2806 // so that getNativeIndex() will correctly compute to zero
2807 // if no call to Access() has ever been made. They can't be both
2808 // zero without Access() thinking that the chunk is valid.
2809 ut->chunkContents = (UChar *)ut->p;
2810 ut->chunkNativeStart = -1;
2811 ut->chunkOffset = 1;
2812 ut->chunkNativeLimit = 0;
2813 ut->chunkLength = 0;
2814 ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
2815 }
2816 return ut;
2817}
2818
2819
2820