]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/uiter.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / uiter.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2002-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uiter.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jan18
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/ustring.h"
19 #include "unicode/chariter.h"
20 #include "unicode/rep.h"
21 #include "unicode/uiter.h"
22 #include "cstring.h"
23
24 #define IS_EVEN(n) (((n)&1)==0)
25 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
26
27 U_CDECL_BEGIN
28
29 /* No-Op UCharIterator implementation for illegal input --------------------- */
30
31 static int32_t U_CALLCONV
32 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
33 return 0;
34 }
35
36 static int32_t U_CALLCONV
37 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
38 return 0;
39 }
40
41 static UBool U_CALLCONV
42 noopHasNext(UCharIterator * /*iter*/) {
43 return FALSE;
44 }
45
46 static UChar32 U_CALLCONV
47 noopCurrent(UCharIterator * /*iter*/) {
48 return U_SENTINEL;
49 }
50
51 static uint32_t U_CALLCONV
52 noopGetState(const UCharIterator * /*iter*/) {
53 return 0;
54 }
55
56 static void U_CALLCONV
57 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode * /*pErrorCode*/) {
58 }
59
60 static const UCharIterator noopIterator={
61 0, 0, 0, 0, 0, 0,
62 noopGetIndex,
63 noopMove,
64 noopHasNext,
65 noopHasNext,
66 noopCurrent,
67 noopCurrent,
68 noopCurrent,
69 NULL,
70 noopGetState,
71 noopSetState
72 };
73
74 /* UCharIterator implementation for simple strings -------------------------- */
75
76 /*
77 * This is an implementation of a code unit (UChar) iterator
78 * for UChar * strings.
79 *
80 * The UCharIterator.context field holds a pointer to the string.
81 */
82
83 static int32_t U_CALLCONV
84 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
85 switch(origin) {
86 case UITER_ZERO:
87 return 0;
88 case UITER_START:
89 return iter->start;
90 case UITER_CURRENT:
91 return iter->index;
92 case UITER_LIMIT:
93 return iter->limit;
94 case UITER_LENGTH:
95 return iter->length;
96 default:
97 /* not a valid origin */
98 /* Should never get here! */
99 return -1;
100 }
101 }
102
103 static int32_t U_CALLCONV
104 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
105 int32_t pos;
106
107 switch(origin) {
108 case UITER_ZERO:
109 pos=delta;
110 break;
111 case UITER_START:
112 pos=iter->start+delta;
113 break;
114 case UITER_CURRENT:
115 pos=iter->index+delta;
116 break;
117 case UITER_LIMIT:
118 pos=iter->limit+delta;
119 break;
120 case UITER_LENGTH:
121 pos=iter->length+delta;
122 break;
123 default:
124 return -1; /* Error */
125 }
126
127 if(pos<iter->start) {
128 pos=iter->start;
129 } else if(pos>iter->limit) {
130 pos=iter->limit;
131 }
132
133 return iter->index=pos;
134 }
135
136 static UBool U_CALLCONV
137 stringIteratorHasNext(UCharIterator *iter) {
138 return iter->index<iter->limit;
139 }
140
141 static UBool U_CALLCONV
142 stringIteratorHasPrevious(UCharIterator *iter) {
143 return iter->index>iter->start;
144 }
145
146 static UChar32 U_CALLCONV
147 stringIteratorCurrent(UCharIterator *iter) {
148 if(iter->index<iter->limit) {
149 return ((const UChar *)(iter->context))[iter->index];
150 } else {
151 return U_SENTINEL;
152 }
153 }
154
155 static UChar32 U_CALLCONV
156 stringIteratorNext(UCharIterator *iter) {
157 if(iter->index<iter->limit) {
158 return ((const UChar *)(iter->context))[iter->index++];
159 } else {
160 return U_SENTINEL;
161 }
162 }
163
164 static UChar32 U_CALLCONV
165 stringIteratorPrevious(UCharIterator *iter) {
166 if(iter->index>iter->start) {
167 return ((const UChar *)(iter->context))[--iter->index];
168 } else {
169 return U_SENTINEL;
170 }
171 }
172
173 static uint32_t U_CALLCONV
174 stringIteratorGetState(const UCharIterator *iter) {
175 return (uint32_t)iter->index;
176 }
177
178 static void U_CALLCONV
179 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
180 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
181 /* do nothing */
182 } else if(iter==NULL) {
183 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
184 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
185 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
186 } else {
187 iter->index=(int32_t)state;
188 }
189 }
190
191 static const UCharIterator stringIterator={
192 0, 0, 0, 0, 0, 0,
193 stringIteratorGetIndex,
194 stringIteratorMove,
195 stringIteratorHasNext,
196 stringIteratorHasPrevious,
197 stringIteratorCurrent,
198 stringIteratorNext,
199 stringIteratorPrevious,
200 NULL,
201 stringIteratorGetState,
202 stringIteratorSetState
203 };
204
205 U_CAPI void U_EXPORT2
206 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
207 if(iter!=0) {
208 if(s!=0 && length>=-1) {
209 *iter=stringIterator;
210 iter->context=s;
211 if(length>=0) {
212 iter->length=length;
213 } else {
214 iter->length=u_strlen(s);
215 }
216 iter->limit=iter->length;
217 } else {
218 *iter=noopIterator;
219 }
220 }
221 }
222
223 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
224
225 /*
226 * This is an implementation of a code unit (UChar) iterator
227 * for UTF-16BE strings, i.e., strings in byte-vectors where
228 * each UChar is stored as a big-endian pair of bytes.
229 *
230 * The UCharIterator.context field holds a pointer to the string.
231 * Everything works just like with a normal UChar iterator (uiter_setString),
232 * except that UChars are assembled from byte pairs.
233 */
234
235 /* internal helper function */
236 static inline UChar32
237 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
238 const uint8_t *p=(const uint8_t *)iter->context;
239 return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
240 }
241
242 static UChar32 U_CALLCONV
243 utf16BEIteratorCurrent(UCharIterator *iter) {
244 int32_t index;
245
246 if((index=iter->index)<iter->limit) {
247 return utf16BEIteratorGet(iter, index);
248 } else {
249 return U_SENTINEL;
250 }
251 }
252
253 static UChar32 U_CALLCONV
254 utf16BEIteratorNext(UCharIterator *iter) {
255 int32_t index;
256
257 if((index=iter->index)<iter->limit) {
258 iter->index=index+1;
259 return utf16BEIteratorGet(iter, index);
260 } else {
261 return U_SENTINEL;
262 }
263 }
264
265 static UChar32 U_CALLCONV
266 utf16BEIteratorPrevious(UCharIterator *iter) {
267 int32_t index;
268
269 if((index=iter->index)>iter->start) {
270 iter->index=--index;
271 return utf16BEIteratorGet(iter, index);
272 } else {
273 return U_SENTINEL;
274 }
275 }
276
277 static const UCharIterator utf16BEIterator={
278 0, 0, 0, 0, 0, 0,
279 stringIteratorGetIndex,
280 stringIteratorMove,
281 stringIteratorHasNext,
282 stringIteratorHasPrevious,
283 utf16BEIteratorCurrent,
284 utf16BEIteratorNext,
285 utf16BEIteratorPrevious,
286 NULL,
287 stringIteratorGetState,
288 stringIteratorSetState
289 };
290
291 /*
292 * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
293 * i.e., before a pair of 0 bytes where the first 0 byte is at an even
294 * offset from s.
295 */
296 static int32_t
297 utf16BE_strlen(const char *s) {
298 if(IS_POINTER_EVEN(s)) {
299 /*
300 * even-aligned, call u_strlen(s)
301 * we are probably on a little-endian machine, but searching for UChar NUL
302 * does not care about endianness
303 */
304 return u_strlen((const UChar *)s);
305 } else {
306 /* odd-aligned, search for pair of 0 bytes */
307 const char *p=s;
308
309 while(!(*p==0 && p[1]==0)) {
310 p+=2;
311 }
312 return (int32_t)((p-s)/2);
313 }
314 }
315
316 U_CAPI void U_EXPORT2
317 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
318 if(iter!=NULL) {
319 /* allow only even-length strings (the input length counts bytes) */
320 if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
321 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
322 length>>=1;
323
324 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
325 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
326 uiter_setString(iter, (const UChar *)s, length);
327 return;
328 }
329
330 *iter=utf16BEIterator;
331 iter->context=s;
332 if(length>=0) {
333 iter->length=length;
334 } else {
335 iter->length=utf16BE_strlen(s);
336 }
337 iter->limit=iter->length;
338 } else {
339 *iter=noopIterator;
340 }
341 }
342 }
343
344 /* UCharIterator wrapper around CharacterIterator --------------------------- */
345
346 /*
347 * This is wrapper code around a C++ CharacterIterator to
348 * look like a C UCharIterator.
349 *
350 * The UCharIterator.context field holds a pointer to the CharacterIterator.
351 */
352
353 static int32_t U_CALLCONV
354 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
355 switch(origin) {
356 case UITER_ZERO:
357 return 0;
358 case UITER_START:
359 return ((CharacterIterator *)(iter->context))->startIndex();
360 case UITER_CURRENT:
361 return ((CharacterIterator *)(iter->context))->getIndex();
362 case UITER_LIMIT:
363 return ((CharacterIterator *)(iter->context))->endIndex();
364 case UITER_LENGTH:
365 return ((CharacterIterator *)(iter->context))->getLength();
366 default:
367 /* not a valid origin */
368 /* Should never get here! */
369 return -1;
370 }
371 }
372
373 static int32_t U_CALLCONV
374 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
375 switch(origin) {
376 case UITER_ZERO:
377 ((CharacterIterator *)(iter->context))->setIndex(delta);
378 return ((CharacterIterator *)(iter->context))->getIndex();
379 case UITER_START:
380 case UITER_CURRENT:
381 case UITER_LIMIT:
382 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
383 case UITER_LENGTH:
384 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
385 return ((CharacterIterator *)(iter->context))->getIndex();
386 default:
387 /* not a valid origin */
388 /* Should never get here! */
389 return -1;
390 }
391 }
392
393 static UBool U_CALLCONV
394 characterIteratorHasNext(UCharIterator *iter) {
395 return ((CharacterIterator *)(iter->context))->hasNext();
396 }
397
398 static UBool U_CALLCONV
399 characterIteratorHasPrevious(UCharIterator *iter) {
400 return ((CharacterIterator *)(iter->context))->hasPrevious();
401 }
402
403 static UChar32 U_CALLCONV
404 characterIteratorCurrent(UCharIterator *iter) {
405 UChar32 c;
406
407 c=((CharacterIterator *)(iter->context))->current();
408 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
409 return c;
410 } else {
411 return U_SENTINEL;
412 }
413 }
414
415 static UChar32 U_CALLCONV
416 characterIteratorNext(UCharIterator *iter) {
417 if(((CharacterIterator *)(iter->context))->hasNext()) {
418 return ((CharacterIterator *)(iter->context))->nextPostInc();
419 } else {
420 return U_SENTINEL;
421 }
422 }
423
424 static UChar32 U_CALLCONV
425 characterIteratorPrevious(UCharIterator *iter) {
426 if(((CharacterIterator *)(iter->context))->hasPrevious()) {
427 return ((CharacterIterator *)(iter->context))->previous();
428 } else {
429 return U_SENTINEL;
430 }
431 }
432
433 static uint32_t U_CALLCONV
434 characterIteratorGetState(const UCharIterator *iter) {
435 return ((CharacterIterator *)(iter->context))->getIndex();
436 }
437
438 static void U_CALLCONV
439 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
440 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
441 /* do nothing */
442 } else if(iter==NULL || iter->context==NULL) {
443 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
444 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
445 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
446 } else {
447 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
448 }
449 }
450
451 static const UCharIterator characterIteratorWrapper={
452 0, 0, 0, 0, 0, 0,
453 characterIteratorGetIndex,
454 characterIteratorMove,
455 characterIteratorHasNext,
456 characterIteratorHasPrevious,
457 characterIteratorCurrent,
458 characterIteratorNext,
459 characterIteratorPrevious,
460 NULL,
461 characterIteratorGetState,
462 characterIteratorSetState
463 };
464
465 U_CAPI void U_EXPORT2
466 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
467 if(iter!=0) {
468 if(charIter!=0) {
469 *iter=characterIteratorWrapper;
470 iter->context=charIter;
471 } else {
472 *iter=noopIterator;
473 }
474 }
475 }
476
477 /* UCharIterator wrapper around Replaceable --------------------------------- */
478
479 /*
480 * This is an implementation of a code unit (UChar) iterator
481 * based on a Replaceable object.
482 *
483 * The UCharIterator.context field holds a pointer to the Replaceable.
484 * UCharIterator.length and UCharIterator.index hold Replaceable.length()
485 * and the iteration index.
486 */
487
488 static UChar32 U_CALLCONV
489 replaceableIteratorCurrent(UCharIterator *iter) {
490 if(iter->index<iter->limit) {
491 return ((Replaceable *)(iter->context))->charAt(iter->index);
492 } else {
493 return U_SENTINEL;
494 }
495 }
496
497 static UChar32 U_CALLCONV
498 replaceableIteratorNext(UCharIterator *iter) {
499 if(iter->index<iter->limit) {
500 return ((Replaceable *)(iter->context))->charAt(iter->index++);
501 } else {
502 return U_SENTINEL;
503 }
504 }
505
506 static UChar32 U_CALLCONV
507 replaceableIteratorPrevious(UCharIterator *iter) {
508 if(iter->index>iter->start) {
509 return ((Replaceable *)(iter->context))->charAt(--iter->index);
510 } else {
511 return U_SENTINEL;
512 }
513 }
514
515 static const UCharIterator replaceableIterator={
516 0, 0, 0, 0, 0, 0,
517 stringIteratorGetIndex,
518 stringIteratorMove,
519 stringIteratorHasNext,
520 stringIteratorHasPrevious,
521 replaceableIteratorCurrent,
522 replaceableIteratorNext,
523 replaceableIteratorPrevious,
524 NULL,
525 stringIteratorGetState,
526 stringIteratorSetState
527 };
528
529 U_CAPI void U_EXPORT2
530 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
531 if(iter!=0) {
532 if(rep!=0) {
533 *iter=replaceableIterator;
534 iter->context=rep;
535 iter->limit=iter->length=rep->length();
536 } else {
537 *iter=noopIterator;
538 }
539 }
540 }
541
542 /* UCharIterator implementation for UTF-8 strings --------------------------- */
543
544 /*
545 * Possible, probably necessary only for an implementation for arbitrary
546 * converters:
547 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
548 * This would require to turn reservedFn into a close function and
549 * to introduce a uiter_close(iter).
550 */
551
552 #define UITER_CNV_CAPACITY 16
553
554 /*
555 * Minimal implementation:
556 * Maintain a single-UChar buffer for an additional surrogate.
557 * The caller must not modify start and limit because they are used internally.
558 *
559 * Use UCharIterator fields as follows:
560 * context pointer to UTF-8 string
561 * length UTF-16 length of the string; -1 until lazy evaluation
562 * start current UTF-8 index
563 * index current UTF-16 index; may be -1="unknown" after setState()
564 * limit UTF-8 length of the string
565 * reservedField supplementary code point
566 *
567 * Since UCharIterator delivers 16-bit code units, the iteration can be
568 * currently in the middle of the byte sequence for a supplementary code point.
569 * In this case, reservedField will contain that code point and start will
570 * point to after the corresponding byte sequence. The UTF-16 index will be
571 * one less than what it would otherwise be corresponding to the UTF-8 index.
572 * Otherwise, reservedField will be 0.
573 */
574
575 /*
576 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
577 * Add implementations that do not call strlen() for iteration but check for NUL.
578 */
579
580 static int32_t U_CALLCONV
581 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
582 switch(origin) {
583 case UITER_ZERO:
584 case UITER_START:
585 return 0;
586 case UITER_CURRENT:
587 if(iter->index<0) {
588 /* the current UTF-16 index is unknown after setState(), count from the beginning */
589 const uint8_t *s;
590 UChar32 c;
591 int32_t i, limit, index;
592
593 s=(const uint8_t *)iter->context;
594 i=index=0;
595 limit=iter->start; /* count up to the UTF-8 index */
596 while(i<limit) {
597 U8_NEXT(s, i, limit, c);
598 if(c<=0xffff) {
599 ++index;
600 } else {
601 index+=2;
602 }
603 }
604
605 iter->start=i; /* just in case setState() did not get us to a code point boundary */
606 if(i==iter->limit) {
607 iter->length=index; /* in case it was <0 or wrong */
608 }
609 if(iter->reservedField!=0) {
610 --index; /* we are in the middle of a supplementary code point */
611 }
612 iter->index=index;
613 }
614 return iter->index;
615 case UITER_LIMIT:
616 case UITER_LENGTH:
617 if(iter->length<0) {
618 const uint8_t *s;
619 UChar32 c;
620 int32_t i, limit, length;
621
622 s=(const uint8_t *)iter->context;
623 if(iter->index<0) {
624 /*
625 * the current UTF-16 index is unknown after setState(),
626 * we must first count from the beginning to here
627 */
628 i=length=0;
629 limit=iter->start;
630
631 /* count from the beginning to the current index */
632 while(i<limit) {
633 U8_NEXT(s, i, limit, c);
634 if(c<=0xffff) {
635 ++length;
636 } else {
637 length+=2;
638 }
639 }
640
641 /* assume i==limit==iter->start, set the UTF-16 index */
642 iter->start=i; /* just in case setState() did not get us to a code point boundary */
643 iter->index= iter->reservedField!=0 ? length-1 : length;
644 } else {
645 i=iter->start;
646 length=iter->index;
647 if(iter->reservedField!=0) {
648 ++length;
649 }
650 }
651
652 /* count from the current index to the end */
653 limit=iter->limit;
654 while(i<limit) {
655 U8_NEXT(s, i, limit, c);
656 if(c<=0xffff) {
657 ++length;
658 } else {
659 length+=2;
660 }
661 }
662 iter->length=length;
663 }
664 return iter->length;
665 default:
666 /* not a valid origin */
667 /* Should never get here! */
668 return -1;
669 }
670 }
671
672 static int32_t U_CALLCONV
673 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
674 const uint8_t *s;
675 UChar32 c;
676 int32_t pos; /* requested UTF-16 index */
677 int32_t i; /* UTF-8 index */
678 UBool havePos;
679
680 /* calculate the requested UTF-16 index */
681 switch(origin) {
682 case UITER_ZERO:
683 case UITER_START:
684 pos=delta;
685 havePos=TRUE;
686 /* iter->index<0 (unknown) is possible */
687 break;
688 case UITER_CURRENT:
689 if(iter->index>=0) {
690 pos=iter->index+delta;
691 havePos=TRUE;
692 } else {
693 /* the current UTF-16 index is unknown after setState(), use only delta */
694 pos=0;
695 havePos=FALSE;
696 }
697 break;
698 case UITER_LIMIT:
699 case UITER_LENGTH:
700 if(iter->length>=0) {
701 pos=iter->length+delta;
702 havePos=TRUE;
703 } else {
704 /* pin to the end, avoid counting the length */
705 iter->index=-1;
706 iter->start=iter->limit;
707 iter->reservedField=0;
708 if(delta>=0) {
709 return UITER_UNKNOWN_INDEX;
710 } else {
711 /* the current UTF-16 index is unknown, use only delta */
712 pos=0;
713 havePos=FALSE;
714 }
715 }
716 break;
717 default:
718 return -1; /* Error */
719 }
720
721 if(havePos) {
722 /* shortcuts: pinning to the edges of the string */
723 if(pos<=0) {
724 iter->index=iter->start=iter->reservedField=0;
725 return 0;
726 } else if(iter->length>=0 && pos>=iter->length) {
727 iter->index=iter->length;
728 iter->start=iter->limit;
729 iter->reservedField=0;
730 return iter->index;
731 }
732
733 /* minimize the number of U8_NEXT/PREV operations */
734 if(iter->index<0 || pos<iter->index/2) {
735 /* go forward from the start instead of backward from the current index */
736 iter->index=iter->start=iter->reservedField=0;
737 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
738 /*
739 * if we have the UTF-16 index and length and the new position is
740 * closer to the end than the current index,
741 * then go backward from the end instead of forward from the current index
742 */
743 iter->index=iter->length;
744 iter->start=iter->limit;
745 iter->reservedField=0;
746 }
747
748 delta=pos-iter->index;
749 if(delta==0) {
750 return iter->index; /* nothing to do */
751 }
752 } else {
753 /* move relative to unknown UTF-16 index */
754 if(delta==0) {
755 return UITER_UNKNOWN_INDEX; /* nothing to do */
756 } else if(-delta>=iter->start) {
757 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
758 iter->index=iter->start=iter->reservedField=0;
759 return 0;
760 } else if(delta>=(iter->limit-iter->start)) {
761 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
762 iter->index=iter->length; /* may or may not be <0 (unknown) */
763 iter->start=iter->limit;
764 iter->reservedField=0;
765 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
766 }
767 }
768
769 /* delta!=0 */
770
771 /* move towards the requested position, pin to the edges of the string */
772 s=(const uint8_t *)iter->context;
773 pos=iter->index; /* could be <0 (unknown) */
774 i=iter->start;
775 if(delta>0) {
776 /* go forward */
777 int32_t limit=iter->limit;
778 if(iter->reservedField!=0) {
779 iter->reservedField=0;
780 ++pos;
781 --delta;
782 }
783 while(delta>0 && i<limit) {
784 U8_NEXT(s, i, limit, c);
785 if(c<0xffff) {
786 ++pos;
787 --delta;
788 } else if(delta>=2) {
789 pos+=2;
790 delta-=2;
791 } else /* delta==1 */ {
792 /* stop in the middle of a supplementary code point */
793 iter->reservedField=c;
794 ++pos;
795 break; /* delta=0; */
796 }
797 }
798 if(i==limit) {
799 if(iter->length<0 && iter->index>=0) {
800 iter->length= iter->reservedField==0 ? pos : pos+1;
801 } else if(iter->index<0 && iter->length>=0) {
802 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
803 }
804 }
805 } else /* delta<0 */ {
806 /* go backward */
807 if(iter->reservedField!=0) {
808 iter->reservedField=0;
809 i-=4; /* we stayed behind the supplementary code point; go before it now */
810 --pos;
811 ++delta;
812 }
813 while(delta<0 && i>0) {
814 U8_PREV(s, 0, i, c);
815 if(c<0xffff) {
816 --pos;
817 ++delta;
818 } else if(delta<=-2) {
819 pos-=2;
820 delta+=2;
821 } else /* delta==-1 */ {
822 /* stop in the middle of a supplementary code point */
823 i+=4; /* back to behind this supplementary code point for consistent state */
824 iter->reservedField=c;
825 --pos;
826 break; /* delta=0; */
827 }
828 }
829 }
830
831 iter->start=i;
832 if(iter->index>=0) {
833 return iter->index=pos;
834 } else {
835 /* we started with index<0 (unknown) so pos is bogus */
836 if(i<=1) {
837 return iter->index=i; /* reached the beginning */
838 } else {
839 /* we still don't know the UTF-16 index */
840 return UITER_UNKNOWN_INDEX;
841 }
842 }
843 }
844
845 static UBool U_CALLCONV
846 utf8IteratorHasNext(UCharIterator *iter) {
847 return iter->start<iter->limit || iter->reservedField!=0;
848 }
849
850 static UBool U_CALLCONV
851 utf8IteratorHasPrevious(UCharIterator *iter) {
852 return iter->start>0;
853 }
854
855 static UChar32 U_CALLCONV
856 utf8IteratorCurrent(UCharIterator *iter) {
857 if(iter->reservedField!=0) {
858 return U16_TRAIL(iter->reservedField);
859 } else if(iter->start<iter->limit) {
860 const uint8_t *s=(const uint8_t *)iter->context;
861 UChar32 c;
862 int32_t i=iter->start;
863
864 U8_NEXT(s, i, iter->limit, c);
865 if(c<0) {
866 return 0xfffd;
867 } else if(c<=0xffff) {
868 return c;
869 } else {
870 return U16_LEAD(c);
871 }
872 } else {
873 return U_SENTINEL;
874 }
875 }
876
877 static UChar32 U_CALLCONV
878 utf8IteratorNext(UCharIterator *iter) {
879 int32_t index;
880
881 if(iter->reservedField!=0) {
882 UChar trail=U16_TRAIL(iter->reservedField);
883 iter->reservedField=0;
884 if((index=iter->index)>=0) {
885 iter->index=index+1;
886 }
887 return trail;
888 } else if(iter->start<iter->limit) {
889 const uint8_t *s=(const uint8_t *)iter->context;
890 UChar32 c;
891
892 U8_NEXT(s, iter->start, iter->limit, c);
893 if((index=iter->index)>=0) {
894 iter->index=++index;
895 if(iter->length<0 && iter->start==iter->limit) {
896 iter->length= c<=0xffff ? index : index+1;
897 }
898 } else if(iter->start==iter->limit && iter->length>=0) {
899 iter->index= c<=0xffff ? iter->length : iter->length-1;
900 }
901 if(c<0) {
902 return 0xfffd;
903 } else if(c<=0xffff) {
904 return c;
905 } else {
906 iter->reservedField=c;
907 return U16_LEAD(c);
908 }
909 } else {
910 return U_SENTINEL;
911 }
912 }
913
914 static UChar32 U_CALLCONV
915 utf8IteratorPrevious(UCharIterator *iter) {
916 int32_t index;
917
918 if(iter->reservedField!=0) {
919 UChar lead=U16_LEAD(iter->reservedField);
920 iter->reservedField=0;
921 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
922 if((index=iter->index)>0) {
923 iter->index=index-1;
924 }
925 return lead;
926 } else if(iter->start>0) {
927 const uint8_t *s=(const uint8_t *)iter->context;
928 UChar32 c;
929
930 U8_PREV(s, 0, iter->start, c);
931 if((index=iter->index)>0) {
932 iter->index=index-1;
933 } else if(iter->start<=1) {
934 iter->index= c<=0xffff ? iter->start : iter->start+1;
935 }
936 if(c<0) {
937 return 0xfffd;
938 } else if(c<=0xffff) {
939 return c;
940 } else {
941 iter->start+=4; /* back to behind this supplementary code point for consistent state */
942 iter->reservedField=c;
943 return U16_TRAIL(c);
944 }
945 } else {
946 return U_SENTINEL;
947 }
948 }
949
950 static uint32_t U_CALLCONV
951 utf8IteratorGetState(const UCharIterator *iter) {
952 uint32_t state=(uint32_t)(iter->start<<1);
953 if(iter->reservedField!=0) {
954 state|=1;
955 }
956 return state;
957 }
958
959 static void U_CALLCONV
960 utf8IteratorSetState(UCharIterator *iter,
961 uint32_t state,
962 UErrorCode *pErrorCode)
963 {
964 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
965 /* do nothing */
966 } else if(iter==NULL) {
967 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
968 } else if(state==utf8IteratorGetState(iter)) {
969 /* setting to the current state: no-op */
970 } else {
971 int32_t index=(int32_t)(state>>1); /* UTF-8 index */
972 state&=1; /* 1 if in surrogate pair, must be index>=4 */
973
974 if((state==0 ? index<0 : index<4) || iter->limit<index) {
975 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
976 } else {
977 iter->start=index; /* restore UTF-8 byte index */
978 if(index<=1) {
979 iter->index=index;
980 } else {
981 iter->index=-1; /* unknown UTF-16 index */
982 }
983 if(state==0) {
984 iter->reservedField=0;
985 } else {
986 /* verified index>=4 above */
987 UChar32 c;
988 U8_PREV((const uint8_t *)iter->context, 0, index, c);
989 if(c<=0xffff) {
990 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
991 } else {
992 iter->reservedField=c;
993 }
994 }
995 }
996 }
997 }
998
999 static const UCharIterator utf8Iterator={
1000 0, 0, 0, 0, 0, 0,
1001 utf8IteratorGetIndex,
1002 utf8IteratorMove,
1003 utf8IteratorHasNext,
1004 utf8IteratorHasPrevious,
1005 utf8IteratorCurrent,
1006 utf8IteratorNext,
1007 utf8IteratorPrevious,
1008 NULL,
1009 utf8IteratorGetState,
1010 utf8IteratorSetState
1011 };
1012
1013 U_CAPI void U_EXPORT2
1014 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
1015 if(iter!=0) {
1016 if(s!=0 && length>=-1) {
1017 *iter=utf8Iterator;
1018 iter->context=s;
1019 if(length>=0) {
1020 iter->limit=length;
1021 } else {
1022 iter->limit=(int32_t)uprv_strlen(s);
1023 }
1024 iter->length= iter->limit<=1 ? iter->limit : -1;
1025 } else {
1026 *iter=noopIterator;
1027 }
1028 }
1029 }
1030
1031 /* Helper functions --------------------------------------------------------- */
1032
1033 U_CAPI UChar32 U_EXPORT2
1034 uiter_current32(UCharIterator *iter) {
1035 UChar32 c, c2;
1036
1037 c=iter->current(iter);
1038 if(UTF_IS_SURROGATE(c)) {
1039 if(UTF_IS_SURROGATE_FIRST(c)) {
1040 /*
1041 * go to the next code unit
1042 * we know that we are not at the limit because c!=U_SENTINEL
1043 */
1044 iter->move(iter, 1, UITER_CURRENT);
1045 if(UTF_IS_SECOND_SURROGATE(c2=iter->current(iter))) {
1046 c=UTF16_GET_PAIR_VALUE(c, c2);
1047 }
1048
1049 /* undo index movement */
1050 iter->move(iter, -1, UITER_CURRENT);
1051 } else {
1052 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
1053 c=UTF16_GET_PAIR_VALUE(c2, c);
1054 }
1055 if(c2>=0) {
1056 /* undo index movement */
1057 iter->move(iter, 1, UITER_CURRENT);
1058 }
1059 }
1060 }
1061 return c;
1062 }
1063
1064 U_CAPI UChar32 U_EXPORT2
1065 uiter_next32(UCharIterator *iter) {
1066 UChar32 c, c2;
1067
1068 c=iter->next(iter);
1069 if(UTF_IS_FIRST_SURROGATE(c)) {
1070 if(UTF_IS_SECOND_SURROGATE(c2=iter->next(iter))) {
1071 c=UTF16_GET_PAIR_VALUE(c, c2);
1072 } else if(c2>=0) {
1073 /* unmatched first surrogate, undo index movement */
1074 iter->move(iter, -1, UITER_CURRENT);
1075 }
1076 }
1077 return c;
1078 }
1079
1080 U_CAPI UChar32 U_EXPORT2
1081 uiter_previous32(UCharIterator *iter) {
1082 UChar32 c, c2;
1083
1084 c=iter->previous(iter);
1085 if(UTF_IS_SECOND_SURROGATE(c)) {
1086 if(UTF_IS_FIRST_SURROGATE(c2=iter->previous(iter))) {
1087 c=UTF16_GET_PAIR_VALUE(c2, c);
1088 } else if(c2>=0) {
1089 /* unmatched second surrogate, undo index movement */
1090 iter->move(iter, 1, UITER_CURRENT);
1091 }
1092 }
1093 return c;
1094 }
1095
1096 U_CAPI uint32_t U_EXPORT2
1097 uiter_getState(const UCharIterator *iter) {
1098 if(iter==NULL || iter->getState==NULL) {
1099 return UITER_NO_STATE;
1100 } else {
1101 return iter->getState(iter);
1102 }
1103 }
1104
1105 U_CAPI void U_EXPORT2
1106 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
1107 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1108 /* do nothing */
1109 } else if(iter==NULL) {
1110 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1111 } else if(iter->setState==NULL) {
1112 *pErrorCode=U_UNSUPPORTED_ERROR;
1113 } else {
1114 iter->setState(iter, state, pErrorCode);
1115 }
1116 }
1117
1118 U_CDECL_END