2 *******************************************************************************
4 * Copyright (C) 2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uit_len8.c
10 * tab size: 8 (not used)
13 * created on: 2003feb10
14 * created by: Markus W. Scherer
16 * This file contains the implementation of the "lenient UTF-8" UCharIterator
17 * as used in the uciter8 sample code.
18 * UTF-8-style macros are defined as well as the UCharIterator.
19 * The macros are incomplete (do not assemble code points from pairs of
20 * surrogates, see comment below)
21 * but sufficient for the iterator.
25 #include "unicode/utypes.h"
26 #include "unicode/uiter.h"
28 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
31 * This code leniently reads 8-bit Unicode strings,
32 * which could contain a mix of UTF-8 and CESU-8.
34 * - supplementary code points may be encoded with dedicated 4-byte sequences
36 * - supplementary code points may be encoded with
37 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
39 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
42 * Right now, the macros do not attempt to assemble code points from pairs of
43 * separately encoded surrogates.
44 * This would not be sufficient for processing based on these macros,
45 * but it is sufficient for a UCharIterator that returns only UChars anyway.
47 * The code is copied and modified from utf_impl.c and utf8.h.
48 * The "strict" argument in the implementation functions is completely removed,
49 * using the "<0" branch from the original code.
50 * Checks for surrogate code points are removed for the leniency
55 lenient8_minLegal
[4]={ 0, 0x80, 0x800, 0x10000 };
58 lenient8_nextCharSafeBody(const uint8_t *s
, int32_t *pi
, int32_t length
, UChar32 c
) {
60 uint8_t count
=U8_COUNT_TRAIL_BYTES(c
);
61 if((i
)+count
<=(length
)) {
62 uint8_t trail
, illegal
=0;
64 U8_MASK_LEAD_BYTE((c
), count
);
65 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
67 /* each branch falls through to the next one */
70 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
75 (c
)=((c
)<<6)|(trail
&0x3f);
77 illegal
|=(trail
&0xc0)^0x80;
79 /* code point>0x10ffff, outside Unicode */
85 (c
)=((c
)<<6)|(trail
&0x3f);
86 illegal
|=(trail
&0xc0)^0x80;
89 (c
)=((c
)<<6)|(trail
&0x3f);
90 illegal
|=(trail
&0xc0)^0x80;
94 /* no default branch to optimize switch() - all values are covered */
97 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
98 /* illegal is also set if count>=4 */
99 if(illegal
|| (c
)<lenient8_minLegal
[count
]) {
101 uint8_t errorCount
=count
;
102 /* don't go beyond this sequence */
104 while(count
>0 && U8_IS_TRAIL(s
[i
])) {
110 } else /* too few bytes left */ {
113 /* don't just set (i)=(length) in case there is an illegal sequence */
114 while((i
)<(length
) && U8_IS_TRAIL(s
[i
])) {
124 lenient8_prevCharSafeBody(const uint8_t *s
, int32_t start
, int32_t *pi
, UChar32 c
) {
126 uint8_t b
, count
=1, shift
=6;
128 /* extract value bits from the last trail byte */
133 /* no lead byte at all */
137 /* read another previous byte */
139 if((uint8_t)(b
-0x80)<0x7e) { /* 0x80<=b<0xfe */
141 /* lead byte, this will always end the loop */
142 uint8_t shouldCount
=U8_COUNT_TRAIL_BYTES(b
);
144 if(count
==shouldCount
) {
145 /* set the new position */
147 U8_MASK_LEAD_BYTE(b
, count
);
148 c
|=(UChar32
)b
<<shift
;
149 if(count
>=4 || c
>0x10ffff || c
<lenient8_minLegal
[count
]) {
150 /* illegal sequence */
156 /* exit with correct c */
159 /* the lead byte does not match the number of trail bytes */
160 /* only set the position to the lead byte if it would
161 include the trail byte that we started with */
162 if(count
<shouldCount
) {
170 c
|=(UChar32
)(b
&0x3f)<<shift
;
174 /* more than 5 trail bytes is illegal */
179 /* single-byte character precedes trailing bytes */
187 #define L8_NEXT(s, i, length, c) { \
190 if(U8_IS_LEAD(c)) { \
191 (c)=lenient8_nextCharSafeBody(s, &(i), (int32_t)(length), c); \
198 #define L8_PREV(s, start, i, c) { \
202 (c)=lenient8_prevCharSafeBody(s, start, &(i), c); \
209 /* lenient-8 UCharIterator -------------------------------------------------- */
212 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
213 * except that it uses the lenient-8-bit-Unicode macros above.
217 * Minimal implementation:
218 * Maintain a single-UChar buffer for an additional surrogate.
219 * The caller must not modify start and limit because they are used internally.
221 * Use UCharIterator fields as follows:
222 * context pointer to UTF-8 string
223 * length UTF-16 length of the string; -1 until lazy evaluation
224 * start current UTF-8 index
225 * index current UTF-16 index; may be -1="unknown" after setState()
226 * limit UTF-8 length of the string
227 * reservedField supplementary code point
229 * Since UCharIterator delivers 16-bit code units, the iteration can be
230 * currently in the middle of the byte sequence for a supplementary code point.
231 * In this case, reservedField will contain that code point and start will
232 * point to after the corresponding byte sequence. The UTF-16 index will be
233 * one less than what it would otherwise be corresponding to the UTF-8 index.
234 * Otherwise, reservedField will be 0.
238 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
239 * Add implementations that do not call strlen() for iteration but check for NUL.
242 static int32_t U_CALLCONV
243 lenient8IteratorGetIndex(UCharIterator
*iter
, UCharIteratorOrigin origin
) {
250 /* the current UTF-16 index is unknown after setState(), count from the beginning */
253 int32_t i
, limit
, index
;
255 s
=(const uint8_t *)iter
->context
;
257 limit
=iter
->start
; /* count up to the UTF-8 index */
259 L8_NEXT(s
, i
, limit
, c
);
267 iter
->start
=i
; /* just in case setState() did not get us to a code point boundary */
269 iter
->length
=index
; /* in case it was <0 or wrong */
271 if(iter
->reservedField
!=0) {
272 --index
; /* we are in the middle of a supplementary code point */
282 int32_t i
, limit
, length
;
284 s
=(const uint8_t *)iter
->context
;
287 * the current UTF-16 index is unknown after setState(),
288 * we must first count from the beginning to here
293 /* count from the beginning to the current index */
295 L8_NEXT(s
, i
, limit
, c
);
303 /* assume i==limit==iter->start, set the UTF-16 index */
304 iter
->start
=i
; /* just in case setState() did not get us to a code point boundary */
305 iter
->index
= iter
->reservedField
!=0 ? length
-1 : length
;
309 if(iter
->reservedField
!=0) {
314 /* count from the current index to the end */
317 L8_NEXT(s
, i
, limit
, c
);
328 /* not a valid origin */
329 /* Should never get here! */
334 static int32_t U_CALLCONV
335 lenient8IteratorMove(UCharIterator
*iter
, int32_t delta
, UCharIteratorOrigin origin
) {
338 int32_t pos
; /* requested UTF-16 index */
339 int32_t i
; /* UTF-8 index */
342 /* calculate the requested UTF-16 index */
348 /* iter->index<0 (unknown) is possible */
352 pos
=iter
->index
+delta
;
355 /* the current UTF-16 index is unknown after setState(), use only delta */
362 if(iter
->length
>=0) {
363 pos
=iter
->length
+delta
;
366 /* pin to the end, avoid counting the length */
368 iter
->start
=iter
->limit
;
369 iter
->reservedField
=0;
371 return UITER_UNKNOWN_INDEX
;
373 /* the current UTF-16 index is unknown, use only delta */
380 return -1; /* Error */
384 /* shortcuts: pinning to the edges of the string */
386 iter
->index
=iter
->start
=iter
->reservedField
=0;
388 } else if(iter
->length
>=0 && pos
>=iter
->length
) {
389 iter
->index
=iter
->length
;
390 iter
->start
=iter
->limit
;
391 iter
->reservedField
=0;
395 /* minimize the number of L8_NEXT/PREV operations */
396 if(iter
->index
<0 || pos
<iter
->index
/2) {
397 /* go forward from the start instead of backward from the current index */
398 iter
->index
=iter
->start
=iter
->reservedField
=0;
399 } else if(iter
->length
>=0 && (iter
->length
-pos
)<(pos
-iter
->index
)) {
401 * if we have the UTF-16 index and length and the new position is
402 * closer to the end than the current index,
403 * then go backward from the end instead of forward from the current index
405 iter
->index
=iter
->length
;
406 iter
->start
=iter
->limit
;
407 iter
->reservedField
=0;
410 delta
=pos
-iter
->index
;
412 return iter
->index
; /* nothing to do */
415 /* move relative to unknown UTF-16 index */
417 return UITER_UNKNOWN_INDEX
; /* nothing to do */
418 } else if(-delta
>=iter
->start
) {
419 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
420 iter
->index
=iter
->start
=iter
->reservedField
=0;
422 } else if(delta
>=(iter
->limit
-iter
->start
)) {
423 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
424 iter
->index
=iter
->length
; /* may or may not be <0 (unknown) */
425 iter
->start
=iter
->limit
;
426 iter
->reservedField
=0;
427 return iter
->index
>=0 ? iter
->index
: UITER_UNKNOWN_INDEX
;
433 /* move towards the requested position, pin to the edges of the string */
434 s
=(const uint8_t *)iter
->context
;
435 pos
=iter
->index
; /* could be <0 (unknown) */
439 int32_t limit
=iter
->limit
;
440 if(iter
->reservedField
!=0) {
441 iter
->reservedField
=0;
445 while(delta
>0 && i
<limit
) {
446 L8_NEXT(s
, i
, limit
, c
);
450 } else if(delta
>=2) {
453 } else /* delta==1 */ {
454 /* stop in the middle of a supplementary code point */
455 iter
->reservedField
=c
;
457 break; /* delta=0; */
461 if(iter
->length
<0 && iter
->index
>=0) {
462 iter
->length
= iter
->reservedField
==0 ? pos
: pos
+1;
463 } else if(iter
->index
<0 && iter
->length
>=0) {
464 iter
->index
= iter
->reservedField
==0 ? iter
->length
: iter
->length
-1;
467 } else /* delta<0 */ {
469 if(iter
->reservedField
!=0) {
470 iter
->reservedField
=0;
471 i
-=4; /* we stayed behind the supplementary code point; go before it now */
475 while(delta
<0 && i
>0) {
480 } else if(delta
<=-2) {
483 } else /* delta==-1 */ {
484 /* stop in the middle of a supplementary code point */
485 i
+=4; /* back to behind this supplementary code point for consistent state */
486 iter
->reservedField
=c
;
488 break; /* delta=0; */
495 return iter
->index
=pos
;
497 /* we started with index<0 (unknown) so pos is bogus */
499 return iter
->index
=i
; /* reached the beginning */
501 /* we still don't know the UTF-16 index */
502 return UITER_UNKNOWN_INDEX
;
507 static UBool U_CALLCONV
508 lenient8IteratorHasNext(UCharIterator
*iter
) {
509 return iter
->reservedField
!=0 || iter
->start
<iter
->limit
;
512 static UBool U_CALLCONV
513 lenient8IteratorHasPrevious(UCharIterator
*iter
) {
514 return iter
->start
>0;
517 static UChar32 U_CALLCONV
518 lenient8IteratorCurrent(UCharIterator
*iter
) {
519 if(iter
->reservedField
!=0) {
520 return U16_TRAIL(iter
->reservedField
);
521 } else if(iter
->start
<iter
->limit
) {
522 const uint8_t *s
=(const uint8_t *)iter
->context
;
524 int32_t i
=iter
->start
;
526 L8_NEXT(s
, i
, iter
->limit
, c
);
529 } else if(c
<=0xffff) {
539 static UChar32 U_CALLCONV
540 lenient8IteratorNext(UCharIterator
*iter
) {
543 if(iter
->reservedField
!=0) {
544 UChar trail
=U16_TRAIL(iter
->reservedField
);
545 iter
->reservedField
=0;
546 if((index
=iter
->index
)>=0) {
550 } else if(iter
->start
<iter
->limit
) {
551 const uint8_t *s
=(const uint8_t *)iter
->context
;
554 L8_NEXT(s
, iter
->start
, iter
->limit
, c
);
555 if((index
=iter
->index
)>=0) {
557 if(iter
->length
<0 && iter
->start
==iter
->limit
) {
558 iter
->length
= c
<=0xffff ? index
: index
+1;
560 } else if(iter
->start
==iter
->limit
&& iter
->length
>=0) {
561 iter
->index
= c
<=0xffff ? iter
->length
: iter
->length
-1;
565 } else if(c
<=0xffff) {
568 iter
->reservedField
=c
;
576 static UChar32 U_CALLCONV
577 lenient8IteratorPrevious(UCharIterator
*iter
) {
580 if(iter
->reservedField
!=0) {
581 UChar lead
=U16_LEAD(iter
->reservedField
);
582 iter
->reservedField
=0;
583 iter
->start
-=4; /* we stayed behind the supplementary code point; go before it now */
584 if((index
=iter
->index
)>0) {
588 } else if(iter
->start
>0) {
589 const uint8_t *s
=(const uint8_t *)iter
->context
;
592 L8_PREV(s
, 0, iter
->start
, c
);
593 if((index
=iter
->index
)>0) {
595 } else if(iter
->start
<=1) {
596 iter
->index
= c
<=0xffff ? iter
->start
: iter
->start
+1;
600 } else if(c
<=0xffff) {
603 iter
->start
+=4; /* back to behind this supplementary code point for consistent state */
604 iter
->reservedField
=c
;
612 static uint32_t U_CALLCONV
613 lenient8IteratorGetState(const UCharIterator
*iter
) {
614 uint32_t state
=(uint32_t)(iter
->start
<<1);
615 if(iter
->reservedField
!=0) {
621 static void U_CALLCONV
622 lenient8IteratorSetState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
) {
623 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
625 } else if(iter
==NULL
) {
626 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
627 } else if(state
==lenient8IteratorGetState(iter
)) {
628 /* setting to the current state: no-op */
630 int32_t index
=(int32_t)(state
>>1); /* UTF-8 index */
631 state
&=1; /* 1 if in surrogate pair, must be index>=4 */
633 if((state
==0 ? index
<0 : index
<4) || iter
->limit
<index
) {
634 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
636 iter
->start
=index
; /* restore UTF-8 byte index */
640 iter
->index
=-1; /* unknown UTF-16 index */
643 iter
->reservedField
=0;
645 /* verified index>=4 above */
647 L8_PREV((const uint8_t *)iter
->context
, 0, index
, c
);
649 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
651 iter
->reservedField
=c
;
658 static const UCharIterator lenient8Iterator
={
660 lenient8IteratorGetIndex
,
661 lenient8IteratorMove
,
662 lenient8IteratorHasNext
,
663 lenient8IteratorHasPrevious
,
664 lenient8IteratorCurrent
,
665 lenient8IteratorNext
,
666 lenient8IteratorPrevious
,
668 lenient8IteratorGetState
,
669 lenient8IteratorSetState
672 U_CAPI
void U_EXPORT2
673 uiter_setLenient8(UCharIterator
*iter
, const char *s
, int32_t length
) {
675 if(s
!=0 && length
>=-1) {
676 *iter
=lenient8Iterator
;
681 iter
->limit
=strlen(s
);
683 iter
->length
= iter
->limit
<=1 ? iter
->limit
: -1;
685 /* set no-op iterator */
686 uiter_setString(iter
, NULL
, 0);