2 *******************************************************************************
4 * Copyright (C) 2003-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: uit_len8.c
10 * tab size: 8 (not used)
13 * created on: 2003feb10
14 * created by: Markus W. Scherer
16 * This file contains the implementation of the "lenient UTF-8" UCharIterator
17 * as used in the uciter8 sample code.
18 * UTF-8-style macros are defined as well as the UCharIterator.
19 * The macros are incomplete (do not assemble code points from pairs of
20 * surrogates, see comment below)
21 * but sufficient for the iterator.
25 #include "unicode/utypes.h"
26 #include "unicode/uiter.h"
28 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
31 * This code leniently reads 8-bit Unicode strings,
32 * which could contain a mix of UTF-8 and CESU-8.
34 * - supplementary code points may be encoded with dedicated 4-byte sequences
36 * - supplementary code points may be encoded with
37 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
39 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
42 * Right now, the macros do not attempt to assemble code points from pairs of
43 * separately encoded surrogates.
44 * This would not be sufficient for processing based on these macros,
45 * but it is sufficient for a UCharIterator that returns only UChars anyway.
47 * The code is copied and modified from utf_impl.c and utf8.h.
49 * Change 2006feb08: Much of the implementation code is replaced by calling
50 * the utf_impl.c functions which accept a new "strict" parameter value
51 * of -2 implementing exactly this leniency.
54 #define L8_NEXT(s, i, length, c) { \
55 (c)=(uint8_t)(s)[(i)++]; \
58 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
65 #define L8_PREV(s, start, i, c) { \
66 (c)=(uint8_t)(s)[--(i)]; \
69 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
76 /* lenient-8 UCharIterator -------------------------------------------------- */
79 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
80 * except that it uses the lenient-8-bit-Unicode macros above.
84 * Minimal implementation:
85 * Maintain a single-UChar buffer for an additional surrogate.
86 * The caller must not modify start and limit because they are used internally.
88 * Use UCharIterator fields as follows:
89 * context pointer to UTF-8 string
90 * length UTF-16 length of the string; -1 until lazy evaluation
91 * start current UTF-8 index
92 * index current UTF-16 index; may be -1="unknown" after setState()
93 * limit UTF-8 length of the string
94 * reservedField supplementary code point
96 * Since UCharIterator delivers 16-bit code units, the iteration can be
97 * currently in the middle of the byte sequence for a supplementary code point.
98 * In this case, reservedField will contain that code point and start will
99 * point to after the corresponding byte sequence. The UTF-16 index will be
100 * one less than what it would otherwise be corresponding to the UTF-8 index.
101 * Otherwise, reservedField will be 0.
105 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
106 * Add implementations that do not call strlen() for iteration but check for NUL.
109 static int32_t U_CALLCONV
110 lenient8IteratorGetIndex(UCharIterator
*iter
, UCharIteratorOrigin origin
) {
117 /* the current UTF-16 index is unknown after setState(), count from the beginning */
120 int32_t i
, limit
, index
;
122 s
=(const uint8_t *)iter
->context
;
124 limit
=iter
->start
; /* count up to the UTF-8 index */
126 L8_NEXT(s
, i
, limit
, c
);
134 iter
->start
=i
; /* just in case setState() did not get us to a code point boundary */
136 iter
->length
=index
; /* in case it was <0 or wrong */
138 if(iter
->reservedField
!=0) {
139 --index
; /* we are in the middle of a supplementary code point */
149 int32_t i
, limit
, length
;
151 s
=(const uint8_t *)iter
->context
;
154 * the current UTF-16 index is unknown after setState(),
155 * we must first count from the beginning to here
160 /* count from the beginning to the current index */
162 L8_NEXT(s
, i
, limit
, c
);
170 /* assume i==limit==iter->start, set the UTF-16 index */
171 iter
->start
=i
; /* just in case setState() did not get us to a code point boundary */
172 iter
->index
= iter
->reservedField
!=0 ? length
-1 : length
;
176 if(iter
->reservedField
!=0) {
181 /* count from the current index to the end */
184 L8_NEXT(s
, i
, limit
, c
);
195 /* not a valid origin */
196 /* Should never get here! */
201 static int32_t U_CALLCONV
202 lenient8IteratorMove(UCharIterator
*iter
, int32_t delta
, UCharIteratorOrigin origin
) {
205 int32_t pos
; /* requested UTF-16 index */
206 int32_t i
; /* UTF-8 index */
209 /* calculate the requested UTF-16 index */
215 /* iter->index<0 (unknown) is possible */
219 pos
=iter
->index
+delta
;
222 /* the current UTF-16 index is unknown after setState(), use only delta */
229 if(iter
->length
>=0) {
230 pos
=iter
->length
+delta
;
233 /* pin to the end, avoid counting the length */
235 iter
->start
=iter
->limit
;
236 iter
->reservedField
=0;
238 return UITER_UNKNOWN_INDEX
;
240 /* the current UTF-16 index is unknown, use only delta */
247 return -1; /* Error */
251 /* shortcuts: pinning to the edges of the string */
253 iter
->index
=iter
->start
=iter
->reservedField
=0;
255 } else if(iter
->length
>=0 && pos
>=iter
->length
) {
256 iter
->index
=iter
->length
;
257 iter
->start
=iter
->limit
;
258 iter
->reservedField
=0;
262 /* minimize the number of L8_NEXT/PREV operations */
263 if(iter
->index
<0 || pos
<iter
->index
/2) {
264 /* go forward from the start instead of backward from the current index */
265 iter
->index
=iter
->start
=iter
->reservedField
=0;
266 } else if(iter
->length
>=0 && (iter
->length
-pos
)<(pos
-iter
->index
)) {
268 * if we have the UTF-16 index and length and the new position is
269 * closer to the end than the current index,
270 * then go backward from the end instead of forward from the current index
272 iter
->index
=iter
->length
;
273 iter
->start
=iter
->limit
;
274 iter
->reservedField
=0;
277 delta
=pos
-iter
->index
;
279 return iter
->index
; /* nothing to do */
282 /* move relative to unknown UTF-16 index */
284 return UITER_UNKNOWN_INDEX
; /* nothing to do */
285 } else if(-delta
>=iter
->start
) {
286 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
287 iter
->index
=iter
->start
=iter
->reservedField
=0;
289 } else if(delta
>=(iter
->limit
-iter
->start
)) {
290 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
291 iter
->index
=iter
->length
; /* may or may not be <0 (unknown) */
292 iter
->start
=iter
->limit
;
293 iter
->reservedField
=0;
294 return iter
->index
>=0 ? iter
->index
: UITER_UNKNOWN_INDEX
;
300 /* move towards the requested position, pin to the edges of the string */
301 s
=(const uint8_t *)iter
->context
;
302 pos
=iter
->index
; /* could be <0 (unknown) */
306 int32_t limit
=iter
->limit
;
307 if(iter
->reservedField
!=0) {
308 iter
->reservedField
=0;
312 while(delta
>0 && i
<limit
) {
313 L8_NEXT(s
, i
, limit
, c
);
317 } else if(delta
>=2) {
320 } else /* delta==1 */ {
321 /* stop in the middle of a supplementary code point */
322 iter
->reservedField
=c
;
324 break; /* delta=0; */
328 if(iter
->length
<0 && iter
->index
>=0) {
329 iter
->length
= iter
->reservedField
==0 ? pos
: pos
+1;
330 } else if(iter
->index
<0 && iter
->length
>=0) {
331 iter
->index
= iter
->reservedField
==0 ? iter
->length
: iter
->length
-1;
334 } else /* delta<0 */ {
336 if(iter
->reservedField
!=0) {
337 iter
->reservedField
=0;
338 i
-=4; /* we stayed behind the supplementary code point; go before it now */
342 while(delta
<0 && i
>0) {
347 } else if(delta
<=-2) {
350 } else /* delta==-1 */ {
351 /* stop in the middle of a supplementary code point */
352 i
+=4; /* back to behind this supplementary code point for consistent state */
353 iter
->reservedField
=c
;
355 break; /* delta=0; */
362 return iter
->index
=pos
;
364 /* we started with index<0 (unknown) so pos is bogus */
366 return iter
->index
=i
; /* reached the beginning */
368 /* we still don't know the UTF-16 index */
369 return UITER_UNKNOWN_INDEX
;
374 static UBool U_CALLCONV
375 lenient8IteratorHasNext(UCharIterator
*iter
) {
376 return iter
->reservedField
!=0 || iter
->start
<iter
->limit
;
379 static UBool U_CALLCONV
380 lenient8IteratorHasPrevious(UCharIterator
*iter
) {
381 return iter
->start
>0;
384 static UChar32 U_CALLCONV
385 lenient8IteratorCurrent(UCharIterator
*iter
) {
386 if(iter
->reservedField
!=0) {
387 return U16_TRAIL(iter
->reservedField
);
388 } else if(iter
->start
<iter
->limit
) {
389 const uint8_t *s
=(const uint8_t *)iter
->context
;
391 int32_t i
=iter
->start
;
393 L8_NEXT(s
, i
, iter
->limit
, c
);
396 } else if(c
<=0xffff) {
406 static UChar32 U_CALLCONV
407 lenient8IteratorNext(UCharIterator
*iter
) {
410 if(iter
->reservedField
!=0) {
411 UChar trail
=U16_TRAIL(iter
->reservedField
);
412 iter
->reservedField
=0;
413 if((index
=iter
->index
)>=0) {
417 } else if(iter
->start
<iter
->limit
) {
418 const uint8_t *s
=(const uint8_t *)iter
->context
;
421 L8_NEXT(s
, iter
->start
, iter
->limit
, c
);
422 if((index
=iter
->index
)>=0) {
424 if(iter
->length
<0 && iter
->start
==iter
->limit
) {
425 iter
->length
= c
<=0xffff ? index
: index
+1;
427 } else if(iter
->start
==iter
->limit
&& iter
->length
>=0) {
428 iter
->index
= c
<=0xffff ? iter
->length
: iter
->length
-1;
432 } else if(c
<=0xffff) {
435 iter
->reservedField
=c
;
443 static UChar32 U_CALLCONV
444 lenient8IteratorPrevious(UCharIterator
*iter
) {
447 if(iter
->reservedField
!=0) {
448 UChar lead
=U16_LEAD(iter
->reservedField
);
449 iter
->reservedField
=0;
450 iter
->start
-=4; /* we stayed behind the supplementary code point; go before it now */
451 if((index
=iter
->index
)>0) {
455 } else if(iter
->start
>0) {
456 const uint8_t *s
=(const uint8_t *)iter
->context
;
459 L8_PREV(s
, 0, iter
->start
, c
);
460 if((index
=iter
->index
)>0) {
462 } else if(iter
->start
<=1) {
463 iter
->index
= c
<=0xffff ? iter
->start
: iter
->start
+1;
467 } else if(c
<=0xffff) {
470 iter
->start
+=4; /* back to behind this supplementary code point for consistent state */
471 iter
->reservedField
=c
;
479 static uint32_t U_CALLCONV
480 lenient8IteratorGetState(const UCharIterator
*iter
) {
481 uint32_t state
=(uint32_t)(iter
->start
<<1);
482 if(iter
->reservedField
!=0) {
488 static void U_CALLCONV
489 lenient8IteratorSetState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
) {
490 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
492 } else if(iter
==NULL
) {
493 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
494 } else if(state
==lenient8IteratorGetState(iter
)) {
495 /* setting to the current state: no-op */
497 int32_t index
=(int32_t)(state
>>1); /* UTF-8 index */
498 state
&=1; /* 1 if in surrogate pair, must be index>=4 */
500 if((state
==0 ? index
<0 : index
<4) || iter
->limit
<index
) {
501 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
503 iter
->start
=index
; /* restore UTF-8 byte index */
507 iter
->index
=-1; /* unknown UTF-16 index */
510 iter
->reservedField
=0;
512 /* verified index>=4 above */
514 L8_PREV((const uint8_t *)iter
->context
, 0, index
, c
);
516 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
518 iter
->reservedField
=c
;
525 static const UCharIterator lenient8Iterator
={
527 lenient8IteratorGetIndex
,
528 lenient8IteratorMove
,
529 lenient8IteratorHasNext
,
530 lenient8IteratorHasPrevious
,
531 lenient8IteratorCurrent
,
532 lenient8IteratorNext
,
533 lenient8IteratorPrevious
,
535 lenient8IteratorGetState
,
536 lenient8IteratorSetState
539 U_CAPI
void U_EXPORT2
540 uiter_setLenient8(UCharIterator
*iter
, const char *s
, int32_t length
) {
542 if(s
!=0 && length
>=-1) {
543 *iter
=lenient8Iterator
;
548 iter
->limit
=strlen(s
);
550 iter
->length
= iter
->limit
<=1 ? iter
->limit
: -1;
552 /* set no-op iterator */
553 uiter_setString(iter
, NULL
, 0);