2 *******************************************************************************
4 * © 2016 and later: Unicode, Inc. and others.
5 * License & terms of use: http://www.unicode.org/copyright.html#License
7 *******************************************************************************
8 *******************************************************************************
10 * Copyright (C) 2003-2006, International Business Machines
11 * Corporation and others. All Rights Reserved.
13 *******************************************************************************
14 * file name: uit_len8.c
16 * tab size: 8 (not used)
19 * created on: 2003feb10
20 * created by: Markus W. Scherer
22 * This file contains the implementation of the "lenient UTF-8" UCharIterator
23 * as used in the uciter8 sample code.
24 * UTF-8-style macros are defined as well as the UCharIterator.
25 * The macros are incomplete (do not assemble code points from pairs of
26 * surrogates, see comment below)
27 * but sufficient for the iterator.
31 #include "unicode/utypes.h"
32 #include "unicode/uiter.h"
34 /* lenient UTF-8/CESU-8 macros ---------------------------------------------- */
37 * This code leniently reads 8-bit Unicode strings,
38 * which could contain a mix of UTF-8 and CESU-8.
40 * - supplementary code points may be encoded with dedicated 4-byte sequences
42 * - supplementary code points may be encoded with
43 * pairs of 3-byte sequences, one for each surrogate of the UTF-16 form
45 * - single surrogates are allowed, encoded with their "natural" 3-byte sequences
48 * Right now, the macros do not attempt to assemble code points from pairs of
49 * separately encoded surrogates.
50 * This would not be sufficient for processing based on these macros,
51 * but it is sufficient for a UCharIterator that returns only UChars anyway.
53 * The code is copied and modified from utf_impl.c and utf8.h.
55 * Change 2006feb08: Much of the implementation code is replaced by calling
56 * the utf_impl.c functions which accept a new "strict" parameter value
57 * of -2 implementing exactly this leniency.
60 #define L8_NEXT(s, i, length, c) { \
61 (c)=(uint8_t)(s)[(i)++]; \
64 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
71 #define L8_PREV(s, start, i, c) { \
72 (c)=(uint8_t)(s)[--(i)]; \
75 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
82 /* lenient-8 UCharIterator -------------------------------------------------- */
85 * This is a copy of the UTF-8 UCharIterator in uiter.cpp,
86 * except that it uses the lenient-8-bit-Unicode macros above.
90 * Minimal implementation:
91 * Maintain a single-UChar buffer for an additional surrogate.
92 * The caller must not modify start and limit because they are used internally.
94 * Use UCharIterator fields as follows:
95 * context pointer to UTF-8 string
96 * length UTF-16 length of the string; -1 until lazy evaluation
97 * start current UTF-8 index
98 * index current UTF-16 index; may be -1="unknown" after setState()
99 * limit UTF-8 length of the string
100 * reservedField supplementary code point
102 * Since UCharIterator delivers 16-bit code units, the iteration can be
103 * currently in the middle of the byte sequence for a supplementary code point.
104 * In this case, reservedField will contain that code point and start will
105 * point to after the corresponding byte sequence. The UTF-16 index will be
106 * one less than what it would otherwise be corresponding to the UTF-8 index.
107 * Otherwise, reservedField will be 0.
111 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
112 * Add implementations that do not call strlen() for iteration but check for NUL.
115 static int32_t U_CALLCONV
116 lenient8IteratorGetIndex(UCharIterator
*iter
, UCharIteratorOrigin origin
) {
123 /* the current UTF-16 index is unknown after setState(), count from the beginning */
126 int32_t i
, limit
, index
;
128 s
=(const uint8_t *)iter
->context
;
130 limit
=iter
->start
; /* count up to the UTF-8 index */
132 L8_NEXT(s
, i
, limit
, c
);
140 iter
->start
=i
; /* just in case setState() did not get us to a code point boundary */
142 iter
->length
=index
; /* in case it was <0 or wrong */
144 if(iter
->reservedField
!=0) {
145 --index
; /* we are in the middle of a supplementary code point */
155 int32_t i
, limit
, length
;
157 s
=(const uint8_t *)iter
->context
;
160 * the current UTF-16 index is unknown after setState(),
161 * we must first count from the beginning to here
166 /* count from the beginning to the current index */
168 L8_NEXT(s
, i
, limit
, c
);
176 /* assume i==limit==iter->start, set the UTF-16 index */
177 iter
->start
=i
; /* just in case setState() did not get us to a code point boundary */
178 iter
->index
= iter
->reservedField
!=0 ? length
-1 : length
;
182 if(iter
->reservedField
!=0) {
187 /* count from the current index to the end */
190 L8_NEXT(s
, i
, limit
, c
);
201 /* not a valid origin */
202 /* Should never get here! */
207 static int32_t U_CALLCONV
208 lenient8IteratorMove(UCharIterator
*iter
, int32_t delta
, UCharIteratorOrigin origin
) {
211 int32_t pos
; /* requested UTF-16 index */
212 int32_t i
; /* UTF-8 index */
215 /* calculate the requested UTF-16 index */
221 /* iter->index<0 (unknown) is possible */
225 pos
=iter
->index
+delta
;
228 /* the current UTF-16 index is unknown after setState(), use only delta */
235 if(iter
->length
>=0) {
236 pos
=iter
->length
+delta
;
239 /* pin to the end, avoid counting the length */
241 iter
->start
=iter
->limit
;
242 iter
->reservedField
=0;
244 return UITER_UNKNOWN_INDEX
;
246 /* the current UTF-16 index is unknown, use only delta */
253 return -1; /* Error */
257 /* shortcuts: pinning to the edges of the string */
259 iter
->index
=iter
->start
=iter
->reservedField
=0;
261 } else if(iter
->length
>=0 && pos
>=iter
->length
) {
262 iter
->index
=iter
->length
;
263 iter
->start
=iter
->limit
;
264 iter
->reservedField
=0;
268 /* minimize the number of L8_NEXT/PREV operations */
269 if(iter
->index
<0 || pos
<iter
->index
/2) {
270 /* go forward from the start instead of backward from the current index */
271 iter
->index
=iter
->start
=iter
->reservedField
=0;
272 } else if(iter
->length
>=0 && (iter
->length
-pos
)<(pos
-iter
->index
)) {
274 * if we have the UTF-16 index and length and the new position is
275 * closer to the end than the current index,
276 * then go backward from the end instead of forward from the current index
278 iter
->index
=iter
->length
;
279 iter
->start
=iter
->limit
;
280 iter
->reservedField
=0;
283 delta
=pos
-iter
->index
;
285 return iter
->index
; /* nothing to do */
288 /* move relative to unknown UTF-16 index */
290 return UITER_UNKNOWN_INDEX
; /* nothing to do */
291 } else if(-delta
>=iter
->start
) {
292 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
293 iter
->index
=iter
->start
=iter
->reservedField
=0;
295 } else if(delta
>=(iter
->limit
-iter
->start
)) {
296 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
297 iter
->index
=iter
->length
; /* may or may not be <0 (unknown) */
298 iter
->start
=iter
->limit
;
299 iter
->reservedField
=0;
300 return iter
->index
>=0 ? iter
->index
: UITER_UNKNOWN_INDEX
;
306 /* move towards the requested position, pin to the edges of the string */
307 s
=(const uint8_t *)iter
->context
;
308 pos
=iter
->index
; /* could be <0 (unknown) */
312 int32_t limit
=iter
->limit
;
313 if(iter
->reservedField
!=0) {
314 iter
->reservedField
=0;
318 while(delta
>0 && i
<limit
) {
319 L8_NEXT(s
, i
, limit
, c
);
323 } else if(delta
>=2) {
326 } else /* delta==1 */ {
327 /* stop in the middle of a supplementary code point */
328 iter
->reservedField
=c
;
330 break; /* delta=0; */
334 if(iter
->length
<0 && iter
->index
>=0) {
335 iter
->length
= iter
->reservedField
==0 ? pos
: pos
+1;
336 } else if(iter
->index
<0 && iter
->length
>=0) {
337 iter
->index
= iter
->reservedField
==0 ? iter
->length
: iter
->length
-1;
340 } else /* delta<0 */ {
342 if(iter
->reservedField
!=0) {
343 iter
->reservedField
=0;
344 i
-=4; /* we stayed behind the supplementary code point; go before it now */
348 while(delta
<0 && i
>0) {
353 } else if(delta
<=-2) {
356 } else /* delta==-1 */ {
357 /* stop in the middle of a supplementary code point */
358 i
+=4; /* back to behind this supplementary code point for consistent state */
359 iter
->reservedField
=c
;
361 break; /* delta=0; */
368 return iter
->index
=pos
;
370 /* we started with index<0 (unknown) so pos is bogus */
372 return iter
->index
=i
; /* reached the beginning */
374 /* we still don't know the UTF-16 index */
375 return UITER_UNKNOWN_INDEX
;
380 static UBool U_CALLCONV
381 lenient8IteratorHasNext(UCharIterator
*iter
) {
382 return iter
->reservedField
!=0 || iter
->start
<iter
->limit
;
385 static UBool U_CALLCONV
386 lenient8IteratorHasPrevious(UCharIterator
*iter
) {
387 return iter
->start
>0;
390 static UChar32 U_CALLCONV
391 lenient8IteratorCurrent(UCharIterator
*iter
) {
392 if(iter
->reservedField
!=0) {
393 return U16_TRAIL(iter
->reservedField
);
394 } else if(iter
->start
<iter
->limit
) {
395 const uint8_t *s
=(const uint8_t *)iter
->context
;
397 int32_t i
=iter
->start
;
399 L8_NEXT(s
, i
, iter
->limit
, c
);
402 } else if(c
<=0xffff) {
412 static UChar32 U_CALLCONV
413 lenient8IteratorNext(UCharIterator
*iter
) {
416 if(iter
->reservedField
!=0) {
417 UChar trail
=U16_TRAIL(iter
->reservedField
);
418 iter
->reservedField
=0;
419 if((index
=iter
->index
)>=0) {
423 } else if(iter
->start
<iter
->limit
) {
424 const uint8_t *s
=(const uint8_t *)iter
->context
;
427 L8_NEXT(s
, iter
->start
, iter
->limit
, c
);
428 if((index
=iter
->index
)>=0) {
430 if(iter
->length
<0 && iter
->start
==iter
->limit
) {
431 iter
->length
= c
<=0xffff ? index
: index
+1;
433 } else if(iter
->start
==iter
->limit
&& iter
->length
>=0) {
434 iter
->index
= c
<=0xffff ? iter
->length
: iter
->length
-1;
438 } else if(c
<=0xffff) {
441 iter
->reservedField
=c
;
449 static UChar32 U_CALLCONV
450 lenient8IteratorPrevious(UCharIterator
*iter
) {
453 if(iter
->reservedField
!=0) {
454 UChar lead
=U16_LEAD(iter
->reservedField
);
455 iter
->reservedField
=0;
456 iter
->start
-=4; /* we stayed behind the supplementary code point; go before it now */
457 if((index
=iter
->index
)>0) {
461 } else if(iter
->start
>0) {
462 const uint8_t *s
=(const uint8_t *)iter
->context
;
465 L8_PREV(s
, 0, iter
->start
, c
);
466 if((index
=iter
->index
)>0) {
468 } else if(iter
->start
<=1) {
469 iter
->index
= c
<=0xffff ? iter
->start
: iter
->start
+1;
473 } else if(c
<=0xffff) {
476 iter
->start
+=4; /* back to behind this supplementary code point for consistent state */
477 iter
->reservedField
=c
;
485 static uint32_t U_CALLCONV
486 lenient8IteratorGetState(const UCharIterator
*iter
) {
487 uint32_t state
=(uint32_t)(iter
->start
<<1);
488 if(iter
->reservedField
!=0) {
494 static void U_CALLCONV
495 lenient8IteratorSetState(UCharIterator
*iter
, uint32_t state
, UErrorCode
*pErrorCode
) {
496 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
498 } else if(iter
==NULL
) {
499 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
500 } else if(state
==lenient8IteratorGetState(iter
)) {
501 /* setting to the current state: no-op */
503 int32_t index
=(int32_t)(state
>>1); /* UTF-8 index */
504 state
&=1; /* 1 if in surrogate pair, must be index>=4 */
506 if((state
==0 ? index
<0 : index
<4) || iter
->limit
<index
) {
507 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
509 iter
->start
=index
; /* restore UTF-8 byte index */
513 iter
->index
=-1; /* unknown UTF-16 index */
516 iter
->reservedField
=0;
518 /* verified index>=4 above */
520 L8_PREV((const uint8_t *)iter
->context
, 0, index
, c
);
522 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
524 iter
->reservedField
=c
;
531 static const UCharIterator lenient8Iterator
={
533 lenient8IteratorGetIndex
,
534 lenient8IteratorMove
,
535 lenient8IteratorHasNext
,
536 lenient8IteratorHasPrevious
,
537 lenient8IteratorCurrent
,
538 lenient8IteratorNext
,
539 lenient8IteratorPrevious
,
541 lenient8IteratorGetState
,
542 lenient8IteratorSetState
545 U_CAPI
void U_EXPORT2
546 uiter_setLenient8(UCharIterator
*iter
, const char *s
, int32_t length
) {
548 if(s
!=0 && length
>=-1) {
549 *iter
=lenient8Iterator
;
554 iter
->limit
=(int32_t)strlen(s
);
556 iter
->length
= iter
->limit
<=1 ? iter
->limit
: -1;
558 /* set no-op iterator */
559 uiter_setString(iter
, NULL
, 0);