2 *******************************************************************************
4 * Copyright (C) 1999-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999sep13
14 * created by: Markus W. Scherer
19 * \brief C API: 8-bit Unicode handling macros
21 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
22 * utf8.h is included by utf.h after unicode/umachine.h
23 * and some common definitions.
25 * For more information see utf.h and the ICU User Guide Strings chapter
26 * (http://icu-project.org/userguide/strings.html).
29 * ICU coding guidelines for if() statements should be followed when using these macros.
30 * Compound statements (curly braces {}) must be used for if-else-while...
31 * bodies and all macro statements should be terminated with semicolon.
37 /* utf.h must be included first. */
39 # include "unicode/utf.h"
42 /* internal definitions ----------------------------------------------------- */
45 * \var utf8_countTrailBytes
46 * Internal array with numbers of trail bytes for any given byte used in
51 U_EXPORT
const uint8_t
52 #elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION)
55 U_CFUNC U_IMPORT
const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/
57 utf8_countTrailBytes
[256];
60 * Count the trail bytes for a UTF-8 lead byte.
63 #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
66 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
69 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
72 * Function for handling "next code point" with error-checking.
75 U_INTERNAL UChar32 U_EXPORT2
76 utf8_nextCharSafeBody(const uint8_t *s
, int32_t *pi
, int32_t length
, UChar32 c
, UBool strict
);
79 * Function for handling "append code point" with error-checking.
82 U_INTERNAL
int32_t U_EXPORT2
83 utf8_appendCharSafeBody(uint8_t *s
, int32_t i
, int32_t length
, UChar32 c
, UBool
*pIsError
);
86 * Function for handling "previous code point" with error-checking.
89 U_INTERNAL UChar32 U_EXPORT2
90 utf8_prevCharSafeBody(const uint8_t *s
, int32_t start
, int32_t *pi
, UChar32 c
, UBool strict
);
93 * Function for handling "skip backward one code point" with error-checking.
96 U_INTERNAL
int32_t U_EXPORT2
97 utf8_back1SafeBody(const uint8_t *s
, int32_t start
, int32_t i
);
99 /* single-code point definitions -------------------------------------------- */
102 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
103 * @param c 8-bit code unit (byte)
104 * @return TRUE or FALSE
107 #define U8_IS_SINGLE(c) (((c)&0x80)==0)
110 * Is this code unit (byte) a UTF-8 lead byte?
111 * @param c 8-bit code unit (byte)
112 * @return TRUE or FALSE
115 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
118 * Is this code unit (byte) a UTF-8 trail byte?
119 * @param c 8-bit code unit (byte)
120 * @return TRUE or FALSE
123 #define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
126 * How many code units (bytes) are used for the UTF-8 encoding
127 * of this Unicode code point?
128 * @param c 32-bit code point
129 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
132 #define U8_LENGTH(c) \
133 ((uint32_t)(c)<=0x7f ? 1 : \
134 ((uint32_t)(c)<=0x7ff ? 2 : \
135 ((uint32_t)(c)<=0xd7ff ? 3 : \
136 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
137 ((uint32_t)(c)<=0xffff ? 3 : 4)\
144 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
148 #define U8_MAX_LENGTH 4
151 * Get a code point from a string at a random-access offset,
152 * without changing the offset.
153 * The offset may point to either the lead byte or one of the trail bytes
154 * for a code point, in which case the macro will read all of the bytes
155 * for the code point.
156 * The result is undefined if the offset points to an illegal UTF-8
158 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
160 * @param s const uint8_t * string
161 * @param i string offset
162 * @param c output UChar32 variable
166 #define U8_GET_UNSAFE(s, i, c) { \
167 int32_t _u8_get_unsafe_index=(int32_t)(i); \
168 U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
169 U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
173 * Get a code point from a string at a random-access offset,
174 * without changing the offset.
175 * The offset may point to either the lead byte or one of the trail bytes
176 * for a code point, in which case the macro will read all of the bytes
177 * for the code point.
178 * If the offset points to an illegal UTF-8 byte sequence, then
179 * c is set to a negative value.
180 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
182 * @param s const uint8_t * string
183 * @param start starting string offset
184 * @param i string offset, must be start<=i<length
185 * @param length string length
186 * @param c output UChar32 variable, set to <0 in case of an error
190 #define U8_GET(s, start, i, length, c) { \
191 int32_t _u8_get_index=(int32_t)(i); \
192 U8_SET_CP_START(s, start, _u8_get_index); \
193 U8_NEXT(s, _u8_get_index, length, c); \
196 /* definitions with forward iteration --------------------------------------- */
199 * Get a code point from a string at a code point boundary offset,
200 * and advance the offset to the next code point boundary.
201 * (Post-incrementing forward iteration.)
202 * "Unsafe" macro, assumes well-formed UTF-8.
204 * The offset may point to the lead byte of a multi-byte sequence,
205 * in which case the macro will read the whole sequence.
206 * The result is undefined if the offset points to a trail byte
207 * or an illegal UTF-8 sequence.
209 * @param s const uint8_t * string
210 * @param i string offset
211 * @param c output UChar32 variable
215 #define U8_NEXT_UNSAFE(s, i, c) { \
216 (c)=(uint8_t)(s)[(i)++]; \
217 if((uint8_t)((c)-0xc0)<0x35) { \
218 uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \
219 U8_MASK_LEAD_BYTE(c, __count); \
221 /* each following branch falls through to the next one */ \
223 (c)=((c)<<6)|((s)[(i)++]&0x3f); \
225 (c)=((c)<<6)|((s)[(i)++]&0x3f); \
227 (c)=((c)<<6)|((s)[(i)++]&0x3f); \
228 /* no other branches to optimize switch() */ \
235 * Get a code point from a string at a code point boundary offset,
236 * and advance the offset to the next code point boundary.
237 * (Post-incrementing forward iteration.)
238 * "Safe" macro, checks for illegal sequences and for string boundaries.
240 * The offset may point to the lead byte of a multi-byte sequence,
241 * in which case the macro will read the whole sequence.
242 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
243 * c is set to a negative value.
245 * @param s const uint8_t * string
246 * @param i string offset, must be i<length
247 * @param length string length
248 * @param c output UChar32 variable, set to <0 in case of an error
249 * @see U8_NEXT_UNSAFE
252 #define U8_NEXT(s, i, length, c) { \
253 (c)=(uint8_t)(s)[(i)++]; \
255 uint8_t __t1, __t2; \
256 if( /* handle U+1000..U+CFFF inline */ \
257 (0xe0<(c) && (c)<=0xec) && \
258 (((i)+1)<(length)) && \
259 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \
260 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \
262 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
263 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
265 } else if( /* handle U+0080..U+07FF inline */ \
266 ((c)<0xe0 && (c)>=0xc2) && \
268 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \
270 (c)=(UChar)((((c)&0x1f)<<6)|__t1); \
272 } else if(U8_IS_LEAD(c)) { \
273 /* function call for "complicated" and error cases */ \
274 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \
282 * Append a code point to a string, overwriting 1 to 4 bytes.
283 * The offset points to the current end of the string contents
284 * and is advanced (post-increment).
285 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
286 * Otherwise, the result is undefined.
288 * @param s const uint8_t * string buffer
289 * @param i string offset
290 * @param c code point to append
294 #define U8_APPEND_UNSAFE(s, i, c) { \
295 if((uint32_t)(c)<=0x7f) { \
296 (s)[(i)++]=(uint8_t)(c); \
298 if((uint32_t)(c)<=0x7ff) { \
299 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
301 if((uint32_t)(c)<=0xffff) { \
302 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
304 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
305 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
307 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
309 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
314 * Append a code point to a string, overwriting 1 to 4 bytes.
315 * The offset points to the current end of the string contents
316 * and is advanced (post-increment).
317 * "Safe" macro, checks for a valid code point.
318 * If a non-ASCII code point is written, checks for sufficient space in the string.
319 * If the code point is not valid or trail bytes do not fit,
320 * then isError is set to TRUE.
322 * @param s const uint8_t * string buffer
323 * @param i string offset, must be i<capacity
324 * @param capacity size of the string buffer
325 * @param c code point to append
326 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
327 * @see U8_APPEND_UNSAFE
330 #define U8_APPEND(s, i, capacity, c, isError) { \
331 if((uint32_t)(c)<=0x7f) { \
332 (s)[(i)++]=(uint8_t)(c); \
333 } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \
334 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
335 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
336 } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \
337 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
338 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
339 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
341 (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(capacity), c, &(isError)); \
346 * Advance the string offset from one code point boundary to the next.
347 * (Post-incrementing iteration.)
348 * "Unsafe" macro, assumes well-formed UTF-8.
350 * @param s const uint8_t * string
351 * @param i string offset
355 #define U8_FWD_1_UNSAFE(s, i) { \
356 (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \
360 * Advance the string offset from one code point boundary to the next.
361 * (Post-incrementing iteration.)
362 * "Safe" macro, checks for illegal sequences and for string boundaries.
364 * @param s const uint8_t * string
365 * @param i string offset, must be i<length
366 * @param length string length
367 * @see U8_FWD_1_UNSAFE
370 #define U8_FWD_1(s, i, length) { \
371 uint8_t __b=(uint8_t)(s)[(i)++]; \
372 if(U8_IS_LEAD(__b)) { \
373 uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
374 if((i)+__count>(length)) { \
375 __count=(uint8_t)((length)-(i)); \
377 while(__count>0 && U8_IS_TRAIL((s)[i])) { \
385 * Advance the string offset from one code point boundary to the n-th next one,
386 * i.e., move forward by n code points.
387 * (Post-incrementing iteration.)
388 * "Unsafe" macro, assumes well-formed UTF-8.
390 * @param s const uint8_t * string
391 * @param i string offset
392 * @param n number of code points to skip
396 #define U8_FWD_N_UNSAFE(s, i, n) { \
399 U8_FWD_1_UNSAFE(s, i); \
405 * Advance the string offset from one code point boundary to the n-th next one,
406 * i.e., move forward by n code points.
407 * (Post-incrementing iteration.)
408 * "Safe" macro, checks for illegal sequences and for string boundaries.
410 * @param s const uint8_t * string
411 * @param i string offset, must be i<length
412 * @param length string length
413 * @param n number of code points to skip
414 * @see U8_FWD_N_UNSAFE
417 #define U8_FWD_N(s, i, length, n) { \
419 while(__N>0 && (i)<(length)) { \
420 U8_FWD_1(s, i, length); \
426 * Adjust a random-access offset to a code point boundary
427 * at the start of a code point.
428 * If the offset points to a UTF-8 trail byte,
429 * then the offset is moved backward to the corresponding lead byte.
430 * Otherwise, it is not modified.
431 * "Unsafe" macro, assumes well-formed UTF-8.
433 * @param s const uint8_t * string
434 * @param i string offset
435 * @see U8_SET_CP_START
438 #define U8_SET_CP_START_UNSAFE(s, i) { \
439 while(U8_IS_TRAIL((s)[i])) { --(i); } \
443 * Adjust a random-access offset to a code point boundary
444 * at the start of a code point.
445 * If the offset points to a UTF-8 trail byte,
446 * then the offset is moved backward to the corresponding lead byte.
447 * Otherwise, it is not modified.
448 * "Safe" macro, checks for illegal sequences and for string boundaries.
450 * @param s const uint8_t * string
451 * @param start starting string offset (usually 0)
452 * @param i string offset, must be start<=i
453 * @see U8_SET_CP_START_UNSAFE
456 #define U8_SET_CP_START(s, start, i) { \
457 if(U8_IS_TRAIL((s)[(i)])) { \
458 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
462 /* definitions with backward iteration -------------------------------------- */
465 * Move the string offset from one code point boundary to the previous one
466 * and get the code point between them.
467 * (Pre-decrementing backward iteration.)
468 * "Unsafe" macro, assumes well-formed UTF-8.
470 * The input offset may be the same as the string length.
471 * If the offset is behind a multi-byte sequence, then the macro will read
472 * the whole sequence.
473 * If the offset is behind a lead byte, then that itself
474 * will be returned as the code point.
475 * The result is undefined if the offset is behind an illegal UTF-8 sequence.
477 * @param s const uint8_t * string
478 * @param i string offset
479 * @param c output UChar32 variable
483 #define U8_PREV_UNSAFE(s, i, c) { \
484 (c)=(uint8_t)(s)[--(i)]; \
485 if(U8_IS_TRAIL(c)) { \
486 uint8_t __b, __count=1, __shift=6; \
488 /* c is a trail byte */ \
491 __b=(uint8_t)(s)[--(i)]; \
493 U8_MASK_LEAD_BYTE(__b, __count); \
494 (c)|=(UChar32)__b<<__shift; \
497 (c)|=(UChar32)(__b&0x3f)<<__shift; \
506 * Move the string offset from one code point boundary to the previous one
507 * and get the code point between them.
508 * (Pre-decrementing backward iteration.)
509 * "Safe" macro, checks for illegal sequences and for string boundaries.
511 * The input offset may be the same as the string length.
512 * If the offset is behind a multi-byte sequence, then the macro will read
513 * the whole sequence.
514 * If the offset is behind a lead byte, then that itself
515 * will be returned as the code point.
516 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
518 * @param s const uint8_t * string
519 * @param start starting string offset (usually 0)
520 * @param i string offset, must be start<i
521 * @param c output UChar32 variable, set to <0 in case of an error
522 * @see U8_PREV_UNSAFE
525 #define U8_PREV(s, start, i, c) { \
526 (c)=(uint8_t)(s)[--(i)]; \
529 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
537 * Move the string offset from one code point boundary to the previous one.
538 * (Pre-decrementing backward iteration.)
539 * The input offset may be the same as the string length.
540 * "Unsafe" macro, assumes well-formed UTF-8.
542 * @param s const uint8_t * string
543 * @param i string offset
547 #define U8_BACK_1_UNSAFE(s, i) { \
548 while(U8_IS_TRAIL((s)[--(i)])) {} \
552 * Move the string offset from one code point boundary to the previous one.
553 * (Pre-decrementing backward iteration.)
554 * The input offset may be the same as the string length.
555 * "Safe" macro, checks for illegal sequences and for string boundaries.
557 * @param s const uint8_t * string
558 * @param start starting string offset (usually 0)
559 * @param i string offset, must be start<i
560 * @see U8_BACK_1_UNSAFE
563 #define U8_BACK_1(s, start, i) { \
564 if(U8_IS_TRAIL((s)[--(i)])) { \
565 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
570 * Move the string offset from one code point boundary to the n-th one before it,
571 * i.e., move backward by n code points.
572 * (Pre-decrementing backward iteration.)
573 * The input offset may be the same as the string length.
574 * "Unsafe" macro, assumes well-formed UTF-8.
576 * @param s const uint8_t * string
577 * @param i string offset
578 * @param n number of code points to skip
582 #define U8_BACK_N_UNSAFE(s, i, n) { \
585 U8_BACK_1_UNSAFE(s, i); \
591 * Move the string offset from one code point boundary to the n-th one before it,
592 * i.e., move backward by n code points.
593 * (Pre-decrementing backward iteration.)
594 * The input offset may be the same as the string length.
595 * "Safe" macro, checks for illegal sequences and for string boundaries.
597 * @param s const uint8_t * string
598 * @param start index of the start of the string
599 * @param i string offset, must be start<i
600 * @param n number of code points to skip
601 * @see U8_BACK_N_UNSAFE
604 #define U8_BACK_N(s, start, i, n) { \
606 while(__N>0 && (i)>(start)) { \
607 U8_BACK_1(s, start, i); \
613 * Adjust a random-access offset to a code point boundary after a code point.
614 * If the offset is behind a partial multi-byte sequence,
615 * then the offset is incremented to behind the whole sequence.
616 * Otherwise, it is not modified.
617 * The input offset may be the same as the string length.
618 * "Unsafe" macro, assumes well-formed UTF-8.
620 * @param s const uint8_t * string
621 * @param i string offset
622 * @see U8_SET_CP_LIMIT
625 #define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
626 U8_BACK_1_UNSAFE(s, i); \
627 U8_FWD_1_UNSAFE(s, i); \
631 * Adjust a random-access offset to a code point boundary after a code point.
632 * If the offset is behind a partial multi-byte sequence,
633 * then the offset is incremented to behind the whole sequence.
634 * Otherwise, it is not modified.
635 * The input offset may be the same as the string length.
636 * "Safe" macro, checks for illegal sequences and for string boundaries.
638 * @param s const uint8_t * string
639 * @param start starting string offset (usually 0)
640 * @param i string offset, must be start<=i<=length
641 * @param length string length
642 * @see U8_SET_CP_LIMIT_UNSAFE
645 #define U8_SET_CP_LIMIT(s, start, i, length) { \
646 if((start)<(i) && (i)<(length)) { \
647 U8_BACK_1(s, start, i); \
648 U8_FWD_1(s, i, length); \