]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/utf16.h
2 *******************************************************************************
4 * Copyright (C) 1999-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999sep09
14 * created by: Markus W. Scherer
19 * \brief C API: 16-bit Unicode handling macros
21 * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
22 * utf16.h is included by utf.h after unicode/umachine.h
23 * and some common definitions.
25 * For more information see utf.h and the ICU User Guide Strings chapter
26 * (http://icu-project.org/userguide/strings.html).
29 * ICU coding guidelines for if() statements should be followed when using these macros.
30 * Compound statements (curly braces {}) must be used for if-else-while...
31 * bodies and all macro statements should be terminated with semicolon.
37 /* utf.h must be included first. */
39 # include "unicode/utf.h"
42 /* single-code point definitions -------------------------------------------- */
45 * Does this code unit alone encode a code point (BMP, not a surrogate)?
46 * @param c 16-bit code unit
47 * @return TRUE or FALSE
50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
53 * Is this code unit a lead surrogate (U+d800..U+dbff)?
54 * @param c 16-bit code unit
55 * @return TRUE or FALSE
58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
61 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
62 * @param c 16-bit code unit
63 * @return TRUE or FALSE
66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
69 * Is this code unit a surrogate (U+d800..U+dfff)?
70 * @param c 16-bit code unit
71 * @return TRUE or FALSE
74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
77 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
78 * is it a lead surrogate?
79 * @param c 16-bit code unit
80 * @return TRUE or FALSE
83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
86 * Helper constant for U16_GET_SUPPLEMENTARY.
89 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
92 * Get a supplementary code point value (U+10000..U+10ffff)
93 * from its lead and trail surrogates.
94 * The result is undefined if the input values are not
95 * lead and trail surrogates.
97 * @param lead lead surrogate (U+d800..U+dbff)
98 * @param trail trail surrogate (U+dc00..U+dfff)
99 * @return supplementary code point (U+10000..U+10ffff)
102 #define U16_GET_SUPPLEMENTARY(lead, trail) \
103 (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
107 * Get the lead surrogate (0xd800..0xdbff) for a
108 * supplementary code point (0x10000..0x10ffff).
109 * @param supplementary 32-bit code point (U+10000..U+10ffff)
110 * @return lead surrogate (U+d800..U+dbff) for supplementary
113 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
116 * Get the trail surrogate (0xdc00..0xdfff) for a
117 * supplementary code point (0x10000..0x10ffff).
118 * @param supplementary 32-bit code point (U+10000..U+10ffff)
119 * @return trail surrogate (U+dc00..U+dfff) for supplementary
122 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
125 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
126 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
127 * @param c 32-bit code point
131 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
134 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
138 #define U16_MAX_LENGTH 2
141 * Get a code point from a string at a random-access offset,
142 * without changing the offset.
143 * "Unsafe" macro, assumes well-formed UTF-16.
145 * The offset may point to either the lead or trail surrogate unit
146 * for a supplementary code point, in which case the macro will read
147 * the adjacent matching surrogate as well.
148 * The result is undefined if the offset points to a single, unpaired surrogate.
149 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
151 * @param s const UChar * string
152 * @param i string offset
153 * @param c output UChar32 variable
157 #define U16_GET_UNSAFE(s, i, c) { \
159 if(U16_IS_SURROGATE(c)) { \
160 if(U16_IS_SURROGATE_LEAD(c)) { \
161 (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
163 (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
169 * Get a code point from a string at a random-access offset,
170 * without changing the offset.
171 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
173 * The offset may point to either the lead or trail surrogate unit
174 * for a supplementary code point, in which case the macro will read
175 * the adjacent matching surrogate as well.
176 * If the offset points to a single, unpaired surrogate, then that itself
177 * will be returned as the code point.
178 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
180 * @param s const UChar * string
181 * @param start starting string offset (usually 0)
182 * @param i string offset, must be start<=i<length
183 * @param length string length
184 * @param c output UChar32 variable
185 * @see U16_GET_UNSAFE
188 #define U16_GET(s, start, i, length, c) { \
190 if(U16_IS_SURROGATE(c)) { \
192 if(U16_IS_SURROGATE_LEAD(c)) { \
193 if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
194 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
197 if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
198 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
204 /* definitions with forward iteration --------------------------------------- */
207 * Get a code point from a string at a code point boundary offset,
208 * and advance the offset to the next code point boundary.
209 * (Post-incrementing forward iteration.)
210 * "Unsafe" macro, assumes well-formed UTF-16.
212 * The offset may point to the lead surrogate unit
213 * for a supplementary code point, in which case the macro will read
214 * the following trail surrogate as well.
215 * If the offset points to a trail surrogate, then that itself
216 * will be returned as the code point.
217 * The result is undefined if the offset points to a single, unpaired lead surrogate.
219 * @param s const UChar * string
220 * @param i string offset
221 * @param c output UChar32 variable
225 #define U16_NEXT_UNSAFE(s, i, c) { \
227 if(U16_IS_LEAD(c)) { \
228 (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
233 * Get a code point from a string at a code point boundary offset,
234 * and advance the offset to the next code point boundary.
235 * (Post-incrementing forward iteration.)
236 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
238 * The offset may point to the lead surrogate unit
239 * for a supplementary code point, in which case the macro will read
240 * the following trail surrogate as well.
241 * If the offset points to a trail surrogate or
242 * to a single, unpaired lead surrogate, then that itself
243 * will be returned as the code point.
245 * @param s const UChar * string
246 * @param i string offset, must be i<length
247 * @param length string length
248 * @param c output UChar32 variable
249 * @see U16_NEXT_UNSAFE
252 #define U16_NEXT(s, i, length, c) { \
254 if(U16_IS_LEAD(c)) { \
256 if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
258 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
264 * Append a code point to a string, overwriting 1 or 2 code units.
265 * The offset points to the current end of the string contents
266 * and is advanced (post-increment).
267 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
268 * Otherwise, the result is undefined.
270 * @param s const UChar * string buffer
271 * @param i string offset
272 * @param c code point to append
276 #define U16_APPEND_UNSAFE(s, i, c) { \
277 if((uint32_t)(c)<=0xffff) { \
278 (s)[(i)++]=(uint16_t)(c); \
280 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
281 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
286 * Append a code point to a string, overwriting 1 or 2 code units.
287 * The offset points to the current end of the string contents
288 * and is advanced (post-increment).
289 * "Safe" macro, checks for a valid code point.
290 * If a surrogate pair is written, checks for sufficient space in the string.
291 * If the code point is not valid or a trail surrogate does not fit,
292 * then isError is set to TRUE.
294 * @param s const UChar * string buffer
295 * @param i string offset, must be i<capacity
296 * @param capacity size of the string buffer
297 * @param c code point to append
298 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
299 * @see U16_APPEND_UNSAFE
302 #define U16_APPEND(s, i, capacity, c, isError) { \
303 if((uint32_t)(c)<=0xffff) { \
304 (s)[(i)++]=(uint16_t)(c); \
305 } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
306 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
307 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
308 } else /* c>0x10ffff or not enough space */ { \
314 * Advance the string offset from one code point boundary to the next.
315 * (Post-incrementing iteration.)
316 * "Unsafe" macro, assumes well-formed UTF-16.
318 * @param s const UChar * string
319 * @param i string offset
323 #define U16_FWD_1_UNSAFE(s, i) { \
324 if(U16_IS_LEAD((s)[(i)++])) { \
330 * Advance the string offset from one code point boundary to the next.
331 * (Post-incrementing iteration.)
332 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
334 * @param s const UChar * string
335 * @param i string offset, must be i<length
336 * @param length string length
337 * @see U16_FWD_1_UNSAFE
340 #define U16_FWD_1(s, i, length) { \
341 if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
347 * Advance the string offset from one code point boundary to the n-th next one,
348 * i.e., move forward by n code points.
349 * (Post-incrementing iteration.)
350 * "Unsafe" macro, assumes well-formed UTF-16.
352 * @param s const UChar * string
353 * @param i string offset
354 * @param n number of code points to skip
358 #define U16_FWD_N_UNSAFE(s, i, n) { \
361 U16_FWD_1_UNSAFE(s, i); \
367 * Advance the string offset from one code point boundary to the n-th next one,
368 * i.e., move forward by n code points.
369 * (Post-incrementing iteration.)
370 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
372 * @param s const UChar * string
373 * @param i string offset, must be i<length
374 * @param length string length
375 * @param n number of code points to skip
376 * @see U16_FWD_N_UNSAFE
379 #define U16_FWD_N(s, i, length, n) { \
381 while(__N>0 && (i)<(length)) { \
382 U16_FWD_1(s, i, length); \
388 * Adjust a random-access offset to a code point boundary
389 * at the start of a code point.
390 * If the offset points to the trail surrogate of a surrogate pair,
391 * then the offset is decremented.
392 * Otherwise, it is not modified.
393 * "Unsafe" macro, assumes well-formed UTF-16.
395 * @param s const UChar * string
396 * @param i string offset
397 * @see U16_SET_CP_START
400 #define U16_SET_CP_START_UNSAFE(s, i) { \
401 if(U16_IS_TRAIL((s)[i])) { \
407 * Adjust a random-access offset to a code point boundary
408 * at the start of a code point.
409 * If the offset points to the trail surrogate of a surrogate pair,
410 * then the offset is decremented.
411 * Otherwise, it is not modified.
412 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
414 * @param s const UChar * string
415 * @param start starting string offset (usually 0)
416 * @param i string offset, must be start<=i
417 * @see U16_SET_CP_START_UNSAFE
420 #define U16_SET_CP_START(s, start, i) { \
421 if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
426 /* definitions with backward iteration -------------------------------------- */
429 * Move the string offset from one code point boundary to the previous one
430 * and get the code point between them.
431 * (Pre-decrementing backward iteration.)
432 * "Unsafe" macro, assumes well-formed UTF-16.
434 * The input offset may be the same as the string length.
435 * If the offset is behind a trail surrogate unit
436 * for a supplementary code point, then the macro will read
437 * the preceding lead surrogate as well.
438 * If the offset is behind a lead surrogate, then that itself
439 * will be returned as the code point.
440 * The result is undefined if the offset is behind a single, unpaired trail surrogate.
442 * @param s const UChar * string
443 * @param i string offset
444 * @param c output UChar32 variable
448 #define U16_PREV_UNSAFE(s, i, c) { \
450 if(U16_IS_TRAIL(c)) { \
451 (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
456 * Move the string offset from one code point boundary to the previous one
457 * and get the code point between them.
458 * (Pre-decrementing backward iteration.)
459 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
461 * The input offset may be the same as the string length.
462 * If the offset is behind a trail surrogate unit
463 * for a supplementary code point, then the macro will read
464 * the preceding lead surrogate as well.
465 * If the offset is behind a lead surrogate or behind a single, unpaired
466 * trail surrogate, then that itself
467 * will be returned as the code point.
469 * @param s const UChar * string
470 * @param start starting string offset (usually 0)
471 * @param i string offset, must be start<i
472 * @param c output UChar32 variable
473 * @see U16_PREV_UNSAFE
476 #define U16_PREV(s, start, i, c) { \
478 if(U16_IS_TRAIL(c)) { \
480 if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
482 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
488 * Move the string offset from one code point boundary to the previous one.
489 * (Pre-decrementing backward iteration.)
490 * The input offset may be the same as the string length.
491 * "Unsafe" macro, assumes well-formed UTF-16.
493 * @param s const UChar * string
494 * @param i string offset
498 #define U16_BACK_1_UNSAFE(s, i) { \
499 if(U16_IS_TRAIL((s)[--(i)])) { \
505 * Move the string offset from one code point boundary to the previous one.
506 * (Pre-decrementing backward iteration.)
507 * The input offset may be the same as the string length.
508 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
510 * @param s const UChar * string
511 * @param start starting string offset (usually 0)
512 * @param i string offset, must be start<i
513 * @see U16_BACK_1_UNSAFE
516 #define U16_BACK_1(s, start, i) { \
517 if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
523 * Move the string offset from one code point boundary to the n-th one before it,
524 * i.e., move backward by n code points.
525 * (Pre-decrementing backward iteration.)
526 * The input offset may be the same as the string length.
527 * "Unsafe" macro, assumes well-formed UTF-16.
529 * @param s const UChar * string
530 * @param i string offset
531 * @param n number of code points to skip
535 #define U16_BACK_N_UNSAFE(s, i, n) { \
538 U16_BACK_1_UNSAFE(s, i); \
544 * Move the string offset from one code point boundary to the n-th one before it,
545 * i.e., move backward by n code points.
546 * (Pre-decrementing backward iteration.)
547 * The input offset may be the same as the string length.
548 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
550 * @param s const UChar * string
551 * @param start start of string
552 * @param i string offset, must be start<i
553 * @param n number of code points to skip
554 * @see U16_BACK_N_UNSAFE
557 #define U16_BACK_N(s, start, i, n) { \
559 while(__N>0 && (i)>(start)) { \
560 U16_BACK_1(s, start, i); \
566 * Adjust a random-access offset to a code point boundary after a code point.
567 * If the offset is behind the lead surrogate of a surrogate pair,
568 * then the offset is incremented.
569 * Otherwise, it is not modified.
570 * The input offset may be the same as the string length.
571 * "Unsafe" macro, assumes well-formed UTF-16.
573 * @param s const UChar * string
574 * @param i string offset
575 * @see U16_SET_CP_LIMIT
578 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
579 if(U16_IS_LEAD((s)[(i)-1])) { \
585 * Adjust a random-access offset to a code point boundary after a code point.
586 * If the offset is behind the lead surrogate of a surrogate pair,
587 * then the offset is incremented.
588 * Otherwise, it is not modified.
589 * The input offset may be the same as the string length.
590 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
592 * @param s const UChar * string
593 * @param start starting string offset (usually 0)
594 * @param i string offset, start<=i<=length
595 * @param length string length
596 * @see U16_SET_CP_LIMIT_UNSAFE
599 #define U16_SET_CP_LIMIT(s, start, i, length) { \
600 if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \