]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unicode/utf16.h
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / unicode / utf16.h
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: utf16.h
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999sep09
14 * created by: Markus W. Scherer
15 */
16
17 /**
18 * \file
19 * \brief C API: 16-bit Unicode handling macros
20 *
21 * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
22 * utf16.h is included by utf.h after unicode/umachine.h
23 * and some common definitions.
24 *
25 * For more information see utf.h and the ICU User Guide Strings chapter
26 * (http://icu.sourceforge.net/userguide/strings.html).
27 *
28 * <em>Usage:</em>
29 * ICU coding guidelines for if() statements should be followed when using these macros.
30 * Compound statements (curly braces {}) must be used for if-else-while...
31 * bodies and all macro statements should be terminated with semicolon.
32 */
33
34 #ifndef __UTF16_H__
35 #define __UTF16_H__
36
37 /* utf.h must be included first. */
38 #ifndef __UTF_H__
39 # include "unicode/utf.h"
40 #endif
41
42 /* single-code point definitions -------------------------------------------- */
43
44 /**
45 * Does this code unit alone encode a code point (BMP, not a surrogate)?
46 * @param c 16-bit code unit
47 * @return TRUE or FALSE
48 * @stable ICU 2.4
49 */
50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
51
52 /**
53 * Is this code unit a lead surrogate (U+d800..U+dbff)?
54 * @param c 16-bit code unit
55 * @return TRUE or FALSE
56 * @stable ICU 2.4
57 */
58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
59
60 /**
61 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
62 * @param c 16-bit code unit
63 * @return TRUE or FALSE
64 * @stable ICU 2.4
65 */
66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
67
68 /**
69 * Is this code unit a surrogate (U+d800..U+dfff)?
70 * @param c 16-bit code unit
71 * @return TRUE or FALSE
72 * @stable ICU 2.4
73 */
74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
75
76 /**
77 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
78 * is it a lead surrogate?
79 * @param c 16-bit code unit
80 * @return TRUE or FALSE
81 * @stable ICU 2.4
82 */
83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
84
85 /**
86 * Helper constant for U16_GET_SUPPLEMENTARY.
87 * @internal
88 */
89 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
90
91 /**
92 * Get a supplementary code point value (U+10000..U+10ffff)
93 * from its lead and trail surrogates.
94 * The result is undefined if the input values are not
95 * lead and trail surrogates.
96 *
97 * @param lead lead surrogate (U+d800..U+dbff)
98 * @param trail trail surrogate (U+dc00..U+dfff)
99 * @return supplementary code point (U+10000..U+10ffff)
100 * @stable ICU 2.4
101 */
102 #define U16_GET_SUPPLEMENTARY(lead, trail) \
103 (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
104
105
106 /**
107 * Get the lead surrogate (0xd800..0xdbff) for a
108 * supplementary code point (0x10000..0x10ffff).
109 * @param supplementary 32-bit code point (U+10000..U+10ffff)
110 * @return lead surrogate (U+d800..U+dbff) for supplementary
111 * @stable ICU 2.4
112 */
113 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
114
115 /**
116 * Get the trail surrogate (0xdc00..0xdfff) for a
117 * supplementary code point (0x10000..0x10ffff).
118 * @param supplementary 32-bit code point (U+10000..U+10ffff)
119 * @return trail surrogate (U+dc00..U+dfff) for supplementary
120 * @stable ICU 2.4
121 */
122 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
123
124 /**
125 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
126 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
127 * @param c 32-bit code point
128 * @return 1 or 2
129 * @stable ICU 2.4
130 */
131 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
132
133 /**
134 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
135 * @return 2
136 * @stable ICU 2.4
137 */
138 #define U16_MAX_LENGTH 2
139
140 /**
141 * Get a code point from a string at a random-access offset,
142 * without changing the offset.
143 * "Unsafe" macro, assumes well-formed UTF-16.
144 *
145 * The offset may point to either the lead or trail surrogate unit
146 * for a supplementary code point, in which case the macro will read
147 * the adjacent matching surrogate as well.
148 * The result is undefined if the offset points to a single, unpaired surrogate.
149 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
150 *
151 * @param s const UChar * string
152 * @param i string offset
153 * @param c output UChar32 variable
154 * @see U16_GET
155 * @stable ICU 2.4
156 */
157 #define U16_GET_UNSAFE(s, i, c) { \
158 (c)=(s)[i]; \
159 if(U16_IS_SURROGATE(c)) { \
160 if(U16_IS_SURROGATE_LEAD(c)) { \
161 (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
162 } else { \
163 (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
164 } \
165 } \
166 }
167
168 /**
169 * Get a code point from a string at a random-access offset,
170 * without changing the offset.
171 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
172 *
173 * The offset may point to either the lead or trail surrogate unit
174 * for a supplementary code point, in which case the macro will read
175 * the adjacent matching surrogate as well.
176 * If the offset points to a single, unpaired surrogate, then that itself
177 * will be returned as the code point.
178 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
179 *
180 * @param s const UChar * string
181 * @param start starting string offset (usually 0)
182 * @param i string offset, must be start<=i<length
183 * @param length string length
184 * @param c output UChar32 variable
185 * @see U16_GET_UNSAFE
186 * @stable ICU 2.4
187 */
188 #define U16_GET(s, start, i, length, c) { \
189 (c)=(s)[i]; \
190 if(U16_IS_SURROGATE(c)) { \
191 uint16_t __c2; \
192 if(U16_IS_SURROGATE_LEAD(c)) { \
193 if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
194 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
195 } \
196 } else { \
197 if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
198 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
199 } \
200 } \
201 } \
202 }
203
204 /* definitions with forward iteration --------------------------------------- */
205
206 /**
207 * Get a code point from a string at a code point boundary offset,
208 * and advance the offset to the next code point boundary.
209 * (Post-incrementing forward iteration.)
210 * "Unsafe" macro, assumes well-formed UTF-16.
211 *
212 * The offset may point to the lead surrogate unit
213 * for a supplementary code point, in which case the macro will read
214 * the following trail surrogate as well.
215 * If the offset points to a trail surrogate, then that itself
216 * will be returned as the code point.
217 * The result is undefined if the offset points to a single, unpaired lead surrogate.
218 *
219 * @param s const UChar * string
220 * @param i string offset
221 * @param c output UChar32 variable
222 * @see U16_NEXT
223 * @stable ICU 2.4
224 */
225 #define U16_NEXT_UNSAFE(s, i, c) { \
226 (c)=(s)[(i)++]; \
227 if(U16_IS_LEAD(c)) { \
228 (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
229 } \
230 }
231
232 /**
233 * Get a code point from a string at a code point boundary offset,
234 * and advance the offset to the next code point boundary.
235 * (Post-incrementing forward iteration.)
236 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
237 *
238 * The offset may point to the lead surrogate unit
239 * for a supplementary code point, in which case the macro will read
240 * the following trail surrogate as well.
241 * If the offset points to a trail surrogate or
242 * to a single, unpaired lead surrogate, then that itself
243 * will be returned as the code point.
244 *
245 * @param s const UChar * string
246 * @param i string offset, must be i<length
247 * @param length string length
248 * @param c output UChar32 variable
249 * @see U16_NEXT_UNSAFE
250 * @stable ICU 2.4
251 */
252 #define U16_NEXT(s, i, length, c) { \
253 (c)=(s)[(i)++]; \
254 if(U16_IS_LEAD(c)) { \
255 uint16_t __c2; \
256 if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
257 ++(i); \
258 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
259 } \
260 } \
261 }
262
263 /**
264 * Append a code point to a string, overwriting 1 or 2 code units.
265 * The offset points to the current end of the string contents
266 * and is advanced (post-increment).
267 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
268 * Otherwise, the result is undefined.
269 *
270 * @param s const UChar * string buffer
271 * @param i string offset
272 * @param c code point to append
273 * @see U16_APPEND
274 * @stable ICU 2.4
275 */
276 #define U16_APPEND_UNSAFE(s, i, c) { \
277 if((uint32_t)(c)<=0xffff) { \
278 (s)[(i)++]=(uint16_t)(c); \
279 } else { \
280 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
281 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
282 } \
283 }
284
285 /**
286 * Append a code point to a string, overwriting 1 or 2 code units.
287 * The offset points to the current end of the string contents
288 * and is advanced (post-increment).
289 * "Safe" macro, checks for a valid code point.
290 * If a surrogate pair is written, checks for sufficient space in the string.
291 * If the code point is not valid or a trail surrogate does not fit,
292 * then isError is set to TRUE.
293 *
294 * @param s const UChar * string buffer
295 * @param i string offset, must be i<capacity
296 * @param capacity size of the string buffer
297 * @param c code point to append
298 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
299 * @see U16_APPEND_UNSAFE
300 * @stable ICU 2.4
301 */
302 #define U16_APPEND(s, i, capacity, c, isError) { \
303 if((uint32_t)(c)<=0xffff) { \
304 (s)[(i)++]=(uint16_t)(c); \
305 } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
306 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
307 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
308 } else /* c>0x10ffff or not enough space */ { \
309 (isError)=TRUE; \
310 } \
311 }
312
313 /**
314 * Advance the string offset from one code point boundary to the next.
315 * (Post-incrementing iteration.)
316 * "Unsafe" macro, assumes well-formed UTF-16.
317 *
318 * @param s const UChar * string
319 * @param i string offset
320 * @see U16_FWD_1
321 * @stable ICU 2.4
322 */
323 #define U16_FWD_1_UNSAFE(s, i) { \
324 if(U16_IS_LEAD((s)[(i)++])) { \
325 ++(i); \
326 } \
327 }
328
329 /**
330 * Advance the string offset from one code point boundary to the next.
331 * (Post-incrementing iteration.)
332 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
333 *
334 * @param s const UChar * string
335 * @param i string offset, must be i<length
336 * @param length string length
337 * @see U16_FWD_1_UNSAFE
338 * @stable ICU 2.4
339 */
340 #define U16_FWD_1(s, i, length) { \
341 if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
342 ++(i); \
343 } \
344 }
345
346 /**
347 * Advance the string offset from one code point boundary to the n-th next one,
348 * i.e., move forward by n code points.
349 * (Post-incrementing iteration.)
350 * "Unsafe" macro, assumes well-formed UTF-16.
351 *
352 * @param s const UChar * string
353 * @param i string offset
354 * @param n number of code points to skip
355 * @see U16_FWD_N
356 * @stable ICU 2.4
357 */
358 #define U16_FWD_N_UNSAFE(s, i, n) { \
359 int32_t __N=(n); \
360 while(__N>0) { \
361 U16_FWD_1_UNSAFE(s, i); \
362 --__N; \
363 } \
364 }
365
366 /**
367 * Advance the string offset from one code point boundary to the n-th next one,
368 * i.e., move forward by n code points.
369 * (Post-incrementing iteration.)
370 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
371 *
372 * @param s const UChar * string
373 * @param i string offset, must be i<length
374 * @param length string length
375 * @param n number of code points to skip
376 * @see U16_FWD_N_UNSAFE
377 * @stable ICU 2.4
378 */
379 #define U16_FWD_N(s, i, length, n) { \
380 int32_t __N=(n); \
381 while(__N>0 && (i)<(length)) { \
382 U16_FWD_1(s, i, length); \
383 --__N; \
384 } \
385 }
386
387 /**
388 * Adjust a random-access offset to a code point boundary
389 * at the start of a code point.
390 * If the offset points to the trail surrogate of a surrogate pair,
391 * then the offset is decremented.
392 * Otherwise, it is not modified.
393 * "Unsafe" macro, assumes well-formed UTF-16.
394 *
395 * @param s const UChar * string
396 * @param i string offset
397 * @see U16_SET_CP_START
398 * @stable ICU 2.4
399 */
400 #define U16_SET_CP_START_UNSAFE(s, i) { \
401 if(U16_IS_TRAIL((s)[i])) { \
402 --(i); \
403 } \
404 }
405
406 /**
407 * Adjust a random-access offset to a code point boundary
408 * at the start of a code point.
409 * If the offset points to the trail surrogate of a surrogate pair,
410 * then the offset is decremented.
411 * Otherwise, it is not modified.
412 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
413 *
414 * @param s const UChar * string
415 * @param start starting string offset (usually 0)
416 * @param i string offset, must be start<=i
417 * @see U16_SET_CP_START_UNSAFE
418 * @stable ICU 2.4
419 */
420 #define U16_SET_CP_START(s, start, i) { \
421 if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
422 --(i); \
423 } \
424 }
425
426 /* definitions with backward iteration -------------------------------------- */
427
428 /**
429 * Move the string offset from one code point boundary to the previous one
430 * and get the code point between them.
431 * (Pre-decrementing backward iteration.)
432 * "Unsafe" macro, assumes well-formed UTF-16.
433 *
434 * The input offset may be the same as the string length.
435 * If the offset is behind a trail surrogate unit
436 * for a supplementary code point, then the macro will read
437 * the preceding lead surrogate as well.
438 * If the offset is behind a lead surrogate, then that itself
439 * will be returned as the code point.
440 * The result is undefined if the offset is behind a single, unpaired trail surrogate.
441 *
442 * @param s const UChar * string
443 * @param i string offset
444 * @param c output UChar32 variable
445 * @see U16_PREV
446 * @stable ICU 2.4
447 */
448 #define U16_PREV_UNSAFE(s, i, c) { \
449 (c)=(s)[--(i)]; \
450 if(U16_IS_TRAIL(c)) { \
451 (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
452 } \
453 }
454
455 /**
456 * Move the string offset from one code point boundary to the previous one
457 * and get the code point between them.
458 * (Pre-decrementing backward iteration.)
459 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
460 *
461 * The input offset may be the same as the string length.
462 * If the offset is behind a trail surrogate unit
463 * for a supplementary code point, then the macro will read
464 * the preceding lead surrogate as well.
465 * If the offset is behind a lead surrogate or behind a single, unpaired
466 * trail surrogate, then that itself
467 * will be returned as the code point.
468 *
469 * @param s const UChar * string
470 * @param start starting string offset (usually 0)
471 * @param i string offset, must be start<i
472 * @param c output UChar32 variable
473 * @see U16_PREV_UNSAFE
474 * @stable ICU 2.4
475 */
476 #define U16_PREV(s, start, i, c) { \
477 (c)=(s)[--(i)]; \
478 if(U16_IS_TRAIL(c)) { \
479 uint16_t __c2; \
480 if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
481 --(i); \
482 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
483 } \
484 } \
485 }
486
487 /**
488 * Move the string offset from one code point boundary to the previous one.
489 * (Pre-decrementing backward iteration.)
490 * The input offset may be the same as the string length.
491 * "Unsafe" macro, assumes well-formed UTF-16.
492 *
493 * @param s const UChar * string
494 * @param i string offset
495 * @see U16_BACK_1
496 * @stable ICU 2.4
497 */
498 #define U16_BACK_1_UNSAFE(s, i) { \
499 if(U16_IS_TRAIL((s)[--(i)])) { \
500 --(i); \
501 } \
502 }
503
504 /**
505 * Move the string offset from one code point boundary to the previous one.
506 * (Pre-decrementing backward iteration.)
507 * The input offset may be the same as the string length.
508 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
509 *
510 * @param s const UChar * string
511 * @param start starting string offset (usually 0)
512 * @param i string offset, must be start<i
513 * @see U16_BACK_1_UNSAFE
514 * @stable ICU 2.4
515 */
516 #define U16_BACK_1(s, start, i) { \
517 if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
518 --(i); \
519 } \
520 }
521
522 /**
523 * Move the string offset from one code point boundary to the n-th one before it,
524 * i.e., move backward by n code points.
525 * (Pre-decrementing backward iteration.)
526 * The input offset may be the same as the string length.
527 * "Unsafe" macro, assumes well-formed UTF-16.
528 *
529 * @param s const UChar * string
530 * @param i string offset
531 * @param n number of code points to skip
532 * @see U16_BACK_N
533 * @stable ICU 2.4
534 */
535 #define U16_BACK_N_UNSAFE(s, i, n) { \
536 int32_t __N=(n); \
537 while(__N>0) { \
538 U16_BACK_1_UNSAFE(s, i); \
539 --__N; \
540 } \
541 }
542
543 /**
544 * Move the string offset from one code point boundary to the n-th one before it,
545 * i.e., move backward by n code points.
546 * (Pre-decrementing backward iteration.)
547 * The input offset may be the same as the string length.
548 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
549 *
550 * @param s const UChar * string
551 * @param start start of string
552 * @param i string offset, must be start<i
553 * @param n number of code points to skip
554 * @see U16_BACK_N_UNSAFE
555 * @stable ICU 2.4
556 */
557 #define U16_BACK_N(s, start, i, n) { \
558 int32_t __N=(n); \
559 while(__N>0 && (i)>(start)) { \
560 U16_BACK_1(s, start, i); \
561 --__N; \
562 } \
563 }
564
565 /**
566 * Adjust a random-access offset to a code point boundary after a code point.
567 * If the offset is behind the lead surrogate of a surrogate pair,
568 * then the offset is incremented.
569 * Otherwise, it is not modified.
570 * The input offset may be the same as the string length.
571 * "Unsafe" macro, assumes well-formed UTF-16.
572 *
573 * @param s const UChar * string
574 * @param i string offset
575 * @see U16_SET_CP_LIMIT
576 * @stable ICU 2.4
577 */
578 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
579 if(U16_IS_LEAD((s)[(i)-1])) { \
580 ++(i); \
581 } \
582 }
583
584 /**
585 * Adjust a random-access offset to a code point boundary after a code point.
586 * If the offset is behind the lead surrogate of a surrogate pair,
587 * then the offset is incremented.
588 * Otherwise, it is not modified.
589 * The input offset may be the same as the string length.
590 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
591 *
592 * @param s const UChar * string
593 * @param start starting string offset (usually 0)
594 * @param i string offset, start<=i<=length
595 * @param length string length
596 * @see U16_SET_CP_LIMIT_UNSAFE
597 * @stable ICU 2.4
598 */
599 #define U16_SET_CP_LIMIT(s, start, i, length) { \
600 if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
601 ++(i); \
602 } \
603 }
604
605 #endif