1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1998-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
11 * Modification History:
13 * Date Name Description
14 * 07/24/2000 Madhu Creation
15 *******************************************************************************
18 #include "unicode/utypes.h"
19 #include "unicode/utf8.h"
20 #include "unicode/utf_old.h"
24 /* lenient UTF-8 ------------------------------------------------------------ */
27 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
28 * code points with their "natural" encoding.
29 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
32 * This is not conformant with UTF-8.
34 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
35 * the macros below do not attempt to assemble such pairs.
38 #define L8_NEXT(s, i, length, c) { \
39 (c)=(uint8_t)(s)[(i)++]; \
42 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
49 #define L8_PREV(s, start, i, c) { \
50 (c)=(uint8_t)(s)[--(i)]; \
53 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
60 /* -------------------------------------------------------------------------- */
62 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
63 #ifndef UTF8_ERROR_VALUE_1
64 # define UTF8_ERROR_VALUE_1 0x15
66 #ifndef UTF8_ERROR_VALUE_2
67 # define UTF8_ERROR_VALUE_2 0x9f
69 #ifndef UTF_ERROR_VALUE
70 # define UTF_ERROR_VALUE 0xffff
73 # define UTF_IS_ERROR(c) \
74 (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
77 #if !U_HIDE_OBSOLETE_UTF_OLD_H
78 static void printUChars(const uint8_t *uchars
, int16_t len
){
81 log_err("0x%02x ", *(uchars
+i
));
86 static void TestCodeUnitValues(void);
87 static void TestCharLength(void);
88 static void TestGetChar(void);
89 static void TestNextPrevChar(void);
90 static void TestNulTerminated(void);
91 static void TestNextPrevNonCharacters(void);
92 static void TestNextPrevCharUnsafe(void);
93 static void TestFwdBack(void);
94 static void TestFwdBackUnsafe(void);
95 static void TestSetChar(void);
96 static void TestSetCharUnsafe(void);
97 static void TestTruncateIfIncomplete(void);
98 static void TestAppendChar(void);
99 static void TestAppend(void);
100 static void TestSurrogates(void);
102 void addUTF8Test(TestNode
** root
);
105 addUTF8Test(TestNode
** root
)
107 addTest(root
, &TestCodeUnitValues
, "utf8tst/TestCodeUnitValues");
108 addTest(root
, &TestCharLength
, "utf8tst/TestCharLength");
109 addTest(root
, &TestGetChar
, "utf8tst/TestGetChar");
110 addTest(root
, &TestNextPrevChar
, "utf8tst/TestNextPrevChar");
111 addTest(root
, &TestNulTerminated
, "utf8tst/TestNulTerminated");
112 addTest(root
, &TestNextPrevNonCharacters
, "utf8tst/TestNextPrevNonCharacters");
113 addTest(root
, &TestNextPrevCharUnsafe
, "utf8tst/TestNextPrevCharUnsafe");
114 addTest(root
, &TestFwdBack
, "utf8tst/TestFwdBack");
115 addTest(root
, &TestFwdBackUnsafe
, "utf8tst/TestFwdBackUnsafe");
116 addTest(root
, &TestSetChar
, "utf8tst/TestSetChar");
117 addTest(root
, &TestSetCharUnsafe
, "utf8tst/TestSetCharUnsafe");
118 addTest(root
, &TestTruncateIfIncomplete
, "utf8tst/TestTruncateIfIncomplete");
119 addTest(root
, &TestAppendChar
, "utf8tst/TestAppendChar");
120 addTest(root
, &TestAppend
, "utf8tst/TestAppend");
121 addTest(root
, &TestSurrogates
, "utf8tst/TestSurrogates");
124 static void TestCodeUnitValues()
126 static const uint8_t codeunit
[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
129 for(i
=0; i
<UPRV_LENGTHOF(codeunit
); i
++){
130 uint8_t c
=codeunit
[i
];
131 log_verbose("Testing code unit value of %x\n", c
);
134 #if !U_HIDE_OBSOLETE_UTF_OLD_H
135 !UTF8_IS_SINGLE(c
) || UTF8_IS_LEAD(c
) || UTF8_IS_TRAIL(c
) ||
137 !U8_IS_SINGLE(c
) || U8_IS_LEAD(c
) || U8_IS_TRAIL(c
)) {
138 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
139 c
, U8_IS_SINGLE(c
) ? 'y' : 'n', U8_IS_LEAD(c
) ? 'y' : 'n', U8_IS_TRAIL(c
) ? 'y' : 'n');
143 #if !U_HIDE_OBSOLETE_UTF_OLD_H
144 !UTF8_IS_LEAD(c
) || UTF8_IS_SINGLE(c
) || UTF8_IS_TRAIL(c
) ||
146 !U8_IS_LEAD(c
) || U8_IS_SINGLE(c
) || U8_IS_TRAIL(c
)) {
147 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
148 c
, U8_IS_SINGLE(c
) ? 'y' : 'n', U8_IS_LEAD(c
) ? 'y' : 'n', U8_IS_TRAIL(c
) ? 'y' : 'n');
152 #if !U_HIDE_OBSOLETE_UTF_OLD_H
153 !UTF8_IS_TRAIL(c
) || UTF8_IS_SINGLE(c
) || UTF8_IS_LEAD(c
) ||
155 !U8_IS_TRAIL(c
) || U8_IS_SINGLE(c
) || U8_IS_LEAD(c
)){
156 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
157 c
, U8_IS_SINGLE(c
) ? 'y' : 'n', U8_IS_LEAD(c
) ? 'y' : 'n', U8_IS_TRAIL(c
) ? 'y' : 'n');
163 static void TestCharLength()
165 static const uint32_t codepoint
[]={
183 #if !U_HIDE_OBSOLETE_UTF_OLD_H
186 for(i
=0; i
<UPRV_LENGTHOF(codepoint
); i
=(int16_t)(i
+2)){
187 UChar32 c
=codepoint
[i
+1];
189 #if !U_HIDE_OBSOLETE_UTF_OLD_H
190 UTF8_CHAR_LENGTH(c
) != (uint16_t)codepoint
[i
] ||
192 U8_LENGTH(c
) != (uint16_t)codepoint
[i
]) {
193 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c
, codepoint
[i
], U8_LENGTH(c
));
195 log_verbose("The no: of code units for %lx is %d\n",c
, U8_LENGTH(c
));
197 #if !U_HIDE_OBSOLETE_UTF_OLD_H
198 multiple
=(UBool
)(codepoint
[i
] == 1 ? FALSE
: TRUE
);
199 if(UTF8_NEED_MULTIPLE_UCHAR(c
) != multiple
){
200 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c
);
206 static void TestGetChar()
208 static const uint8_t input
[]={
225 static const UChar32 result
[]={
226 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
229 0x4e8c, 0x4e8c, 0x4e8c,
230 0x4e8c, 0x4e8c, 0x4e8c ,
231 0x4e8c, 0x4e8c, 0x4e8c,
232 0x10401, 0x10401, 0x10401 ,
233 0x10401, 0x10401, 0x10401 ,
234 0x10401, 0x10401, 0x10401 ,
235 0x10401, 0x10401, 0x10401,
236 -1, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
239 -1, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
240 -1, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
246 for(offset
=0; offset
<sizeof(input
); offset
++) {
247 expected
= result
[i
];
248 if (expected
>= 0 && offset
< sizeof(input
) - 1) {
249 #if !U_HIDE_OBSOLETE_UTF_OLD_H
250 UTF8_GET_CHAR_UNSAFE(input
, offset
, c
);
252 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
253 offset
, expected
, c
);
257 U8_GET_UNSAFE(input
, offset
, c
);
259 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
260 offset
, expected
, c
);
264 expected
=result
[i
+1];
265 #if !U_HIDE_OBSOLETE_UTF_OLD_H
266 UTF8_GET_CHAR_SAFE(input
, 0, offset
, sizeof(input
), c
, FALSE
);
268 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
271 U8_GET(input
, 0, offset
, sizeof(input
), c
);
272 if(UTF_IS_ERROR(expected
)) { expected
=U_SENTINEL
; }
274 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
277 U8_GET_OR_FFFD(input
, 0, offset
, sizeof(input
), c
);
278 if(expected
<0) { expected
=0xfffd; }
280 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
282 #if !U_HIDE_OBSOLETE_UTF_OLD_H
283 UTF8_GET_CHAR_SAFE(input
, 0, offset
, sizeof(input
), c
, TRUE
);
284 if(c
!= result
[i
+2]){
285 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, result
[i
+2], c
);
292 static void TestNextPrevChar() {
293 static const uint8_t input
[]={
295 0xf0, 0x90, 0x90, 0x81,
296 0xc0, 0x80, // non-shortest form
297 0xf3, 0xbe, // truncated
300 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
303 static const UChar32 result
[]={
304 /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
305 0x0061, 0x0061, 0x0000, 0x0000,
306 0x10401, 0x10401, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
307 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
308 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
309 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
310 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x61, 0x61,
311 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
312 UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
,
313 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
314 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
315 0x61, 0x61, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
316 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x10401, 0x10401,
317 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF_ERROR_VALUE
, UTF_ERROR_VALUE
,
318 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
,
319 UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
320 0x0000, 0x0000, 0x0061, 0x0061
322 static const int32_t movedOffset
[]={
323 /* next_safe prev_safe_s */
346 for(offset
=0; offset
<sizeof(input
); offset
++){
347 expected
=result
[i
]; // next_safe_ns
348 #if !U_HIDE_OBSOLETE_UTF_OLD_H
350 UTF8_NEXT_CHAR_SAFE(input
, setOffset
, sizeof(input
), c
, FALSE
);
351 if(setOffset
!= movedOffset
[j
]) {
352 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
353 offset
, movedOffset
[j
], setOffset
);
356 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
360 U8_NEXT(input
, setOffset
, sizeof(input
), c
);
361 if(setOffset
!= movedOffset
[j
]) {
362 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
363 offset
, movedOffset
[j
], setOffset
);
365 if(UTF_IS_ERROR(expected
)) { expected
=U_SENTINEL
; }
367 log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
371 U8_NEXT_OR_FFFD(input
, setOffset
, sizeof(input
), c
);
372 if(setOffset
!= movedOffset
[j
]) {
373 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
374 offset
, movedOffset
[j
], setOffset
);
376 if(expected
<0) { expected
=0xfffd; }
378 log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
380 #if !U_HIDE_OBSOLETE_UTF_OLD_H
382 UTF8_NEXT_CHAR_SAFE(input
, setOffset
, sizeof(input
), c
, TRUE
);
383 if(setOffset
!= movedOffset
[j
]) {
384 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
385 offset
, movedOffset
[j
], setOffset
);
387 expected
=result
[i
+1]; // next_safe_s
389 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
390 offset
, expected
, c
);
398 for(offset
=sizeof(input
); offset
> 0; --offset
){
399 expected
=result
[i
+2]; // prev_safe_ns
400 #if !U_HIDE_OBSOLETE_UTF_OLD_H
402 UTF8_PREV_CHAR_SAFE(input
, 0, setOffset
, c
, FALSE
);
403 if(setOffset
!= movedOffset
[j
+1]) {
404 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
405 offset
, movedOffset
[j
+1], setOffset
);
408 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
412 U8_PREV(input
, 0, setOffset
, c
);
413 if(setOffset
!= movedOffset
[j
+1]) {
414 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
415 offset
, movedOffset
[j
+1], setOffset
);
417 if(UTF_IS_ERROR(expected
)) { expected
=U_SENTINEL
; }
419 log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
423 U8_PREV_OR_FFFD(input
, 0, setOffset
, c
);
424 if(setOffset
!= movedOffset
[j
+1]) {
425 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
426 offset
, movedOffset
[j
+1], setOffset
);
428 if(expected
<0) { expected
=0xfffd; }
430 log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
432 #if !U_HIDE_OBSOLETE_UTF_OLD_H
434 UTF8_PREV_CHAR_SAFE(input
, 0, setOffset
, c
, TRUE
);
435 if(setOffset
!= movedOffset
[j
+1]) {
436 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
437 offset
, movedOffset
[j
+1], setOffset
);
439 expected
=result
[i
+3]; // prev_safe_s
441 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
442 offset
, expected
, c
);
450 /* keep this in sync with utf16tst.c's TestNulTerminated() */
451 static void TestNulTerminated() {
452 static const uint8_t input
[]={
454 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
462 /* 13 */ 0xe0, 0xa0, 0x80,
463 /* 16 */ 0xe2, 0x82, 0xac,
464 /* 19 */ 0xf0, 0x90, 0x90,
468 static const UChar32 result
[]={
471 U_SENTINEL
, // C0 not a lead byte
476 U_SENTINEL
, // FD not a lead byte
480 U_SENTINEL
, // truncated F0 90 90
484 UChar32 c
, c2
, expected
;
485 int32_t i0
, i
=0, j
, k
, expectedIndex
;
489 U8_NEXT(input
, i
, -1, c
);
490 expected
=result
[cpIndex
];
492 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0
, c
, expected
);
495 U8_NEXT_OR_FFFD(input
, j
, -1, c
);
496 if(expected
<0) { expected
=0xfffd; }
498 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0
, c
, expected
);
501 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j
, i
);
504 U8_FWD_1(input
, j
, -1);
506 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j
, i
);
510 * Move by this many code points from the start.
511 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
513 expectedIndex
= (c
==0) ? i
-1 : i
;
515 U8_FWD_N(input
, k
, -1, cpIndex
);
516 if(k
!=expectedIndex
) {
517 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k
, expectedIndex
);
524 U8_NEXT(input
, i
, -1, c
);
526 U8_GET(input
, 0, j
, -1, c2
);
528 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0
, c
, c2
, j
);
530 U8_GET_OR_FFFD(input
, 0, j
, -1, c2
);
531 expected
= (c
>=0) ? c
: 0xfffd;
533 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0
, expected
, c2
, j
);
535 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
537 U8_SET_CP_LIMIT(input
, 0, k
, -1);
539 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i
, j
+1, k
);
545 static void TestNextPrevNonCharacters() {
546 /* test non-characters */
547 static const uint8_t nonChars
[]={
548 0xef, 0xb7, 0x90, /* U+fdd0 */
549 0xef, 0xbf, 0xbf, /* U+feff */
550 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
551 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
552 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
558 for(idx
=0; idx
<(int32_t)sizeof(nonChars
);) {
559 U8_NEXT(nonChars
, idx
, sizeof(nonChars
), ch
);
560 if(!U_IS_UNICODE_NONCHAR(ch
)) {
561 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx
);
564 for(idx
=(int32_t)sizeof(nonChars
); idx
>0;) {
565 U8_PREV(nonChars
, 0, idx
, ch
);
566 if(!U_IS_UNICODE_NONCHAR(ch
)) {
567 log_err("U8_PREV(at %d) failed to read a non-character\n", idx
);
570 #if !U_HIDE_OBSOLETE_UTF_OLD_H
571 for(idx
=0; idx
<(int32_t)sizeof(nonChars
);) {
572 UChar32 expected
= nonChars
[idx
]<0xf0 ? 0xffff : 0x10ffff;
573 UTF8_NEXT_CHAR_SAFE(nonChars
, idx
, sizeof(nonChars
), ch
, TRUE
);
575 log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx
);
578 for(idx
=(int32_t)sizeof(nonChars
); idx
>0;) {
579 UTF8_PREV_CHAR_SAFE(nonChars
, 0, idx
, ch
, TRUE
);
580 UChar32 expected
= nonChars
[idx
]<0xf0 ? 0xffff : 0x10ffff;
582 log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx
);
588 static void TestNextPrevCharUnsafe() {
590 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
591 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
593 static const uint8_t input
[]={
595 0xf0, 0x90, 0x90, 0x81,
596 0xc0, 0x80, /* non-shortest form */
599 0xf4, 0x8f, 0xbf, 0xbf,
602 static const UChar32 codePoints
[]={
615 #if !U_HIDE_OBSOLETE_UTF_OLD_H
616 for(i
=0, offset
=0; offset
<sizeof(input
); ++i
) {
617 UTF8_NEXT_CHAR_UNSAFE(input
, offset
, c
);
618 expected
= codePoints
[i
];
619 if(expected
>= 0 && c
!= expected
) {
620 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
621 offset
, expected
, c
);
624 // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
625 // while the new one skips C0 80 together.
630 for(i
=0, offset
=0; offset
<sizeof(input
); ++i
) {
631 U8_NEXT_UNSAFE(input
, offset
, c
);
632 expected
= codePoints
[i
];
633 if(expected
>= 0 && c
!= expected
) {
634 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
635 offset
, expected
, c
);
638 #if !U_HIDE_OBSOLETE_UTF_OLD_H
639 for(i
=UPRV_LENGTHOF(codePoints
)-1, offset
=sizeof(input
); offset
> 0; --i
){
640 UTF8_PREV_CHAR_UNSAFE(input
, offset
, c
);
641 expected
= codePoints
[i
];
642 if(expected
>= 0 && c
!= expected
) {
643 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
644 offset
, expected
, c
);
648 for(i
=UPRV_LENGTHOF(codePoints
)-1, offset
=sizeof(input
); offset
> 0; --i
){
649 U8_PREV_UNSAFE(input
, offset
, c
);
650 expected
= codePoints
[i
];
651 if(expected
>= 0 && c
!= expected
) {
652 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
653 offset
, expected
, c
);
658 static void TestFwdBack() {
659 static const uint8_t input
[]={
661 0xF0, 0x90, 0x90, 0x81,
676 static const uint16_t fwd_safe
[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
677 static const uint16_t back_safe
[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
679 static const uint16_t Nvalue
[]= {0, 1, 2, 4, 1, 2, 1, 5};
680 static const uint16_t fwd_N_safe
[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
681 static const uint16_t back_N_safe
[] ={18, 17, 15, 11, 10, 8, 7, 0};
686 #if !U_HIDE_OBSOLETE_UTF_OLD_H
687 while(offsafe
< sizeof(input
)){
688 UTF8_FWD_1_SAFE(input
, offsafe
, sizeof(input
));
689 if(offsafe
!= fwd_safe
[i
]){
690 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe
[i
], offsafe
);
697 while(offsafe
< sizeof(input
)){
698 U8_FWD_1(input
, offsafe
, sizeof(input
));
699 if(offsafe
!= fwd_safe
[i
]){
700 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe
[i
], offsafe
);
704 #if !U_HIDE_OBSOLETE_UTF_OLD_H
706 offsafe
=sizeof(input
);
708 UTF8_BACK_1_SAFE(input
, 0, offsafe
);
709 if(offsafe
!= back_safe
[i
]){
710 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe
[i
], offsafe
);
716 offsafe
=sizeof(input
);
718 U8_BACK_1(input
, 0, offsafe
);
719 if(offsafe
!= back_safe
[i
]){
720 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe
[i
], offsafe
);
724 #if !U_HIDE_OBSOLETE_UTF_OLD_H
726 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
727 UTF8_FWD_N_SAFE(input
, offsafe
, sizeof(input
), Nvalue
[i
]);
728 if(offsafe
!= fwd_N_safe
[i
]){
729 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i
, fwd_N_safe
[i
], offsafe
);
735 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
736 U8_FWD_N(input
, offsafe
, sizeof(input
), Nvalue
[i
]);
737 if(offsafe
!= fwd_N_safe
[i
]){
738 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i
, fwd_N_safe
[i
], offsafe
);
742 #if !U_HIDE_OBSOLETE_UTF_OLD_H
743 offsafe
=sizeof(input
);
744 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
745 UTF8_BACK_N_SAFE(input
, 0, offsafe
, Nvalue
[i
]);
746 if(offsafe
!= back_N_safe
[i
]){
747 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i
, back_N_safe
[i
], offsafe
);
751 offsafe
=sizeof(input
);
752 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
753 U8_BACK_N(input
, 0, offsafe
, Nvalue
[i
]);
754 if(offsafe
!= back_N_safe
[i
]){
755 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i
, back_N_safe
[i
], offsafe
);
761 * Ticket #13636 - Visual Studio 2017 has problems optimizing this function.
762 * As a workaround, we will turn off optimization just for this function on VS2017 and above.
764 #if defined(_MSC_VER) && (_MSC_VER > 1900)
765 #pragma optimize( "", off )
768 static void TestFwdBackUnsafe() {
770 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
771 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
773 static const uint8_t input
[]={
775 0xf0, 0x90, 0x90, 0x81,
776 0xc0, 0x80, /* non-shortest form */
779 0xf4, 0x8f, 0xbf, 0xbf,
782 // forward unsafe skips only C0
783 static const int8_t boundaries
[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
784 // backward unsafe skips C0 80 together
785 static const int8_t backBoundaries
[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
789 #if !U_HIDE_OBSOLETE_UTF_OLD_H
790 for(i
=1, offset
=0; offset
<UPRV_LENGTHOF(input
); ++i
) {
791 UTF8_FWD_1_UNSAFE(input
, offset
);
792 if(offset
!= boundaries
[i
]){
793 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
797 for(i
=1, offset
=0; offset
<UPRV_LENGTHOF(input
); ++i
) {
798 U8_FWD_1_UNSAFE(input
, offset
);
799 if(offset
!= boundaries
[i
]){
800 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
803 #if !U_HIDE_OBSOLETE_UTF_OLD_H
804 for(i
=UPRV_LENGTHOF(backBoundaries
)-2, offset
=UPRV_LENGTHOF(input
); offset
>0; --i
) {
805 UTF8_BACK_1_UNSAFE(input
, offset
);
806 if(offset
!= backBoundaries
[i
]){
807 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries
[i
], offset
);
811 for(i
=UPRV_LENGTHOF(backBoundaries
)-2, offset
=UPRV_LENGTHOF(input
); offset
>0; --i
) {
812 U8_BACK_1_UNSAFE(input
, offset
);
813 if(offset
!= backBoundaries
[i
]){
814 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries
[i
], offset
);
817 #if !U_HIDE_OBSOLETE_UTF_OLD_H
818 for(i
=0; i
<UPRV_LENGTHOF(boundaries
); ++i
) {
820 UTF8_FWD_N_UNSAFE(input
, offset
, i
);
821 if(offset
!= boundaries
[i
]) {
822 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
826 for(i
=0; i
<UPRV_LENGTHOF(boundaries
); ++i
) {
828 U8_FWD_N_UNSAFE(input
, offset
, i
);
829 if(offset
!= boundaries
[i
]) {
830 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
833 #if !U_HIDE_OBSOLETE_UTF_OLD_H
834 for(i
=0; i
<UPRV_LENGTHOF(backBoundaries
); ++i
) {
835 int32_t j
=UPRV_LENGTHOF(backBoundaries
)-1-i
;
836 offset
=UPRV_LENGTHOF(input
);
837 UTF8_BACK_N_UNSAFE(input
, offset
, i
);
838 if(offset
!= backBoundaries
[j
]) {
839 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries
[j
], offset
);
843 for(i
=0; i
<UPRV_LENGTHOF(backBoundaries
); ++i
) {
844 int32_t j
=UPRV_LENGTHOF(backBoundaries
)-1-i
;
845 offset
=UPRV_LENGTHOF(input
);
846 U8_BACK_N_UNSAFE(input
, offset
, i
);
847 if(offset
!= backBoundaries
[j
]) {
848 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries
[j
], offset
);
854 * Ticket #13636 - Turn optimization back on.
856 #if defined(_MSC_VER) && (_MSC_VER > 1900)
857 #pragma optimize( "", on )
860 static void TestSetChar() {
861 static const uint8_t input
[]
862 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
863 static const int16_t start_safe
[]
864 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
865 static const int16_t limit_safe
[]
866 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
869 int32_t offset
=0, setOffset
=0;
870 for(offset
=0; offset
<=UPRV_LENGTHOF(input
); offset
++){
871 if (offset
<UPRV_LENGTHOF(input
)){
872 #if !U_HIDE_OBSOLETE_UTF_OLD_H
874 UTF8_SET_CHAR_START_SAFE(input
, 0, setOffset
);
875 if(setOffset
!= start_safe
[i
]){
876 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_safe
[i
], setOffset
);
880 U8_SET_CP_START(input
, 0, setOffset
);
881 if(setOffset
!= start_safe
[i
]){
882 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_safe
[i
], setOffset
);
885 #if !U_HIDE_OBSOLETE_UTF_OLD_H
887 UTF8_SET_CHAR_LIMIT_SAFE(input
,0, setOffset
, sizeof(input
));
888 if(setOffset
!= limit_safe
[i
]){
889 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_safe
[i
], setOffset
);
893 U8_SET_CP_LIMIT(input
,0, setOffset
, sizeof(input
));
894 if(setOffset
!= limit_safe
[i
]){
895 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_safe
[i
], setOffset
);
902 static void TestSetCharUnsafe() {
903 static const uint8_t input
[]
904 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
905 static const int16_t start_unsafe
[]
906 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
907 static const int16_t limit_unsafe
[]
908 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
911 int32_t offset
=0, setOffset
=0;
912 for(offset
=0; offset
<=UPRV_LENGTHOF(input
); offset
++){
913 if (offset
<UPRV_LENGTHOF(input
)){
914 #if !U_HIDE_OBSOLETE_UTF_OLD_H
916 UTF8_SET_CHAR_START_UNSAFE(input
, setOffset
);
917 if(setOffset
!= start_unsafe
[i
]){
918 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_unsafe
[i
], setOffset
);
922 U8_SET_CP_START_UNSAFE(input
, setOffset
);
923 if(setOffset
!= start_unsafe
[i
]){
924 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_unsafe
[i
], setOffset
);
928 if (offset
!= 0) { /* Can't have it go off the end of the array */
929 #if !U_HIDE_OBSOLETE_UTF_OLD_H
931 UTF8_SET_CHAR_LIMIT_UNSAFE(input
, setOffset
);
932 if(setOffset
!= limit_unsafe
[i
]){
933 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_unsafe
[i
], setOffset
);
937 U8_SET_CP_LIMIT_UNSAFE(input
, setOffset
);
938 if(setOffset
!= limit_unsafe
[i
]){
939 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_unsafe
[i
], setOffset
);
947 static void TestTruncateIfIncomplete() {
948 // Difference from U8_SET_CP_START():
949 // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
950 // Therefore, if the last byte is a lead byte, then this macro truncates
951 // even if the byte at the input index cannot continue a valid sequence
952 // (including when that is not a trail byte).
953 // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
954 static const struct {
977 { "\x80\x80\x80", 3 },
978 { "\xC2\xA0\x80", 3 },
979 { "\xE0\xA0\x80", 3 },
980 { "\xF0\x8F\x80", 3 },
981 { "\xF0\x90\x80", 0 },
982 { "\xF4\x8F\x80", 0 },
983 { "\xF4\x90\x80", 3 },
984 { "\xF5\x80\x80", 3 },
985 { "\x80\x80\x80\x80", 4 },
986 { "\xC2\xA0\x80\x80", 4 },
987 { "\xE0\xA0\x80\x80", 4 },
988 { "\xF0\x90\x80\x80", 4 },
989 { "\xF5\x80\x80\x80", 4 }
992 for (i
= 0; i
< UPRV_LENGTHOF(cases
); ++i
) {
993 const char *s
= cases
[i
].s
;
994 int32_t expected
= cases
[i
].expected
;
995 int32_t length
= (int32_t)strlen(s
);
996 int32_t adjusted
= length
;
997 U8_TRUNCATE_IF_INCOMPLETE(s
, 0, adjusted
);
998 if (adjusted
!= expected
) {
999 log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1000 (int)i
, (int)length
, (int)expected
, (int)adjusted
);
1005 static void TestAppendChar(){
1006 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1007 static const uint8_t s
[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1008 static const uint32_t test
[]={
1009 /* append-position(unsafe), CHAR to be appended */
1024 /* append-position(safe), CHAR to be appended */
1028 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
1040 static const uint16_t movedOffset
[]={
1041 /* offset-moved-to(unsafe) */
1042 4, /*for append-pos: 0 , CHAR 0x10401*/
1056 /* offset-moved-to(safe) */
1057 4, /*for append-pos: 0, CHAR 0x10401*/
1073 static const uint8_t result
[][11]={
1075 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1076 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1077 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1080 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
1082 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1083 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1084 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1086 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1087 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1088 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1090 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1092 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1093 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1094 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1095 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1097 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
1099 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1100 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1101 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1103 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1104 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1105 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1107 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1110 uint16_t i
, count
=0;
1114 uint16_t size
=UPRV_LENGTHOF(s
);
1115 for(i
=0; i
<UPRV_LENGTHOF(test
); i
=(uint16_t)(i
+2)){
1116 uprv_memcpy(str
, s
, size
);
1119 UTF8_APPEND_CHAR_UNSAFE(str
, offset
, test
[i
+1]);
1120 if(offset
!= movedOffset
[count
]){
1121 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1122 count
, movedOffset
[count
], offset
);
1125 if(uprv_memcmp(str
, result
[count
], size
) !=0){
1126 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count
);
1127 printUChars(result
[count
], size
);
1129 printUChars(str
, size
);
1133 UTF8_APPEND_CHAR_SAFE(str
, offset
, size
, test
[i
+1]);
1134 if(offset
!= movedOffset
[count
]){
1135 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1136 count
, movedOffset
[count
], offset
);
1139 if(uprv_memcmp(str
, result
[count
], size
) !=0){
1140 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count
);
1141 printUChars(result
[count
], size
);
1143 printUChars(str
, size
);
1146 /*call the API instead of MACRO
1147 uprv_memcpy(str, s, size);
1150 if((uint32_t)(c)<=0x7f) {
1151 (str)[(offset)++]=(uint8_t)(c);
1153 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
1155 if(offset != movedOffset[count]){
1156 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
1157 count, movedOffset[count], offset);
1160 if(uprv_memcmp(str, result[count], size) !=0){
1161 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1162 printUChars(result[count], size);
1164 printUChars(str, size);
1174 static void TestAppend() {
1175 static const UChar32 codePoints
[]={
1176 0x61, 0xdf, 0x901, 0x3040,
1177 0xac00, 0xd800, 0xdbff, 0xdcde,
1178 0xdffd, 0xe000, 0xffff, 0x10000,
1179 0x12345, 0xe0021, 0x10ffff, 0x110000,
1180 0x234567, 0x7fffffff, -1, -1000,
1183 static const uint8_t expectUnsafe
[]={
1184 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1185 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
1186 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1187 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1188 /* none from this line */
1191 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1192 0xea, 0xb0, 0x80, /* no surrogates */
1193 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1194 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1195 /* none from this line */
1199 uint8_t buffer
[100];
1202 UBool isError
, expectIsError
, wrongIsError
;
1205 for(i
=0; i
<UPRV_LENGTHOF(codePoints
); ++i
) {
1207 if(c
<0 || 0x10ffff<c
) {
1208 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1211 U8_APPEND_UNSAFE(buffer
, length
, c
);
1213 if(length
!=UPRV_LENGTHOF(expectUnsafe
) || 0!=memcmp(buffer
, expectUnsafe
, length
)) {
1214 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1219 for(i
=0; i
<UPRV_LENGTHOF(codePoints
); ++i
) {
1221 expectIsError
= c
<0 || 0x10ffff<c
|| U_IS_SURROGATE(c
);
1224 U8_APPEND(buffer
, length
, UPRV_LENGTHOF(buffer
), c
, isError
);
1225 wrongIsError
|= isError
!=expectIsError
;
1228 log_err("U8_APPEND did not set isError correctly\n");
1230 if(length
!=UPRV_LENGTHOF(expectSafe
) || 0!=memcmp(buffer
, expectSafe
, length
)) {
1231 log_err("U8_APPEND did not generate the expected output\n");
1237 static const uint8_t b
[]={
1238 0xc3, 0x9f, /* 00DF */
1239 0xed, 0x9f, 0xbf, /* D7FF */
1240 0xed, 0xa0, 0x81, /* D801 */
1241 0xed, 0xbf, 0xbe, /* DFFE */
1242 0xee, 0x80, 0x80, /* E000 */
1243 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1245 static const UChar32 cp
[]={
1246 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1250 int32_t i
, j
, k
, iu
, is
, il
, length
;
1252 k
=0; /* index into cp[] */
1253 length
=UPRV_LENGTHOF(b
);
1254 for(i
=0; i
<length
;) {
1256 U8_NEXT_UNSAFE(b
, j
, cu
);
1260 U8_NEXT(b
, j
, length
, cs
);
1264 L8_NEXT(b
, j
, length
, cl
);
1268 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cu
, (long)cp
[k
]);
1271 /* U8_NEXT() returns <0 for surrogate code points */
1272 if(U_IS_SURROGATE(cu
) ? cs
>=0 : cs
!=cu
) {
1273 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cs
, (long)cu
);
1276 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1278 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cl
, (long)cu
);
1281 // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1282 if(U_IS_SURROGATE(cu
) ? is
!=(i
+1) : is
!=iu
) {
1283 log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i
, (long)i
);
1286 log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i
, (long)i
);
1289 ++k
; /* next code point */
1290 i
=iu
; /* advance by one UTF-8 sequence */
1294 --k
; /* previous code point */
1297 U8_PREV_UNSAFE(b
, j
, cu
);
1301 U8_PREV(b
, 0, j
, cs
);
1305 L8_PREV(b
, 0, j
, cl
);
1309 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cu
, (long)cp
[k
]);
1312 /* U8_PREV() returns <0 for surrogate code points */
1313 if(U_IS_SURROGATE(cu
) ? cs
>=0 : cs
!=cu
) {
1314 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cs
, (long)cu
);
1317 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1319 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cl
, (long)cu
);
1322 // U8_PREV() skips only the last byte of a surrogate byte sequence.
1323 if(U_IS_SURROGATE(cu
) ? is
!=(i
-1) : is
!=iu
) {
1324 log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i
, (long)i
);
1327 log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i
, (long)i
);
1330 i
=iu
; /* go back by one UTF-8 sequence */