1 /********************************************************************
3 * Copyright (c) 1998-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
9 * Modification History:
11 * Date Name Description
12 * 07/24/2000 Madhu Creation
13 *******************************************************************************
16 #include "unicode/utypes.h"
17 #include "unicode/utf8.h"
21 /* lenient UTF-8 ------------------------------------------------------------ */
24 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
25 * code points with their "natural" encoding.
26 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
29 * This is not conformant with UTF-8.
31 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
32 * the macros below do not attempt to assemble such pairs.
35 #define L8_NEXT(s, i, length, c) { \
36 (c)=(uint8_t)(s)[(i)++]; \
39 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
46 #define L8_PREV(s, start, i, c) { \
47 (c)=(uint8_t)(s)[--(i)]; \
50 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
57 /* -------------------------------------------------------------------------- */
59 static void printUChars(const uint8_t *uchars
, int16_t len
);
61 static void TestCodeUnitValues(void);
62 static void TestCharLength(void);
63 static void TestGetChar(void);
64 static void TestNextPrevChar(void);
65 static void TestNulTerminated(void);
66 static void TestNextPrevNonCharacters(void);
67 static void TestNextPrevCharUnsafe(void);
68 static void TestFwdBack(void);
69 static void TestFwdBackUnsafe(void);
70 static void TestSetChar(void);
71 static void TestSetCharUnsafe(void);
72 static void TestAppendChar(void);
73 static void TestAppend(void);
74 static void TestSurrogates(void);
76 void addUTF8Test(TestNode
** root
);
79 addUTF8Test(TestNode
** root
)
81 addTest(root
, &TestCodeUnitValues
, "utf8tst/TestCodeUnitValues");
82 addTest(root
, &TestCharLength
, "utf8tst/TestCharLength");
83 addTest(root
, &TestGetChar
, "utf8tst/TestGetChar");
84 addTest(root
, &TestNextPrevChar
, "utf8tst/TestNextPrevChar");
85 addTest(root
, &TestNulTerminated
, "utf8tst/TestNulTerminated");
86 addTest(root
, &TestNextPrevNonCharacters
, "utf8tst/TestNextPrevNonCharacters");
87 addTest(root
, &TestNextPrevCharUnsafe
, "utf8tst/TestNextPrevCharUnsafe");
88 addTest(root
, &TestFwdBack
, "utf8tst/TestFwdBack");
89 addTest(root
, &TestFwdBackUnsafe
, "utf8tst/TestFwdBackUnsafe");
90 addTest(root
, &TestSetChar
, "utf8tst/TestSetChar");
91 addTest(root
, &TestSetCharUnsafe
, "utf8tst/TestSetCharUnsafe");
92 addTest(root
, &TestAppendChar
, "utf8tst/TestAppendChar");
93 addTest(root
, &TestAppend
, "utf8tst/TestAppend");
94 addTest(root
, &TestSurrogates
, "utf8tst/TestSurrogates");
97 static void TestCodeUnitValues()
99 static const uint8_t codeunit
[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
102 for(i
=0; i
<UPRV_LENGTHOF(codeunit
); i
++){
103 uint8_t c
=codeunit
[i
];
104 log_verbose("Testing code unit value of %x\n", c
);
106 if(!UTF8_IS_SINGLE(c
) || UTF8_IS_LEAD(c
) || UTF8_IS_TRAIL(c
) || !U8_IS_SINGLE(c
) || U8_IS_LEAD(c
) || U8_IS_TRAIL(c
)){
107 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
108 c
, UTF8_IS_SINGLE(c
) ? 'y' : 'n', UTF8_IS_LEAD(c
) ? 'y' : 'n', UTF8_IS_TRAIL(c
) ? 'y' : 'n');
111 if(!UTF8_IS_LEAD(c
) || UTF8_IS_SINGLE(c
) || UTF8_IS_TRAIL(c
) || !U8_IS_LEAD(c
) || U8_IS_SINGLE(c
) || U8_IS_TRAIL(c
)){
112 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
113 c
, UTF8_IS_SINGLE(c
) ? 'y' : 'n', UTF8_IS_LEAD(c
) ? 'y' : 'n', UTF8_IS_TRAIL(c
) ? 'y' : 'n');
116 if(!UTF8_IS_TRAIL(c
) || UTF8_IS_SINGLE(c
) || UTF8_IS_LEAD(c
) || !U8_IS_TRAIL(c
) || U8_IS_SINGLE(c
) || U8_IS_LEAD(c
)){
117 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
118 c
, UTF8_IS_SINGLE(c
) ? 'y' : 'n', UTF8_IS_LEAD(c
) ? 'y' : 'n', UTF8_IS_TRAIL(c
) ? 'y' : 'n');
124 static void TestCharLength()
126 static const uint32_t codepoint
[]={
145 for(i
=0; i
<UPRV_LENGTHOF(codepoint
); i
=(int16_t)(i
+2)){
146 UChar32 c
=codepoint
[i
+1];
147 if(UTF8_CHAR_LENGTH(c
) != (uint16_t)codepoint
[i
] || U8_LENGTH(c
) != (uint16_t)codepoint
[i
]){
148 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c
, codepoint
[i
], UTF8_CHAR_LENGTH(c
));
150 log_verbose("The no: of code units for %lx is %d\n",c
, UTF8_CHAR_LENGTH(c
));
152 multiple
=(UBool
)(codepoint
[i
] == 1 ? FALSE
: TRUE
);
153 if(UTF8_NEED_MULTIPLE_UCHAR(c
) != multiple
){
154 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c
);
159 static void TestGetChar()
161 static const uint8_t input
[]={
178 static const UChar32 result
[]={
179 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
182 0x4e8c, 0x4e8c, 0x4e8c,
183 0x4e8c, 0x4e8c, 0x4e8c ,
184 0x4e8c, 0x4e8c, 0x4e8c,
185 0x10401, 0x10401, 0x10401 ,
186 0x10401, 0x10401, 0x10401 ,
187 0x10401, 0x10401, 0x10401 ,
188 0x10401, 0x10401, 0x10401,
189 0x25, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
192 0x31, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
193 0x240, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
199 for(offset
=0; offset
<sizeof(input
); offset
++) {
200 if (offset
< sizeof(input
) - 1) {
201 UTF8_GET_CHAR_UNSAFE(input
, offset
, c
);
203 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, result
[i
], c
);
207 U8_GET_UNSAFE(input
, offset
, c
);
209 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, result
[i
], c
);
214 UTF8_GET_CHAR_SAFE(input
, 0, offset
, sizeof(input
), c
, FALSE
);
215 expected
=result
[i
+1];
217 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
220 U8_GET(input
, 0, offset
, sizeof(input
), c
);
221 if(UTF_IS_ERROR(expected
)) { expected
=U_SENTINEL
; }
223 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
226 U8_GET_OR_FFFD(input
, 0, offset
, sizeof(input
), c
);
227 if(expected
<0) { expected
=0xfffd; }
229 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
232 UTF8_GET_CHAR_SAFE(input
, 0, offset
, sizeof(input
), c
, TRUE
);
233 if(c
!= result
[i
+2]){
234 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset
, result
[i
+2], c
);
241 static void TestNextPrevChar() {
242 static const uint8_t input
[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
243 static const UChar32 result
[]={
244 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
245 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000,
246 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
247 0x90, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x2841410, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
248 0x90, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0xa1050, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
249 0x81, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x2841, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
250 0x00, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
, 0x61, 0x61, 0x61,
251 0x80, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0xc2, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
252 0xfd, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
, 0x77e, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
,
253 0xbe, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0xfd, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
254 0xa1, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x00, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
,
255 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
256 0x81, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x10401, 0x10401, 0x10401,
257 0x90, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x410, UTF_ERROR_VALUE
, UTF_ERROR_VALUE
,
258 0x90, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0x410, UTF8_ERROR_VALUE_2
, UTF8_ERROR_VALUE_2
,
259 0x0840, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
, 0xf0, UTF8_ERROR_VALUE_1
, UTF8_ERROR_VALUE_1
,
260 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061
262 static const int32_t movedOffset
[]={
263 /* next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s */
265 5, 5, 5, 14, 14 , 14,
281 /* TODO: remove unused columns for next_unsafe & prev_unsafe, and adjust the test code */
287 for(offset
=0; offset
<sizeof(input
); offset
++){
289 UTF8_NEXT_CHAR_SAFE(input
, setOffset
, sizeof(input
), c
, FALSE
);
290 if(setOffset
!= movedOffset
[i
+1]){
291 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
292 offset
, movedOffset
[i
+1], setOffset
);
294 expected
=result
[i
+1];
296 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
300 U8_NEXT(input
, setOffset
, sizeof(input
), c
);
301 if(setOffset
!= movedOffset
[i
+1]){
302 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
303 offset
, movedOffset
[i
+1], setOffset
);
305 if(UTF_IS_ERROR(expected
)) { expected
=U_SENTINEL
; }
307 log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
311 U8_NEXT_OR_FFFD(input
, setOffset
, sizeof(input
), c
);
312 if(setOffset
!= movedOffset
[i
+1]){
313 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
314 offset
, movedOffset
[i
+1], setOffset
);
316 if(expected
<0) { expected
=0xfffd; }
318 log_err("ERROR: U8_NEXT_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
322 UTF8_NEXT_CHAR_SAFE(input
, setOffset
, sizeof(input
), c
, TRUE
);
323 if(setOffset
!= movedOffset
[i
+1]){
324 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
325 offset
, movedOffset
[i
+2], setOffset
);
327 if(c
!= result
[i
+2]){
328 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset
, result
[i
+2], c
);
335 for(offset
=sizeof(input
); offset
> 0; --offset
){
337 UTF8_PREV_CHAR_SAFE(input
, 0, setOffset
, c
, FALSE
);
338 if(setOffset
!= movedOffset
[i
+4]){
339 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
340 offset
, movedOffset
[i
+4], setOffset
);
342 expected
=result
[i
+4];
344 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
348 U8_PREV(input
, 0, setOffset
, c
);
349 if(setOffset
!= movedOffset
[i
+4]){
350 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
351 offset
, movedOffset
[i
+4], setOffset
);
353 if(UTF_IS_ERROR(expected
)) { expected
=U_SENTINEL
; }
355 log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
359 U8_PREV_OR_FFFD(input
, 0, setOffset
, c
);
360 if(setOffset
!= movedOffset
[i
+4]){
361 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
362 offset
, movedOffset
[i
+4], setOffset
);
364 if(expected
<0) { expected
=0xfffd; }
366 log_err("ERROR: U8_PREV_OR_FFFD failed for input=%ld. Expected:%lx Got:%lx\n", offset
, expected
, c
);
370 UTF8_PREV_CHAR_SAFE(input
, 0, setOffset
, c
, TRUE
);
371 if(setOffset
!= movedOffset
[i
+5]){
372 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
373 offset
, movedOffset
[i
+5], setOffset
);
375 if(c
!= result
[i
+5]){
376 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset
, result
[i
+5], c
);
383 /* keep this in sync with utf16tst.c's TestNulTerminated() */
384 static void TestNulTerminated() {
385 static const uint8_t input
[]={
387 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
393 /* 13 */ 0xe0, 0xa0, 0x80,
394 /* 16 */ 0xe2, 0x82, 0xac,
395 /* 19 */ 0xf0, 0x90, 0x90,
399 static const UChar32 result
[]={
413 UChar32 c
, c2
, expected
;
414 int32_t i0
, i
=0, j
, k
, expectedIndex
;
418 U8_NEXT(input
, i
, -1, c
);
419 expected
=result
[cpIndex
];
421 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0
, c
, expected
);
424 U8_NEXT_OR_FFFD(input
, j
, -1, c
);
425 if(expected
<0) { expected
=0xfffd; }
427 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0
, c
, expected
);
430 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j
, i
);
433 U8_FWD_1(input
, j
, -1);
435 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j
, i
);
439 * Move by this many code points from the start.
440 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
442 expectedIndex
= (c
==0) ? i
-1 : i
;
444 U8_FWD_N(input
, k
, -1, cpIndex
);
445 if(k
!=expectedIndex
) {
446 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k
, expectedIndex
);
453 U8_NEXT(input
, i
, -1, c
);
455 U8_GET(input
, 0, j
, -1, c2
);
457 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0
, c
, c2
, j
);
459 U8_GET_OR_FFFD(input
, 0, j
, -1, c2
);
460 expected
= (c
>=0) ? c
: 0xfffd;
462 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0
, expected
, c2
, j
);
464 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
466 U8_SET_CP_LIMIT(input
, 0, k
, -1);
468 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i
, j
+1, k
);
474 static void TestNextPrevNonCharacters() {
475 /* test non-characters */
476 static const uint8_t nonChars
[]={
477 0xef, 0xb7, 0x90, /* U+fdd0 */
478 0xef, 0xbf, 0xbf, /* U+feff */
479 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
480 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
481 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
487 for(idx
=0; idx
<(int32_t)sizeof(nonChars
);) {
488 U8_NEXT(nonChars
, idx
, sizeof(nonChars
), ch
);
489 if(!U_IS_UNICODE_NONCHAR(ch
)) {
490 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx
);
493 for(idx
=(int32_t)sizeof(nonChars
); idx
>0;) {
494 U8_PREV(nonChars
, 0, idx
, ch
);
495 if(!U_IS_UNICODE_NONCHAR(ch
)) {
496 log_err("U8_PREV(at %d) failed to read a non-character\n", idx
);
501 static void TestNextPrevCharUnsafe() {
503 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
504 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
506 static const uint8_t input
[]={
508 0xf0, 0x90, 0x90, 0x81,
509 0xc0, 0x80, /* non-shortest form */
512 0xf4, 0x8f, 0xbf, 0xbf,
515 static const UChar32 codePoints
[]={
528 for(i
=0, offset
=0; offset
<sizeof(input
); ++i
) {
529 UTF8_NEXT_CHAR_UNSAFE(input
, offset
, c
);
530 if(c
!= codePoints
[i
]){
531 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
532 offset
, codePoints
[i
], c
);
535 for(i
=0, offset
=0; offset
<sizeof(input
); ++i
) {
536 U8_NEXT_UNSAFE(input
, offset
, c
);
537 if(c
!= codePoints
[i
]){
538 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
539 offset
, codePoints
[i
], c
);
543 for(i
=UPRV_LENGTHOF(codePoints
)-1, offset
=sizeof(input
); offset
> 0; --i
){
544 UTF8_PREV_CHAR_UNSAFE(input
, offset
, c
);
545 if(c
!= codePoints
[i
]){
546 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
547 offset
, codePoints
[i
], c
);
550 for(i
=UPRV_LENGTHOF(codePoints
)-1, offset
=sizeof(input
); offset
> 0; --i
){
551 U8_PREV_UNSAFE(input
, offset
, c
);
552 if(c
!= codePoints
[i
]){
553 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
554 offset
, codePoints
[i
], c
);
559 static void TestFwdBack() {
560 static const uint8_t input
[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
561 static const uint16_t fwd_safe
[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
562 static const uint16_t back_safe
[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};
564 static const uint16_t Nvalue
[]= {0, 1, 2, 3, 1, 2, 1, 5};
565 static const uint16_t fwd_N_safe
[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
566 static const uint16_t back_N_safe
[] ={18, 17, 15, 12, 11, 9, 7, 0};
571 while(offsafe
< sizeof(input
)){
572 UTF8_FWD_1_SAFE(input
, offsafe
, sizeof(input
));
573 if(offsafe
!= fwd_safe
[i
]){
574 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe
[i
], offsafe
);
580 while(offsafe
< sizeof(input
)){
581 U8_FWD_1(input
, offsafe
, sizeof(input
));
582 if(offsafe
!= fwd_safe
[i
]){
583 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe
[i
], offsafe
);
589 offsafe
=sizeof(input
);
591 UTF8_BACK_1_SAFE(input
, 0, offsafe
);
592 if(offsafe
!= back_safe
[i
]){
593 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe
[i
], offsafe
);
599 offsafe
=sizeof(input
);
601 U8_BACK_1(input
, 0, offsafe
);
602 if(offsafe
!= back_safe
[i
]){
603 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe
[i
], offsafe
);
609 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
610 UTF8_FWD_N_SAFE(input
, offsafe
, sizeof(input
), Nvalue
[i
]);
611 if(offsafe
!= fwd_N_safe
[i
]){
612 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i
, fwd_N_safe
[i
], offsafe
);
618 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
619 U8_FWD_N(input
, offsafe
, sizeof(input
), Nvalue
[i
]);
620 if(offsafe
!= fwd_N_safe
[i
]){
621 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i
, fwd_N_safe
[i
], offsafe
);
626 offsafe
=sizeof(input
);
627 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
628 UTF8_BACK_N_SAFE(input
, 0, offsafe
, Nvalue
[i
]);
629 if(offsafe
!= back_N_safe
[i
]){
630 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i
, back_N_safe
[i
], offsafe
);
634 offsafe
=sizeof(input
);
635 for(i
=0; i
<UPRV_LENGTHOF(Nvalue
); i
++){
636 U8_BACK_N(input
, 0, offsafe
, Nvalue
[i
]);
637 if(offsafe
!= back_N_safe
[i
]){
638 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i
, back_N_safe
[i
], offsafe
);
643 static void TestFwdBackUnsafe() {
645 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
646 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
648 static const uint8_t input
[]={
650 0xf0, 0x90, 0x90, 0x81,
651 0xc0, 0x80, /* non-shortest form */
654 0xf4, 0x8f, 0xbf, 0xbf,
657 static const int8_t boundaries
[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
661 for(i
=1, offset
=0; offset
<UPRV_LENGTHOF(input
); ++i
) {
662 UTF8_FWD_1_UNSAFE(input
, offset
);
663 if(offset
!= boundaries
[i
]){
664 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
667 for(i
=1, offset
=0; offset
<UPRV_LENGTHOF(input
); ++i
) {
668 U8_FWD_1_UNSAFE(input
, offset
);
669 if(offset
!= boundaries
[i
]){
670 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
674 for(i
=UPRV_LENGTHOF(boundaries
)-2, offset
=UPRV_LENGTHOF(input
); offset
>0; --i
) {
675 UTF8_BACK_1_UNSAFE(input
, offset
);
676 if(offset
!= boundaries
[i
]){
677 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
680 for(i
=UPRV_LENGTHOF(boundaries
)-2, offset
=UPRV_LENGTHOF(input
); offset
>0; --i
) {
681 U8_BACK_1_UNSAFE(input
, offset
);
682 if(offset
!= boundaries
[i
]){
683 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
687 for(i
=0; i
<UPRV_LENGTHOF(boundaries
); ++i
) {
689 UTF8_FWD_N_UNSAFE(input
, offset
, i
);
690 if(offset
!= boundaries
[i
]) {
691 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
694 for(i
=0; i
<UPRV_LENGTHOF(boundaries
); ++i
) {
696 U8_FWD_N_UNSAFE(input
, offset
, i
);
697 if(offset
!= boundaries
[i
]) {
698 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries
[i
], offset
);
702 for(i
=0; i
<UPRV_LENGTHOF(boundaries
); ++i
) {
703 int32_t j
=UPRV_LENGTHOF(boundaries
)-1-i
;
704 offset
=UPRV_LENGTHOF(input
);
705 UTF8_BACK_N_UNSAFE(input
, offset
, i
);
706 if(offset
!= boundaries
[j
]) {
707 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries
[j
], offset
);
710 for(i
=0; i
<UPRV_LENGTHOF(boundaries
); ++i
) {
711 int32_t j
=UPRV_LENGTHOF(boundaries
)-1-i
;
712 offset
=UPRV_LENGTHOF(input
);
713 U8_BACK_N_UNSAFE(input
, offset
, i
);
714 if(offset
!= boundaries
[j
]) {
715 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries
[j
], offset
);
720 static void TestSetChar() {
721 static const uint8_t input
[]
722 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
723 static const int16_t start_safe
[]
724 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
725 static const int16_t limit_safe
[]
726 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
729 int32_t offset
=0, setOffset
=0;
730 for(offset
=0; offset
<=UPRV_LENGTHOF(input
); offset
++){
731 if (offset
<UPRV_LENGTHOF(input
)){
733 UTF8_SET_CHAR_START_SAFE(input
, 0, setOffset
);
734 if(setOffset
!= start_safe
[i
]){
735 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_safe
[i
], setOffset
);
739 U8_SET_CP_START(input
, 0, setOffset
);
740 if(setOffset
!= start_safe
[i
]){
741 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_safe
[i
], setOffset
);
746 UTF8_SET_CHAR_LIMIT_SAFE(input
,0, setOffset
, sizeof(input
));
747 if(setOffset
!= limit_safe
[i
]){
748 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_safe
[i
], setOffset
);
752 U8_SET_CP_LIMIT(input
,0, setOffset
, sizeof(input
));
753 if(setOffset
!= limit_safe
[i
]){
754 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_safe
[i
], setOffset
);
761 static void TestSetCharUnsafe() {
762 static const uint8_t input
[]
763 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
764 static const int16_t start_unsafe
[]
765 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
766 static const int16_t limit_unsafe
[]
767 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
770 int32_t offset
=0, setOffset
=0;
771 for(offset
=0; offset
<=UPRV_LENGTHOF(input
); offset
++){
772 if (offset
<UPRV_LENGTHOF(input
)){
774 UTF8_SET_CHAR_START_UNSAFE(input
, setOffset
);
775 if(setOffset
!= start_unsafe
[i
]){
776 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_unsafe
[i
], setOffset
);
780 U8_SET_CP_START_UNSAFE(input
, setOffset
);
781 if(setOffset
!= start_unsafe
[i
]){
782 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, start_unsafe
[i
], setOffset
);
786 if (offset
!= 0) { /* Can't have it go off the end of the array */
788 UTF8_SET_CHAR_LIMIT_UNSAFE(input
, setOffset
);
789 if(setOffset
!= limit_unsafe
[i
]){
790 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_unsafe
[i
], setOffset
);
794 U8_SET_CP_LIMIT_UNSAFE(input
, setOffset
);
795 if(setOffset
!= limit_unsafe
[i
]){
796 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset
, limit_unsafe
[i
], setOffset
);
804 static void TestAppendChar(){
805 static const uint8_t s
[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
806 static const uint32_t test
[]={
807 /* append-position(unsafe), CHAR to be appended */
822 /* append-position(safe), CHAR to be appended */
826 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
838 static const uint16_t movedOffset
[]={
839 /* offset-moved-to(unsafe) */
840 4, /*for append-pos: 0 , CHAR 0x10401*/
854 /* offset-moved-to(safe) */
855 4, /*for append-pos: 0, CHAR 0x10401*/
871 static const uint8_t result
[][11]={
873 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
874 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
875 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
876 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
877 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
878 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
880 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
881 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
882 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
884 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
885 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
886 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
888 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
890 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
891 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
892 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
893 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
894 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
895 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
897 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
898 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
899 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
901 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
902 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
903 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
905 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
912 uint16_t size
=UPRV_LENGTHOF(s
);
913 for(i
=0; i
<UPRV_LENGTHOF(test
); i
=(uint16_t)(i
+2)){
914 uprv_memcpy(str
, s
, size
);
917 UTF8_APPEND_CHAR_UNSAFE(str
, offset
, test
[i
+1]);
918 if(offset
!= movedOffset
[count
]){
919 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
920 count
, movedOffset
[count
], offset
);
923 if(uprv_memcmp(str
, result
[count
], size
) !=0){
924 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count
);
925 printUChars(result
[count
], size
);
927 printUChars(str
, size
);
931 UTF8_APPEND_CHAR_SAFE(str
, offset
, size
, test
[i
+1]);
932 if(offset
!= movedOffset
[count
]){
933 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
934 count
, movedOffset
[count
], offset
);
937 if(uprv_memcmp(str
, result
[count
], size
) !=0){
938 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count
);
939 printUChars(result
[count
], size
);
941 printUChars(str
, size
);
944 /*call the API instead of MACRO
945 uprv_memcpy(str, s, size);
948 if((uint32_t)(c)<=0x7f) {
949 (str)[(offset)++]=(uint8_t)(c);
951 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
953 if(offset != movedOffset[count]){
954 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
955 count, movedOffset[count], offset);
958 if(uprv_memcmp(str, result[count], size) !=0){
959 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
960 printUChars(result[count], size);
962 printUChars(str, size);
973 static void TestAppend() {
974 static const UChar32 codePoints
[]={
975 0x61, 0xdf, 0x901, 0x3040,
976 0xac00, 0xd800, 0xdbff, 0xdcde,
977 0xdffd, 0xe000, 0xffff, 0x10000,
978 0x12345, 0xe0021, 0x10ffff, 0x110000,
979 0x234567, 0x7fffffff, -1, -1000,
982 static const uint8_t expectUnsafe
[]={
983 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
984 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
985 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
986 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
987 /* none from this line */
990 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
991 0xea, 0xb0, 0x80, /* no surrogates */
992 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
993 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
994 /* none from this line */
1001 UBool isError
, expectIsError
, wrongIsError
;
1004 for(i
=0; i
<UPRV_LENGTHOF(codePoints
); ++i
) {
1006 if(c
<0 || 0x10ffff<c
) {
1007 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1010 U8_APPEND_UNSAFE(buffer
, length
, c
);
1012 if(length
!=UPRV_LENGTHOF(expectUnsafe
) || 0!=memcmp(buffer
, expectUnsafe
, length
)) {
1013 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1018 for(i
=0; i
<UPRV_LENGTHOF(codePoints
); ++i
) {
1020 expectIsError
= c
<0 || 0x10ffff<c
|| U_IS_SURROGATE(c
);
1023 U8_APPEND(buffer
, length
, UPRV_LENGTHOF(buffer
), c
, isError
);
1024 wrongIsError
|= isError
!=expectIsError
;
1027 log_err("U8_APPEND did not set isError correctly\n");
1029 if(length
!=UPRV_LENGTHOF(expectSafe
) || 0!=memcmp(buffer
, expectSafe
, length
)) {
1030 log_err("U8_APPEND did not generate the expected output\n");
1036 static const uint8_t b
[]={
1037 0xc3, 0x9f, /* 00DF */
1038 0xed, 0x9f, 0xbf, /* D7FF */
1039 0xed, 0xa0, 0x81, /* D801 */
1040 0xed, 0xbf, 0xbe, /* DFFE */
1041 0xee, 0x80, 0x80, /* E000 */
1042 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1044 static const UChar32 cp
[]={
1045 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1049 int32_t i
, j
, k
, iu
, is
, il
, length
;
1051 k
=0; /* index into cp[] */
1052 length
=UPRV_LENGTHOF(b
);
1053 for(i
=0; i
<length
;) {
1055 U8_NEXT_UNSAFE(b
, j
, cu
);
1059 U8_NEXT(b
, j
, length
, cs
);
1063 L8_NEXT(b
, j
, length
, cl
);
1067 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cu
, (long)cp
[k
]);
1070 /* U8_NEXT() returns <0 for surrogate code points */
1071 if(U_IS_SURROGATE(cu
) ? cs
>=0 : cs
!=cu
) {
1072 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cs
, (long)cu
);
1075 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1077 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cl
, (long)cu
);
1080 if(is
!=iu
|| il
!=iu
) {
1081 log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i
, (long)i
);
1084 ++k
; /* next code point */
1085 i
=iu
; /* advance by one UTF-8 sequence */
1089 --k
; /* previous code point */
1092 U8_PREV_UNSAFE(b
, j
, cu
);
1096 U8_PREV(b
, 0, j
, cs
);
1100 L8_PREV(b
, 0, j
, cl
);
1104 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cu
, (long)cp
[k
]);
1107 /* U8_PREV() returns <0 for surrogate code points */
1108 if(U_IS_SURROGATE(cu
) ? cs
>=0 : cs
!=cu
) {
1109 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cs
, (long)cu
);
1112 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1114 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i
, (long)cl
, (long)cu
);
1117 if(is
!=iu
|| il
!=iu
) {
1118 log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i
, (long)i
);
1121 i
=iu
; /* go back by one UTF-8 sequence */
1125 static void printUChars(const uint8_t *uchars
, int16_t len
){
1127 for(i
=0; i
<len
; i
++){
1128 log_err("0x%02x ", *(uchars
+i
));