]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/cintltst/utf8tst.c
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / test / cintltst / utf8tst.c
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f 3/********************************************************************
51004dcb 4 * COPYRIGHT:
b331163b 5 * Copyright (c) 1998-2014, International Business Machines Corporation and
b75a7d8f
A
6 * others. All Rights Reserved.
7 ********************************************************************/
8/*
51004dcb 9* File utf8tst.c
b75a7d8f
A
10*
11* Modification History:
12*
13* Date Name Description
51004dcb 14* 07/24/2000 Madhu Creation
b75a7d8f
A
15*******************************************************************************
16*/
17
18#include "unicode/utypes.h"
19#include "unicode/utf8.h"
0f5d89e8 20#include "unicode/utf_old.h"
b75a7d8f
A
21#include "cmemory.h"
22#include "cintltst.h"
23
73c04bcf
A
24/* lenient UTF-8 ------------------------------------------------------------ */
25
26/*
27 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
28 * code points with their "natural" encoding.
29 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
30 * single surrogates.
31 *
32 * This is not conformant with UTF-8.
33 *
34 * Supplementary code points may be encoded as pairs of 3-byte sequences, but
35 * the macros below do not attempt to assemble such pairs.
36 */
37
38#define L8_NEXT(s, i, length, c) { \
39 (c)=(uint8_t)(s)[(i)++]; \
40 if((c)>=0x80) { \
41 if(U8_IS_LEAD(c)) { \
42 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
43 } else { \
44 (c)=U_SENTINEL; \
45 } \
46 } \
47}
48
49#define L8_PREV(s, start, i, c) { \
50 (c)=(uint8_t)(s)[--(i)]; \
51 if((c)>=0x80) { \
52 if((c)<=0xbf) { \
53 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
54 } else { \
55 (c)=U_SENTINEL; \
56 } \
57 } \
58}
59
60/* -------------------------------------------------------------------------- */
61
0f5d89e8
A
62// Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
63#ifndef UTF8_ERROR_VALUE_1
64# define UTF8_ERROR_VALUE_1 0x15
65#endif
66#ifndef UTF8_ERROR_VALUE_2
67# define UTF8_ERROR_VALUE_2 0x9f
68#endif
69#ifndef UTF_ERROR_VALUE
70# define UTF_ERROR_VALUE 0xffff
71#endif
72#ifndef UTF_IS_ERROR
73# define UTF_IS_ERROR(c) \
74 (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
75#endif
76
77#if !U_HIDE_OBSOLETE_UTF_OLD_H
78static void printUChars(const uint8_t *uchars, int16_t len){
79 int16_t i=0;
80 for(i=0; i<len; i++){
81 log_err("0x%02x ", *(uchars+i));
82 }
83}
84#endif
b75a7d8f
A
85
86static void TestCodeUnitValues(void);
87static void TestCharLength(void);
88static void TestGetChar(void);
89static void TestNextPrevChar(void);
51004dcb
A
90static void TestNulTerminated(void);
91static void TestNextPrevNonCharacters(void);
92static void TestNextPrevCharUnsafe(void);
b75a7d8f 93static void TestFwdBack(void);
51004dcb 94static void TestFwdBackUnsafe(void);
b75a7d8f 95static void TestSetChar(void);
51004dcb 96static void TestSetCharUnsafe(void);
0f5d89e8 97static void TestTruncateIfIncomplete(void);
b75a7d8f
A
98static void TestAppendChar(void);
99static void TestAppend(void);
73c04bcf 100static void TestSurrogates(void);
b75a7d8f
A
101
102void addUTF8Test(TestNode** root);
103
104void
105addUTF8Test(TestNode** root)
106{
51004dcb
A
107 addTest(root, &TestCodeUnitValues, "utf8tst/TestCodeUnitValues");
108 addTest(root, &TestCharLength, "utf8tst/TestCharLength");
109 addTest(root, &TestGetChar, "utf8tst/TestGetChar");
110 addTest(root, &TestNextPrevChar, "utf8tst/TestNextPrevChar");
111 addTest(root, &TestNulTerminated, "utf8tst/TestNulTerminated");
112 addTest(root, &TestNextPrevNonCharacters, "utf8tst/TestNextPrevNonCharacters");
113 addTest(root, &TestNextPrevCharUnsafe, "utf8tst/TestNextPrevCharUnsafe");
114 addTest(root, &TestFwdBack, "utf8tst/TestFwdBack");
115 addTest(root, &TestFwdBackUnsafe, "utf8tst/TestFwdBackUnsafe");
116 addTest(root, &TestSetChar, "utf8tst/TestSetChar");
117 addTest(root, &TestSetCharUnsafe, "utf8tst/TestSetCharUnsafe");
0f5d89e8 118 addTest(root, &TestTruncateIfIncomplete, "utf8tst/TestTruncateIfIncomplete");
51004dcb
A
119 addTest(root, &TestAppendChar, "utf8tst/TestAppendChar");
120 addTest(root, &TestAppend, "utf8tst/TestAppend");
121 addTest(root, &TestSurrogates, "utf8tst/TestSurrogates");
b75a7d8f
A
122}
123
124static void TestCodeUnitValues()
125{
0f5d89e8 126 static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
51004dcb 127
b75a7d8f 128 int16_t i;
b331163b 129 for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
b75a7d8f
A
130 uint8_t c=codeunit[i];
131 log_verbose("Testing code unit value of %x\n", c);
132 if(i<4){
0f5d89e8
A
133 if(
134#if !U_HIDE_OBSOLETE_UTF_OLD_H
135 !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||
136#endif
137 !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {
b75a7d8f 138 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
0f5d89e8 139 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
b75a7d8f
A
140 }
141 } else if(i< 8){
0f5d89e8
A
142 if(
143#if !U_HIDE_OBSOLETE_UTF_OLD_H
144 !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||
145#endif
146 !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {
b75a7d8f 147 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
0f5d89e8 148 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
b75a7d8f
A
149 }
150 } else if(i< 12){
0f5d89e8
A
151 if(
152#if !U_HIDE_OBSOLETE_UTF_OLD_H
153 !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||
154#endif
155 !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
b75a7d8f 156 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
0f5d89e8 157 c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
b75a7d8f
A
158 }
159 }
160 }
161}
162
163static void TestCharLength()
164{
165 static const uint32_t codepoint[]={
166 1, 0x0061,
167 1, 0x007f,
168 2, 0x016f,
169 2, 0x07ff,
170 3, 0x0865,
171 3, 0x20ac,
172 4, 0x20402,
173 4, 0x23456,
174 4, 0x24506,
175 4, 0x20402,
176 4, 0x10402,
177 3, 0xd7ff,
178 3, 0xe000,
51004dcb 179
b75a7d8f 180 };
51004dcb 181
b75a7d8f 182 int16_t i;
0f5d89e8 183#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f 184 UBool multiple;
0f5d89e8 185#endif
b331163b 186 for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
b75a7d8f 187 UChar32 c=codepoint[i+1];
0f5d89e8
A
188 if(
189#if !U_HIDE_OBSOLETE_UTF_OLD_H
190 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||
191#endif
192 U8_LENGTH(c) != (uint16_t)codepoint[i]) {
193 log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));
b75a7d8f 194 }else{
0f5d89e8 195 log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));
b75a7d8f 196 }
0f5d89e8 197#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
198 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
199 if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
200 log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
201 }
0f5d89e8 202#endif
b75a7d8f
A
203 }
204}
205
206static void TestGetChar()
207{
208 static const uint8_t input[]={
209 /* code unit,*/
210 0x61,
211 0x7f,
212 0xe4,
51004dcb 213 0xba,
b75a7d8f 214 0x8c,
51004dcb
A
215 0xF0,
216 0x90,
217 0x90,
b75a7d8f
A
218 0x81,
219 0xc0,
220 0x65,
221 0x31,
222 0x9a,
223 0xc9
224 };
225 static const UChar32 result[]={
51004dcb
A
226 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */
227 0x61, 0x61, 0x61,
228 0x7f, 0x7f, 0x7f,
b75a7d8f
A
229 0x4e8c, 0x4e8c, 0x4e8c,
230 0x4e8c, 0x4e8c, 0x4e8c ,
231 0x4e8c, 0x4e8c, 0x4e8c,
232 0x10401, 0x10401, 0x10401 ,
233 0x10401, 0x10401, 0x10401 ,
234 0x10401, 0x10401, 0x10401 ,
235 0x10401, 0x10401, 0x10401,
0f5d89e8 236 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
51004dcb
A
237 0x65, 0x65, 0x65,
238 0x31, 0x31, 0x31,
0f5d89e8
A
239 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
240 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1
b75a7d8f
A
241 };
242 uint16_t i=0;
51004dcb 243 UChar32 c, expected;
b75a7d8f
A
244 uint32_t offset=0;
245
246 for(offset=0; offset<sizeof(input); offset++) {
0f5d89e8
A
247 expected = result[i];
248 if (expected >= 0 && offset < sizeof(input) - 1) {
249#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f 250 UTF8_GET_CHAR_UNSAFE(input, offset, c);
0f5d89e8
A
251 if(c != expected) {
252 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
253 offset, expected, c);
51004dcb 254
b75a7d8f 255 }
0f5d89e8 256#endif
b75a7d8f 257 U8_GET_UNSAFE(input, offset, c);
0f5d89e8
A
258 if(c != expected) {
259 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
260 offset, expected, c);
51004dcb 261
b75a7d8f
A
262 }
263 }
51004dcb 264 expected=result[i+1];
0f5d89e8
A
265#if !U_HIDE_OBSOLETE_UTF_OLD_H
266 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
51004dcb
A
267 if(c != expected){
268 log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
269 }
0f5d89e8 270#endif
b75a7d8f 271 U8_GET(input, 0, offset, sizeof(input), c);
51004dcb
A
272 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
273 if(c != expected){
274 log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
b75a7d8f
A
275 }
276
51004dcb
A
277 U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
278 if(expected<0) { expected=0xfffd; }
279 if(c != expected){
280 log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
b75a7d8f 281 }
0f5d89e8 282#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
283 UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
284 if(c != result[i+2]){
285 log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
286 }
0f5d89e8 287#endif
51004dcb 288 i=(uint16_t)(i+3);
b75a7d8f
A
289 }
290}
291
51004dcb 292static void TestNextPrevChar() {
0f5d89e8
A
293 static const uint8_t input[]={
294 0x61,
295 0xf0, 0x90, 0x90, 0x81,
296 0xc0, 0x80, // non-shortest form
297 0xf3, 0xbe, // truncated
298 0xc2, // truncated
299 0x61,
300 0x81, 0x90, 0x90, 0xf0, // "backwards" sequence
301 0x00
302 };
b75a7d8f 303 static const UChar32 result[]={
0f5d89e8
A
304 /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */
305 0x0061, 0x0061, 0x0000, 0x0000,
306 0x10401, 0x10401, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
307 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
308 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
309 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
310 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x61, 0x61,
311 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
312 UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
313 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
314 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
315 0x61, 0x61, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
316 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401,
317 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,
318 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
319 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
320 0x0000, 0x0000, 0x0061, 0x0061
b75a7d8f
A
321 };
322 static const int32_t movedOffset[]={
0f5d89e8
A
323 /* next_safe prev_safe_s */
324 1, 15,
325 5, 14,
326 3, 13,
327 4, 12,
328 5, 11,
329 6, 10,
330 7, 9,
331 9, 7,
332 9, 7,
333 10, 6,
334 11, 5,
335 12, 1,
336 13, 1,
337 14, 1,
338 15, 1,
339 16, 0,
b75a7d8f
A
340 };
341
51004dcb 342 UChar32 c, expected;
0f5d89e8 343 uint32_t i=0, j=0;
b75a7d8f
A
344 uint32_t offset=0;
345 int32_t setOffset=0;
346 for(offset=0; offset<sizeof(input); offset++){
0f5d89e8
A
347 expected=result[i]; // next_safe_ns
348#if !U_HIDE_OBSOLETE_UTF_OLD_H
349 setOffset=offset;
350 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
351 if(setOffset != movedOffset[j]) {
352 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
353 offset, movedOffset[j], setOffset);
354 }
355 if(c != expected) {
356 log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
357 }
358#endif
359 setOffset=offset;
360 U8_NEXT(input, setOffset, sizeof(input), c);
361 if(setOffset != movedOffset[j]) {
362 log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
363 offset, movedOffset[j], setOffset);
51004dcb 364 }
51004dcb 365 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
0f5d89e8
A
366 if(c != expected) {
367 log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
51004dcb
A
368 }
369
370 setOffset=offset;
371 U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
0f5d89e8 372 if(setOffset != movedOffset[j]) {
51004dcb 373 log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
0f5d89e8 374 offset, movedOffset[j], setOffset);
51004dcb
A
375 }
376 if(expected<0) { expected=0xfffd; }
0f5d89e8
A
377 if(c != expected) {
378 log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
51004dcb 379 }
0f5d89e8
A
380#if !U_HIDE_OBSOLETE_UTF_OLD_H
381 setOffset=offset;
382 UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
383 if(setOffset != movedOffset[j]) {
384 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
385 offset, movedOffset[j], setOffset);
386 }
387 expected=result[i+1]; // next_safe_s
388 if(c != expected) {
389 log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
390 offset, expected, c);
391 }
392#endif
393 i=i+4;
394 j=j+2;
b75a7d8f
A
395 }
396
0f5d89e8 397 i=j=0;
b75a7d8f 398 for(offset=sizeof(input); offset > 0; --offset){
0f5d89e8
A
399 expected=result[i+2]; // prev_safe_ns
400#if !U_HIDE_OBSOLETE_UTF_OLD_H
401 setOffset=offset;
402 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
403 if(setOffset != movedOffset[j+1]) {
404 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
405 offset, movedOffset[j+1], setOffset);
406 }
407 if(c != expected) {
408 log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
409 }
410#endif
411 setOffset=offset;
412 U8_PREV(input, 0, setOffset, c);
413 if(setOffset != movedOffset[j+1]) {
414 log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
415 offset, movedOffset[j+1], setOffset);
51004dcb 416 }
51004dcb 417 if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
0f5d89e8
A
418 if(c != expected) {
419 log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
51004dcb
A
420 }
421
422 setOffset=offset;
423 U8_PREV_OR_FFFD(input, 0, setOffset, c);
0f5d89e8 424 if(setOffset != movedOffset[j+1]) {
51004dcb 425 log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
0f5d89e8 426 offset, movedOffset[j+1], setOffset);
51004dcb
A
427 }
428 if(expected<0) { expected=0xfffd; }
0f5d89e8
A
429 if(c != expected) {
430 log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
51004dcb 431 }
0f5d89e8
A
432#if !U_HIDE_OBSOLETE_UTF_OLD_H
433 setOffset=offset;
434 UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
435 if(setOffset != movedOffset[j+1]) {
436 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
437 offset, movedOffset[j+1], setOffset);
438 }
439 expected=result[i+3]; // prev_safe_s
440 if(c != expected) {
441 log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
442 offset, expected, c);
443 }
444#endif
445 i=i+4;
446 j=j+2;
b75a7d8f 447 }
51004dcb 448}
b75a7d8f 449
51004dcb
A
450/* keep this in sync with utf16tst.c's TestNulTerminated() */
451static void TestNulTerminated() {
452 static const uint8_t input[]={
453 /* 0 */ 0x61,
454 /* 1 */ 0xf0, 0x90, 0x90, 0x81,
0f5d89e8
A
455 /* 5 */ 0xc0,
456 /* 6 */ 0x80,
51004dcb
A
457 /* 7 */ 0xdf, 0x80,
458 /* 9 */ 0xc2,
459 /* 10 */ 0x62,
0f5d89e8
A
460 /* 11 */ 0xfd,
461 /* 12 */ 0xbe,
51004dcb
A
462 /* 13 */ 0xe0, 0xa0, 0x80,
463 /* 16 */ 0xe2, 0x82, 0xac,
464 /* 19 */ 0xf0, 0x90, 0x90,
465 /* 22 */ 0x00
466 /* 23 */
467 };
468 static const UChar32 result[]={
469 0x61,
470 0x10401,
0f5d89e8
A
471 U_SENTINEL, // C0 not a lead byte
472 U_SENTINEL, // 80
51004dcb 473 0x7c0,
0f5d89e8 474 U_SENTINEL, // C2
51004dcb 475 0x62,
0f5d89e8
A
476 U_SENTINEL, // FD not a lead byte
477 U_SENTINEL, // BE
51004dcb
A
478 0x800,
479 0x20ac,
0f5d89e8 480 U_SENTINEL, // truncated F0 90 90
51004dcb
A
481 0
482 };
483
484 UChar32 c, c2, expected;
485 int32_t i0, i=0, j, k, expectedIndex;
486 int32_t cpIndex=0;
487 do {
488 i0=i;
489 U8_NEXT(input, i, -1, c);
490 expected=result[cpIndex];
491 if(c!=expected) {
492 log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
493 }
494 j=i0;
495 U8_NEXT_OR_FFFD(input, j, -1, c);
496 if(expected<0) { expected=0xfffd; }
497 if(c!=expected) {
498 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
499 }
500 if(j!=i) {
501 log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
b75a7d8f 502 }
51004dcb
A
503 j=i0;
504 U8_FWD_1(input, j, -1);
505 if(j!=i) {
506 log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
507 }
508 ++cpIndex;
509 /*
510 * Move by this many code points from the start.
511 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
512 */
513 expectedIndex= (c==0) ? i-1 : i;
514 k=0;
515 U8_FWD_N(input, k, -1, cpIndex);
516 if(k!=expectedIndex) {
517 log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
518 }
519 } while(c!=0);
520
521 i=0;
522 do {
523 j=i0=i;
524 U8_NEXT(input, i, -1, c);
525 do {
526 U8_GET(input, 0, j, -1, c2);
527 if(c2!=c) {
528 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
529 }
530 U8_GET_OR_FFFD(input, 0, j, -1, c2);
531 expected= (c>=0) ? c : 0xfffd;
532 if(c2!=expected) {
533 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
534 }
535 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
536 k=j+1;
537 U8_SET_CP_LIMIT(input, 0, k, -1);
538 if(k!=i) {
539 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
b75a7d8f 540 }
51004dcb
A
541 } while(++j<i);
542 } while(c!=0);
543}
544
545static void TestNextPrevNonCharacters() {
546 /* test non-characters */
547 static const uint8_t nonChars[]={
548 0xef, 0xb7, 0x90, /* U+fdd0 */
549 0xef, 0xbf, 0xbf, /* U+feff */
550 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
551 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
552 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */
553 };
554
555 UChar32 ch;
556 int32_t idx;
557
558 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
559 U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
560 if(!U_IS_UNICODE_NONCHAR(ch)) {
561 log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
562 }
563 }
564 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
565 U8_PREV(nonChars, 0, idx, ch);
566 if(!U_IS_UNICODE_NONCHAR(ch)) {
567 log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
b75a7d8f
A
568 }
569 }
0f5d89e8
A
570#if !U_HIDE_OBSOLETE_UTF_OLD_H
571 for(idx=0; idx<(int32_t)sizeof(nonChars);) {
572 UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
573 UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
574 if(ch!=expected) {
575 log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
576 }
577 }
578 for(idx=(int32_t)sizeof(nonChars); idx>0;) {
579 UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
580 UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
581 if(ch!=expected) {
582 log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
583 }
584 }
585#endif
b75a7d8f
A
586}
587
51004dcb
A
588static void TestNextPrevCharUnsafe() {
589 /*
590 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
591 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
592 */
593 static const uint8_t input[]={
594 0x61,
595 0xf0, 0x90, 0x90, 0x81,
596 0xc0, 0x80, /* non-shortest form */
597 0xe2, 0x82, 0xac,
598 0xc2, 0xa1,
599 0xf4, 0x8f, 0xbf, 0xbf,
600 0x00
601 };
602 static const UChar32 codePoints[]={
603 0x61,
604 0x10401,
0f5d89e8 605 -1,
51004dcb
A
606 0x20ac,
607 0xa1,
608 0x10ffff,
609 0
610 };
611
0f5d89e8 612 UChar32 c, expected;
51004dcb
A
613 int32_t i;
614 uint32_t offset;
0f5d89e8 615#if !U_HIDE_OBSOLETE_UTF_OLD_H
51004dcb
A
616 for(i=0, offset=0; offset<sizeof(input); ++i) {
617 UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
0f5d89e8
A
618 expected = codePoints[i];
619 if(expected >= 0 && c != expected) {
51004dcb 620 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
0f5d89e8
A
621 offset, expected, c);
622 }
623 if(offset==6) {
624 // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
625 // while the new one skips C0 80 together.
626 ++offset;
51004dcb
A
627 }
628 }
0f5d89e8 629#endif
51004dcb
A
630 for(i=0, offset=0; offset<sizeof(input); ++i) {
631 U8_NEXT_UNSAFE(input, offset, c);
0f5d89e8
A
632 expected = codePoints[i];
633 if(expected >= 0 && c != expected) {
51004dcb 634 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
0f5d89e8 635 offset, expected, c);
51004dcb
A
636 }
637 }
0f5d89e8 638#if !U_HIDE_OBSOLETE_UTF_OLD_H
b331163b 639 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
0f5d89e8
A
640 UTF8_PREV_CHAR_UNSAFE(input, offset, c);
641 expected = codePoints[i];
642 if(expected >= 0 && c != expected) {
643 log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
644 offset, expected, c);
645 }
51004dcb 646 }
0f5d89e8 647#endif
b331163b 648 for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
0f5d89e8
A
649 U8_PREV_UNSAFE(input, offset, c);
650 expected = codePoints[i];
651 if(expected >= 0 && c != expected) {
652 log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
653 offset, expected, c);
654 }
51004dcb
A
655 }
656}
657
658static void TestFwdBack() {
0f5d89e8
A
659 static const uint8_t input[]={
660 0x61,
661 0xF0, 0x90, 0x90, 0x81,
662 0xff,
663 0x62,
664 0xc0,
665 0x80,
666 0x7f,
667 0x8f,
668 0xc0,
669 0x63,
670 0x81,
671 0x90,
672 0x90,
673 0xF0,
674 0x00
675 };
676 static const uint16_t fwd_safe[] ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
677 static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
b75a7d8f 678
0f5d89e8 679 static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
b75a7d8f 680 static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
0f5d89e8 681 static const uint16_t back_N_safe[] ={18, 17, 15, 11, 10, 8, 7, 0};
b75a7d8f 682
51004dcb 683 uint32_t offsafe=0;
b75a7d8f
A
684
685 uint32_t i=0;
0f5d89e8 686#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
687 while(offsafe < sizeof(input)){
688 UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
689 if(offsafe != fwd_safe[i]){
690 log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
691 }
692 i++;
693 }
0f5d89e8
A
694#endif
695 offsafe=0;
b75a7d8f
A
696 i=0;
697 while(offsafe < sizeof(input)){
698 U8_FWD_1(input, offsafe, sizeof(input));
699 if(offsafe != fwd_safe[i]){
700 log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
701 }
702 i++;
703 }
0f5d89e8 704#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
705 i=0;
706 offsafe=sizeof(input);
707 while(offsafe > 0){
708 UTF8_BACK_1_SAFE(input, 0, offsafe);
709 if(offsafe != back_safe[i]){
51004dcb 710 log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
b75a7d8f
A
711 }
712 i++;
713 }
0f5d89e8 714#endif
b75a7d8f
A
715 i=0;
716 offsafe=sizeof(input);
717 while(offsafe > 0){
718 U8_BACK_1(input, 0, offsafe);
719 if(offsafe != back_safe[i]){
51004dcb 720 log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
b75a7d8f
A
721 }
722 i++;
723 }
0f5d89e8 724#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f 725 offsafe=0;
b331163b 726 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
b75a7d8f
A
727 UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
728 if(offsafe != fwd_N_safe[i]){
729 log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
730 }
51004dcb 731
b75a7d8f 732 }
0f5d89e8 733#endif
b75a7d8f 734 offsafe=0;
b331163b 735 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
b75a7d8f
A
736 U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
737 if(offsafe != fwd_N_safe[i]){
738 log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
739 }
b75a7d8f 740
b75a7d8f 741 }
0f5d89e8 742#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f 743 offsafe=sizeof(input);
b331163b 744 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
b75a7d8f
A
745 UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
746 if(offsafe != back_N_safe[i]){
747 log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
748 }
749 }
0f5d89e8 750#endif
b75a7d8f 751 offsafe=sizeof(input);
b331163b 752 for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
b75a7d8f
A
753 U8_BACK_N(input, 0, offsafe, Nvalue[i]);
754 if(offsafe != back_N_safe[i]){
755 log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
756 }
757 }
758}
759
0f5d89e8
A
760/**
761* Ticket #13636 - Visual Studio 2017 has problems optimizing this function.
762* As a workaround, we will turn off optimization just for this function on VS2017 and above.
763*/
764#if defined(_MSC_VER) && (_MSC_VER > 1900)
765#pragma optimize( "", off )
766#endif
767
51004dcb
A
768static void TestFwdBackUnsafe() {
769 /*
770 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
771 * The behavior of _UNSAFE macros for ill-formed strings is undefined.
772 */
773 static const uint8_t input[]={
774 0x61,
775 0xf0, 0x90, 0x90, 0x81,
776 0xc0, 0x80, /* non-shortest form */
777 0xe2, 0x82, 0xac,
778 0xc2, 0xa1,
779 0xf4, 0x8f, 0xbf, 0xbf,
780 0x00
781 };
0f5d89e8
A
782 // forward unsafe skips only C0
783 static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
784 // backward unsafe skips C0 80 together
785 static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
51004dcb
A
786
787 int32_t offset;
788 int32_t i;
0f5d89e8 789#if !U_HIDE_OBSOLETE_UTF_OLD_H
b331163b 790 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
51004dcb
A
791 UTF8_FWD_1_UNSAFE(input, offset);
792 if(offset != boundaries[i]){
793 log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
794 }
795 }
0f5d89e8 796#endif
b331163b 797 for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
51004dcb
A
798 U8_FWD_1_UNSAFE(input, offset);
799 if(offset != boundaries[i]){
800 log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
801 }
802 }
0f5d89e8
A
803#if !U_HIDE_OBSOLETE_UTF_OLD_H
804 for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
51004dcb 805 UTF8_BACK_1_UNSAFE(input, offset);
0f5d89e8
A
806 if(offset != backBoundaries[i]){
807 log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
51004dcb
A
808 }
809 }
0f5d89e8
A
810#endif
811 for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
51004dcb 812 U8_BACK_1_UNSAFE(input, offset);
0f5d89e8
A
813 if(offset != backBoundaries[i]){
814 log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
51004dcb
A
815 }
816 }
0f5d89e8 817#if !U_HIDE_OBSOLETE_UTF_OLD_H
b331163b 818 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
51004dcb
A
819 offset=0;
820 UTF8_FWD_N_UNSAFE(input, offset, i);
821 if(offset != boundaries[i]) {
822 log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
823 }
824 }
0f5d89e8 825#endif
b331163b 826 for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
51004dcb
A
827 offset=0;
828 U8_FWD_N_UNSAFE(input, offset, i);
829 if(offset != boundaries[i]) {
830 log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
831 }
832 }
0f5d89e8
A
833#if !U_HIDE_OBSOLETE_UTF_OLD_H
834 for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
835 int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
b331163b 836 offset=UPRV_LENGTHOF(input);
51004dcb 837 UTF8_BACK_N_UNSAFE(input, offset, i);
0f5d89e8
A
838 if(offset != backBoundaries[j]) {
839 log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
51004dcb
A
840 }
841 }
0f5d89e8
A
842#endif
843 for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
844 int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
b331163b 845 offset=UPRV_LENGTHOF(input);
51004dcb 846 U8_BACK_N_UNSAFE(input, offset, i);
0f5d89e8
A
847 if(offset != backBoundaries[j]) {
848 log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
51004dcb
A
849 }
850 }
851}
852
0f5d89e8
A
853/**
854* Ticket #13636 - Turn optimization back on.
855*/
856#if defined(_MSC_VER) && (_MSC_VER > 1900)
857#pragma optimize( "", on )
858#endif
859
51004dcb 860static void TestSetChar() {
b75a7d8f
A
861 static const uint8_t input[]
862 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
b75a7d8f 863 static const int16_t start_safe[]
51004dcb 864 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
b75a7d8f 865 static const int16_t limit_safe[]
51004dcb
A
866 = {0, 1, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
867
b75a7d8f
A
868 uint32_t i=0;
869 int32_t offset=0, setOffset=0;
b331163b
A
870 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
871 if (offset<UPRV_LENGTHOF(input)){
0f5d89e8 872#if !U_HIDE_OBSOLETE_UTF_OLD_H
51004dcb
A
873 setOffset=offset;
874 UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
875 if(setOffset != start_safe[i]){
876 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
877 }
0f5d89e8 878#endif
51004dcb
A
879 setOffset=offset;
880 U8_SET_CP_START(input, 0, setOffset);
881 if(setOffset != start_safe[i]){
882 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
883 }
884 }
0f5d89e8 885#if !U_HIDE_OBSOLETE_UTF_OLD_H
51004dcb
A
886 setOffset=offset;
887 UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
888 if(setOffset != limit_safe[i]){
889 log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
890 }
0f5d89e8 891#endif
51004dcb
A
892 setOffset=offset;
893 U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
894 if(setOffset != limit_safe[i]){
895 log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
896 }
b75a7d8f 897
51004dcb
A
898 i++;
899 }
900}
b75a7d8f 901
51004dcb
A
902static void TestSetCharUnsafe() {
903 static const uint8_t input[]
904 = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
905 static const int16_t start_unsafe[]
906 = {0, 1, 1, 1, 4, 5, 6, 7, 8, 9, 9, 9, 12, 12, 12, 15 };
907 static const int16_t limit_unsafe[]
908 = {0, 1, 4, 4, 4, 5, 6, 7, 9, 9, 10, 10, 10, 15, 15, 15, 16 };
b75a7d8f 909
51004dcb
A
910 uint32_t i=0;
911 int32_t offset=0, setOffset=0;
b331163b
A
912 for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
913 if (offset<UPRV_LENGTHOF(input)){
0f5d89e8 914#if !U_HIDE_OBSOLETE_UTF_OLD_H
51004dcb
A
915 setOffset=offset;
916 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
917 if(setOffset != start_unsafe[i]){
918 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
919 }
0f5d89e8 920#endif
51004dcb
A
921 setOffset=offset;
922 U8_SET_CP_START_UNSAFE(input, setOffset);
923 if(setOffset != start_unsafe[i]){
924 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
925 }
926 }
b75a7d8f 927
51004dcb 928 if (offset != 0) { /* Can't have it go off the end of the array */
0f5d89e8 929#if !U_HIDE_OBSOLETE_UTF_OLD_H
51004dcb
A
930 setOffset=offset;
931 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
932 if(setOffset != limit_unsafe[i]){
933 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
934 }
0f5d89e8 935#endif
51004dcb
A
936 setOffset=offset;
937 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
938 if(setOffset != limit_unsafe[i]){
939 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
940 }
941 }
942
943 i++;
b75a7d8f
A
944 }
945}
946
0f5d89e8
A
947static void TestTruncateIfIncomplete() {
948 // Difference from U8_SET_CP_START():
949 // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
950 // Therefore, if the last byte is a lead byte, then this macro truncates
951 // even if the byte at the input index cannot continue a valid sequence
952 // (including when that is not a trail byte).
953 // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
954 static const struct {
955 const char *s;
956 int32_t expected;
957 } cases[] = {
958 { "", 0 },
959 { "a", 1 },
960 { "\x80", 1 },
961 { "\xC1", 1 },
962 { "\xC2", 0 },
963 { "\xE0", 0 },
964 { "\xF4", 0 },
965 { "\xF5", 1 },
966 { "\x80\x80", 2 },
967 { "\xC2\xA0", 2 },
968 { "\xE0\x9F", 2 },
969 { "\xE0\xA0", 0 },
970 { "\xED\x9F", 0 },
971 { "\xED\xA0", 2 },
972 { "\xF0\x8F", 2 },
973 { "\xF0\x90", 0 },
974 { "\xF4\x8F", 0 },
975 { "\xF4\x90", 2 },
976 { "\xF5\x80", 2 },
977 { "\x80\x80\x80", 3 },
978 { "\xC2\xA0\x80", 3 },
979 { "\xE0\xA0\x80", 3 },
980 { "\xF0\x8F\x80", 3 },
981 { "\xF0\x90\x80", 0 },
982 { "\xF4\x8F\x80", 0 },
983 { "\xF4\x90\x80", 3 },
984 { "\xF5\x80\x80", 3 },
985 { "\x80\x80\x80\x80", 4 },
986 { "\xC2\xA0\x80\x80", 4 },
987 { "\xE0\xA0\x80\x80", 4 },
988 { "\xF0\x90\x80\x80", 4 },
989 { "\xF5\x80\x80\x80", 4 }
990 };
991 int32_t i;
992 for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
993 const char *s = cases[i].s;
994 int32_t expected = cases[i].expected;
995 int32_t length = (int32_t)strlen(s);
996 int32_t adjusted = length;
997 U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
998 if (adjusted != expected) {
999 log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1000 (int)i, (int)length, (int)expected, (int)adjusted);
1001 }
1002 }
1003}
1004
b75a7d8f 1005static void TestAppendChar(){
0f5d89e8 1006#if !U_HIDE_OBSOLETE_UTF_OLD_H
b75a7d8f
A
1007 static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1008 static const uint32_t test[]={
51004dcb 1009 /* append-position(unsafe), CHAR to be appended */
b75a7d8f
A
1010 0, 0x10401,
1011 2, 0x0028,
51004dcb 1012 2, 0x007f,
b75a7d8f
A
1013 3, 0xd801,
1014 1, 0x20402,
1015 8, 0x10401,
1016 5, 0xc0,
1017 5, 0xc1,
1018 5, 0xfd,
1019 6, 0x80,
1020 6, 0x81,
1021 6, 0xbf,
1022 7, 0xfe,
1023
51004dcb 1024 /* append-position(safe), CHAR to be appended */
b75a7d8f 1025 0, 0x10401,
51004dcb 1026 2, 0x0028,
b75a7d8f
A
1027 3, 0x7f,
1028 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
1029 1, 0x20402,
1030 9, 0x10401,
1031 5, 0xc0,
1032 5, 0xc1,
1033 5, 0xfd,
1034 6, 0x80,
1035 6, 0x81,
1036 6, 0xbf,
1037 7, 0xfe,
51004dcb 1038
b75a7d8f
A
1039 };
1040 static const uint16_t movedOffset[]={
51004dcb 1041 /* offset-moved-to(unsafe) */
b75a7d8f 1042 4, /*for append-pos: 0 , CHAR 0x10401*/
51004dcb 1043 3,
b75a7d8f
A
1044 3,
1045 6,
1046 5,
1047 12,
1048 7,
51004dcb 1049 7,
b75a7d8f
A
1050 7,
1051 8,
1052 8,
1053 8,
1054 9,
1055
51004dcb 1056 /* offset-moved-to(safe) */
b75a7d8f
A
1057 4, /*for append-pos: 0, CHAR 0x10401*/
1058 3,
1059 4,
1060 6,
1061 5,
1062 11,
1063 7,
51004dcb 1064 7,
b75a7d8f
A
1065 7,
1066 8,
1067 8,
1068 8,
1069 9,
51004dcb 1070
b75a7d8f 1071 };
51004dcb 1072
b75a7d8f
A
1073 static const uint8_t result[][11]={
1074 /*unsafe*/
51004dcb
A
1075 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1076 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1077 {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078 {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
b75a7d8f 1080 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
51004dcb 1081
b75a7d8f
A
1082 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1083 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1084 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
51004dcb 1085
b75a7d8f
A
1086 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1087 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1088 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1089
1090 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1091 /*safe*/
51004dcb
A
1092 {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1093 {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
b75a7d8f 1094 {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
51004dcb
A
1095 {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096 {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
b75a7d8f 1097 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
51004dcb 1098
b75a7d8f
A
1099 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1100 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1101 {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
51004dcb 1102
b75a7d8f
A
1103 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1104 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1105 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
51004dcb 1106
b75a7d8f 1107 {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
51004dcb 1108
b75a7d8f
A
1109 };
1110 uint16_t i, count=0;
1111 uint8_t str[12];
1112 uint32_t offset;
1113/* UChar32 c=0;*/
b331163b
A
1114 uint16_t size=UPRV_LENGTHOF(s);
1115 for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
b75a7d8f 1116 uprv_memcpy(str, s, size);
51004dcb 1117 offset=test[i];
b75a7d8f
A
1118 if(count<13){
1119 UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
1120 if(offset != movedOffset[count]){
51004dcb 1121 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
b75a7d8f 1122 count, movedOffset[count], offset);
51004dcb 1123
b75a7d8f
A
1124 }
1125 if(uprv_memcmp(str, result[count], size) !=0){
1126 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
1127 printUChars(result[count], size);
1128 log_err("\nGot: ");
1129 printUChars(str, size);
1130 log_err("\n");
1131 }
1132 }else{
1133 UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
1134 if(offset != movedOffset[count]){
51004dcb 1135 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
b75a7d8f 1136 count, movedOffset[count], offset);
51004dcb 1137
b75a7d8f
A
1138 }
1139 if(uprv_memcmp(str, result[count], size) !=0){
1140 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
1141 printUChars(result[count], size);
1142 log_err("\nGot: ");
1143 printUChars(str, size);
1144 log_err("\n");
1145 }
1146 /*call the API instead of MACRO
1147 uprv_memcpy(str, s, size);
51004dcb 1148 offset=test[i];
b75a7d8f 1149 c=test[i+1];
51004dcb
A
1150 if((uint32_t)(c)<=0x7f) {
1151 (str)[(offset)++]=(uint8_t)(c);
1152 } else {
1153 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
b75a7d8f
A
1154 }
1155 if(offset != movedOffset[count]){
51004dcb 1156 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",
b75a7d8f 1157 count, movedOffset[count], offset);
51004dcb 1158
b75a7d8f
A
1159 }
1160 if(uprv_memcmp(str, result[count], size) !=0){
1161 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1162 printUChars(result[count], size);
1163 printf("\nGot: ");
1164 printUChars(str, size);
1165 printf("\n");
1166 }
1167 */
1168 }
1169 count++;
51004dcb 1170 }
0f5d89e8 1171#endif
b75a7d8f
A
1172}
1173
1174static void TestAppend() {
1175 static const UChar32 codePoints[]={
1176 0x61, 0xdf, 0x901, 0x3040,
1177 0xac00, 0xd800, 0xdbff, 0xdcde,
1178 0xdffd, 0xe000, 0xffff, 0x10000,
1179 0x12345, 0xe0021, 0x10ffff, 0x110000,
1180 0x234567, 0x7fffffff, -1, -1000,
1181 0, 0x400
1182 };
1183 static const uint8_t expectUnsafe[]={
1184 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1185 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
1186 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1187 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1188 /* none from this line */
1189 0, 0xd0, 0x80
1190 }, expectSafe[]={
1191 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
1192 0xea, 0xb0, 0x80, /* no surrogates */
1193 /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
1194 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
1195 /* none from this line */
1196 0, 0xd0, 0x80
1197 };
1198
1199 uint8_t buffer[100];
1200 UChar32 c;
1201 int32_t i, length;
1202 UBool isError, expectIsError, wrongIsError;
1203
1204 length=0;
b331163b 1205 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
b75a7d8f
A
1206 c=codePoints[i];
1207 if(c<0 || 0x10ffff<c) {
1208 continue; /* skip non-code points for U8_APPEND_UNSAFE */
1209 }
1210
1211 U8_APPEND_UNSAFE(buffer, length, c);
1212 }
b331163b 1213 if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
b75a7d8f
A
1214 log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1215 }
1216
1217 length=0;
1218 wrongIsError=FALSE;
b331163b 1219 for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
b75a7d8f
A
1220 c=codePoints[i];
1221 expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1222 isError=FALSE;
1223
b331163b 1224 U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
b75a7d8f
A
1225 wrongIsError|= isError!=expectIsError;
1226 }
1227 if(wrongIsError) {
1228 log_err("U8_APPEND did not set isError correctly\n");
1229 }
b331163b 1230 if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
b75a7d8f
A
1231 log_err("U8_APPEND did not generate the expected output\n");
1232 }
1233}
1234
73c04bcf
A
1235static void
1236TestSurrogates() {
1237 static const uint8_t b[]={
1238 0xc3, 0x9f, /* 00DF */
1239 0xed, 0x9f, 0xbf, /* D7FF */
1240 0xed, 0xa0, 0x81, /* D801 */
1241 0xed, 0xbf, 0xbe, /* DFFE */
1242 0xee, 0x80, 0x80, /* E000 */
1243 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */
1244 };
1245 static const UChar32 cp[]={
1246 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1247 };
1248
1249 UChar32 cu, cs, cl;
1250 int32_t i, j, k, iu, is, il, length;
1251
1252 k=0; /* index into cp[] */
b331163b 1253 length=UPRV_LENGTHOF(b);
73c04bcf
A
1254 for(i=0; i<length;) {
1255 j=i;
1256 U8_NEXT_UNSAFE(b, j, cu);
1257 iu=j;
1258
1259 j=i;
1260 U8_NEXT(b, j, length, cs);
1261 is=j;
1262
1263 j=i;
1264 L8_NEXT(b, j, length, cl);
1265 il=j;
1266
1267 if(cu!=cp[k]) {
1268 log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1269 }
1270
1271 /* U8_NEXT() returns <0 for surrogate code points */
1272 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1273 log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1274 }
1275
1276 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1277 if(cl!=cu) {
1278 log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1279 }
1280
0f5d89e8
A
1281 // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1282 if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
1283 log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1284 }
1285 if(il!=iu) {
1286 log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
73c04bcf
A
1287 }
1288
1289 ++k; /* next code point */
1290 i=iu; /* advance by one UTF-8 sequence */
1291 }
1292
1293 while(i>0) {
1294 --k; /* previous code point */
1295
1296 j=i;
1297 U8_PREV_UNSAFE(b, j, cu);
1298 iu=j;
1299
1300 j=i;
1301 U8_PREV(b, 0, j, cs);
1302 is=j;
1303
1304 j=i;
1305 L8_PREV(b, 0, j, cl);
1306 il=j;
1307
1308 if(cu!=cp[k]) {
1309 log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1310 }
1311
1312 /* U8_PREV() returns <0 for surrogate code points */
1313 if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1314 log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1315 }
1316
1317 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1318 if(cl!=cu) {
1319 log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1320 }
1321
0f5d89e8
A
1322 // U8_PREV() skips only the last byte of a surrogate byte sequence.
1323 if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
1324 log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1325 }
1326 if(il !=iu) {
1327 log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
73c04bcf
A
1328 }
1329
1330 i=iu; /* go back by one UTF-8 sequence */
1331 }
1332}