icuSources/test/cintltst/utf8tst.c

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1998-2014, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /*
   9 * File utf8tst.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date          Name        Description
  14 *   07/24/2000    Madhu       Creation
  15 *******************************************************************************
  16 */
  17
  18 #include "unicode/utypes.h"
  19 #include "unicode/utf8.h"
  20 #include "unicode/utf_old.h"
  21 #include "cmemory.h"
  22 #include "cintltst.h"
  23
  24 /* lenient UTF-8 ------------------------------------------------------------ */
  25
  26 /*
  27  * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate
  28  * code points with their "natural" encoding.
  29  * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of
  30  * single surrogates.
  31  *
  32  * This is not conformant with UTF-8.
  33  *
  34  * Supplementary code points may be encoded as pairs of 3-byte sequences, but
  35  * the macros below do not attempt to assemble such pairs.
  36  */
  37
  38 #define L8_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
  39     (c)=(uint8_t)(s)[(i)++]; \
  40     if((c)>=0x80) { \
  41         if(U8_IS_LEAD(c)) { \
  42             (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \
  43         } else { \
  44             (c)=U_SENTINEL; \
  45         } \
  46     } \
  47 } UPRV_BLOCK_MACRO_END
  48
  49 #define L8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
  50     (c)=(uint8_t)(s)[--(i)]; \
  51     if((c)>=0x80) { \
  52         if((c)<=0xbf) { \
  53             (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \
  54         } else { \
  55             (c)=U_SENTINEL; \
  56         } \
  57     } \
  58 } UPRV_BLOCK_MACRO_END
  59
  60 /* -------------------------------------------------------------------------- */
  61
  62 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.
  63 #ifndef UTF8_ERROR_VALUE_1
  64 #   define UTF8_ERROR_VALUE_1 0x15
  65 #endif
  66 #ifndef UTF8_ERROR_VALUE_2
  67 #   define UTF8_ERROR_VALUE_2 0x9f
  68 #endif
  69 #ifndef UTF_ERROR_VALUE
  70 #   define UTF_ERROR_VALUE 0xffff
  71 #endif
  72 #ifndef UTF_IS_ERROR
  73 #   define UTF_IS_ERROR(c) \
  74         (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
  75 #endif
  76
  77 #if !U_HIDE_OBSOLETE_UTF_OLD_H
  78 static void printUChars(const uint8_t *uchars, int16_t len){
  79     int16_t i=0;
  80     for(i=0; i<len; i++){
  81         log_err("0x%02x ", *(uchars+i));
  82     }
  83 }
  84 #endif
  85
  86 static void TestCodeUnitValues(void);
  87 static void TestCharLength(void);
  88 static void TestGetChar(void);
  89 static void TestNextPrevChar(void);
  90 static void TestNulTerminated(void);
  91 static void TestNextPrevNonCharacters(void);
  92 static void TestNextPrevCharUnsafe(void);
  93 static void TestFwdBack(void);
  94 static void TestFwdBackUnsafe(void);
  95 static void TestSetChar(void);
  96 static void TestSetCharUnsafe(void);
  97 static void TestTruncateIfIncomplete(void);
  98 static void TestAppendChar(void);
  99 static void TestAppend(void);
 100 static void TestSurrogates(void);
 101
 102 void addUTF8Test(TestNode** root);
 103
 104 void
 105 addUTF8Test(TestNode** root)
 106 {
 107     addTest(root, &TestCodeUnitValues,          "utf8tst/TestCodeUnitValues");
 108     addTest(root, &TestCharLength,              "utf8tst/TestCharLength");
 109     addTest(root, &TestGetChar,                 "utf8tst/TestGetChar");
 110     addTest(root, &TestNextPrevChar,            "utf8tst/TestNextPrevChar");
 111     addTest(root, &TestNulTerminated,           "utf8tst/TestNulTerminated");
 112     addTest(root, &TestNextPrevNonCharacters,   "utf8tst/TestNextPrevNonCharacters");
 113     addTest(root, &TestNextPrevCharUnsafe,      "utf8tst/TestNextPrevCharUnsafe");
 114     addTest(root, &TestFwdBack,                 "utf8tst/TestFwdBack");
 115     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");
 116     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");
 117     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");
 118     addTest(root, &TestTruncateIfIncomplete,    "utf8tst/TestTruncateIfIncomplete");
 119     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");
 120     addTest(root, &TestAppend,                  "utf8tst/TestAppend");
 121     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");
 122 }
 123
 124 static void TestCodeUnitValues()
 125 {
 126     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};
 127
 128     int16_t i;
 129     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
 130         uint8_t c=codeunit[i];
 131         log_verbose("Testing code unit value of %x\n", c);
 132         if(i<4){
 133             if(
 134 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 135                     !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||
 136 #endif
 137                     !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {
 138                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
 139                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
 140             }
 141         } else if(i< 8){
 142             if(
 143 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 144                     !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||
 145 #endif
 146                     !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {
 147                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
 148                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
 149             }
 150         } else if(i< 12){
 151             if(
 152 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 153                     !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||
 154 #endif
 155                     !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
 156                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
 157                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');
 158             }
 159         }
 160     }
 161 }
 162
 163 static void TestCharLength()
 164 {
 165     static const uint32_t codepoint[]={
 166         1, 0x0061,
 167         1, 0x007f,
 168         2, 0x016f,
 169         2, 0x07ff,
 170         3, 0x0865,
 171         3, 0x20ac,
 172         4, 0x20402,
 173         4, 0x23456,
 174         4, 0x24506,
 175         4, 0x20402,
 176         4, 0x10402,
 177         3, 0xd7ff,
 178         3, 0xe000,
 179
 180     };
 181
 182     int16_t i;
 183 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 184     UBool multiple;
 185 #endif
 186     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
 187         UChar32 c=codepoint[i+1];
 188         if(
 189 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 190                 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||
 191 #endif
 192                 U8_LENGTH(c) != (uint16_t)codepoint[i]) {
 193             log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));
 194         }else{
 195               log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));
 196         }
 197 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 198         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
 199         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){
 200               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);
 201         }
 202 #endif
 203     }
 204 }
 205
 206 static void TestGetChar()
 207 {
 208     static const uint8_t input[]={
 209     /*  code unit,*/
 210         0x61,
 211         0x7f,
 212         0xe4,
 213         0xba,
 214         0x8c,
 215         0xF0,
 216         0x90,
 217         0x90,
 218         0x81,
 219         0xc0,
 220         0x65,
 221         0x31,
 222         0x9a,
 223         0xc9
 224     };
 225     static const UChar32 result[]={
 226     /*  codepoint-unsafe, codepoint-safe(not strict)  codepoint-safe(strict) */
 227         0x61,             0x61,                       0x61,
 228         0x7f,             0x7f,                       0x7f,
 229         0x4e8c,           0x4e8c,                     0x4e8c,
 230         0x4e8c,           0x4e8c,                     0x4e8c ,
 231         0x4e8c,           0x4e8c,                     0x4e8c,
 232         0x10401,          0x10401,                    0x10401 ,
 233         0x10401,          0x10401,                    0x10401 ,
 234         0x10401,          0x10401,                    0x10401 ,
 235         0x10401,          0x10401,                    0x10401,
 236         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
 237         0x65,             0x65,                       0x65,
 238         0x31,             0x31,                       0x31,
 239         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
 240         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
 241     };
 242     uint16_t i=0;
 243     UChar32 c, expected;
 244     uint32_t offset=0;
 245
 246     for(offset=0; offset<sizeof(input); offset++) {
 247         expected = result[i];
 248         if (expected >= 0 && offset < sizeof(input) - 1) {
 249 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 250             UTF8_GET_CHAR_UNSAFE(input, offset, c);
 251             if(c != expected) {
 252                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
 253                         offset, expected, c);
 254
 255             }
 256 #endif
 257             U8_GET_UNSAFE(input, offset, c);
 258             if(c != expected) {
 259                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
 260                         offset, expected, c);
 261
 262             }
 263         }
 264         expected=result[i+1];
 265 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 266         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
 267         if(c != expected){
 268             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 269         }
 270 #endif
 271         U8_GET(input, 0, offset, sizeof(input), c);
 272         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
 273         if(c != expected){
 274             log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 275         }
 276
 277         U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);
 278         if(expected<0) { expected=0xfffd; }
 279         if(c != expected){
 280             log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 281         }
 282 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 283         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
 284         if(c != result[i+2]){
 285             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
 286         }
 287 #endif
 288         i=(uint16_t)(i+3);
 289     }
 290 }
 291
 292 static void TestNextPrevChar() {
 293     static const uint8_t input[]={
 294         0x61,
 295         0xf0, 0x90, 0x90, 0x81,
 296         0xc0, 0x80,  // non-shortest form
 297         0xf3, 0xbe,  // truncated
 298         0xc2,  // truncated
 299         0x61,
 300         0x81, 0x90, 0x90, 0xf0,  // "backwards" sequence
 301         0x00
 302     };
 303     static const UChar32 result[]={
 304     /*  next_safe_ns        next_safe_s          prev_safe_ns        prev_safe_s */
 305         0x0061,             0x0061,              0x0000,             0x0000,
 306         0x10401,            0x10401,             UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 307         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 308         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 309         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 310         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x61,               0x61,
 311         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 312         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
 313         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 314         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 315         0x61,               0x61,                UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 316         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,            0x10401,
 317         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
 318         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
 319         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
 320         0x0000,             0x0000,              0x0061,             0x0061
 321     };
 322     static const int32_t movedOffset[]={
 323     /*  next_safe    prev_safe_s */
 324         1,           15,
 325         5,           14,
 326         3,           13,
 327         4,           12,
 328         5,           11,
 329         6,           10,
 330         7,           9,
 331         9,           7,
 332         9,           7,
 333         10,          6,
 334         11,          5,
 335         12,          1,
 336         13,          1,
 337         14,          1,
 338         15,          1,
 339         16,          0,
 340     };
 341
 342     UChar32 c, expected;
 343     uint32_t i=0, j=0;
 344     uint32_t offset=0;
 345     int32_t setOffset=0;
 346     for(offset=0; offset<sizeof(input); offset++){
 347         expected=result[i];  // next_safe_ns
 348 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 349         setOffset=offset;
 350         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
 351         if(setOffset != movedOffset[j]) {
 352             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 353                 offset, movedOffset[j], setOffset);
 354         }
 355         if(c != expected) {
 356             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 357         }
 358 #endif
 359         setOffset=offset;
 360         U8_NEXT(input, setOffset, sizeof(input), c);
 361         if(setOffset != movedOffset[j]) {
 362             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 363                 offset, movedOffset[j], setOffset);
 364         }
 365         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
 366         if(c != expected) {
 367             log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 368         }
 369
 370         setOffset=offset;
 371         U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);
 372         if(setOffset != movedOffset[j]) {
 373             log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 374                 offset, movedOffset[j], setOffset);
 375         }
 376         if(expected<0) { expected=0xfffd; }
 377         if(c != expected) {
 378             log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 379         }
 380 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 381         setOffset=offset;
 382         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
 383         if(setOffset != movedOffset[j]) {
 384             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 385                 offset, movedOffset[j], setOffset);
 386         }
 387         expected=result[i+1];  // next_safe_s
 388         if(c != expected) {
 389             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
 390                     offset, expected, c);
 391         }
 392 #endif
 393         i=i+4;
 394         j=j+2;
 395     }
 396
 397     i=j=0;
 398     for(offset=sizeof(input); offset > 0; --offset){
 399         expected=result[i+2];  // prev_safe_ns
 400 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 401         setOffset=offset;
 402         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
 403         if(setOffset != movedOffset[j+1]) {
 404             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 405                 offset, movedOffset[j+1], setOffset);
 406         }
 407         if(c != expected) {
 408             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 409         }
 410 #endif
 411         setOffset=offset;
 412         U8_PREV(input, 0, setOffset, c);
 413         if(setOffset != movedOffset[j+1]) {
 414             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 415                 offset, movedOffset[j+1], setOffset);
 416         }
 417         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }
 418         if(c != expected) {
 419             log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 420         }
 421
 422         setOffset=offset;
 423         U8_PREV_OR_FFFD(input, 0, setOffset, c);
 424         if(setOffset != movedOffset[j+1]) {
 425             log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 426                 offset, movedOffset[j+1], setOffset);
 427         }
 428         if(expected<0) { expected=0xfffd; }
 429         if(c != expected) {
 430             log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);
 431         }
 432 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 433         setOffset=offset;
 434         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
 435         if(setOffset != movedOffset[j+1]) {
 436             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
 437                 offset, movedOffset[j+1], setOffset);
 438         }
 439         expected=result[i+3];  // prev_safe_s
 440         if(c != expected) {
 441             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",
 442                     offset, expected, c);
 443         }
 444 #endif
 445         i=i+4;
 446         j=j+2;
 447     }
 448 }
 449
 450 /* keep this in sync with utf16tst.c's TestNulTerminated() */
 451 static void TestNulTerminated() {
 452     static const uint8_t input[]={
 453         /*  0 */  0x61,
 454         /*  1 */  0xf0, 0x90, 0x90, 0x81,
 455         /*  5 */  0xc0,
 456         /*  6 */  0x80,
 457         /*  7 */  0xdf, 0x80,
 458         /*  9 */  0xc2,
 459         /* 10 */  0x62,
 460         /* 11 */  0xfd,
 461         /* 12 */  0xbe,
 462         /* 13 */  0xe0, 0xa0, 0x80,
 463         /* 16 */  0xe2, 0x82, 0xac,
 464         /* 19 */  0xf0, 0x90, 0x90,
 465         /* 22 */  0x00
 466         /* 23 */
 467     };
 468     static const UChar32 result[]={
 469         0x61,
 470         0x10401,
 471         U_SENTINEL,  // C0 not a lead byte
 472         U_SENTINEL,  // 80
 473         0x7c0,
 474         U_SENTINEL,  // C2
 475         0x62,
 476         U_SENTINEL,  // FD not a lead byte
 477         U_SENTINEL,  // BE
 478         0x800,
 479         0x20ac,
 480         U_SENTINEL,  // truncated F0 90 90
 481         0
 482     };
 483
 484     UChar32 c, c2, expected;
 485     int32_t i0, i=0, j, k, expectedIndex;
 486     int32_t cpIndex=0;
 487     do {
 488         i0=i;
 489         U8_NEXT(input, i, -1, c);
 490         expected=result[cpIndex];
 491         if(c!=expected) {
 492             log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
 493         }
 494         j=i0;
 495         U8_NEXT_OR_FFFD(input, j, -1, c);
 496         if(expected<0) { expected=0xfffd; }
 497         if(c!=expected) {
 498             log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
 499         }
 500         if(j!=i) {
 501             log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
 502         }
 503         j=i0;
 504         U8_FWD_1(input, j, -1);
 505         if(j!=i) {
 506             log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
 507         }
 508         ++cpIndex;
 509         /*
 510          * Move by this many code points from the start.
 511          * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
 512          */
 513         expectedIndex= (c==0) ? i-1 : i;
 514         k=0;
 515         U8_FWD_N(input, k, -1, cpIndex);
 516         if(k!=expectedIndex) {
 517             log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
 518         }
 519     } while(c!=0);
 520
 521     i=0;
 522     do {
 523         j=i0=i;
 524         U8_NEXT(input, i, -1, c);
 525         do {
 526             U8_GET(input, 0, j, -1, c2);
 527             if(c2!=c) {
 528                 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
 529             }
 530             U8_GET_OR_FFFD(input, 0, j, -1, c2);
 531             expected= (c>=0) ? c : 0xfffd;
 532             if(c2!=expected) {
 533                 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
 534             }
 535             /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
 536             k=j+1;
 537             U8_SET_CP_LIMIT(input, 0, k, -1);
 538             if(k!=i) {
 539                 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
 540             }
 541         } while(++j<i);
 542     } while(c!=0);
 543 }
 544
 545 static void TestNextPrevNonCharacters() {
 546     /* test non-characters */
 547     static const uint8_t nonChars[]={
 548         0xef, 0xb7, 0x90,       /* U+fdd0 */
 549         0xef, 0xbf, 0xbf,       /* U+feff */
 550         0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
 551         0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
 552         0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
 553     };
 554
 555     UChar32 ch;
 556     int32_t idx;
 557
 558     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
 559         U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
 560         if(!U_IS_UNICODE_NONCHAR(ch)) {
 561             log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
 562         }
 563     }
 564     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
 565         U8_PREV(nonChars, 0, idx, ch);
 566         if(!U_IS_UNICODE_NONCHAR(ch)) {
 567             log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
 568         }
 569     }
 570 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 571     for(idx=0; idx<(int32_t)sizeof(nonChars);) {
 572         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
 573         UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);
 574         if(ch!=expected) {
 575             log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);
 576         }
 577     }
 578     for(idx=(int32_t)sizeof(nonChars); idx>0;) {
 579         UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);
 580         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;
 581         if(ch!=expected) {
 582             log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);
 583         }
 584     }
 585 #endif
 586 }
 587
 588 static void TestNextPrevCharUnsafe() {
 589     /*
 590      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
 591      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
 592      */
 593     static const uint8_t input[]={
 594         0x61,
 595         0xf0, 0x90, 0x90, 0x81,
 596         0xc0, 0x80,  /* non-shortest form */
 597         0xe2, 0x82, 0xac,
 598         0xc2, 0xa1,
 599         0xf4, 0x8f, 0xbf, 0xbf,
 600         0x00
 601     };
 602     static const UChar32 codePoints[]={
 603         0x61,
 604         0x10401,
 605         -1,
 606         0x20ac,
 607         0xa1,
 608         0x10ffff,
 609         0
 610     };
 611
 612     UChar32 c, expected;
 613     int32_t i;
 614     uint32_t offset;
 615 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 616     for(i=0, offset=0; offset<sizeof(input); ++i) {
 617         UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
 618         expected = codePoints[i];
 619         if(expected >= 0 && c != expected) {
 620             log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
 621                     offset, expected, c);
 622         }
 623         if(offset==6) {
 624             // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes
 625             // while the new one skips C0 80 together.
 626             ++offset;
 627         }
 628     }
 629 #endif
 630     for(i=0, offset=0; offset<sizeof(input); ++i) {
 631         U8_NEXT_UNSAFE(input, offset, c);
 632         expected = codePoints[i];
 633         if(expected >= 0 && c != expected) {
 634             log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
 635                     offset, expected, c);
 636         }
 637     }
 638 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 639     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
 640         UTF8_PREV_CHAR_UNSAFE(input, offset, c);
 641         expected = codePoints[i];
 642         if(expected >= 0 && c != expected) {
 643             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
 644                     offset, expected, c);
 645         }
 646     }
 647 #endif
 648     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
 649         U8_PREV_UNSAFE(input, offset, c);
 650         expected = codePoints[i];
 651         if(expected >= 0 && c != expected) {
 652             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
 653                     offset, expected, c);
 654         }
 655     }
 656 }
 657
 658 static void TestFwdBack() {
 659     static const uint8_t input[]={
 660         0x61,
 661         0xF0, 0x90, 0x90, 0x81,
 662         0xff,
 663         0x62,
 664         0xc0,
 665         0x80,
 666         0x7f,
 667         0x8f,
 668         0xc0,
 669         0x63,
 670         0x81,
 671         0x90,
 672         0x90,
 673         0xF0,
 674         0x00
 675     };
 676     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
 677     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};
 678
 679     static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};
 680     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
 681     static const uint16_t back_N_safe[]  ={18, 17, 15, 11, 10, 8, 7, 0};
 682
 683     uint32_t offsafe=0;
 684
 685     uint32_t i=0;
 686 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 687     while(offsafe < sizeof(input)){
 688         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
 689         if(offsafe != fwd_safe[i]){
 690             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
 691         }
 692         i++;
 693     }
 694 #endif
 695     offsafe=0;
 696     i=0;
 697     while(offsafe < sizeof(input)){
 698         U8_FWD_1(input, offsafe, sizeof(input));
 699         if(offsafe != fwd_safe[i]){
 700             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
 701         }
 702         i++;
 703     }
 704 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 705     i=0;
 706     offsafe=sizeof(input);
 707     while(offsafe > 0){
 708         UTF8_BACK_1_SAFE(input, 0,  offsafe);
 709         if(offsafe != back_safe[i]){
 710             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);
 711         }
 712         i++;
 713     }
 714 #endif
 715     i=0;
 716     offsafe=sizeof(input);
 717     while(offsafe > 0){
 718         U8_BACK_1(input, 0,  offsafe);
 719         if(offsafe != back_safe[i]){
 720             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);
 721         }
 722         i++;
 723     }
 724 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 725     offsafe=0;
 726     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
 727         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
 728         if(offsafe != fwd_N_safe[i]){
 729             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
 730         }
 731
 732     }
 733 #endif
 734     offsafe=0;
 735     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
 736         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
 737         if(offsafe != fwd_N_safe[i]){
 738             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
 739         }
 740
 741     }
 742 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 743     offsafe=sizeof(input);
 744     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
 745         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
 746         if(offsafe != back_N_safe[i]){
 747             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
 748         }
 749     }
 750 #endif
 751     offsafe=sizeof(input);
 752     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){
 753         U8_BACK_N(input, 0, offsafe, Nvalue[i]);
 754         if(offsafe != back_N_safe[i]){
 755             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
 756         }
 757     }
 758 }
 759
 760 /**
 761 * Ticket #13636 - Visual Studio 2017 has problems optimizing this function.
 762 * As a workaround, we will turn off optimization just for this function on VS2017 and above.
 763 */
 764 #if defined(_MSC_VER) && (_MSC_VER > 1900)
 765 #pragma optimize( "", off )
 766 #endif
 767
 768 static void TestFwdBackUnsafe() {
 769     /*
 770      * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
 771      * The behavior of _UNSAFE macros for ill-formed strings is undefined.
 772      */
 773     static const uint8_t input[]={
 774         0x61,
 775         0xf0, 0x90, 0x90, 0x81,
 776         0xc0, 0x80,  /* non-shortest form */
 777         0xe2, 0x82, 0xac,
 778         0xc2, 0xa1,
 779         0xf4, 0x8f, 0xbf, 0xbf,
 780         0x00
 781     };
 782     // forward unsafe skips only C0
 783     static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };
 784     // backward unsafe skips C0 80 together
 785     static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };
 786
 787     int32_t offset;
 788     int32_t i;
 789 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 790     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
 791         UTF8_FWD_1_UNSAFE(input, offset);
 792         if(offset != boundaries[i]){
 793             log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
 794         }
 795     }
 796 #endif
 797     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {
 798         U8_FWD_1_UNSAFE(input, offset);
 799         if(offset != boundaries[i]){
 800             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
 801         }
 802     }
 803 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 804     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
 805         UTF8_BACK_1_UNSAFE(input, offset);
 806         if(offset != backBoundaries[i]){
 807             log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
 808         }
 809     }
 810 #endif
 811     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {
 812         U8_BACK_1_UNSAFE(input, offset);
 813         if(offset != backBoundaries[i]){
 814             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);
 815         }
 816     }
 817 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 818     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
 819         offset=0;
 820         UTF8_FWD_N_UNSAFE(input, offset, i);
 821         if(offset != boundaries[i]) {
 822             log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
 823         }
 824     }
 825 #endif
 826     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {
 827         offset=0;
 828         U8_FWD_N_UNSAFE(input, offset, i);
 829         if(offset != boundaries[i]) {
 830             log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
 831         }
 832     }
 833 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 834     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
 835         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
 836         offset=UPRV_LENGTHOF(input);
 837         UTF8_BACK_N_UNSAFE(input, offset, i);
 838         if(offset != backBoundaries[j]) {
 839             log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
 840         }
 841     }
 842 #endif
 843     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {
 844         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;
 845         offset=UPRV_LENGTHOF(input);
 846         U8_BACK_N_UNSAFE(input, offset, i);
 847         if(offset != backBoundaries[j]) {
 848             log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);
 849         }
 850     }
 851 }
 852
 853 /**
 854 * Ticket #13636 - Turn optimization back on.
 855 */
 856 #if defined(_MSC_VER) && (_MSC_VER > 1900)
 857 #pragma optimize( "", on )
 858 #endif
 859
 860 static void TestSetChar() {
 861     static const uint8_t input[]
 862         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };
 863     static const int16_t start_safe[]
 864         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
 865     static const int16_t limit_safe[]
 866         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };
 867
 868     uint32_t i=0;
 869     int32_t offset=0, setOffset=0;
 870     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
 871         if (offset<UPRV_LENGTHOF(input)){
 872 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 873             setOffset=offset;
 874             UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
 875             if(setOffset != start_safe[i]){
 876                 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
 877             }
 878 #endif
 879             setOffset=offset;
 880             U8_SET_CP_START(input, 0, setOffset);
 881             if(setOffset != start_safe[i]){
 882                 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
 883             }
 884         }
 885 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 886         setOffset=offset;
 887         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, (int32_t)sizeof(input));
 888         if(setOffset != limit_safe[i]){
 889             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
 890         }
 891 #endif
 892         setOffset=offset;
 893         U8_SET_CP_LIMIT(input,0, setOffset, (int32_t)sizeof(input));
 894         if(setOffset != limit_safe[i]){
 895             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
 896         }
 897
 898         i++;
 899     }
 900 }
 901
 902 static void TestSetCharUnsafe() {
 903     static const uint8_t input[]
 904         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };
 905     static const int16_t start_unsafe[]
 906         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   12,   12,   15 };
 907     static const int16_t limit_unsafe[]
 908         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15,   15,   15,   16 };
 909
 910     uint32_t i=0;
 911     int32_t offset=0, setOffset=0;
 912     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){
 913         if (offset<UPRV_LENGTHOF(input)){
 914 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 915             setOffset=offset;
 916             UTF8_SET_CHAR_START_UNSAFE(input, setOffset);
 917             if(setOffset != start_unsafe[i]){
 918                 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
 919             }
 920 #endif
 921             setOffset=offset;
 922             U8_SET_CP_START_UNSAFE(input, setOffset);
 923             if(setOffset != start_unsafe[i]){
 924                 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
 925             }
 926         }
 927
 928         if (offset != 0) { /* Can't have it go off the end of the array */
 929 #if !U_HIDE_OBSOLETE_UTF_OLD_H
 930             setOffset=offset;
 931             UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
 932             if(setOffset != limit_unsafe[i]){
 933                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
 934             }
 935 #endif
 936             setOffset=offset;
 937             U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
 938             if(setOffset != limit_unsafe[i]){
 939                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
 940             }
 941         }
 942
 943         i++;
 944     }
 945 }
 946
 947 static void TestTruncateIfIncomplete() {
 948     // Difference from U8_SET_CP_START():
 949     // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].
 950     // Therefore, if the last byte is a lead byte, then this macro truncates
 951     // even if the byte at the input index cannot continue a valid sequence
 952     // (including when that is not a trail byte).
 953     // On the other hand, if the last byte is a trail byte, then the two macros behave the same.
 954     static const struct {
 955         const char *s;
 956         int32_t expected;
 957     } cases[] = {
 958         { "", 0 },
 959         { "a", 1 },
 960         { "\x80", 1 },
 961         { "\xC1", 1 },
 962         { "\xC2", 0 },
 963         { "\xE0", 0 },
 964         { "\xF4", 0 },
 965         { "\xF5", 1 },
 966         { "\x80\x80", 2 },
 967         { "\xC2\xA0", 2 },
 968         { "\xE0\x9F", 2 },
 969         { "\xE0\xA0", 0 },
 970         { "\xED\x9F", 0 },
 971         { "\xED\xA0", 2 },
 972         { "\xF0\x8F", 2 },
 973         { "\xF0\x90", 0 },
 974         { "\xF4\x8F", 0 },
 975         { "\xF4\x90", 2 },
 976         { "\xF5\x80", 2 },
 977         { "\x80\x80\x80", 3 },
 978         { "\xC2\xA0\x80", 3 },
 979         { "\xE0\xA0\x80", 3 },
 980         { "\xF0\x8F\x80", 3 },
 981         { "\xF0\x90\x80", 0 },
 982         { "\xF4\x8F\x80", 0 },
 983         { "\xF4\x90\x80", 3 },
 984         { "\xF5\x80\x80", 3 },
 985         { "\x80\x80\x80\x80", 4 },
 986         { "\xC2\xA0\x80\x80", 4 },
 987         { "\xE0\xA0\x80\x80", 4 },
 988         { "\xF0\x90\x80\x80", 4 },
 989         { "\xF5\x80\x80\x80", 4 }
 990     };
 991     int32_t i;
 992     for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {
 993         const char *s = cases[i].s;
 994         int32_t expected = cases[i].expected;
 995         int32_t length = (int32_t)strlen(s);
 996         int32_t adjusted = length;
 997         U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);
 998         if (adjusted != expected) {
 999             log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",
1000                     (int)i, (int)length, (int)expected, (int)adjusted);
1001         }
1002     }
1003 }
1004
1005 static void TestAppendChar(){
1006 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1007     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};
1008     static const uint32_t test[]={
1009     /*  append-position(unsafe),  CHAR to be appended */
1010         0,                        0x10401,
1011         2,                        0x0028,
1012         2,                        0x007f,
1013         3,                        0xd801,
1014         1,                        0x20402,
1015         8,                        0x10401,
1016         5,                        0xc0,
1017         5,                        0xc1,
1018         5,                        0xfd,
1019         6,                        0x80,
1020         6,                        0x81,
1021         6,                        0xbf,
1022         7,                        0xfe,
1023
1024     /*  append-position(safe),    CHAR to be appended */
1025         0,                        0x10401,
1026         2,                        0x0028,
1027         3,                        0x7f,
1028         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */
1029         1,                        0x20402,
1030         9,                        0x10401,
1031         5,                        0xc0,
1032         5,                        0xc1,
1033         5,                        0xfd,
1034         6,                        0x80,
1035         6,                        0x81,
1036         6,                        0xbf,
1037         7,                        0xfe,
1038
1039     };
1040     static const uint16_t movedOffset[]={
1041     /* offset-moved-to(unsafe) */
1042           4,              /*for append-pos: 0 , CHAR 0x10401*/
1043           3,
1044           3,
1045           6,
1046           5,
1047           12,
1048           7,
1049           7,
1050           7,
1051           8,
1052           8,
1053           8,
1054           9,
1055
1056     /* offset-moved-to(safe) */
1057           4,              /*for append-pos: 0, CHAR  0x10401*/
1058           3,
1059           4,
1060           6,
1061           5,
1062           11,
1063           7,
1064           7,
1065           7,
1066           8,
1067           8,
1068           8,
1069           9,
1070
1071     };
1072
1073     static const uint8_t result[][11]={
1074         /*unsafe*/
1075         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1076         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1077         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1078         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
1079         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1080         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},
1081
1082         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1083         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1084         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1085
1086         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1087         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1088         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1089
1090         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1091         /*safe*/
1092         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1093         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1094         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1095         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
1096         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
1097         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
1098
1099         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},
1100         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},
1101         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},
1102
1103         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},
1104         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},
1105         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},
1106
1107         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},
1108
1109     };
1110     uint16_t i, count=0;
1111     uint8_t str[12];
1112     uint32_t offset;
1113 /*    UChar32 c=0;*/
1114     uint16_t size=UPRV_LENGTHOF(s);
1115     for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){
1116         uprv_memcpy(str, s, size);
1117         offset=test[i];
1118         if(count<13){
1119             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);
1120             if(offset != movedOffset[count]){
1121                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1122                     count, movedOffset[count], offset);
1123
1124             }
1125             if(uprv_memcmp(str, result[count], size) !=0){
1126                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);
1127                 printUChars(result[count], size);
1128                 log_err("\nGot:      ");
1129                 printUChars(str, size);
1130                 log_err("\n");
1131             }
1132         }else{
1133             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);
1134             if(offset != movedOffset[count]){
1135                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1136                     count, movedOffset[count], offset);
1137
1138             }
1139             if(uprv_memcmp(str, result[count], size) !=0){
1140                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);
1141                 printUChars(result[count], size);
1142                 log_err("\nGot:     ");
1143                 printUChars(str, size);
1144                 log_err("\n");
1145             }
1146             /*call the API instead of MACRO
1147             uprv_memcpy(str, s, size);
1148             offset=test[i];
1149             c=test[i+1];
1150             if((uint32_t)(c)<=0x7f) {
1151                   (str)[(offset)++]=(uint8_t)(c);
1152             } else {
1153                  (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);
1154             }
1155             if(offset != movedOffset[count]){
1156                 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",
1157                     count, movedOffset[count], offset);
1158
1159             }
1160             if(uprv_memcmp(str, result[count], size) !=0){
1161                 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);
1162                 printUChars(result[count], size);
1163                 printf("\nGot:     ");
1164                 printUChars(str, size);
1165                 printf("\n");
1166             }
1167             */
1168         }
1169         count++;
1170     }
1171 #endif
1172 }
1173
1174 static void TestAppend() {
1175     static const UChar32 codePoints[]={
1176         0x61, 0xdf, 0x901, 0x3040,
1177         0xac00, 0xd800, 0xdbff, 0xdcde,
1178         0xdffd, 0xe000, 0xffff, 0x10000,
1179         0x12345, 0xe0021, 0x10ffff, 0x110000,
1180         0x234567, 0x7fffffff, -1, -1000,
1181         0, 0x400
1182     };
1183     static const uint8_t expectUnsafe[]={
1184         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
1185         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
1186         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
1187         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
1188         /* none from this line */
1189         0,  0xd0, 0x80
1190     }, expectSafe[]={
1191         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
1192         0xea, 0xb0, 0x80,  /* no surrogates */
1193         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
1194         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
1195         /* none from this line */
1196         0,  0xd0, 0x80
1197     };
1198
1199     uint8_t buffer[100];
1200     UChar32 c;
1201     int32_t i, length;
1202     UBool isError, expectIsError, wrongIsError;
1203
1204     length=0;
1205     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1206         c=codePoints[i];
1207         if(c<0 || 0x10ffff<c) {
1208             continue; /* skip non-code points for U8_APPEND_UNSAFE */
1209         }
1210
1211         U8_APPEND_UNSAFE(buffer, length, c);
1212     }
1213     if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
1214         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
1215     }
1216
1217     length=0;
1218     wrongIsError=FALSE;
1219     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {
1220         c=codePoints[i];
1221         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
1222         isError=FALSE;
1223
1224         U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);
1225         wrongIsError|= isError!=expectIsError;
1226     }
1227     if(wrongIsError) {
1228         log_err("U8_APPEND did not set isError correctly\n");
1229     }
1230     if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
1231         log_err("U8_APPEND did not generate the expected output\n");
1232     }
1233 }
1234
1235 static void
1236 TestSurrogates() {
1237     static const uint8_t b[]={
1238         0xc3, 0x9f,             /*  00DF */
1239         0xed, 0x9f, 0xbf,       /*  D7FF */
1240         0xed, 0xa0, 0x81,       /*  D801 */
1241         0xed, 0xbf, 0xbe,       /*  DFFE */
1242         0xee, 0x80, 0x80,       /*  E000 */
1243         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
1244     };
1245     static const UChar32 cp[]={
1246         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe
1247     };
1248
1249     UChar32 cu, cs, cl;
1250     int32_t i, j, k, iu, is, il, length;
1251
1252     k=0; /* index into cp[] */
1253     length=UPRV_LENGTHOF(b);
1254     for(i=0; i<length;) {
1255         j=i;
1256         U8_NEXT_UNSAFE(b, j, cu);
1257         iu=j;
1258
1259         j=i;
1260         U8_NEXT(b, j, length, cs);
1261         is=j;
1262
1263         j=i;
1264         L8_NEXT(b, j, length, cl);
1265         il=j;
1266
1267         if(cu!=cp[k]) {
1268             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1269         }
1270
1271         /* U8_NEXT() returns <0 for surrogate code points */
1272         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1273             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1274         }
1275
1276         /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
1277         if(cl!=cu) {
1278             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1279         }
1280
1281         // U8_NEXT() skips only the first byte of a surrogate byte sequence.
1282         if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {
1283             log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1284         }
1285         if(il!=iu) {
1286             log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1287         }
1288
1289         ++k;    /* next code point */
1290         i=iu;   /* advance by one UTF-8 sequence */
1291     }
1292
1293     while(i>0) {
1294         --k; /* previous code point */
1295
1296         j=i;
1297         U8_PREV_UNSAFE(b, j, cu);
1298         iu=j;
1299
1300         j=i;
1301         U8_PREV(b, 0, j, cs);
1302         is=j;
1303
1304         j=i;
1305         L8_PREV(b, 0, j, cl);
1306         il=j;
1307
1308         if(cu!=cp[k]) {
1309             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);
1310         }
1311
1312         /* U8_PREV() returns <0 for surrogate code points */
1313         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
1314             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);
1315         }
1316
1317         /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
1318         if(cl!=cu) {
1319             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);
1320         }
1321
1322         // U8_PREV() skips only the last byte of a surrogate byte sequence.
1323         if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {
1324             log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1325         }
1326         if(il !=iu) {
1327             log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);
1328         }
1329
1330         i=iu;   /* go back by one UTF-8 sequence */
1331     }
1332 }