git.saurik.com Git - apple/icu.git/blob - icuSources/test/cintltst/utf8tst.c

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /********************************************************************

4 * COPYRIGHT:

7 ********************************************************************/

8 /*

9 * File utf8tst.c

10 *

11 * Modification History:

12 *

13 * Date Name Description

14 * 07/24/2000 Madhu Creation

15 *******************************************************************************

16 */

18 #include "unicode/utypes.h"

19 #include "unicode/utf8.h"

20 #include "unicode/utf_old.h"

21 #include "cmemory.h"

22 #include "cintltst.h"

24 /* lenient UTF-8 ------------------------------------------------------------ */

26 /*

27 * Lenient UTF-8 differs from conformant UTF-8 in that it allows surrogate

28 * code points with their "natural" encoding.

29 * Effectively, this allows a mix of UTF-8 and CESU-8 as well as encodings of

30 * single surrogates.

31 *

32 * This is not conformant with UTF-8.

33 *

34 * Supplementary code points may be encoded as pairs of 3-byte sequences, but

35 * the macros below do not attempt to assemble such pairs.

36 */

38 #define L8_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \

39 (c)=(uint8_t)(s)[(i)++]; \

40 if((c)>=0x80) { \

41 if(U8_IS_LEAD(c)) { \

42 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -2); \

43 } else { \

44 (c)=U_SENTINEL; \

45 } \

46 } \

47 } UPRV_BLOCK_MACRO_END

49 #define L8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \

50 (c)=(uint8_t)(s)[--(i)]; \

51 if((c)>=0x80) { \

52 if((c)<=0xbf) { \

53 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -2); \

54 } else { \

55 (c)=U_SENTINEL; \

56 } \

57 } \

58 } UPRV_BLOCK_MACRO_END

60 /* -------------------------------------------------------------------------- */

62 // Obsolete macros from obsolete unicode/utf_old.h, for some old test data.

63 #ifndef UTF8_ERROR_VALUE_1

64 # define UTF8_ERROR_VALUE_1 0x15

65 #endif

66 #ifndef UTF8_ERROR_VALUE_2

67 # define UTF8_ERROR_VALUE_2 0x9f

68 #endif

69 #ifndef UTF_ERROR_VALUE

70 # define UTF_ERROR_VALUE 0xffff

71 #endif

72 #ifndef UTF_IS_ERROR

73 # define UTF_IS_ERROR(c) \

74 (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)

75 #endif

77 #if !U_HIDE_OBSOLETE_UTF_OLD_H

 static void printUChars(const uint8_t *uchars, int16_t len){

79 int16_t i=0;

     for(i=0; i<len; i++){

         log_err("0x%02x ", *(uchars+i));

82 }

83 }

84 #endif

 static void TestCodeUnitValues(void);

 static void TestCharLength(void);

 static void TestGetChar(void);

 static void TestNextPrevChar(void);

 static void TestNulTerminated(void);

 static void TestNextPrevNonCharacters(void);

 static void TestNextPrevCharUnsafe(void);

 static void TestFwdBack(void);

 static void TestFwdBackUnsafe(void);

 static void TestSetChar(void);

 static void TestSetCharUnsafe(void);

 static void TestTruncateIfIncomplete(void);

 static void TestAppendChar(void);

 static void TestAppend(void);

 static void TestSurrogates(void);

101

 void addUTF8Test(TestNode** root);

103

104 void

105 addUTF8Test(TestNode** root)

106 {

     addTest(root, &TestCodeUnitValues,          "utf8tst/TestCodeUnitValues");

     addTest(root, &TestCharLength,              "utf8tst/TestCharLength");

     addTest(root, &TestGetChar,                 "utf8tst/TestGetChar");

     addTest(root, &TestNextPrevChar,            "utf8tst/TestNextPrevChar");

     addTest(root, &TestNulTerminated,           "utf8tst/TestNulTerminated");

     addTest(root, &TestNextPrevNonCharacters,   "utf8tst/TestNextPrevNonCharacters");

     addTest(root, &TestNextPrevCharUnsafe,      "utf8tst/TestNextPrevCharUnsafe");

     addTest(root, &TestFwdBack,                 "utf8tst/TestFwdBack");

     addTest(root, &TestFwdBackUnsafe,           "utf8tst/TestFwdBackUnsafe");

     addTest(root, &TestSetChar,                 "utf8tst/TestSetChar");

     addTest(root, &TestSetCharUnsafe,           "utf8tst/TestSetCharUnsafe");

     addTest(root, &TestTruncateIfIncomplete,    "utf8tst/TestTruncateIfIncomplete");

     addTest(root, &TestAppendChar,              "utf8tst/TestAppendChar");

     addTest(root, &TestAppend,                  "utf8tst/TestAppend");

     addTest(root, &TestSurrogates,              "utf8tst/TestSurrogates");

122 }

123

124 static void TestCodeUnitValues()

125 {

     static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc2, 0xc4, 0xf0, 0xf4, 0x80, 0x81, 0xbc, 0xbe,};

127

128 int16_t i;

     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){

130 uint8_t c=codeunit[i];

         log_verbose("Testing code unit value of %x\n", c);

         if(i<4){

133 if(

134 #if !U_HIDE_OBSOLETE_UTF_OLD_H

                     !UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) ||

136 #endif

                     !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)) {

                 log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",

                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');

140 }

         } else if(i< 8){

142 if(

143 #if !U_HIDE_OBSOLETE_UTF_OLD_H

                     !UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) ||

145 #endif

                     !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)) {

                 log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",

                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');

149 }

         } else if(i< 12){

151 if(

152 #if !U_HIDE_OBSOLETE_UTF_OLD_H

                     !UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) ||

154 #endif

                     !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){

                 log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",

                     c, U8_IS_SINGLE(c) ? 'y' : 'n', U8_IS_LEAD(c) ? 'y' : 'n', U8_IS_TRAIL(c) ? 'y' : 'n');

158 }

159 }

160 }

161 }

162

163 static void TestCharLength()

164 {

165 static const uint32_t codepoint[]={

166 1, 0x0061,

167 1, 0x007f,

168 2, 0x016f,

169 2, 0x07ff,

170 3, 0x0865,

171 3, 0x20ac,

172 4, 0x20402,

173 4, 0x23456,

174 4, 0x24506,

175 4, 0x20402,

176 4, 0x10402,

177 3, 0xd7ff,

178 3, 0xe000,

179

180 };

181

182 int16_t i;

183 #if !U_HIDE_OBSOLETE_UTF_OLD_H

184 UBool multiple;

185 #endif

     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){

         UChar32 c=codepoint[i+1];

188 if(

189 #if !U_HIDE_OBSOLETE_UTF_OLD_H

                 UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] ||

191 #endif

                 U8_LENGTH(c) != (uint16_t)codepoint[i]) {

             log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], U8_LENGTH(c));

194 }else{

               log_verbose("The no: of code units for %lx is %d\n",c, U8_LENGTH(c));

196 }

197 #if !U_HIDE_OBSOLETE_UTF_OLD_H

         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);

         if(UTF8_NEED_MULTIPLE_UCHAR(c) != multiple){

               log_err("ERROR: UTF8_NEED_MULTIPLE_UCHAR failed for %lx\n", c);

201 }

202 #endif

203 }

204 }

205

206 static void TestGetChar()

207 {

208 static const uint8_t input[]={

209 /* code unit,*/

210 0x61,

211 0x7f,

212 0xe4,

213 0xba,

214 0x8c,

215 0xF0,

216 0x90,

217 0x90,

218 0x81,

219 0xc0,

220 0x65,

221 0x31,

222 0x9a,

223 0xc9

224 };

225 static const UChar32 result[]={

226 /* codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict) */

         0x61,             0x61,                       0x61,

         0x7f,             0x7f,                       0x7f,

         0x4e8c,           0x4e8c,                     0x4e8c,

         0x4e8c,           0x4e8c,                     0x4e8c ,

         0x4e8c,           0x4e8c,                     0x4e8c,

         0x10401,          0x10401,                    0x10401 ,

         0x10401,          0x10401,                    0x10401 ,

         0x10401,          0x10401,                    0x10401 ,

         0x10401,          0x10401,                    0x10401,

         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,

         0x65,             0x65,                       0x65,

         0x31,             0x31,                       0x31,

         -1,               UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,

240 -1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1

241 };

242 uint16_t i=0;

243 UChar32 c, expected;

244 uint32_t offset=0;

245

     for(offset=0; offset<sizeof(input); offset++) {

247 expected = result[i];

         if (expected >= 0 && offset < sizeof(input) - 1) {

249 #if !U_HIDE_OBSOLETE_UTF_OLD_H

             UTF8_GET_CHAR_UNSAFE(input, offset, c);

251 if(c != expected) {

                 log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",

253 offset, expected, c);

254

255 }

256 #endif

             U8_GET_UNSAFE(input, offset, c);

258 if(c != expected) {

                 log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",

260 offset, expected, c);

261

262 }

263 }

         expected=result[i+1];

265 #if !U_HIDE_OBSOLETE_UTF_OLD_H

         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);

267 if(c != expected){

             log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

269 }

270 #endif

         U8_GET(input, 0, offset, sizeof(input), c);

         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }

273 if(c != expected){

             log_err("ERROR: U8_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

275 }

276

         U8_GET_OR_FFFD(input, 0, offset, sizeof(input), c);

         if(expected<0) { expected=0xfffd; }

279 if(c != expected){

             log_err("ERROR: U8_GET_OR_FFFD failed for offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

281 }

282 #if !U_HIDE_OBSOLETE_UTF_OLD_H

         UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);

         if(c != result[i+2]){

             log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);

286 }

287 #endif

         i=(uint16_t)(i+3);

289 }

290 }

291

292 static void TestNextPrevChar() {

293 static const uint8_t input[]={

294 0x61,

         0xf0, 0x90, 0x90, 0x81,

         0xc0, 0x80,  // non-shortest form

         0xf3, 0xbe,  // truncated

298 0xc2, // truncated

299 0x61,

         0x81, 0x90, 0x90, 0xf0,  // "backwards" sequence

301 0x00

302 };

303 static const UChar32 result[]={

304 /* next_safe_ns next_safe_s prev_safe_ns prev_safe_s */

         0x0061,             0x0061,              0x0000,             0x0000,

         0x10401,            0x10401,             UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

307 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

308 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

309 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x61,               0x61,

311 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

312 UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,

313 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

314 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

         0x61,               0x61,                UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,            0x10401,

317 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF_ERROR_VALUE, UTF_ERROR_VALUE,

318 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,

319 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,

         0x0000,             0x0000,              0x0061,             0x0061

321 };

322 static const int32_t movedOffset[]={

323 /* next_safe prev_safe_s */

324 1, 15,

325 5, 14,

326 3, 13,

327 4, 12,

328 5, 11,

329 6, 10,

330 7, 9,

331 9, 7,

332 9, 7,

333 10, 6,

334 11, 5,

335 12, 1,

336 13, 1,

337 14, 1,

338 15, 1,

339 16, 0,

340 };

341

342 UChar32 c, expected;

     uint32_t i=0, j=0;

344 uint32_t offset=0;

345 int32_t setOffset=0;

     for(offset=0; offset<sizeof(input); offset++){

347 expected=result[i]; // next_safe_ns

348 #if !U_HIDE_OBSOLETE_UTF_OLD_H

349 setOffset=offset;

         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);

         if(setOffset != movedOffset[j]) {

             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

353 offset, movedOffset[j], setOffset);

354 }

355 if(c != expected) {

             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

357 }

358 #endif

359 setOffset=offset;

         U8_NEXT(input, setOffset, sizeof(input), c);

         if(setOffset != movedOffset[j]) {

             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

363 offset, movedOffset[j], setOffset);

364 }

         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }

366 if(c != expected) {

             log_err("ERROR: U8_NEXT failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

368 }

369

370 setOffset=offset;

         U8_NEXT_OR_FFFD(input, setOffset, sizeof(input), c);

         if(setOffset != movedOffset[j]) {

             log_err("ERROR: U8_NEXT_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

374 offset, movedOffset[j], setOffset);

375 }

         if(expected<0) { expected=0xfffd; }

377 if(c != expected) {

             log_err("ERROR: U8_NEXT_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

379 }

380 #if !U_HIDE_OBSOLETE_UTF_OLD_H

381 setOffset=offset;

         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);

         if(setOffset != movedOffset[j]) {

             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

385 offset, movedOffset[j], setOffset);

386 }

         expected=result[i+1];  // next_safe_s

388 if(c != expected) {

             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",

390 offset, expected, c);

391 }

392 #endif

393 i=i+4;

394 j=j+2;

395 }

396

397 i=j=0;

     for(offset=sizeof(input); offset > 0; --offset){

         expected=result[i+2];  // prev_safe_ns

400 #if !U_HIDE_OBSOLETE_UTF_OLD_H

401 setOffset=offset;

         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);

         if(setOffset != movedOffset[j+1]) {

             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

                 offset, movedOffset[j+1], setOffset);

406 }

407 if(c != expected) {

             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

409 }

410 #endif

411 setOffset=offset;

         U8_PREV(input, 0, setOffset, c);

         if(setOffset != movedOffset[j+1]) {

             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

                 offset, movedOffset[j+1], setOffset);

416 }

         if(UTF_IS_ERROR(expected)) { expected=U_SENTINEL; }

418 if(c != expected) {

             log_err("ERROR: U8_PREV failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

420 }

421

422 setOffset=offset;

         U8_PREV_OR_FFFD(input, 0, setOffset, c);

         if(setOffset != movedOffset[j+1]) {

             log_err("ERROR: U8_PREV_OR_FFFD failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

                 offset, movedOffset[j+1], setOffset);

427 }

         if(expected<0) { expected=0xfffd; }

429 if(c != expected) {

             log_err("ERROR: U8_PREV_OR_FFFD failed at offset=%ld. Expected:%lx Got:%lx\n", offset, expected, c);

431 }

432 #if !U_HIDE_OBSOLETE_UTF_OLD_H

433 setOffset=offset;

         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);

         if(setOffset != movedOffset[j+1]) {

             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",

                 offset, movedOffset[j+1], setOffset);

438 }

         expected=result[i+3];  // prev_safe_s

440 if(c != expected) {

             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed at offset=%ld. Expected:%lx Got:%lx\n",

442 offset, expected, c);

443 }

444 #endif

445 i=i+4;

446 j=j+2;

447 }

448 }

449

450 /* keep this in sync with utf16tst.c's TestNulTerminated() */

451 static void TestNulTerminated() {

452 static const uint8_t input[]={

453 /* 0 */ 0x61,

         /*  1 */  0xf0, 0x90, 0x90, 0x81,

455 /* 5 */ 0xc0,

456 /* 6 */ 0x80,

         /*  7 */  0xdf, 0x80,

458 /* 9 */ 0xc2,

459 /* 10 */ 0x62,

460 /* 11 */ 0xfd,

461 /* 12 */ 0xbe,

         /* 13 */  0xe0, 0xa0, 0x80,

         /* 16 */  0xe2, 0x82, 0xac,

         /* 19 */  0xf0, 0x90, 0x90,

465 /* 22 */ 0x00

466 /* 23 */

467 };

468 static const UChar32 result[]={

469 0x61,

470 0x10401,

471 U_SENTINEL, // C0 not a lead byte

472 U_SENTINEL, // 80

473 0x7c0,

474 U_SENTINEL, // C2

475 0x62,

476 U_SENTINEL, // FD not a lead byte

477 U_SENTINEL, // BE

478 0x800,

479 0x20ac,

480 U_SENTINEL, // truncated F0 90 90

481 0

482 };

483

484 UChar32 c, c2, expected;

     int32_t i0, i=0, j, k, expectedIndex;

486 int32_t cpIndex=0;

487 do {

488 i0=i;

         U8_NEXT(input, i, -1, c);

490 expected=result[cpIndex];

491 if(c!=expected) {

             log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);

493 }

494 j=i0;

         U8_NEXT_OR_FFFD(input, j, -1, c);

         if(expected<0) { expected=0xfffd; }

497 if(c!=expected) {

             log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);

499 }

500 if(j!=i) {

             log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);

502 }

503 j=i0;

         U8_FWD_1(input, j, -1);

505 if(j!=i) {

             log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);

507 }

508 ++cpIndex;

509 /*

510 * Move by this many code points from the start.

511 * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.

512 */

         expectedIndex= (c==0) ? i-1 : i;

514 k=0;

         U8_FWD_N(input, k, -1, cpIndex);

516 if(k!=expectedIndex) {

             log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);

518 }

     } while(c!=0);

520

521 i=0;

522 do {

523 j=i0=i;

         U8_NEXT(input, i, -1, c);

525 do {

             U8_GET(input, 0, j, -1, c2);

527 if(c2!=c) {

                 log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);

529 }

             U8_GET_OR_FFFD(input, 0, j, -1, c2);

             expected= (c>=0) ? c : 0xfffd;

532 if(c2!=expected) {

                 log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);

534 }

535 /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */

536 k=j+1;

             U8_SET_CP_LIMIT(input, 0, k, -1);

538 if(k!=i) {

                 log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);

540 }

         } while(++j<i);

     } while(c!=0);

543 }

544

545 static void TestNextPrevNonCharacters() {

546 /* test non-characters */

547 static const uint8_t nonChars[]={

         0xef, 0xb7, 0x90,       /* U+fdd0 */

         0xef, 0xbf, 0xbf,       /* U+feff */

         0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */

         0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */

         0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */

553 };

554

555 UChar32 ch;

556 int32_t idx;

557

     for(idx=0; idx<(int32_t)sizeof(nonChars);) {

         U8_NEXT(nonChars, idx, sizeof(nonChars), ch);

         if(!U_IS_UNICODE_NONCHAR(ch)) {

             log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);

562 }

563 }

     for(idx=(int32_t)sizeof(nonChars); idx>0;) {

         U8_PREV(nonChars, 0, idx, ch);

         if(!U_IS_UNICODE_NONCHAR(ch)) {

             log_err("U8_PREV(at %d) failed to read a non-character\n", idx);

568 }

569 }

570 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(idx=0; idx<(int32_t)sizeof(nonChars);) {

         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;

         UTF8_NEXT_CHAR_SAFE(nonChars, idx, sizeof(nonChars), ch, TRUE);

574 if(ch!=expected) {

             log_err("UTF8_NEXT_CHAR_SAFE(strict, before %d) failed to read a non-character\n", idx);

576 }

577 }

     for(idx=(int32_t)sizeof(nonChars); idx>0;) {

         UTF8_PREV_CHAR_SAFE(nonChars, 0, idx, ch, TRUE);

         UChar32 expected= nonChars[idx]<0xf0 ? 0xffff : 0x10ffff;

581 if(ch!=expected) {

             log_err("UTF8_PREV_CHAR_SAFE(strict, at %d) failed to read a non-character\n", idx);

583 }

584 }

585 #endif

586 }

587

588 static void TestNextPrevCharUnsafe() {

589 /*

590 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.

591 * The behavior of _UNSAFE macros for ill-formed strings is undefined.

592 */

593 static const uint8_t input[]={

594 0x61,

         0xf0, 0x90, 0x90, 0x81,

         0xc0, 0x80,  /* non-shortest form */

         0xe2, 0x82, 0xac,

598 0xc2, 0xa1,

         0xf4, 0x8f, 0xbf, 0xbf,

600 0x00

601 };

602 static const UChar32 codePoints[]={

603 0x61,

604 0x10401,

605 -1,

606 0x20ac,

607 0xa1,

608 0x10ffff,

609 0

610 };

611

612 UChar32 c, expected;

613 int32_t i;

614 uint32_t offset;

615 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(i=0, offset=0; offset<sizeof(input); ++i) {

         UTF8_NEXT_CHAR_UNSAFE(input, offset, c);

618 expected = codePoints[i];

         if(expected >= 0 && c != expected) {

             log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",

621 offset, expected, c);

622 }

         if(offset==6) {

624 // The obsolete UTF8_NEXT_CHAR_UNSAFE() skips 1+UTF8_COUNT_TRAIL_BYTES(lead) bytes

625 // while the new one skips C0 80 together.

626 ++offset;

627 }

628 }

629 #endif

     for(i=0, offset=0; offset<sizeof(input); ++i) {

         U8_NEXT_UNSAFE(input, offset, c);

632 expected = codePoints[i];

         if(expected >= 0 && c != expected) {

             log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",

635 offset, expected, c);

636 }

637 }

638 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){

         UTF8_PREV_CHAR_UNSAFE(input, offset, c);

641 expected = codePoints[i];

         if(expected >= 0 && c != expected) {

             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",

644 offset, expected, c);

645 }

646 }

647 #endif

     for(i=UPRV_LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){

         U8_PREV_UNSAFE(input, offset, c);

650 expected = codePoints[i];

         if(expected >= 0 && c != expected) {

             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",

653 offset, expected, c);

654 }

655 }

656 }

657

658 static void TestFwdBack() {

659 static const uint8_t input[]={

660 0x61,

         0xF0, 0x90, 0x90, 0x81,

662 0xff,

663 0x62,

664 0xc0,

665 0x80,

666 0x7f,

667 0x8f,

668 0xc0,

669 0x63,

670 0x81,

671 0x90,

672 0x90,

673 0xF0,

674 0x00

675 };

     static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};

     static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 1, 0};

678

     static const uint16_t Nvalue[]= {0, 1, 2, 4, 1, 2, 1, 5};

     static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */

     static const uint16_t back_N_safe[]  ={18, 17, 15, 11, 10, 8, 7, 0};

682

683 uint32_t offsafe=0;

684

685 uint32_t i=0;

686 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     while(offsafe < sizeof(input)){

         UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));

         if(offsafe != fwd_safe[i]){

             log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);

691 }

692 i++;

693 }

694 #endif

695 offsafe=0;

696 i=0;

     while(offsafe < sizeof(input)){

         U8_FWD_1(input, offsafe, sizeof(input));

         if(offsafe != fwd_safe[i]){

             log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);

701 }

702 i++;

703 }

704 #if !U_HIDE_OBSOLETE_UTF_OLD_H

705 i=0;

706 offsafe=sizeof(input);

     while(offsafe > 0){

         UTF8_BACK_1_SAFE(input, 0,  offsafe);

         if(offsafe != back_safe[i]){

             log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_safe[i], offsafe);

711 }

712 i++;

713 }

714 #endif

715 i=0;

716 offsafe=sizeof(input);

     while(offsafe > 0){

         U8_BACK_1(input, 0,  offsafe);

         if(offsafe != back_safe[i]){

             log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_safe[i], offsafe);

721 }

722 i++;

723 }

724 #if !U_HIDE_OBSOLETE_UTF_OLD_H

725 offsafe=0;

     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){

         UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);

         if(offsafe != fwd_N_safe[i]){

             log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);

730 }

731

732 }

733 #endif

734 offsafe=0;

     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){

         U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);

         if(offsafe != fwd_N_safe[i]){

             log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);

739 }

740

741 }

742 #if !U_HIDE_OBSOLETE_UTF_OLD_H

743 offsafe=sizeof(input);

     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){

         UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);

         if(offsafe != back_N_safe[i]){

             log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);

748 }

749 }

750 #endif

751 offsafe=sizeof(input);

     for(i=0; i<UPRV_LENGTHOF(Nvalue); i++){

         U8_BACK_N(input, 0, offsafe, Nvalue[i]);

         if(offsafe != back_N_safe[i]){

             log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);

756 }

757 }

758 }

759

760 /**

761 * Ticket #13636 - Visual Studio 2017 has problems optimizing this function.

762 * As a workaround, we will turn off optimization just for this function on VS2017 and above.

763 */

764 #if defined(_MSC_VER) && (_MSC_VER > 1900)

765 #pragma optimize( "", off )

766 #endif

767

768 static void TestFwdBackUnsafe() {

769 /*

770 * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.

771 * The behavior of _UNSAFE macros for ill-formed strings is undefined.

772 */

773 static const uint8_t input[]={

774 0x61,

         0xf0, 0x90, 0x90, 0x81,

         0xc0, 0x80,  /* non-shortest form */

         0xe2, 0x82, 0xac,

778 0xc2, 0xa1,

         0xf4, 0x8f, 0xbf, 0xbf,

780 0x00

781 };

782 // forward unsafe skips only C0

     static const int8_t boundaries[]={ 0, 1, 5, 6, 7, 10, 12, 16, 17 };

784 // backward unsafe skips C0 80 together

     static const int8_t backBoundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };

786

787 int32_t offset;

788 int32_t i;

789 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {

791 UTF8_FWD_1_UNSAFE(input, offset);

         if(offset != boundaries[i]){

             log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);

794 }

795 }

796 #endif

     for(i=1, offset=0; offset<UPRV_LENGTHOF(input); ++i) {

798 U8_FWD_1_UNSAFE(input, offset);

         if(offset != boundaries[i]){

             log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);

801 }

802 }

803 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {

805 UTF8_BACK_1_UNSAFE(input, offset);

         if(offset != backBoundaries[i]){

             log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);

808 }

809 }

810 #endif

     for(i=UPRV_LENGTHOF(backBoundaries)-2, offset=UPRV_LENGTHOF(input); offset>0; --i) {

812 U8_BACK_1_UNSAFE(input, offset);

         if(offset != backBoundaries[i]){

             log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[i], offset);

815 }

816 }

817 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {

819 offset=0;

         UTF8_FWD_N_UNSAFE(input, offset, i);

         if(offset != boundaries[i]) {

             log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);

823 }

824 }

825 #endif

     for(i=0; i<UPRV_LENGTHOF(boundaries); ++i) {

827 offset=0;

         U8_FWD_N_UNSAFE(input, offset, i);

         if(offset != boundaries[i]) {

             log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);

831 }

832 }

833 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {

         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;

836 offset=UPRV_LENGTHOF(input);

         UTF8_BACK_N_UNSAFE(input, offset, i);

         if(offset != backBoundaries[j]) {

             log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);

840 }

841 }

842 #endif

     for(i=0; i<UPRV_LENGTHOF(backBoundaries); ++i) {

         int32_t j=UPRV_LENGTHOF(backBoundaries)-1-i;

845 offset=UPRV_LENGTHOF(input);

         U8_BACK_N_UNSAFE(input, offset, i);

         if(offset != backBoundaries[j]) {

             log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", backBoundaries[j], offset);

849 }

850 }

851 }

852

853 /**

854 * Ticket #13636 - Turn optimization back on.

855 */

856 #if defined(_MSC_VER) && (_MSC_VER > 1900)

857 #pragma optimize( "", on )

858 #endif

859

860 static void TestSetChar() {

861 static const uint8_t input[]

         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0xfe, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x00 };

863 static const int16_t start_safe[]

         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };

865 static const int16_t limit_safe[]

         = {0,    1,    4,    4,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,  14 };

867

868 uint32_t i=0;

     int32_t offset=0, setOffset=0;

     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){

         if (offset<UPRV_LENGTHOF(input)){

872 #if !U_HIDE_OBSOLETE_UTF_OLD_H

873 setOffset=offset;

             UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);

             if(setOffset != start_safe[i]){

                 log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);

877 }

878 #endif

879 setOffset=offset;

             U8_SET_CP_START(input, 0, setOffset);

             if(setOffset != start_safe[i]){

                 log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);

883 }

884 }

885 #if !U_HIDE_OBSOLETE_UTF_OLD_H

886 setOffset=offset;

         UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, (int32_t)sizeof(input));

         if(setOffset != limit_safe[i]){

             log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);

890 }

891 #endif

892 setOffset=offset;

         U8_SET_CP_LIMIT(input,0, setOffset, (int32_t)sizeof(input));

         if(setOffset != limit_safe[i]){

             log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);

896 }

897

898 i++;

899 }

900 }

901

902 static void TestSetCharUnsafe() {

903 static const uint8_t input[]

         = {0x61, 0xe4, 0xba, 0x8c, 0x7f, 0x2e, 0x62, 0xc5, 0x7f, 0x61, 0x80, 0x80, 0xe0, 0x80, 0x80, 0x00 };

905 static const int16_t start_unsafe[]

         = {0,    1,    1,    1,    4,    5,    6,    7,    8,    9,    9,    9,    12,   12,   12,   15 };

907 static const int16_t limit_unsafe[]

         = {0,    1,    4,    4,    4,    5,    6,    7,    9,    9,    10,   10,   10,   15,   15,   15,   16 };

909

910 uint32_t i=0;

     int32_t offset=0, setOffset=0;

     for(offset=0; offset<=UPRV_LENGTHOF(input); offset++){

         if (offset<UPRV_LENGTHOF(input)){

914 #if !U_HIDE_OBSOLETE_UTF_OLD_H

915 setOffset=offset;

916 UTF8_SET_CHAR_START_UNSAFE(input, setOffset);

             if(setOffset != start_unsafe[i]){

                 log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);

919 }

920 #endif

921 setOffset=offset;

922 U8_SET_CP_START_UNSAFE(input, setOffset);

             if(setOffset != start_unsafe[i]){

                 log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);

925 }

926 }

927

         if (offset != 0) { /* Can't have it go off the end of the array */

929 #if !U_HIDE_OBSOLETE_UTF_OLD_H

930 setOffset=offset;

931 UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);

             if(setOffset != limit_unsafe[i]){

                 log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);

934 }

935 #endif

936 setOffset=offset;

937 U8_SET_CP_LIMIT_UNSAFE(input, setOffset);

             if(setOffset != limit_unsafe[i]){

                 log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);

940 }

941 }

942

943 i++;

944 }

945 }

946

947 static void TestTruncateIfIncomplete() {

948 // Difference from U8_SET_CP_START():

949 // U8_TRUNCATE_IF_INCOMPLETE() does not look at s[length].

950 // Therefore, if the last byte is a lead byte, then this macro truncates

951 // even if the byte at the input index cannot continue a valid sequence

952 // (including when that is not a trail byte).

953 // On the other hand, if the last byte is a trail byte, then the two macros behave the same.

954 static const struct {

955 const char *s;

956 int32_t expected;

957 } cases[] = {

         { "", 0 },

         { "a", 1 },

         { "\x80", 1 },

         { "\xC1", 1 },

         { "\xC2", 0 },

         { "\xE0", 0 },

         { "\xF4", 0 },

         { "\xF5", 1 },

         { "\x80\x80", 2 },

         { "\xC2\xA0", 2 },

         { "\xE0\x9F", 2 },

         { "\xE0\xA0", 0 },

         { "\xED\x9F", 0 },

         { "\xED\xA0", 2 },

         { "\xF0\x8F", 2 },

         { "\xF0\x90", 0 },

         { "\xF4\x8F", 0 },

         { "\xF4\x90", 2 },

         { "\xF5\x80", 2 },

         { "\x80\x80\x80", 3 },

         { "\xC2\xA0\x80", 3 },

         { "\xE0\xA0\x80", 3 },

         { "\xF0\x8F\x80", 3 },

         { "\xF0\x90\x80", 0 },

         { "\xF4\x8F\x80", 0 },

         { "\xF4\x90\x80", 3 },

         { "\xF5\x80\x80", 3 },

         { "\x80\x80\x80\x80", 4 },

         { "\xC2\xA0\x80\x80", 4 },

         { "\xE0\xA0\x80\x80", 4 },

         { "\xF0\x90\x80\x80", 4 },

         { "\xF5\x80\x80\x80", 4 }

990 };

991 int32_t i;

     for (i = 0; i < UPRV_LENGTHOF(cases); ++i) {

         const char *s = cases[i].s;

         int32_t expected = cases[i].expected;

         int32_t length = (int32_t)strlen(s);

996 int32_t adjusted = length;

         U8_TRUNCATE_IF_INCOMPLETE(s, 0, adjusted);

998 if (adjusted != expected) {

             log_err("ERROR: U8_TRUNCATE_IF_INCOMPLETE failed for i=%d, length=%d. Expected:%d Got:%d\n",

                     (int)i, (int)length, (int)expected, (int)adjusted);

1001 }

1002 }

1003 }

1004

1005 static void TestAppendChar(){

1006 #if !U_HIDE_OBSOLETE_UTF_OLD_H

     static const uint8_t s[11]={0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00};

1008 static const uint32_t test[]={

1009 /* append-position(unsafe), CHAR to be appended */

1010 0, 0x10401,

1011 2, 0x0028,

1012 2, 0x007f,

1013 3, 0xd801,

1014 1, 0x20402,

1015 8, 0x10401,

1016 5, 0xc0,

1017 5, 0xc1,

1018 5, 0xfd,

1019 6, 0x80,

1020 6, 0x81,

1021 6, 0xbf,

1022 7, 0xfe,

1023

1024 /* append-position(safe), CHAR to be appended */

1025 0, 0x10401,

1026 2, 0x0028,

1027 3, 0x7f,

         3,                        0xd801,   /* illegal for UTF-8 starting with Unicode 3.2 */

1029 1, 0x20402,

1030 9, 0x10401,

1031 5, 0xc0,

1032 5, 0xc1,

1033 5, 0xfd,

1034 6, 0x80,

1035 6, 0x81,

1036 6, 0xbf,

1037 7, 0xfe,

1038

1039 };

1040 static const uint16_t movedOffset[]={

1041 /* offset-moved-to(unsafe) */

1042 4, /*for append-pos: 0 , CHAR 0x10401*/

1043 3,

1044 3,

1045 6,

1046 5,

1047 12,

1048 7,

1049 7,

1050 7,

1051 8,

1052 8,

1053 8,

1054 9,

1055

1056 /* offset-moved-to(safe) */

1057 4, /*for append-pos: 0, CHAR 0x10401*/

1058 3,

1059 4,

1060 6,

1061 5,

1062 11,

1063 7,

1064 7,

1065 7,

1066 8,

1067 8,

1068 8,

1069 9,

1070

1071 };

1072

1073 static const uint8_t result[][11]={

1074 /*unsafe*/

         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x7f, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0xF0, 0x90, 0x90},

1081

         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},

1085

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},

1089

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},

1091 /*safe*/

         {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/

1098

         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x80, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0x81, 0x68, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0xc3, 0xbd, 0x68, 0x69, 0x6a, 0x00},

1102

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x80, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0x81, 0x69, 0x6a, 0x00},

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0xc2, 0xbf, 0x69, 0x6a, 0x00},

1106

         {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xc3, 0xbe, 0x6a, 0x00},

1108

1109 };

     uint16_t i, count=0;

1111 uint8_t str[12];

1112 uint32_t offset;

1113 /* UChar32 c=0;*/

     uint16_t size=UPRV_LENGTHOF(s);

     for(i=0; i<UPRV_LENGTHOF(test); i=(uint16_t)(i+2)){

         uprv_memcpy(str, s, size);

1117 offset=test[i];

         if(count<13){

             UTF8_APPEND_CHAR_UNSAFE(str, offset, test[i+1]);

             if(offset != movedOffset[count]){

                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",

1122 count, movedOffset[count], offset);

1123

1124 }

             if(uprv_memcmp(str, result[count], size) !=0){

                 log_err("ERROR: UTF8_APPEND_CHAR_UNSAFE failed for count=%d. \nExpected:", count);

                 printUChars(result[count], size);

                 log_err("\nGot:      ");

1129 printUChars(str, size);

                 log_err("\n");

1131 }

1132 }else{

             UTF8_APPEND_CHAR_SAFE(str, offset, size, test[i+1]);

             if(offset != movedOffset[count]){

                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed to move the offset correctly for count=%d.\nExpectedOffset=%d  currentOffset=%d\n",

1136 count, movedOffset[count], offset);

1137

1138 }

             if(uprv_memcmp(str, result[count], size) !=0){

                 log_err("ERROR: UTF8_APPEND_CHAR_SAFE failed for count=%d. \nExpected:", count);

                 printUChars(result[count], size);

                 log_err("\nGot:     ");

1143 printUChars(str, size);

                 log_err("\n");

1145 }

1146 /*call the API instead of MACRO

1147 uprv_memcpy(str, s, size);

1148 offset=test[i];

1149 c=test[i+1];

1150 if((uint32_t)(c)<=0x7f) {

1151 (str)[(offset)++]=(uint8_t)(c);

1152 } else {

1153 (offset)=utf8_appendCharSafeBody(str, (int32_t)(offset), (int32_t)(size), c);

1154 }

1155 if(offset != movedOffset[count]){

1156 log_err("ERROR: utf8_appendCharSafeBody() failed to move the offset correctly for count=%d.\nExpectedOffset=%d currentOffset=%d\n",

1157 count, movedOffset[count], offset);

1158

1159 }

1160 if(uprv_memcmp(str, result[count], size) !=0){

1161 log_err("ERROR: utf8_appendCharSafeBody() failed for count=%d. \nExpected:", count);

1162 printUChars(result[count], size);

1163 printf("\nGot: ");

1164 printUChars(str, size);

1165 printf("\n");

1166 }

1167 */

1168 }

1169 count++;

1170 }

1171 #endif

1172 }

1173

1174 static void TestAppend() {

1175 static const UChar32 codePoints[]={

         0x61, 0xdf, 0x901, 0x3040,

         0xac00, 0xd800, 0xdbff, 0xdcde,

         0xdffd, 0xe000, 0xffff, 0x10000,

         0x12345, 0xe0021, 0x10ffff, 0x110000,

         0x234567, 0x7fffffff, -1, -1000,

1181 0, 0x400

1182 };

1183 static const uint8_t expectUnsafe[]={

         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,

         0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,

         0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,

         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */

1188 /* none from this line */

         0,  0xd0, 0x80

1190 }, expectSafe[]={

         0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,

         0xea, 0xb0, 0x80,  /* no surrogates */

         /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,

         0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */

1195 /* none from this line */

         0,  0xd0, 0x80

1197 };

1198

1199 uint8_t buffer[100];

1200 UChar32 c;

1201 int32_t i, length;

1202 UBool isError, expectIsError, wrongIsError;

1203

1204 length=0;

     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {

1206 c=codePoints[i];

         if(c<0 || 0x10ffff<c) {

1208 continue; /* skip non-code points for U8_APPEND_UNSAFE */

1209 }

1210

         U8_APPEND_UNSAFE(buffer, length, c);

1212 }

     if(length!=UPRV_LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {

         log_err("U8_APPEND_UNSAFE did not generate the expected output\n");

1215 }

1216

1217 length=0;

1218 wrongIsError=FALSE;

     for(i=0; i<UPRV_LENGTHOF(codePoints); ++i) {

1220 c=codePoints[i];

         expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);

1222 isError=FALSE;

1223

         U8_APPEND(buffer, length, UPRV_LENGTHOF(buffer), c, isError);

1225 wrongIsError|= isError!=expectIsError;

1226 }

1227 if(wrongIsError) {

         log_err("U8_APPEND did not set isError correctly\n");

1229 }

     if(length!=UPRV_LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {

         log_err("U8_APPEND did not generate the expected output\n");

1232 }

1233 }

1234

1235 static void

1236 TestSurrogates() {

1237 static const uint8_t b[]={

         0xc3, 0x9f,             /*  00DF */

         0xed, 0x9f, 0xbf,       /*  D7FF */

         0xed, 0xa0, 0x81,       /*  D801 */

         0xed, 0xbf, 0xbe,       /*  DFFE */

         0xee, 0x80, 0x80,       /*  E000 */

         0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */

1244 };

1245 static const UChar32 cp[]={

         0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe

1247 };

1248

1249 UChar32 cu, cs, cl;

     int32_t i, j, k, iu, is, il, length;

1251

1252 k=0; /* index into cp[] */

1253 length=UPRV_LENGTHOF(b);

     for(i=0; i<length;) {

1255 j=i;

         U8_NEXT_UNSAFE(b, j, cu);

1257 iu=j;

1258

1259 j=i;

         U8_NEXT(b, j, length, cs);

1261 is=j;

1262

1263 j=i;

         L8_NEXT(b, j, length, cl);

1265 il=j;

1266

         if(cu!=cp[k]) {

             log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);

1269 }

1270

1271 /* U8_NEXT() returns <0 for surrogate code points */

         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {

             log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);

1274 }

1275

1276 /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */

1277 if(cl!=cu) {

             log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);

1279 }

1280

1281 // U8_NEXT() skips only the first byte of a surrogate byte sequence.

         if(U_IS_SURROGATE(cu) ? is!=(i+1) : is!=iu) {

             log_err("U8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);

1284 }

1285 if(il!=iu) {

             log_err("L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);

1287 }

1288

1289 ++k; /* next code point */

1290 i=iu; /* advance by one UTF-8 sequence */

1291 }

1292

     while(i>0) {

1294 --k; /* previous code point */

1295

1296 j=i;

         U8_PREV_UNSAFE(b, j, cu);

1298 iu=j;

1299

1300 j=i;

         U8_PREV(b, 0, j, cs);

1302 is=j;

1303

1304 j=i;

         L8_PREV(b, 0, j, cl);

1306 il=j;

1307

         if(cu!=cp[k]) {

             log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);

1310 }

1311

1312 /* U8_PREV() returns <0 for surrogate code points */

         if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {

             log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);

1315 }

1316

1317 /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */

1318 if(cl!=cu) {

             log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);

1320 }

1321

1322 // U8_PREV() skips only the last byte of a surrogate byte sequence.

         if(U_IS_SURROGATE(cu) ? is!=(i-1) : is!=iu) {

             log_err("U8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);

1325 }

1326 if(il !=iu) {

             log_err("L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);

1328 }

1329

1330 i=iu; /* go back by one UTF-8 sequence */

1331 }

1332 }