icuSources/test/intltest/convtest.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2003-2014, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  convtest.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2003jul15
  16 *   created by: Markus W. Scherer
  17 *
  18 *   Test file for data-driven conversion tests.
  19 */
  20
  21 #include "unicode/utypes.h"
  22
  23 #if !UCONFIG_NO_LEGACY_CONVERSION
  24 /*
  25  * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
  26  * is slightly unnecessary - it removes tests for Unicode charsets
  27  * like UTF-8 that should work.
  28  * However, there is no easy way for the test to detect whether a test case
  29  * is for a Unicode charset, so it would be difficult to only exclude those.
  30  * Also, regular testing of ICU is done with all modules on, therefore
  31  * not testing conversion for a custom configuration like this should be ok.
  32  */
  33
  34 #include "unicode/ucnv.h"
  35 #include "unicode/unistr.h"
  36 #include "unicode/parsepos.h"
  37 #include "unicode/uniset.h"
  38 #include "unicode/ustring.h"
  39 #include "unicode/ures.h"
  40 #include "convtest.h"
  41 #include "cmemory.h"
  42 #include "unicode/tstdtmod.h"
  43 #include <string.h>
  44 #include <stdlib.h>
  45
  46 enum {
  47     // characters used in test data for callbacks
  48     SUB_CB='?',
  49     SKIP_CB='0',
  50     STOP_CB='.',
  51     ESC_CB='&'
  52 };
  53
  54 ConversionTest::ConversionTest() {
  55     UErrorCode errorCode=U_ZERO_ERROR;
  56     utf8Cnv=ucnv_open("UTF-8", &errorCode);
  57     ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
  58     if(U_FAILURE(errorCode)) {
  59         errln("unable to open UTF-8 converter");
  60     }
  61 }
  62
  63 ConversionTest::~ConversionTest() {
  64     ucnv_close(utf8Cnv);
  65 }
  66
  67 void
  68 ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
  69     if (exec) logln("TestSuite ConversionTest: ");
  70     switch (index) {
  71 #if !UCONFIG_NO_FILE_IO
  72         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
  73         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
  74         case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
  75         case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break;
  76 #else
  77         case 0:
  78         case 1:
  79         case 2:
  80         case 3: name="skip"; break;
  81 #endif
  82         case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
  83         default: name=""; break; //needed to end loop
  84     }
  85 }
  86
  87 // test data interface ----------------------------------------------------- ***
  88
  89 void
  90 ConversionTest::TestToUnicode() {
  91     ConversionCase cc;
  92     char charset[100], cbopt[4];
  93     const char *option;
  94     UnicodeString s, unicode;
  95     int32_t offsetsLength;
  96     UConverterToUCallback callback;
  97
  98     TestDataModule *dataModule;
  99     TestData *testData;
 100     const DataMap *testCase;
 101     UErrorCode errorCode;
 102     int32_t i;
 103
 104     errorCode=U_ZERO_ERROR;
 105     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
 106     if(U_SUCCESS(errorCode)) {
 107         testData=dataModule->createTestData("toUnicode", errorCode);
 108         if(U_SUCCESS(errorCode)) {
 109             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
 110                 if(U_FAILURE(errorCode)) {
 111                     errln("error retrieving conversion/toUnicode test case %d - %s",
 112                             i, u_errorName(errorCode));
 113                     errorCode=U_ZERO_ERROR;
 114                     continue;
 115                 }
 116
 117                 cc.caseNr=i;
 118
 119                 s=testCase->getString("charset", errorCode);
 120                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
 121                 cc.charset=charset;
 122
 123                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
 124                 unicode=testCase->getString("unicode", errorCode);
 125                 cc.unicode=unicode.getBuffer();
 126                 cc.unicodeLength=unicode.length();
 127
 128                 offsetsLength=0;
 129                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
 130                 if(offsetsLength==0) {
 131                     cc.offsets=NULL;
 132                 } else if(offsetsLength!=unicode.length()) {
 133                     errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
 134                             i, unicode.length(), offsetsLength);
 135                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 136                 }
 137
 138                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
 139                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
 140
 141                 s=testCase->getString("errorCode", errorCode);
 142                 if(s==UNICODE_STRING("invalid", 7)) {
 143                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
 144                 } else if(s==UNICODE_STRING("illegal", 7)) {
 145                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
 146                 } else if(s==UNICODE_STRING("truncated", 9)) {
 147                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
 148                 } else if(s==UNICODE_STRING("illesc", 6)) {
 149                     cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
 150                 } else if(s==UNICODE_STRING("unsuppesc", 9)) {
 151                     cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
 152                 } else {
 153                     cc.outErrorCode=U_ZERO_ERROR;
 154                 }
 155
 156                 s=testCase->getString("callback", errorCode);
 157                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
 158                 cc.cbopt=cbopt;
 159                 switch(cbopt[0]) {
 160                 case SUB_CB:
 161                     callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
 162                     break;
 163                 case SKIP_CB:
 164                     callback=UCNV_TO_U_CALLBACK_SKIP;
 165                     break;
 166                 case STOP_CB:
 167                     callback=UCNV_TO_U_CALLBACK_STOP;
 168                     break;
 169                 case ESC_CB:
 170                     callback=UCNV_TO_U_CALLBACK_ESCAPE;
 171                     break;
 172                 default:
 173                     callback=NULL;
 174                     break;
 175                 }
 176                 option=callback==NULL ? cbopt : cbopt+1;
 177                 if(*option==0) {
 178                     option=NULL;
 179                 }
 180
 181                 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
 182
 183                 if(U_FAILURE(errorCode)) {
 184                     errln("error parsing conversion/toUnicode test case %d - %s",
 185                             i, u_errorName(errorCode));
 186                     errorCode=U_ZERO_ERROR;
 187                 } else {
 188                     logln("TestToUnicode[%d] %s", i, charset);
 189                     ToUnicodeCase(cc, callback, option);
 190                 }
 191             }
 192             delete testData;
 193         }
 194         delete dataModule;
 195     }
 196     else {
 197         dataerrln("Could not load test conversion data");
 198     }
 199 }
 200
 201 void
 202 ConversionTest::TestFromUnicode() {
 203     ConversionCase cc;
 204     char charset[100], cbopt[4];
 205     const char *option;
 206     UnicodeString s, unicode, invalidUChars;
 207     int32_t offsetsLength, index;
 208     UConverterFromUCallback callback;
 209
 210     TestDataModule *dataModule;
 211     TestData *testData;
 212     const DataMap *testCase;
 213     const UChar *p;
 214     UErrorCode errorCode;
 215     int32_t i, length;
 216
 217     errorCode=U_ZERO_ERROR;
 218     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
 219     if(U_SUCCESS(errorCode)) {
 220         testData=dataModule->createTestData("fromUnicode", errorCode);
 221         if(U_SUCCESS(errorCode)) {
 222             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
 223                 if(U_FAILURE(errorCode)) {
 224                     errln("error retrieving conversion/fromUnicode test case %d - %s",
 225                             i, u_errorName(errorCode));
 226                     errorCode=U_ZERO_ERROR;
 227                     continue;
 228                 }
 229
 230                 cc.caseNr=i;
 231
 232                 s=testCase->getString("charset", errorCode);
 233                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
 234                 cc.charset=charset;
 235
 236                 unicode=testCase->getString("unicode", errorCode);
 237                 cc.unicode=unicode.getBuffer();
 238                 cc.unicodeLength=unicode.length();
 239                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
 240
 241                 offsetsLength=0;
 242                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
 243                 if(offsetsLength==0) {
 244                     cc.offsets=NULL;
 245                 } else if(offsetsLength!=cc.bytesLength) {
 246                     errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
 247                             i, cc.bytesLength, offsetsLength);
 248                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 249                 }
 250
 251                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
 252                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
 253
 254                 s=testCase->getString("errorCode", errorCode);
 255                 if(s==UNICODE_STRING("invalid", 7)) {
 256                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
 257                 } else if(s==UNICODE_STRING("illegal", 7)) {
 258                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
 259                 } else if(s==UNICODE_STRING("truncated", 9)) {
 260                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
 261                 } else {
 262                     cc.outErrorCode=U_ZERO_ERROR;
 263                 }
 264
 265                 s=testCase->getString("callback", errorCode);
 266                 cc.setSub=0; // default: no subchar
 267
 268                 if((index=s.indexOf((UChar)0))>0) {
 269                     // read NUL-separated subchar first, if any
 270                     // copy the subchar from Latin-1 characters
 271                     // start after the NUL
 272                     p=s.getTerminatedBuffer();
 273                     length=index+1;
 274                     p+=length;
 275                     length=s.length()-length;
 276                     if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
 277                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 278                     } else {
 279                         int32_t j;
 280
 281                         for(j=0; j<length; ++j) {
 282                             cc.subchar[j]=(char)p[j];
 283                         }
 284                         // NUL-terminate the subchar
 285                         cc.subchar[j]=0;
 286                         cc.setSub=1;
 287                     }
 288
 289                     // remove the NUL and subchar from s
 290                     s.truncate(index);
 291                 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
 292                     // read a substitution string, separated by an equal sign
 293                     p=s.getBuffer()+index+1;
 294                     length=s.length()-(index+1);
 295                     if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) {
 296                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 297                     } else {
 298                         u_memcpy(cc.subString, p, length);
 299                         // NUL-terminate the subString
 300                         cc.subString[length]=0;
 301                         cc.setSub=-1;
 302                     }
 303
 304                     // remove the equal sign and subString from s
 305                     s.truncate(index);
 306                 }
 307
 308                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
 309                 cc.cbopt=cbopt;
 310                 switch(cbopt[0]) {
 311                 case SUB_CB:
 312                     callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 313                     break;
 314                 case SKIP_CB:
 315                     callback=UCNV_FROM_U_CALLBACK_SKIP;
 316                     break;
 317                 case STOP_CB:
 318                     callback=UCNV_FROM_U_CALLBACK_STOP;
 319                     break;
 320                 case ESC_CB:
 321                     callback=UCNV_FROM_U_CALLBACK_ESCAPE;
 322                     break;
 323                 default:
 324                     callback=NULL;
 325                     break;
 326                 }
 327                 option=callback==NULL ? cbopt : cbopt+1;
 328                 if(*option==0) {
 329                     option=NULL;
 330                 }
 331
 332                 invalidUChars=testCase->getString("invalidUChars", errorCode);
 333                 cc.invalidUChars=invalidUChars.getBuffer();
 334                 cc.invalidLength=invalidUChars.length();
 335
 336                 if(U_FAILURE(errorCode)) {
 337                     errln("error parsing conversion/fromUnicode test case %d - %s",
 338                             i, u_errorName(errorCode));
 339                     errorCode=U_ZERO_ERROR;
 340                 } else {
 341                     logln("TestFromUnicode[%d] %s", i, charset);
 342                     FromUnicodeCase(cc, callback, option);
 343                 }
 344             }
 345             delete testData;
 346         }
 347         delete dataModule;
 348     }
 349     else {
 350         dataerrln("Could not load test conversion data");
 351     }
 352 }
 353
 354 static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
 355
 356 void
 357 ConversionTest::TestGetUnicodeSet() {
 358     char charset[100];
 359     UnicodeString s, map, mapnot;
 360     int32_t which;
 361
 362     ParsePosition pos;
 363     UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
 364     UnicodeSet *cnvSetPtr = &cnvSet;
 365     LocalUConverterPointer cnv;
 366
 367     TestDataModule *dataModule;
 368     TestData *testData;
 369     const DataMap *testCase;
 370     UErrorCode errorCode;
 371     int32_t i;
 372
 373     errorCode=U_ZERO_ERROR;
 374     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
 375     if(U_SUCCESS(errorCode)) {
 376         testData=dataModule->createTestData("getUnicodeSet", errorCode);
 377         if(U_SUCCESS(errorCode)) {
 378             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
 379                 if(U_FAILURE(errorCode)) {
 380                     errln("error retrieving conversion/getUnicodeSet test case %d - %s",
 381                             i, u_errorName(errorCode));
 382                     errorCode=U_ZERO_ERROR;
 383                     continue;
 384                 }
 385
 386                 s=testCase->getString("charset", errorCode);
 387                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
 388
 389                 map=testCase->getString("map", errorCode);
 390                 mapnot=testCase->getString("mapnot", errorCode);
 391
 392                 which=testCase->getInt28("which", errorCode);
 393
 394                 if(U_FAILURE(errorCode)) {
 395                     errln("error parsing conversion/getUnicodeSet test case %d - %s",
 396                             i, u_errorName(errorCode));
 397                     errorCode=U_ZERO_ERROR;
 398                     continue;
 399                 }
 400
 401                 // test this test case
 402                 mapSet.clear();
 403                 mapnotSet.clear();
 404
 405                 pos.setIndex(0);
 406                 mapSet.applyPattern(map, pos, 0, NULL, errorCode);
 407                 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
 408                     errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
 409                           "    error index %d  index %d  U+%04x",
 410                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
 411                     errorCode=U_ZERO_ERROR;
 412                     continue;
 413                 }
 414
 415                 pos.setIndex(0);
 416                 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
 417                 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
 418                     errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
 419                           "    error index %d  index %d  U+%04x",
 420                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
 421                     errorCode=U_ZERO_ERROR;
 422                     continue;
 423                 }
 424
 425                 logln("TestGetUnicodeSet[%d] %s", i, charset);
 426
 427                 cnv.adoptInstead(cnv_open(charset, errorCode));
 428                 if(U_FAILURE(errorCode)) {
 429                     errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
 430                             charset, i, u_errorName(errorCode));
 431                     errorCode=U_ZERO_ERROR;
 432                     continue;
 433                 }
 434
 435                 ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode);
 436
 437                 if(U_FAILURE(errorCode)) {
 438                     errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
 439                             charset, i, u_errorName(errorCode));
 440                     errorCode=U_ZERO_ERROR;
 441                     continue;
 442                 }
 443
 444                 // are there items that must be in cnvSet but are not?
 445                 (diffSet=mapSet).removeAll(cnvSet);
 446                 if(!diffSet.isEmpty()) {
 447                     diffSet.toPattern(s, TRUE);
 448                     if(s.length()>100) {
 449                         s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
 450                     }
 451                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
 452                             charset, i);
 453                     errln(s);
 454                 }
 455
 456                 // are there items that must not be in cnvSet but are?
 457                 (diffSet=mapnotSet).retainAll(cnvSet);
 458                 if(!diffSet.isEmpty()) {
 459                     diffSet.toPattern(s, TRUE);
 460                     if(s.length()>100) {
 461                         s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
 462                     }
 463                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
 464                             charset, i);
 465                     errln(s);
 466                 }
 467             }
 468             delete testData;
 469         }
 470         delete dataModule;
 471     }
 472     else {
 473         dataerrln("Could not load test conversion data");
 474     }
 475 }
 476
 477 U_CDECL_BEGIN
 478 static void U_CALLCONV
 479 getUnicodeSetCallback(const void *context,
 480                       UConverterFromUnicodeArgs * /*fromUArgs*/,
 481                       const UChar* /*codeUnits*/,
 482                       int32_t /*length*/,
 483                       UChar32 codePoint,
 484                       UConverterCallbackReason reason,
 485                       UErrorCode *pErrorCode) {
 486     if(reason<=UCNV_IRREGULAR) {
 487         ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point
 488         *pErrorCode=U_ZERO_ERROR;                    // skip
 489     }  // else ignore the reset, close and clone calls.
 490 }
 491 U_CDECL_END
 492
 493 // Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
 494 void
 495 ConversionTest::TestGetUnicodeSet2() {
 496     // Build a string with all code points.
 497     UChar32 cpLimit;
 498     int32_t s0Length;
 499     if(quick) {
 500         cpLimit=s0Length=0x10000;  // BMP only
 501     } else {
 502         cpLimit=0x110000;
 503         s0Length=0x10000+0x200000;  // BMP + surrogate pairs
 504     }
 505     UChar *s0=new UChar[s0Length];
 506     if(s0==NULL) {
 507         return;
 508     }
 509     UChar *s=s0;
 510     UChar32 c;
 511     UChar c2;
 512     // low BMP
 513     for(c=0; c<=0xd7ff; ++c) {
 514         *s++=(UChar)c;
 515     }
 516     // trail surrogates
 517     for(c=0xdc00; c<=0xdfff; ++c) {
 518         *s++=(UChar)c;
 519     }
 520     // lead surrogates
 521     // (after trails so that there is not even one surrogate pair in between)
 522     for(c=0xd800; c<=0xdbff; ++c) {
 523         *s++=(UChar)c;
 524     }
 525     // high BMP
 526     for(c=0xe000; c<=0xffff; ++c) {
 527         *s++=(UChar)c;
 528     }
 529     // supplementary code points = surrogate pairs
 530     if(cpLimit==0x110000) {
 531         for(c=0xd800; c<=0xdbff; ++c) {
 532             for(c2=0xdc00; c2<=0xdfff; ++c2) {
 533                 *s++=(UChar)c;
 534                 *s++=c2;
 535             }
 536         }
 537     }
 538
 539     static const char *const cnvNames[]={
 540         "UTF-8",
 541         "UTF-7",
 542         "UTF-16",
 543         "US-ASCII",
 544         "ISO-8859-1",
 545         "windows-1252",
 546         "Shift-JIS",
 547         "ibm-1390",  // EBCDIC_STATEFUL table
 548         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
 549         "HZ",
 550         "ISO-2022-JP",
 551         "JIS7",
 552         "ISO-2022-CN",
 553         "ISO-2022-CN-EXT",
 554         "LMBCS"
 555     };
 556     LocalUConverterPointer cnv;
 557     char buffer[1024];
 558     int32_t i;
 559     for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
 560         UErrorCode errorCode=U_ZERO_ERROR;
 561         cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
 562         if(U_FAILURE(errorCode)) {
 563             errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
 564             continue;
 565         }
 566         UnicodeSet expected;
 567         ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
 568         if(U_FAILURE(errorCode)) {
 569             errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
 570             continue;
 571         }
 572         UConverterUnicodeSet which;
 573         for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
 574             if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
 575                 ucnv_setFallback(cnv.getAlias(), TRUE);
 576             }
 577             expected.add(0, cpLimit-1);
 578             s=s0;
 579             UBool flush;
 580             do {
 581                 char *t=buffer;
 582                 flush=(UBool)(s==s0+s0Length);
 583                 ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
 584                 if(U_FAILURE(errorCode)) {
 585                     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 586                         errorCode=U_ZERO_ERROR;
 587                         continue;
 588                     } else {
 589                         break;  // unexpected error, should not occur
 590                     }
 591                 }
 592             } while(!flush);
 593             UnicodeSet set;
 594             ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode);
 595             if(cpLimit<0x110000) {
 596                 set.remove(cpLimit, 0x10ffff);
 597             }
 598             if(which==UCNV_ROUNDTRIP_SET) {
 599                 // ignore PUA code points because they will be converted even if they
 600                 // are fallbacks and when other fallbacks are turned off,
 601                 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
 602                 expected.remove(0xe000, 0xf8ff);
 603                 expected.remove(0xf0000, 0xffffd);
 604                 expected.remove(0x100000, 0x10fffd);
 605                 set.remove(0xe000, 0xf8ff);
 606                 set.remove(0xf0000, 0xffffd);
 607                 set.remove(0x100000, 0x10fffd);
 608             }
 609             if(set!=expected) {
 610                 // First try to see if we have different sets because ucnv_getUnicodeSet()
 611                 // added strings: The above conversion method does not tell us what strings might be convertible.
 612                 // Remove strings from the set and compare again.
 613                 // Unfortunately, there are no good, direct set methods for finding out whether there are strings
 614                 // in the set, nor for enumerating or removing just them.
 615                 // Intersect all code points with the set. The intersection will not contain strings.
 616                 UnicodeSet temp(0, 0x10ffff);
 617                 temp.retainAll(set);
 618                 set=temp;
 619             }
 620             if(set!=expected) {
 621                 UnicodeSet diffSet;
 622                 UnicodeString out;
 623
 624                 // are there items that must be in the set but are not?
 625                 (diffSet=expected).removeAll(set);
 626                 if(!diffSet.isEmpty()) {
 627                     diffSet.toPattern(out, TRUE);
 628                     if(out.length()>100) {
 629                         out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
 630                     }
 631                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
 632                             cnvNames[i], which);
 633                     errln(out);
 634                 }
 635
 636                 // are there items that must not be in the set but are?
 637                 (diffSet=set).removeAll(expected);
 638                 if(!diffSet.isEmpty()) {
 639                     diffSet.toPattern(out, TRUE);
 640                     if(out.length()>100) {
 641                         out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
 642                     }
 643                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
 644                             cnvNames[i], which);
 645                     errln(out);
 646                 }
 647             }
 648         }
 649     }
 650
 651     delete [] s0;
 652 }
 653
 654 // Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
 655 // If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
 656 void
 657 ConversionTest::TestDefaultIgnorableCallback() {
 658     UErrorCode status = U_ZERO_ERROR;
 659     const char *cnv_name = "euc-jp-2007";
 660     const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
 661     const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
 662
 663     UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
 664     if (U_FAILURE(status)) {
 665         dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
 666         return;
 667     }
 668
 669     UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
 670     if (U_FAILURE(status)) {
 671         dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
 672         return;
 673     }
 674
 675     UConverter *cnv = cnv_open(cnv_name, status);
 676     if (U_FAILURE(status)) {
 677         dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
 678         return;
 679     }
 680
 681     // set callback for the converter
 682     ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
 683
 684     UChar32 input[1];
 685     char output[10];
 686     int32_t outputLength;
 687
 688     // test default ignorables are ignored
 689     int size = set_ignorable->size();
 690     for (int i = 0; i < size; i++) {
 691         status = U_ZERO_ERROR;
 692         outputLength= 0;
 693
 694         input[0] = set_ignorable->charAt(i);
 695
 696         outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
 697         if (U_FAILURE(status) || outputLength != 0) {
 698             errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
 699         }
 700     }
 701
 702     // test non-ignorables are not ignored
 703     size = set_not_ignorable->size();
 704     for (int i = 0; i < size; i++) {
 705         status = U_ZERO_ERROR;
 706         outputLength= 0;
 707
 708         input[0] = set_not_ignorable->charAt(i);
 709
 710         if (input[0] == 0) {
 711             continue;
 712         }
 713
 714         outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
 715         if (U_FAILURE(status) || outputLength <= 0) {
 716             errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
 717         }
 718     }
 719
 720     ucnv_close(cnv);
 721     delete set_not_ignorable;
 722     delete set_ignorable;
 723 }
 724
 725 // open testdata or ICU data converter ------------------------------------- ***
 726
 727 UConverter *
 728 ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
 729     if(name!=NULL && *name=='+') {
 730         // Converter names that start with '+' are ignored in ICU4J tests.
 731         ++name;
 732     }
 733     if(name!=NULL && *name=='*') {
 734         /* loadTestData(): set the data directory */
 735         return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
 736     } else {
 737         return ucnv_open(name, &errorCode);
 738     }
 739 }
 740
 741 // output helpers ---------------------------------------------------------- ***
 742
 743 static inline char
 744 hexDigit(uint8_t digit) {
 745     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
 746 }
 747
 748 static char *
 749 printBytes(const uint8_t *bytes, int32_t length, char *out) {
 750     uint8_t b;
 751
 752     if(length>0) {
 753         b=*bytes++;
 754         --length;
 755         *out++=hexDigit((uint8_t)(b>>4));
 756         *out++=hexDigit((uint8_t)(b&0xf));
 757     }
 758
 759     while(length>0) {
 760         b=*bytes++;
 761         --length;
 762         *out++=' ';
 763         *out++=hexDigit((uint8_t)(b>>4));
 764         *out++=hexDigit((uint8_t)(b&0xf));
 765     }
 766     *out++=0;
 767     return out;
 768 }
 769
 770 static char *
 771 printUnicode(const UChar *unicode, int32_t length, char *out) {
 772     UChar32 c;
 773     int32_t i;
 774
 775     for(i=0; i<length;) {
 776         if(i>0) {
 777             *out++=' ';
 778         }
 779         U16_NEXT(unicode, i, length, c);
 780         // write 4..6 digits
 781         if(c>=0x100000) {
 782             *out++='1';
 783         }
 784         if(c>=0x10000) {
 785             *out++=hexDigit((uint8_t)((c>>16)&0xf));
 786         }
 787         *out++=hexDigit((uint8_t)((c>>12)&0xf));
 788         *out++=hexDigit((uint8_t)((c>>8)&0xf));
 789         *out++=hexDigit((uint8_t)((c>>4)&0xf));
 790         *out++=hexDigit((uint8_t)(c&0xf));
 791     }
 792     *out++=0;
 793     return out;
 794 }
 795
 796 static char *
 797 printOffsets(const int32_t *offsets, int32_t length, char *out) {
 798     int32_t i, o, d;
 799
 800     if(offsets==NULL) {
 801         length=0;
 802     }
 803
 804     for(i=0; i<length; ++i) {
 805         if(i>0) {
 806             *out++=' ';
 807         }
 808         o=offsets[i];
 809
 810         // print all offsets with 2 characters each (-x, -9..99, xx)
 811         if(o<-9) {
 812             *out++='-';
 813             *out++='x';
 814         } else if(o<0) {
 815             *out++='-';
 816             *out++=(char)('0'-o);
 817         } else if(o<=99) {
 818             *out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
 819             *out++=(char)('0'+o%10);
 820         } else /* o>99 */ {
 821             *out++='x';
 822             *out++='x';
 823         }
 824     }
 825     *out++=0;
 826     return out;
 827 }
 828
 829 // toUnicode test worker functions ----------------------------------------- ***
 830
 831 static int32_t
 832 stepToUnicode(ConversionCase &cc, UConverter *cnv,
 833               UChar *result, int32_t resultCapacity,
 834               int32_t *resultOffsets, /* also resultCapacity */
 835               int32_t step,
 836               UErrorCode *pErrorCode) {
 837     const char *source, *sourceLimit, *bytesLimit;
 838     UChar *target, *targetLimit, *resultLimit;
 839     UBool flush;
 840
 841     source=(const char *)cc.bytes;
 842     target=result;
 843     bytesLimit=source+cc.bytesLength;
 844     resultLimit=result+resultCapacity;
 845
 846     if(step>=0) {
 847         // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
 848         // move only one buffer (in vs. out) at a time to be extra mean
 849         // step==0 performs bulk conversion and generates offsets
 850
 851         // initialize the partial limits for the loop
 852         if(step==0) {
 853             // use the entire buffers
 854             sourceLimit=bytesLimit;
 855             targetLimit=resultLimit;
 856             flush=cc.finalFlush;
 857         } else {
 858             // start with empty partial buffers
 859             sourceLimit=source;
 860             targetLimit=target;
 861             flush=FALSE;
 862
 863             // output offsets only for bulk conversion
 864             resultOffsets=NULL;
 865         }
 866
 867         for(;;) {
 868             // resetting the opposite conversion direction must not affect this one
 869             ucnv_resetFromUnicode(cnv);
 870
 871             // convert
 872             ucnv_toUnicode(cnv,
 873                 &target, targetLimit,
 874                 &source, sourceLimit,
 875                 resultOffsets,
 876                 flush, pErrorCode);
 877
 878             // check pointers and errors
 879             if(source>sourceLimit || target>targetLimit) {
 880                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 881                 break;
 882             } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
 883                 if(target!=targetLimit) {
 884                     // buffer overflow must only be set when the target is filled
 885                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 886                     break;
 887                 } else if(targetLimit==resultLimit) {
 888                     // not just a partial overflow
 889                     break;
 890                 }
 891
 892                 // the partial target is filled, set a new limit, reset the error and continue
 893                 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
 894                 *pErrorCode=U_ZERO_ERROR;
 895             } else if(U_FAILURE(*pErrorCode)) {
 896                 // some other error occurred, done
 897                 break;
 898             } else {
 899                 if(source!=sourceLimit) {
 900                     // when no error occurs, then the input must be consumed
 901                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 902                     break;
 903                 }
 904
 905                 if(sourceLimit==bytesLimit) {
 906                     // we are done
 907                     break;
 908                 }
 909
 910                 // the partial conversion succeeded, set a new limit and continue
 911                 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
 912                 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
 913             }
 914         }
 915     } else /* step<0 */ {
 916         /*
 917          * step==-1: call only ucnv_getNextUChar()
 918          * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
 919          *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
 920          *   else give it at most (-step-2)/2 bytes
 921          */
 922         UChar32 c;
 923
 924         // end the loop by getting an index out of bounds error
 925         for(;;) {
 926             // resetting the opposite conversion direction must not affect this one
 927             ucnv_resetFromUnicode(cnv);
 928
 929             // convert
 930             if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
 931                 sourceLimit=source; // use sourceLimit not as a real limit
 932                                     // but to remember the pre-getNextUChar source pointer
 933                 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
 934
 935                 // check pointers and errors
 936                 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
 937                     if(source!=bytesLimit) {
 938                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 939                     } else {
 940                         *pErrorCode=U_ZERO_ERROR;
 941                     }
 942                     break;
 943                 } else if(U_FAILURE(*pErrorCode)) {
 944                     break;
 945                 }
 946                 // source may not move if c is from previous overflow
 947
 948                 if(target==resultLimit) {
 949                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 950                     break;
 951                 }
 952                 if(c<=0xffff) {
 953                     *target++=(UChar)c;
 954                 } else {
 955                     *target++=U16_LEAD(c);
 956                     if(target==resultLimit) {
 957                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 958                         break;
 959                     }
 960                     *target++=U16_TRAIL(c);
 961                 }
 962
 963                 // alternate between -n-1 and -n but leave -1 alone
 964                 if(step<-1) {
 965                     ++step;
 966                 }
 967             } else /* step is even */ {
 968                 // allow only one UChar output
 969                 targetLimit=target<resultLimit ? target+1 : resultLimit;
 970
 971                 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
 972                 // and never output offsets
 973                 if(step==-2) {
 974                     sourceLimit=bytesLimit;
 975                 } else {
 976                     sourceLimit=source+(-step-2)/2;
 977                     if(sourceLimit>bytesLimit) {
 978                         sourceLimit=bytesLimit;
 979                     }
 980                 }
 981
 982                 ucnv_toUnicode(cnv,
 983                     &target, targetLimit,
 984                     &source, sourceLimit,
 985                     NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
 986
 987                 // check pointers and errors
 988                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
 989                     if(target!=targetLimit) {
 990                         // buffer overflow must only be set when the target is filled
 991                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 992                         break;
 993                     } else if(targetLimit==resultLimit) {
 994                         // not just a partial overflow
 995                         break;
 996                     }
 997
 998                     // the partial target is filled, set a new limit and continue
 999                     *pErrorCode=U_ZERO_ERROR;
1000                 } else if(U_FAILURE(*pErrorCode)) {
1001                     // some other error occurred, done
1002                     break;
1003                 } else {
1004                     if(source!=sourceLimit) {
1005                         // when no error occurs, then the input must be consumed
1006                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1007                         break;
1008                     }
1009
1010                     // we are done (flush==TRUE) but we continue, to get the index out of bounds error above
1011                 }
1012
1013                 --step;
1014             }
1015         }
1016     }
1017
1018     return (int32_t)(target-result);
1019 }
1020
1021 UBool
1022 ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
1023     // open the converter
1024     IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
1025     LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
1026     // with no data, the above crashes with "pointer being freed was not allocated" for charset "x11-compound-text", see #13078
1027     if(errorCode.isFailure()) {
1028         errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1029                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
1030         errorCode.reset();
1031         return FALSE;
1032     }
1033
1034     // set the callback
1035     if(callback!=NULL) {
1036         ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode);
1037         if(U_FAILURE(errorCode)) {
1038             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
1039                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1040             return FALSE;
1041         }
1042     }
1043
1044     int32_t resultOffsets[256];
1045     UChar result[256];
1046     int32_t resultLength;
1047     UBool ok;
1048
1049     static const struct {
1050         int32_t step;
1051         const char *name;
1052     } steps[]={
1053         { 0, "bulk" }, // must be first for offsets to be checked
1054         { 1, "step=1" },
1055         { 3, "step=3" },
1056         { 7, "step=7" },
1057         { -1, "getNext" },
1058         { -2, "toU(bulk)+getNext" },
1059         { -3, "getNext+toU(bulk)" },
1060         { -4, "toU(1)+getNext" },
1061         { -5, "getNext+toU(1)" },
1062         { -12, "toU(5)+getNext" },
1063         { -13, "getNext+toU(5)" },
1064     };
1065     int32_t i, step;
1066
1067     ok=TRUE;
1068     for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1069         step=steps[i].step;
1070         if(step<0 && !cc.finalFlush) {
1071             // skip ucnv_getNextUChar() if !finalFlush because
1072             // ucnv_getNextUChar() always implies flush
1073             continue;
1074         }
1075         if(step!=0) {
1076             // bulk test is first, then offsets are not checked any more
1077             cc.offsets=NULL;
1078         }
1079         else {
1080             memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1081         }
1082         memset(result, -1, UPRV_LENGTHOF(result));
1083         errorCode.reset();
1084         resultLength=stepToUnicode(cc, cnv.getAlias(),
1085                                 result, UPRV_LENGTHOF(result),
1086                                 step==0 ? resultOffsets : NULL,
1087                                 step, errorCode);
1088         ok=checkToUnicode(
1089                 cc, cnv.getAlias(), steps[i].name,
1090                 result, resultLength,
1091                 cc.offsets!=NULL ? resultOffsets : NULL,
1092                 errorCode);
1093         if(errorCode.isFailure() || !cc.finalFlush) {
1094             // reset if an error occurred or we did not flush
1095             // otherwise do nothing to make sure that flushing resets
1096             ucnv_resetToUnicode(cnv.getAlias());
1097         }
1098         if (cc.offsets != NULL && resultOffsets[resultLength] != -1) {
1099             errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1100                 cc.caseNr, cc.charset, resultLength);
1101         }
1102         if (result[resultLength] != (UChar)-1) {
1103             errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
1104                 cc.caseNr, cc.charset, resultLength);
1105         }
1106     }
1107
1108     // not a real loop, just a convenience for breaking out of the block
1109     while(ok && cc.finalFlush) {
1110         // test ucnv_toUChars()
1111         memset(result, 0, sizeof(result));
1112
1113         errorCode.reset();
1114         resultLength=ucnv_toUChars(cnv.getAlias(),
1115                         result, UPRV_LENGTHOF(result),
1116                         (const char *)cc.bytes, cc.bytesLength,
1117                         errorCode);
1118         ok=checkToUnicode(
1119                 cc, cnv.getAlias(), "toUChars",
1120                 result, resultLength,
1121                 NULL,
1122                 errorCode);
1123         if(!ok) {
1124             break;
1125         }
1126
1127         // test preflighting
1128         // keep the correct result for simple checking
1129         errorCode.reset();
1130         resultLength=ucnv_toUChars(cnv.getAlias(),
1131                         NULL, 0,
1132                         (const char *)cc.bytes, cc.bytesLength,
1133                         errorCode);
1134         if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) {
1135             errorCode.reset();
1136         }
1137         ok=checkToUnicode(
1138                 cc, cnv.getAlias(), "preflight toUChars",
1139                 result, resultLength,
1140                 NULL,
1141                 errorCode);
1142         break;
1143     }
1144
1145     errorCode.reset();  // all errors have already been reported
1146     return ok;
1147 }
1148
1149 UBool
1150 ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1151                                const UChar *result, int32_t resultLength,
1152                                const int32_t *resultOffsets,
1153                                UErrorCode resultErrorCode) {
1154     char resultInvalidChars[8];
1155     int8_t resultInvalidLength;
1156     UErrorCode errorCode;
1157
1158     const char *msg;
1159
1160     // reset the message; NULL will mean "ok"
1161     msg=NULL;
1162
1163     errorCode=U_ZERO_ERROR;
1164     resultInvalidLength=sizeof(resultInvalidChars);
1165     ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
1166     if(U_FAILURE(errorCode)) {
1167         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
1168                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1169         return FALSE;
1170     }
1171
1172     // check everything that might have gone wrong
1173     if(cc.unicodeLength!=resultLength) {
1174         msg="wrong result length";
1175     } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
1176         msg="wrong result string";
1177     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) {
1178         msg="wrong offsets";
1179     } else if(cc.outErrorCode!=resultErrorCode) {
1180         msg="wrong error code";
1181     } else if(cc.invalidLength!=resultInvalidLength) {
1182         msg="wrong length of last invalid input";
1183     } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
1184         msg="wrong last invalid input";
1185     }
1186
1187     if(msg==NULL) {
1188         return TRUE;
1189     } else {
1190         char buffer[2000]; // one buffer for all strings
1191         char *s, *bytesString, *unicodeString, *resultString,
1192             *offsetsString, *resultOffsetsString,
1193             *invalidCharsString, *resultInvalidCharsString;
1194
1195         bytesString=s=buffer;
1196         s=printBytes(cc.bytes, cc.bytesLength, bytesString);
1197         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
1198         s=printUnicode(result, resultLength, resultString=s);
1199         s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
1200         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1201         s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
1202         s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
1203
1204         if((s-buffer)>(int32_t)sizeof(buffer)) {
1205             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
1206                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1207             exit(1);
1208         }
1209
1210         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1211               "  bytes <%s>[%d]\n"
1212               " expected <%s>[%d]\n"
1213               "  result  <%s>[%d]\n"
1214               " offsets         <%s>\n"
1215               "  result offsets <%s>\n"
1216               " error code expected %s got %s\n"
1217               "  invalidChars expected <%s> got <%s>\n",
1218               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1219               bytesString, cc.bytesLength,
1220               unicodeString, cc.unicodeLength,
1221               resultString, resultLength,
1222               offsetsString,
1223               resultOffsetsString,
1224               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1225               invalidCharsString, resultInvalidCharsString);
1226
1227         return FALSE;
1228     }
1229 }
1230
1231 // fromUnicode test worker functions --------------------------------------- ***
1232
1233 static int32_t
1234 stepFromUTF8(ConversionCase &cc,
1235              UConverter *utf8Cnv, UConverter *cnv,
1236              char *result, int32_t resultCapacity,
1237              int32_t step,
1238              UErrorCode *pErrorCode) {
1239     const char *source, *sourceLimit, *utf8Limit;
1240     UChar pivotBuffer[32];
1241     UChar *pivotSource, *pivotTarget, *pivotLimit;
1242     char *target, *targetLimit, *resultLimit;
1243     UBool flush;
1244
1245     source=cc.utf8;
1246     pivotSource=pivotTarget=pivotBuffer;
1247     target=result;
1248     utf8Limit=source+cc.utf8Length;
1249     resultLimit=result+resultCapacity;
1250
1251     // call ucnv_convertEx() with in/out buffers no larger than (step) at a time
1252     // move only one buffer (in vs. out) at a time to be extra mean
1253     // step==0 performs bulk conversion
1254
1255     // initialize the partial limits for the loop
1256     if(step==0) {
1257         // use the entire buffers
1258         sourceLimit=utf8Limit;
1259         targetLimit=resultLimit;
1260         flush=cc.finalFlush;
1261
1262         pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
1263     } else {
1264         // start with empty partial buffers
1265         sourceLimit=source;
1266         targetLimit=target;
1267         flush=FALSE;
1268
1269         // empty pivot is not allowed, make it of length step
1270         pivotLimit=pivotBuffer+step;
1271     }
1272
1273     for(;;) {
1274         // resetting the opposite conversion direction must not affect this one
1275         ucnv_resetFromUnicode(utf8Cnv);
1276         ucnv_resetToUnicode(cnv);
1277
1278         // convert
1279         ucnv_convertEx(cnv, utf8Cnv,
1280             &target, targetLimit,
1281             &source, sourceLimit,
1282             pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
1283             FALSE, flush, pErrorCode);
1284
1285         // check pointers and errors
1286         if(source>sourceLimit || target>targetLimit) {
1287             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1288             break;
1289         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1290             if(target!=targetLimit) {
1291                 // buffer overflow must only be set when the target is filled
1292                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1293                 break;
1294             } else if(targetLimit==resultLimit) {
1295                 // not just a partial overflow
1296                 break;
1297             }
1298
1299             // the partial target is filled, set a new limit, reset the error and continue
1300             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1301             *pErrorCode=U_ZERO_ERROR;
1302         } else if(U_FAILURE(*pErrorCode)) {
1303             if(pivotSource==pivotBuffer) {
1304                 // toUnicode error, should not occur
1305                 // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1306                 break;
1307             } else {
1308                 // fromUnicode error
1309                 // some other error occurred, done
1310                 break;
1311             }
1312         } else {
1313             if(source!=sourceLimit) {
1314                 // when no error occurs, then the input must be consumed
1315                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1316                 break;
1317             }
1318
1319             if(sourceLimit==utf8Limit) {
1320                 // we are done
1321                 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
1322                     // ucnv_convertEx() warns about not terminating the output
1323                     // but ucnv_fromUnicode() does not and so
1324                     // checkFromUnicode() does not expect it
1325                     *pErrorCode=U_ZERO_ERROR;
1326                 }
1327                 break;
1328             }
1329
1330             // the partial conversion succeeded, set a new limit and continue
1331             sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
1332             flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
1333         }
1334     }
1335
1336     return (int32_t)(target-result);
1337 }
1338
1339 static int32_t
1340 stepFromUnicode(ConversionCase &cc, UConverter *cnv,
1341                 char *result, int32_t resultCapacity,
1342                 int32_t *resultOffsets, /* also resultCapacity */
1343                 int32_t step,
1344                 UErrorCode *pErrorCode) {
1345     const UChar *source, *sourceLimit, *unicodeLimit;
1346     char *target, *targetLimit, *resultLimit;
1347     UBool flush;
1348
1349     source=cc.unicode;
1350     target=result;
1351     unicodeLimit=source+cc.unicodeLength;
1352     resultLimit=result+resultCapacity;
1353
1354     // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
1355     // move only one buffer (in vs. out) at a time to be extra mean
1356     // step==0 performs bulk conversion and generates offsets
1357
1358     // initialize the partial limits for the loop
1359     if(step==0) {
1360         // use the entire buffers
1361         sourceLimit=unicodeLimit;
1362         targetLimit=resultLimit;
1363         flush=cc.finalFlush;
1364     } else {
1365         // start with empty partial buffers
1366         sourceLimit=source;
1367         targetLimit=target;
1368         flush=FALSE;
1369
1370         // output offsets only for bulk conversion
1371         resultOffsets=NULL;
1372     }
1373
1374     for(;;) {
1375         // resetting the opposite conversion direction must not affect this one
1376         ucnv_resetToUnicode(cnv);
1377
1378         // convert
1379         ucnv_fromUnicode(cnv,
1380             &target, targetLimit,
1381             &source, sourceLimit,
1382             resultOffsets,
1383             flush, pErrorCode);
1384
1385         // check pointers and errors
1386         if(source>sourceLimit || target>targetLimit) {
1387             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1388             break;
1389         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1390             if(target!=targetLimit) {
1391                 // buffer overflow must only be set when the target is filled
1392                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1393                 break;
1394             } else if(targetLimit==resultLimit) {
1395                 // not just a partial overflow
1396                 break;
1397             }
1398
1399             // the partial target is filled, set a new limit, reset the error and continue
1400             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1401             *pErrorCode=U_ZERO_ERROR;
1402         } else if(U_FAILURE(*pErrorCode)) {
1403             // some other error occurred, done
1404             break;
1405         } else {
1406             if(source!=sourceLimit) {
1407                 // when no error occurs, then the input must be consumed
1408                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1409                 break;
1410             }
1411
1412             if(sourceLimit==unicodeLimit) {
1413                 // we are done
1414                 break;
1415             }
1416
1417             // the partial conversion succeeded, set a new limit and continue
1418             sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
1419             flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
1420         }
1421     }
1422
1423     return (int32_t)(target-result);
1424 }
1425
1426 UBool
1427 ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
1428     UConverter *cnv;
1429     UErrorCode errorCode;
1430
1431     // open the converter
1432     errorCode=U_ZERO_ERROR;
1433     cnv=cnv_open(cc.charset, errorCode);
1434     if(U_FAILURE(errorCode)) {
1435         errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1436                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1437         return FALSE;
1438     }
1439     ucnv_resetToUnicode(utf8Cnv);
1440
1441     // set the callback
1442     if(callback!=NULL) {
1443         ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
1444         if(U_FAILURE(errorCode)) {
1445             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
1446                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1447             ucnv_close(cnv);
1448             return FALSE;
1449         }
1450     }
1451
1452     // set the fallbacks flag
1453     // TODO change with Jitterbug 2401, then add a similar call for toUnicode too
1454     ucnv_setFallback(cnv, cc.fallbacks);
1455
1456     // set the subchar
1457     int32_t length;
1458
1459     if(cc.setSub>0) {
1460         length=(int32_t)strlen(cc.subchar);
1461         ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
1462         if(U_FAILURE(errorCode)) {
1463             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
1464                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1465             ucnv_close(cnv);
1466             return FALSE;
1467         }
1468     } else if(cc.setSub<0) {
1469         ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
1470         if(U_FAILURE(errorCode)) {
1471             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
1472                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1473             ucnv_close(cnv);
1474             return FALSE;
1475         }
1476     }
1477
1478     // convert unicode to utf8
1479     char utf8[256];
1480     cc.utf8=utf8;
1481     u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
1482                 cc.unicode, cc.unicodeLength,
1483                 &errorCode);
1484     if(U_FAILURE(errorCode)) {
1485         // skip UTF-8 testing of a string with an unpaired surrogate,
1486         // or of one that's too long
1487         // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1488         cc.utf8Length=-1;
1489     }
1490
1491     int32_t resultOffsets[256];
1492     char result[256];
1493     int32_t resultLength;
1494     UBool ok;
1495
1496     static const struct {
1497         int32_t step;
1498         const char *name, *utf8Name;
1499     } steps[]={
1500         { 0, "bulk",   "utf8" }, // must be first for offsets to be checked
1501         { 1, "step=1", "utf8 step=1" },
1502         { 3, "step=3", "utf8 step=3" },
1503         { 7, "step=7", "utf8 step=7" }
1504     };
1505     int32_t i, step;
1506
1507     ok=TRUE;
1508     for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
1509         step=steps[i].step;
1510         memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
1511         memset(result, -1, UPRV_LENGTHOF(result));
1512         errorCode=U_ZERO_ERROR;
1513         resultLength=stepFromUnicode(cc, cnv,
1514                                 result, UPRV_LENGTHOF(result),
1515                                 step==0 ? resultOffsets : NULL,
1516                                 step, &errorCode);
1517         ok=checkFromUnicode(
1518                 cc, cnv, steps[i].name,
1519                 (uint8_t *)result, resultLength,
1520                 cc.offsets!=NULL ? resultOffsets : NULL,
1521                 errorCode);
1522         if(U_FAILURE(errorCode) || !cc.finalFlush) {
1523             // reset if an error occurred or we did not flush
1524             // otherwise do nothing to make sure that flushing resets
1525             ucnv_resetFromUnicode(cnv);
1526         }
1527         if (resultOffsets[resultLength] != -1) {
1528             errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1529                 cc.caseNr, cc.charset, resultLength);
1530         }
1531         if (result[resultLength] != (char)-1) {
1532             errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
1533                 cc.caseNr, cc.charset, resultLength);
1534         }
1535
1536         // bulk test is first, then offsets are not checked any more
1537         cc.offsets=NULL;
1538
1539         // test direct conversion from UTF-8
1540         if(cc.utf8Length>=0) {
1541             errorCode=U_ZERO_ERROR;
1542             resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
1543                                     result, UPRV_LENGTHOF(result),
1544                                     step, &errorCode);
1545             ok=checkFromUnicode(
1546                     cc, cnv, steps[i].utf8Name,
1547                     (uint8_t *)result, resultLength,
1548                     NULL,
1549                     errorCode);
1550             if(U_FAILURE(errorCode) || !cc.finalFlush) {
1551                 // reset if an error occurred or we did not flush
1552                 // otherwise do nothing to make sure that flushing resets
1553                 ucnv_resetToUnicode(utf8Cnv);
1554                 ucnv_resetFromUnicode(cnv);
1555             }
1556         }
1557     }
1558
1559     // not a real loop, just a convenience for breaking out of the block
1560     while(ok && cc.finalFlush) {
1561         // test ucnv_fromUChars()
1562         memset(result, 0, sizeof(result));
1563
1564         errorCode=U_ZERO_ERROR;
1565         resultLength=ucnv_fromUChars(cnv,
1566                         result, UPRV_LENGTHOF(result),
1567                         cc.unicode, cc.unicodeLength,
1568                         &errorCode);
1569         ok=checkFromUnicode(
1570                 cc, cnv, "fromUChars",
1571                 (uint8_t *)result, resultLength,
1572                 NULL,
1573                 errorCode);
1574         if(!ok) {
1575             break;
1576         }
1577
1578         // test preflighting
1579         // keep the correct result for simple checking
1580         errorCode=U_ZERO_ERROR;
1581         resultLength=ucnv_fromUChars(cnv,
1582                         NULL, 0,
1583                         cc.unicode, cc.unicodeLength,
1584                         &errorCode);
1585         if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1586             errorCode=U_ZERO_ERROR;
1587         }
1588         ok=checkFromUnicode(
1589                 cc, cnv, "preflight fromUChars",
1590                 (uint8_t *)result, resultLength,
1591                 NULL,
1592                 errorCode);
1593         break;
1594     }
1595
1596     ucnv_close(cnv);
1597     return ok;
1598 }
1599
1600 UBool
1601 ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1602                                  const uint8_t *result, int32_t resultLength,
1603                                  const int32_t *resultOffsets,
1604                                  UErrorCode resultErrorCode) {
1605     UChar resultInvalidUChars[8];
1606     int8_t resultInvalidLength;
1607     UErrorCode errorCode;
1608
1609     const char *msg;
1610
1611     // reset the message; NULL will mean "ok"
1612     msg=NULL;
1613
1614     errorCode=U_ZERO_ERROR;
1615     resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
1616     ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
1617     if(U_FAILURE(errorCode)) {
1618         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
1619                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1620         return FALSE;
1621     }
1622
1623     // check everything that might have gone wrong
1624     if(cc.bytesLength!=resultLength) {
1625         msg="wrong result length";
1626     } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
1627         msg="wrong result string";
1628     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) {
1629         msg="wrong offsets";
1630     } else if(cc.outErrorCode!=resultErrorCode) {
1631         msg="wrong error code";
1632     } else if(cc.invalidLength!=resultInvalidLength) {
1633         msg="wrong length of last invalid input";
1634     } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
1635         msg="wrong last invalid input";
1636     }
1637
1638     if(msg==NULL) {
1639         return TRUE;
1640     } else {
1641         char buffer[2000]; // one buffer for all strings
1642         char *s, *unicodeString, *bytesString, *resultString,
1643             *offsetsString, *resultOffsetsString,
1644             *invalidCharsString, *resultInvalidUCharsString;
1645
1646         unicodeString=s=buffer;
1647         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
1648         s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
1649         s=printBytes(result, resultLength, resultString=s);
1650         s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
1651         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1652         s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
1653         s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
1654
1655         if((s-buffer)>(int32_t)sizeof(buffer)) {
1656             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
1657                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1658             exit(1);
1659         }
1660
1661         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1662               "  unicode <%s>[%d]\n"
1663               " expected <%s>[%d]\n"
1664               "  result  <%s>[%d]\n"
1665               " offsets         <%s>\n"
1666               "  result offsets <%s>\n"
1667               " error code expected %s got %s\n"
1668               "  invalidChars expected <%s> got <%s>\n",
1669               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1670               unicodeString, cc.unicodeLength,
1671               bytesString, cc.bytesLength,
1672               resultString, resultLength,
1673               offsetsString,
1674               resultOffsetsString,
1675               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1676               invalidCharsString, resultInvalidUCharsString);
1677
1678         return FALSE;
1679     }
1680 }
1681
1682 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */