icuSources/test/intltest/convtest.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2003-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  convtest.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2003jul15
  14 *   created by: Markus W. Scherer
  15 *
  16 *   Test file for data-driven conversion tests.
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_LEGACY_CONVERSION
  22 /*
  23  * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION
  24  * is slightly unnecessary - it removes tests for Unicode charsets
  25  * like UTF-8 that should work.
  26  * However, there is no easy way for the test to detect whether a test case
  27  * is for a Unicode charset, so it would be difficult to only exclude those.
  28  * Also, regular testing of ICU is done with all modules on, therefore
  29  * not testing conversion for a custom configuration like this should be ok.
  30  */
  31
  32 #include "unicode/ucnv.h"
  33 #include "unicode/unistr.h"
  34 #include "unicode/parsepos.h"
  35 #include "unicode/uniset.h"
  36 #include "unicode/ustring.h"
  37 #include "unicode/ures.h"
  38 #include "convtest.h"
  39 #include "unicode/tstdtmod.h"
  40 #include <string.h>
  41 #include <stdlib.h>
  42
  43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  44
  45 enum {
  46     // characters used in test data for callbacks
  47     SUB_CB='?',
  48     SKIP_CB='0',
  49     STOP_CB='.',
  50     ESC_CB='&'
  51 };
  52
  53 ConversionTest::ConversionTest() {
  54     UErrorCode errorCode=U_ZERO_ERROR;
  55     utf8Cnv=ucnv_open("UTF-8", &errorCode);
  56     ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
  57     if(U_FAILURE(errorCode)) {
  58         errln("unable to open UTF-8 converter");
  59     }
  60 }
  61
  62 ConversionTest::~ConversionTest() {
  63     ucnv_close(utf8Cnv);
  64 }
  65
  66 void
  67 ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
  68     if (exec) logln("TestSuite ConversionTest: ");
  69     switch (index) {
  70         case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
  71         case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
  72         case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
  73         case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
  74         default: name=""; break; //needed to end loop
  75     }
  76 }
  77
  78 // test data interface ----------------------------------------------------- ***
  79
  80 void
  81 ConversionTest::TestToUnicode() {
  82     ConversionCase cc;
  83     char charset[100], cbopt[4];
  84     const char *option;
  85     UnicodeString s, unicode;
  86     int32_t offsetsLength;
  87     UConverterToUCallback callback;
  88
  89     TestDataModule *dataModule;
  90     TestData *testData;
  91     const DataMap *testCase;
  92     UErrorCode errorCode;
  93     int32_t i;
  94
  95     errorCode=U_ZERO_ERROR;
  96     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
  97     if(U_SUCCESS(errorCode)) {
  98         testData=dataModule->createTestData("toUnicode", errorCode);
  99         if(U_SUCCESS(errorCode)) {
 100             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
 101                 if(U_FAILURE(errorCode)) {
 102                     errln("error retrieving conversion/toUnicode test case %d - %s",
 103                             i, u_errorName(errorCode));
 104                     errorCode=U_ZERO_ERROR;
 105                     continue;
 106                 }
 107
 108                 cc.caseNr=i;
 109
 110                 s=testCase->getString("charset", errorCode);
 111                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
 112                 cc.charset=charset;
 113
 114                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
 115                 unicode=testCase->getString("unicode", errorCode);
 116                 cc.unicode=unicode.getBuffer();
 117                 cc.unicodeLength=unicode.length();
 118
 119                 offsetsLength=0;
 120                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
 121                 if(offsetsLength==0) {
 122                     cc.offsets=NULL;
 123                 } else if(offsetsLength!=unicode.length()) {
 124                     errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length",
 125                             i, unicode.length(), offsetsLength);
 126                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 127                 }
 128
 129                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
 130                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
 131
 132                 s=testCase->getString("errorCode", errorCode);
 133                 if(s==UNICODE_STRING("invalid", 7)) {
 134                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
 135                 } else if(s==UNICODE_STRING("illegal", 7)) {
 136                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
 137                 } else if(s==UNICODE_STRING("truncated", 9)) {
 138                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
 139                 } else if(s==UNICODE_STRING("illesc", 6)) {
 140                     cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
 141                 } else if(s==UNICODE_STRING("unsuppesc", 9)) {
 142                     cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE;
 143                 } else {
 144                     cc.outErrorCode=U_ZERO_ERROR;
 145                 }
 146
 147                 s=testCase->getString("callback", errorCode);
 148                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
 149                 cc.cbopt=cbopt;
 150                 switch(cbopt[0]) {
 151                 case SUB_CB:
 152                     callback=UCNV_TO_U_CALLBACK_SUBSTITUTE;
 153                     break;
 154                 case SKIP_CB:
 155                     callback=UCNV_TO_U_CALLBACK_SKIP;
 156                     break;
 157                 case STOP_CB:
 158                     callback=UCNV_TO_U_CALLBACK_STOP;
 159                     break;
 160                 case ESC_CB:
 161                     callback=UCNV_TO_U_CALLBACK_ESCAPE;
 162                     break;
 163                 default:
 164                     callback=NULL;
 165                     break;
 166                 }
 167                 option=callback==NULL ? cbopt : cbopt+1;
 168                 if(*option==0) {
 169                     option=NULL;
 170                 }
 171
 172                 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode);
 173
 174                 if(U_FAILURE(errorCode)) {
 175                     errln("error parsing conversion/toUnicode test case %d - %s",
 176                             i, u_errorName(errorCode));
 177                     errorCode=U_ZERO_ERROR;
 178                 } else {
 179                     logln("TestToUnicode[%d] %s", i, charset);
 180                     ToUnicodeCase(cc, callback, option);
 181                 }
 182             }
 183             delete testData;
 184         }
 185         delete dataModule;
 186     }
 187     else {
 188         dataerrln("[DATA] Could not load test conversion data");
 189     }
 190 }
 191
 192 void
 193 ConversionTest::TestFromUnicode() {
 194     ConversionCase cc;
 195     char charset[100], cbopt[4];
 196     const char *option;
 197     UnicodeString s, unicode, invalidUChars;
 198     int32_t offsetsLength, index;
 199     UConverterFromUCallback callback;
 200
 201     TestDataModule *dataModule;
 202     TestData *testData;
 203     const DataMap *testCase;
 204     const UChar *p;
 205     UErrorCode errorCode;
 206     int32_t i, length;
 207
 208     errorCode=U_ZERO_ERROR;
 209     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
 210     if(U_SUCCESS(errorCode)) {
 211         testData=dataModule->createTestData("fromUnicode", errorCode);
 212         if(U_SUCCESS(errorCode)) {
 213             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
 214                 if(U_FAILURE(errorCode)) {
 215                     errln("error retrieving conversion/fromUnicode test case %d - %s",
 216                             i, u_errorName(errorCode));
 217                     errorCode=U_ZERO_ERROR;
 218                     continue;
 219                 }
 220
 221                 cc.caseNr=i;
 222
 223                 s=testCase->getString("charset", errorCode);
 224                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
 225                 cc.charset=charset;
 226
 227                 unicode=testCase->getString("unicode", errorCode);
 228                 cc.unicode=unicode.getBuffer();
 229                 cc.unicodeLength=unicode.length();
 230                 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode);
 231
 232                 offsetsLength=0;
 233                 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode);
 234                 if(offsetsLength==0) {
 235                     cc.offsets=NULL;
 236                 } else if(offsetsLength!=cc.bytesLength) {
 237                     errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length",
 238                             i, cc.bytesLength, offsetsLength);
 239                     errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 240                 }
 241
 242                 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode);
 243                 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode);
 244
 245                 s=testCase->getString("errorCode", errorCode);
 246                 if(s==UNICODE_STRING("invalid", 7)) {
 247                     cc.outErrorCode=U_INVALID_CHAR_FOUND;
 248                 } else if(s==UNICODE_STRING("illegal", 7)) {
 249                     cc.outErrorCode=U_ILLEGAL_CHAR_FOUND;
 250                 } else if(s==UNICODE_STRING("truncated", 9)) {
 251                     cc.outErrorCode=U_TRUNCATED_CHAR_FOUND;
 252                 } else {
 253                     cc.outErrorCode=U_ZERO_ERROR;
 254                 }
 255
 256                 s=testCase->getString("callback", errorCode);
 257                 cc.setSub=0; // default: no subchar
 258
 259                 if((index=s.indexOf((UChar)0))>0) {
 260                     // read NUL-separated subchar first, if any
 261                     // copy the subchar from Latin-1 characters
 262                     // start after the NUL
 263                     p=s.getTerminatedBuffer();
 264                     length=index+1;
 265                     p+=length;
 266                     length=s.length()-length;
 267                     if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) {
 268                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 269                     } else {
 270                         int32_t j;
 271
 272                         for(j=0; j<length; ++j) {
 273                             cc.subchar[j]=(char)p[j];
 274                         }
 275                         // NUL-terminate the subchar
 276                         cc.subchar[j]=0;
 277                         cc.setSub=1;
 278                     }
 279
 280                     // remove the NUL and subchar from s
 281                     s.truncate(index);
 282                 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ {
 283                     // read a substitution string, separated by an equal sign
 284                     p=s.getBuffer()+index+1;
 285                     length=s.length()-(index+1);
 286                     if(length<0 || length>=LENGTHOF(cc.subString)) {
 287                         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
 288                     } else {
 289                         u_memcpy(cc.subString, p, length);
 290                         // NUL-terminate the subString
 291                         cc.subString[length]=0;
 292                         cc.setSub=-1;
 293                     }
 294
 295                     // remove the equal sign and subString from s
 296                     s.truncate(index);
 297                 }
 298
 299                 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), "");
 300                 cc.cbopt=cbopt;
 301                 switch(cbopt[0]) {
 302                 case SUB_CB:
 303                     callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE;
 304                     break;
 305                 case SKIP_CB:
 306                     callback=UCNV_FROM_U_CALLBACK_SKIP;
 307                     break;
 308                 case STOP_CB:
 309                     callback=UCNV_FROM_U_CALLBACK_STOP;
 310                     break;
 311                 case ESC_CB:
 312                     callback=UCNV_FROM_U_CALLBACK_ESCAPE;
 313                     break;
 314                 default:
 315                     callback=NULL;
 316                     break;
 317                 }
 318                 option=callback==NULL ? cbopt : cbopt+1;
 319                 if(*option==0) {
 320                     option=NULL;
 321                 }
 322
 323                 invalidUChars=testCase->getString("invalidUChars", errorCode);
 324                 cc.invalidUChars=invalidUChars.getBuffer();
 325                 cc.invalidLength=invalidUChars.length();
 326
 327                 if(U_FAILURE(errorCode)) {
 328                     errln("error parsing conversion/fromUnicode test case %d - %s",
 329                             i, u_errorName(errorCode));
 330                     errorCode=U_ZERO_ERROR;
 331                 } else {
 332                     logln("TestFromUnicode[%d] %s", i, charset);
 333                     FromUnicodeCase(cc, callback, option);
 334                 }
 335             }
 336             delete testData;
 337         }
 338         delete dataModule;
 339     }
 340     else {
 341         dataerrln("[DATA] Could not load test conversion data");
 342     }
 343 }
 344
 345 static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e };
 346
 347 void
 348 ConversionTest::TestGetUnicodeSet() {
 349     char charset[100];
 350     UnicodeString s, map, mapnot;
 351     int32_t which;
 352
 353     ParsePosition pos;
 354     UnicodeSet cnvSet, mapSet, mapnotSet, diffSet;
 355     UnicodeSet *cnvSetPtr = &cnvSet;
 356     UConverter *cnv;
 357
 358     TestDataModule *dataModule;
 359     TestData *testData;
 360     const DataMap *testCase;
 361     UErrorCode errorCode;
 362     int32_t i;
 363
 364     errorCode=U_ZERO_ERROR;
 365     dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode);
 366     if(U_SUCCESS(errorCode)) {
 367         testData=dataModule->createTestData("getUnicodeSet", errorCode);
 368         if(U_SUCCESS(errorCode)) {
 369             for(i=0; testData->nextCase(testCase, errorCode); ++i) {
 370                 if(U_FAILURE(errorCode)) {
 371                     errln("error retrieving conversion/getUnicodeSet test case %d - %s",
 372                             i, u_errorName(errorCode));
 373                     errorCode=U_ZERO_ERROR;
 374                     continue;
 375                 }
 376
 377                 s=testCase->getString("charset", errorCode);
 378                 s.extract(0, 0x7fffffff, charset, sizeof(charset), "");
 379
 380                 map=testCase->getString("map", errorCode);
 381                 mapnot=testCase->getString("mapnot", errorCode);
 382
 383                 which=testCase->getInt28("which", errorCode);
 384
 385                 if(U_FAILURE(errorCode)) {
 386                     errln("error parsing conversion/getUnicodeSet test case %d - %s",
 387                             i, u_errorName(errorCode));
 388                     errorCode=U_ZERO_ERROR;
 389                     continue;
 390                 }
 391
 392                 // test this test case
 393                 mapSet.clear();
 394                 mapnotSet.clear();
 395
 396                 pos.setIndex(0);
 397                 mapSet.applyPattern(map, pos, 0, NULL, errorCode);
 398                 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) {
 399                     errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n"
 400                           "    error index %d  index %d  U+%04x",
 401                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex()));
 402                     errorCode=U_ZERO_ERROR;
 403                     continue;
 404                 }
 405
 406                 pos.setIndex(0);
 407                 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode);
 408                 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) {
 409                     errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n"
 410                           "    error index %d  index %d  U+%04x",
 411                             i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex()));
 412                     errorCode=U_ZERO_ERROR;
 413                     continue;
 414                 }
 415
 416                 logln("TestGetUnicodeSet[%d] %s", i, charset);
 417
 418                 cnv=cnv_open(charset, errorCode);
 419                 if(U_FAILURE(errorCode)) {
 420                     errln("error opening \"%s\" for conversion/getUnicodeSet test case %d - %s",
 421                             charset, i, u_errorName(errorCode));
 422                     errorCode=U_ZERO_ERROR;
 423                     continue;
 424                 }
 425
 426                 ucnv_getUnicodeSet(cnv, (USet *)cnvSetPtr, (UConverterUnicodeSet)which, &errorCode);
 427                 ucnv_close(cnv);
 428
 429                 if(U_FAILURE(errorCode)) {
 430                     errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s",
 431                             charset, i, u_errorName(errorCode));
 432                     errorCode=U_ZERO_ERROR;
 433                     continue;
 434                 }
 435
 436                 // are there items that must be in cnvSet but are not?
 437                 (diffSet=mapSet).removeAll(cnvSet);
 438                 if(!diffSet.isEmpty()) {
 439                     diffSet.toPattern(s, TRUE);
 440                     if(s.length()>100) {
 441                         s.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
 442                     }
 443                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
 444                             charset, i);
 445                     errln(s);
 446                 }
 447
 448                 // are there items that must not be in cnvSet but are?
 449                 (diffSet=mapnotSet).retainAll(cnvSet);
 450                 if(!diffSet.isEmpty()) {
 451                     diffSet.toPattern(s, TRUE);
 452                     if(s.length()>100) {
 453                         s.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
 454                     }
 455                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
 456                             charset, i);
 457                     errln(s);
 458                 }
 459             }
 460             delete testData;
 461         }
 462         delete dataModule;
 463     }
 464     else {
 465         dataerrln("[DATA] Could not load test conversion data");
 466     }
 467 }
 468
 469 U_CDECL_BEGIN
 470 static void U_CALLCONV
 471 getUnicodeSetCallback(const void *context,
 472                       UConverterFromUnicodeArgs * /*fromUArgs*/,
 473                       const UChar* /*codeUnits*/,
 474                       int32_t /*length*/,
 475                       UChar32 codePoint,
 476                       UConverterCallbackReason reason,
 477                       UErrorCode *pErrorCode) {
 478     if(reason<=UCNV_IRREGULAR) {
 479         ((UnicodeSet *)context)->remove(codePoint);  // the converter cannot convert this code point
 480         *pErrorCode=U_ZERO_ERROR;                    // skip
 481     }  // else ignore the reset, close and clone calls.
 482 }
 483 U_CDECL_END
 484
 485 // Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
 486 void
 487 ConversionTest::TestGetUnicodeSet2() {
 488     // Build a string with all code points.
 489     UChar32 cpLimit;
 490     int32_t s0Length;
 491     if(quick) {
 492         cpLimit=s0Length=0x10000;  // BMP only
 493     } else {
 494         cpLimit=0x110000;
 495         s0Length=0x10000+0x200000;  // BMP + surrogate pairs
 496     }
 497     UChar *s0=new UChar[s0Length];
 498     if(s0==NULL) {
 499         return;
 500     }
 501     UChar *s=s0;
 502     UChar32 c;
 503     UChar c2;
 504     // low BMP
 505     for(c=0; c<=0xd7ff; ++c) {
 506         *s++=(UChar)c;
 507     }
 508     // trail surrogates
 509     for(c=0xdc00; c<=0xdfff; ++c) {
 510         *s++=(UChar)c;
 511     }
 512     // lead surrogates
 513     // (after trails so that there is not even one surrogate pair in between)
 514     for(c=0xd800; c<=0xdbff; ++c) {
 515         *s++=(UChar)c;
 516     }
 517     // high BMP
 518     for(c=0xe000; c<=0xffff; ++c) {
 519         *s++=(UChar)c;
 520     }
 521     // supplementary code points = surrogate pairs
 522     if(cpLimit==0x110000) {
 523         for(c=0xd800; c<=0xdbff; ++c) {
 524             for(c2=0xdc00; c2<=0xdfff; ++c2) {
 525                 *s++=(UChar)c;
 526                 *s++=c2;
 527             }
 528         }
 529     }
 530
 531     static const char *const cnvNames[]={
 532         "UTF-8",
 533         "UTF-7",
 534         "UTF-16",
 535         "US-ASCII",
 536         "ISO-8859-1",
 537         "windows-1252",
 538         "Shift-JIS",
 539         "ibm-1390",  // EBCDIC_STATEFUL table
 540         "ibm-16684",  // DBCS-only extension table based on EBCDIC_STATEFUL table
 541         "HZ",
 542         "ISO-2022-JP",
 543         "JIS7",
 544         "ISO-2022-CN",
 545         "ISO-2022-CN-EXT",
 546         "LMBCS"
 547     };
 548     char buffer[1024];
 549     int32_t i;
 550     for(i=0; i<LENGTHOF(cnvNames); ++i) {
 551         UErrorCode errorCode=U_ZERO_ERROR;
 552         UConverter *cnv=cnv_open(cnvNames[i], errorCode);
 553         if(U_FAILURE(errorCode)) {
 554             errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
 555             continue;
 556         }
 557         UnicodeSet expected;
 558         ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
 559         if(U_FAILURE(errorCode)) {
 560             errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
 561             ucnv_close(cnv);
 562             continue;
 563         }
 564         UConverterUnicodeSet which;
 565         for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
 566             if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
 567                 ucnv_setFallback(cnv, TRUE);
 568             }
 569             expected.add(0, cpLimit-1);
 570             s=s0;
 571             UBool flush;
 572             do {
 573                 char *t=buffer;
 574                 flush=(UBool)(s==s0+s0Length);
 575                 ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
 576                 if(U_FAILURE(errorCode)) {
 577                     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 578                         errorCode=U_ZERO_ERROR;
 579                         continue;
 580                     } else {
 581                         break;  // unexpected error, should not occur
 582                     }
 583                 }
 584             } while(!flush);
 585             UnicodeSet set;
 586             ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
 587             if(cpLimit<0x110000) {
 588                 set.remove(cpLimit, 0x10ffff);
 589             }
 590             if(which==UCNV_ROUNDTRIP_SET) {
 591                 // ignore PUA code points because they will be converted even if they
 592                 // are fallbacks and when other fallbacks are turned off,
 593                 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
 594                 expected.remove(0xe000, 0xf8ff);
 595                 expected.remove(0xf0000, 0xffffd);
 596                 expected.remove(0x100000, 0x10fffd);
 597                 set.remove(0xe000, 0xf8ff);
 598                 set.remove(0xf0000, 0xffffd);
 599                 set.remove(0x100000, 0x10fffd);
 600             }
 601             if(set!=expected) {
 602                 // First try to see if we have different sets because ucnv_getUnicodeSet()
 603                 // added strings: The above conversion method does not tell us what strings might be convertible.
 604                 // Remove strings from the set and compare again.
 605                 // Unfortunately, there are no good, direct set methods for finding out whether there are strings
 606                 // in the set, nor for enumerating or removing just them.
 607                 // Intersect all code points with the set. The intersection will not contain strings.
 608                 UnicodeSet temp(0, 0x10ffff);
 609                 temp.retainAll(set);
 610                 set=temp;
 611             }
 612             if(set!=expected) {
 613                 UnicodeSet diffSet;
 614                 UnicodeString out;
 615
 616                 // are there items that must be in the set but are not?
 617                 (diffSet=expected).removeAll(set);
 618                 if(!diffSet.isEmpty()) {
 619                     diffSet.toPattern(out, TRUE);
 620                     if(out.length()>100) {
 621                         out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
 622                     }
 623                     errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
 624                             cnvNames[i], which);
 625                     errln(out);
 626                 }
 627
 628                 // are there items that must not be in the set but are?
 629                 (diffSet=set).removeAll(expected);
 630                 if(!diffSet.isEmpty()) {
 631                     diffSet.toPattern(out, TRUE);
 632                     if(out.length()>100) {
 633                         out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
 634                     }
 635                     errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
 636                             cnvNames[i], which);
 637                     errln(out);
 638                 }
 639             }
 640         }
 641         ucnv_close(cnv);
 642     }
 643
 644     delete [] s0;
 645 }
 646
 647 // open testdata or ICU data converter ------------------------------------- ***
 648
 649 UConverter *
 650 ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
 651     if(name!=NULL && *name=='*') {
 652         /* loadTestData(): set the data directory */
 653         return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
 654     } else {
 655         return ucnv_open(name, &errorCode);
 656     }
 657 }
 658
 659 // output helpers ---------------------------------------------------------- ***
 660
 661 static inline char
 662 hexDigit(uint8_t digit) {
 663     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
 664 }
 665
 666 static char *
 667 printBytes(const uint8_t *bytes, int32_t length, char *out) {
 668     uint8_t b;
 669
 670     if(length>0) {
 671         b=*bytes++;
 672         --length;
 673         *out++=hexDigit((uint8_t)(b>>4));
 674         *out++=hexDigit((uint8_t)(b&0xf));
 675     }
 676
 677     while(length>0) {
 678         b=*bytes++;
 679         --length;
 680         *out++=' ';
 681         *out++=hexDigit((uint8_t)(b>>4));
 682         *out++=hexDigit((uint8_t)(b&0xf));
 683     }
 684     *out++=0;
 685     return out;
 686 }
 687
 688 static char *
 689 printUnicode(const UChar *unicode, int32_t length, char *out) {
 690     UChar32 c;
 691     int32_t i;
 692
 693     for(i=0; i<length;) {
 694         if(i>0) {
 695             *out++=' ';
 696         }
 697         U16_NEXT(unicode, i, length, c);
 698         // write 4..6 digits
 699         if(c>=0x100000) {
 700             *out++='1';
 701         }
 702         if(c>=0x10000) {
 703             *out++=hexDigit((uint8_t)((c>>16)&0xf));
 704         }
 705         *out++=hexDigit((uint8_t)((c>>12)&0xf));
 706         *out++=hexDigit((uint8_t)((c>>8)&0xf));
 707         *out++=hexDigit((uint8_t)((c>>4)&0xf));
 708         *out++=hexDigit((uint8_t)(c&0xf));
 709     }
 710     *out++=0;
 711     return out;
 712 }
 713
 714 static char *
 715 printOffsets(const int32_t *offsets, int32_t length, char *out) {
 716     int32_t i, o, d;
 717
 718     if(offsets==NULL) {
 719         length=0;
 720     }
 721
 722     for(i=0; i<length; ++i) {
 723         if(i>0) {
 724             *out++=' ';
 725         }
 726         o=offsets[i];
 727
 728         // print all offsets with 2 characters each (-x, -9..99, xx)
 729         if(o<-9) {
 730             *out++='-';
 731             *out++='x';
 732         } else if(o<0) {
 733             *out++='-';
 734             *out++=(char)('0'-o);
 735         } else if(o<=99) {
 736             *out++=(d=o/10)==0 ? ' ' : (char)('0'+d);
 737             *out++=(char)('0'+o%10);
 738         } else /* o>99 */ {
 739             *out++='x';
 740             *out++='x';
 741         }
 742     }
 743     *out++=0;
 744     return out;
 745 }
 746
 747 // toUnicode test worker functions ----------------------------------------- ***
 748
 749 static int32_t
 750 stepToUnicode(ConversionCase &cc, UConverter *cnv,
 751               UChar *result, int32_t resultCapacity,
 752               int32_t *resultOffsets, /* also resultCapacity */
 753               int32_t step,
 754               UErrorCode *pErrorCode) {
 755     const char *source, *sourceLimit, *bytesLimit;
 756     UChar *target, *targetLimit, *resultLimit;
 757     UBool flush;
 758
 759     source=(const char *)cc.bytes;
 760     target=result;
 761     bytesLimit=source+cc.bytesLength;
 762     resultLimit=result+resultCapacity;
 763
 764     if(step>=0) {
 765         // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time
 766         // move only one buffer (in vs. out) at a time to be extra mean
 767         // step==0 performs bulk conversion and generates offsets
 768
 769         // initialize the partial limits for the loop
 770         if(step==0) {
 771             // use the entire buffers
 772             sourceLimit=bytesLimit;
 773             targetLimit=resultLimit;
 774             flush=cc.finalFlush;
 775         } else {
 776             // start with empty partial buffers
 777             sourceLimit=source;
 778             targetLimit=target;
 779             flush=FALSE;
 780
 781             // output offsets only for bulk conversion
 782             resultOffsets=NULL;
 783         }
 784
 785         for(;;) {
 786             // resetting the opposite conversion direction must not affect this one
 787             ucnv_resetFromUnicode(cnv);
 788
 789             // convert
 790             ucnv_toUnicode(cnv,
 791                 &target, targetLimit,
 792                 &source, sourceLimit,
 793                 resultOffsets,
 794                 flush, pErrorCode);
 795
 796             // check pointers and errors
 797             if(source>sourceLimit || target>targetLimit) {
 798                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 799                 break;
 800             } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
 801                 if(target!=targetLimit) {
 802                     // buffer overflow must only be set when the target is filled
 803                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 804                     break;
 805                 } else if(targetLimit==resultLimit) {
 806                     // not just a partial overflow
 807                     break;
 808                 }
 809
 810                 // the partial target is filled, set a new limit, reset the error and continue
 811                 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
 812                 *pErrorCode=U_ZERO_ERROR;
 813             } else if(U_FAILURE(*pErrorCode)) {
 814                 // some other error occurred, done
 815                 break;
 816             } else {
 817                 if(source!=sourceLimit) {
 818                     // when no error occurs, then the input must be consumed
 819                     *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 820                     break;
 821                 }
 822
 823                 if(sourceLimit==bytesLimit) {
 824                     // we are done
 825                     break;
 826                 }
 827
 828                 // the partial conversion succeeded, set a new limit and continue
 829                 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit;
 830                 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit);
 831             }
 832         }
 833     } else /* step<0 */ {
 834         /*
 835          * step==-1: call only ucnv_getNextUChar()
 836          * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
 837          *   if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
 838          *   else give it at most (-step-2)/2 bytes
 839          */
 840         UChar32 c;
 841
 842         // end the loop by getting an index out of bounds error
 843         for(;;) {
 844             // resetting the opposite conversion direction must not affect this one
 845             ucnv_resetFromUnicode(cnv);
 846
 847             // convert
 848             if((step&1)!=0 /* odd: -1, -3, -5, ... */) {
 849                 sourceLimit=source; // use sourceLimit not as a real limit
 850                                     // but to remember the pre-getNextUChar source pointer
 851                 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode);
 852
 853                 // check pointers and errors
 854                 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
 855                     if(source!=bytesLimit) {
 856                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 857                     } else {
 858                         *pErrorCode=U_ZERO_ERROR;
 859                     }
 860                     break;
 861                 } else if(U_FAILURE(*pErrorCode)) {
 862                     break;
 863                 }
 864                 // source may not move if c is from previous overflow
 865
 866                 if(target==resultLimit) {
 867                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 868                     break;
 869                 }
 870                 if(c<=0xffff) {
 871                     *target++=(UChar)c;
 872                 } else {
 873                     *target++=U16_LEAD(c);
 874                     if(target==resultLimit) {
 875                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
 876                         break;
 877                     }
 878                     *target++=U16_TRAIL(c);
 879                 }
 880
 881                 // alternate between -n-1 and -n but leave -1 alone
 882                 if(step<-1) {
 883                     ++step;
 884                 }
 885             } else /* step is even */ {
 886                 // allow only one UChar output
 887                 targetLimit=target<resultLimit ? target+1 : resultLimit;
 888
 889                 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit)
 890                 // and never output offsets
 891                 if(step==-2) {
 892                     sourceLimit=bytesLimit;
 893                 } else {
 894                     sourceLimit=source+(-step-2)/2;
 895                     if(sourceLimit>bytesLimit) {
 896                         sourceLimit=bytesLimit;
 897                     }
 898                 }
 899
 900                 ucnv_toUnicode(cnv,
 901                     &target, targetLimit,
 902                     &source, sourceLimit,
 903                     NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode);
 904
 905                 // check pointers and errors
 906                 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
 907                     if(target!=targetLimit) {
 908                         // buffer overflow must only be set when the target is filled
 909                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 910                         break;
 911                     } else if(targetLimit==resultLimit) {
 912                         // not just a partial overflow
 913                         break;
 914                     }
 915
 916                     // the partial target is filled, set a new limit and continue
 917                     *pErrorCode=U_ZERO_ERROR;
 918                 } else if(U_FAILURE(*pErrorCode)) {
 919                     // some other error occurred, done
 920                     break;
 921                 } else {
 922                     if(source!=sourceLimit) {
 923                         // when no error occurs, then the input must be consumed
 924                         *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
 925                         break;
 926                     }
 927
 928                     // we are done (flush==TRUE) but we continue, to get the index out of bounds error above
 929                 }
 930
 931                 --step;
 932             }
 933         }
 934     }
 935
 936     return (int32_t)(target-result);
 937 }
 938
 939 UBool
 940 ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) {
 941     UConverter *cnv;
 942     UErrorCode errorCode;
 943
 944     // open the converter
 945     errorCode=U_ZERO_ERROR;
 946     cnv=cnv_open(cc.charset, errorCode);
 947     if(U_FAILURE(errorCode)) {
 948         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
 949                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
 950         return FALSE;
 951     }
 952
 953     // set the callback
 954     if(callback!=NULL) {
 955         ucnv_setToUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
 956         if(U_FAILURE(errorCode)) {
 957             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s",
 958                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
 959             ucnv_close(cnv);
 960             return FALSE;
 961         }
 962     }
 963
 964     int32_t resultOffsets[256];
 965     UChar result[256];
 966     int32_t resultLength;
 967     UBool ok;
 968
 969     static const struct {
 970         int32_t step;
 971         const char *name;
 972     } steps[]={
 973         { 0, "bulk" }, // must be first for offsets to be checked
 974         { 1, "step=1" },
 975         { 3, "step=3" },
 976         { 7, "step=7" },
 977         { -1, "getNext" },
 978         { -2, "toU(bulk)+getNext" },
 979         { -3, "getNext+toU(bulk)" },
 980         { -4, "toU(1)+getNext" },
 981         { -5, "getNext+toU(1)" },
 982         { -12, "toU(5)+getNext" },
 983         { -13, "getNext+toU(5)" },
 984     };
 985     int32_t i, step;
 986
 987     ok=TRUE;
 988     for(i=0; i<LENGTHOF(steps) && ok; ++i) {
 989         step=steps[i].step;
 990         if(step<0 && !cc.finalFlush) {
 991             // skip ucnv_getNextUChar() if !finalFlush because
 992             // ucnv_getNextUChar() always implies flush
 993             continue;
 994         }
 995         if(step!=0) {
 996             // bulk test is first, then offsets are not checked any more
 997             cc.offsets=NULL;
 998         }
 999         else {
1000             memset(resultOffsets, -1, LENGTHOF(resultOffsets));
1001         }
1002         memset(result, -1, LENGTHOF(result));
1003         errorCode=U_ZERO_ERROR;
1004         resultLength=stepToUnicode(cc, cnv,
1005                                 result, LENGTHOF(result),
1006                                 step==0 ? resultOffsets : NULL,
1007                                 step, &errorCode);
1008         ok=checkToUnicode(
1009                 cc, cnv, steps[i].name,
1010                 result, resultLength,
1011                 cc.offsets!=NULL ? resultOffsets : NULL,
1012                 errorCode);
1013         if(U_FAILURE(errorCode) || !cc.finalFlush) {
1014             // reset if an error occurred or we did not flush
1015             // otherwise do nothing to make sure that flushing resets
1016             ucnv_resetToUnicode(cnv);
1017         }
1018         if (resultOffsets[resultLength] != -1) {
1019             errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1020                 cc.caseNr, cc.charset, resultLength);
1021         }
1022         if (result[resultLength] != (UChar)-1) {
1023             errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d",
1024                 cc.caseNr, cc.charset, resultLength);
1025         }
1026     }
1027
1028     // not a real loop, just a convenience for breaking out of the block
1029     while(ok && cc.finalFlush) {
1030         // test ucnv_toUChars()
1031         memset(result, 0, sizeof(result));
1032
1033         errorCode=U_ZERO_ERROR;
1034         resultLength=ucnv_toUChars(cnv,
1035                         result, LENGTHOF(result),
1036                         (const char *)cc.bytes, cc.bytesLength,
1037                         &errorCode);
1038         ok=checkToUnicode(
1039                 cc, cnv, "toUChars",
1040                 result, resultLength,
1041                 NULL,
1042                 errorCode);
1043         if(!ok) {
1044             break;
1045         }
1046
1047         // test preflighting
1048         // keep the correct result for simple checking
1049         errorCode=U_ZERO_ERROR;
1050         resultLength=ucnv_toUChars(cnv,
1051                         NULL, 0,
1052                         (const char *)cc.bytes, cc.bytesLength,
1053                         &errorCode);
1054         if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1055             errorCode=U_ZERO_ERROR;
1056         }
1057         ok=checkToUnicode(
1058                 cc, cnv, "preflight toUChars",
1059                 result, resultLength,
1060                 NULL,
1061                 errorCode);
1062         break;
1063     }
1064
1065     ucnv_close(cnv);
1066     return ok;
1067 }
1068
1069 UBool
1070 ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1071                                const UChar *result, int32_t resultLength,
1072                                const int32_t *resultOffsets,
1073                                UErrorCode resultErrorCode) {
1074     char resultInvalidChars[8];
1075     int8_t resultInvalidLength;
1076     UErrorCode errorCode;
1077
1078     const char *msg;
1079
1080     // reset the message; NULL will mean "ok"
1081     msg=NULL;
1082
1083     errorCode=U_ZERO_ERROR;
1084     resultInvalidLength=sizeof(resultInvalidChars);
1085     ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode);
1086     if(U_FAILURE(errorCode)) {
1087         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s",
1088                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1089         return FALSE;
1090     }
1091
1092     // check everything that might have gone wrong
1093     if(cc.unicodeLength!=resultLength) {
1094         msg="wrong result length";
1095     } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) {
1096         msg="wrong result string";
1097     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) {
1098         msg="wrong offsets";
1099     } else if(cc.outErrorCode!=resultErrorCode) {
1100         msg="wrong error code";
1101     } else if(cc.invalidLength!=resultInvalidLength) {
1102         msg="wrong length of last invalid input";
1103     } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) {
1104         msg="wrong last invalid input";
1105     }
1106
1107     if(msg==NULL) {
1108         return TRUE;
1109     } else {
1110         char buffer[2000]; // one buffer for all strings
1111         char *s, *bytesString, *unicodeString, *resultString,
1112             *offsetsString, *resultOffsetsString,
1113             *invalidCharsString, *resultInvalidCharsString;
1114
1115         bytesString=s=buffer;
1116         s=printBytes(cc.bytes, cc.bytesLength, bytesString);
1117         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s);
1118         s=printUnicode(result, resultLength, resultString=s);
1119         s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s);
1120         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1121         s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s);
1122         s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s);
1123
1124         if((s-buffer)>(int32_t)sizeof(buffer)) {
1125             errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n",
1126                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1127             exit(1);
1128         }
1129
1130         errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1131               "  bytes <%s>[%d]\n"
1132               " expected <%s>[%d]\n"
1133               "  result  <%s>[%d]\n"
1134               " offsets         <%s>\n"
1135               "  result offsets <%s>\n"
1136               " error code expected %s got %s\n"
1137               "  invalidChars expected <%s> got <%s>\n",
1138               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1139               bytesString, cc.bytesLength,
1140               unicodeString, cc.unicodeLength,
1141               resultString, resultLength,
1142               offsetsString,
1143               resultOffsetsString,
1144               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1145               invalidCharsString, resultInvalidCharsString);
1146
1147         return FALSE;
1148     }
1149 }
1150
1151 // fromUnicode test worker functions --------------------------------------- ***
1152
1153 static int32_t
1154 stepFromUTF8(ConversionCase &cc,
1155              UConverter *utf8Cnv, UConverter *cnv,
1156              char *result, int32_t resultCapacity,
1157              int32_t step,
1158              UErrorCode *pErrorCode) {
1159     const char *source, *sourceLimit, *utf8Limit;
1160     UChar pivotBuffer[32];
1161     UChar *pivotSource, *pivotTarget, *pivotLimit;
1162     char *target, *targetLimit, *resultLimit;
1163     UBool flush;
1164
1165     source=cc.utf8;
1166     pivotSource=pivotTarget=pivotBuffer;
1167     target=result;
1168     utf8Limit=source+cc.utf8Length;
1169     resultLimit=result+resultCapacity;
1170
1171     // call ucnv_convertEx() with in/out buffers no larger than (step) at a time
1172     // move only one buffer (in vs. out) at a time to be extra mean
1173     // step==0 performs bulk conversion
1174
1175     // initialize the partial limits for the loop
1176     if(step==0) {
1177         // use the entire buffers
1178         sourceLimit=utf8Limit;
1179         targetLimit=resultLimit;
1180         flush=cc.finalFlush;
1181
1182         pivotLimit=pivotBuffer+LENGTHOF(pivotBuffer);
1183     } else {
1184         // start with empty partial buffers
1185         sourceLimit=source;
1186         targetLimit=target;
1187         flush=FALSE;
1188
1189         // empty pivot is not allowed, make it of length step
1190         pivotLimit=pivotBuffer+step;
1191     }
1192
1193     for(;;) {
1194         // resetting the opposite conversion direction must not affect this one
1195         ucnv_resetFromUnicode(utf8Cnv);
1196         ucnv_resetToUnicode(cnv);
1197
1198         // convert
1199         ucnv_convertEx(cnv, utf8Cnv,
1200             &target, targetLimit,
1201             &source, sourceLimit,
1202             pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
1203             FALSE, flush, pErrorCode);
1204
1205         // check pointers and errors
1206         if(source>sourceLimit || target>targetLimit) {
1207             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1208             break;
1209         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1210             if(target!=targetLimit) {
1211                 // buffer overflow must only be set when the target is filled
1212                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1213                 break;
1214             } else if(targetLimit==resultLimit) {
1215                 // not just a partial overflow
1216                 break;
1217             }
1218
1219             // the partial target is filled, set a new limit, reset the error and continue
1220             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1221             *pErrorCode=U_ZERO_ERROR;
1222         } else if(U_FAILURE(*pErrorCode)) {
1223             if(pivotSource==pivotBuffer) {
1224                 // toUnicode error, should not occur
1225                 // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1226                 break;
1227             } else {
1228                 // fromUnicode error
1229                 // some other error occurred, done
1230                 break;
1231             }
1232         } else {
1233             if(source!=sourceLimit) {
1234                 // when no error occurs, then the input must be consumed
1235                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1236                 break;
1237             }
1238
1239             if(sourceLimit==utf8Limit) {
1240                 // we are done
1241                 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
1242                     // ucnv_convertEx() warns about not terminating the output
1243                     // but ucnv_fromUnicode() does not and so
1244                     // checkFromUnicode() does not expect it
1245                     *pErrorCode=U_ZERO_ERROR;
1246                 }
1247                 break;
1248             }
1249
1250             // the partial conversion succeeded, set a new limit and continue
1251             sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
1252             flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
1253         }
1254     }
1255
1256     return (int32_t)(target-result);
1257 }
1258
1259 static int32_t
1260 stepFromUnicode(ConversionCase &cc, UConverter *cnv,
1261                 char *result, int32_t resultCapacity,
1262                 int32_t *resultOffsets, /* also resultCapacity */
1263                 int32_t step,
1264                 UErrorCode *pErrorCode) {
1265     const UChar *source, *sourceLimit, *unicodeLimit;
1266     char *target, *targetLimit, *resultLimit;
1267     UBool flush;
1268
1269     source=cc.unicode;
1270     target=result;
1271     unicodeLimit=source+cc.unicodeLength;
1272     resultLimit=result+resultCapacity;
1273
1274     // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time
1275     // move only one buffer (in vs. out) at a time to be extra mean
1276     // step==0 performs bulk conversion and generates offsets
1277
1278     // initialize the partial limits for the loop
1279     if(step==0) {
1280         // use the entire buffers
1281         sourceLimit=unicodeLimit;
1282         targetLimit=resultLimit;
1283         flush=cc.finalFlush;
1284     } else {
1285         // start with empty partial buffers
1286         sourceLimit=source;
1287         targetLimit=target;
1288         flush=FALSE;
1289
1290         // output offsets only for bulk conversion
1291         resultOffsets=NULL;
1292     }
1293
1294     for(;;) {
1295         // resetting the opposite conversion direction must not affect this one
1296         ucnv_resetToUnicode(cnv);
1297
1298         // convert
1299         ucnv_fromUnicode(cnv,
1300             &target, targetLimit,
1301             &source, sourceLimit,
1302             resultOffsets,
1303             flush, pErrorCode);
1304
1305         // check pointers and errors
1306         if(source>sourceLimit || target>targetLimit) {
1307             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1308             break;
1309         } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
1310             if(target!=targetLimit) {
1311                 // buffer overflow must only be set when the target is filled
1312                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1313                 break;
1314             } else if(targetLimit==resultLimit) {
1315                 // not just a partial overflow
1316                 break;
1317             }
1318
1319             // the partial target is filled, set a new limit, reset the error and continue
1320             targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
1321             *pErrorCode=U_ZERO_ERROR;
1322         } else if(U_FAILURE(*pErrorCode)) {
1323             // some other error occurred, done
1324             break;
1325         } else {
1326             if(source!=sourceLimit) {
1327                 // when no error occurs, then the input must be consumed
1328                 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
1329                 break;
1330             }
1331
1332             if(sourceLimit==unicodeLimit) {
1333                 // we are done
1334                 break;
1335             }
1336
1337             // the partial conversion succeeded, set a new limit and continue
1338             sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit;
1339             flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit);
1340         }
1341     }
1342
1343     return (int32_t)(target-result);
1344 }
1345
1346 UBool
1347 ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) {
1348     UConverter *cnv;
1349     UErrorCode errorCode;
1350
1351     // open the converter
1352     errorCode=U_ZERO_ERROR;
1353     cnv=cnv_open(cc.charset, errorCode);
1354     if(U_FAILURE(errorCode)) {
1355         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
1356                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1357         return FALSE;
1358     }
1359     ucnv_resetToUnicode(utf8Cnv);
1360
1361     // set the callback
1362     if(callback!=NULL) {
1363         ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode);
1364         if(U_FAILURE(errorCode)) {
1365             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s",
1366                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1367             ucnv_close(cnv);
1368             return FALSE;
1369         }
1370     }
1371
1372     // set the fallbacks flag
1373     // TODO change with Jitterbug 2401, then add a similar call for toUnicode too
1374     ucnv_setFallback(cnv, cc.fallbacks);
1375
1376     // set the subchar
1377     int32_t length;
1378
1379     if(cc.setSub>0) {
1380         length=(int32_t)strlen(cc.subchar);
1381         ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode);
1382         if(U_FAILURE(errorCode)) {
1383             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s",
1384                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1385             ucnv_close(cnv);
1386             return FALSE;
1387         }
1388     } else if(cc.setSub<0) {
1389         ucnv_setSubstString(cnv, cc.subString, -1, &errorCode);
1390         if(U_FAILURE(errorCode)) {
1391             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s",
1392                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
1393             ucnv_close(cnv);
1394             return FALSE;
1395         }
1396     }
1397
1398     // convert unicode to utf8
1399     char utf8[256];
1400     cc.utf8=utf8;
1401     u_strToUTF8(utf8, LENGTHOF(utf8), &cc.utf8Length,
1402                 cc.unicode, cc.unicodeLength,
1403                 &errorCode);
1404     if(U_FAILURE(errorCode)) {
1405         // skip UTF-8 testing of a string with an unpaired surrogate,
1406         // or of one that's too long
1407         // toUnicode errors are tested in cintltst TestConvertExFromUTF8()
1408         cc.utf8Length=-1;
1409     }
1410
1411     int32_t resultOffsets[256];
1412     char result[256];
1413     int32_t resultLength;
1414     UBool ok;
1415
1416     static const struct {
1417         int32_t step;
1418         const char *name, *utf8Name;
1419     } steps[]={
1420         { 0, "bulk",   "utf8" }, // must be first for offsets to be checked
1421         { 1, "step=1", "utf8 step=1" },
1422         { 3, "step=3", "utf8 step=3" },
1423         { 7, "step=7", "utf8 step=7" }
1424     };
1425     int32_t i, step;
1426
1427     ok=TRUE;
1428     for(i=0; i<LENGTHOF(steps) && ok; ++i) {
1429         step=steps[i].step;
1430         memset(resultOffsets, -1, LENGTHOF(resultOffsets));
1431         memset(result, -1, LENGTHOF(result));
1432         errorCode=U_ZERO_ERROR;
1433         resultLength=stepFromUnicode(cc, cnv,
1434                                 result, LENGTHOF(result),
1435                                 step==0 ? resultOffsets : NULL,
1436                                 step, &errorCode);
1437         ok=checkFromUnicode(
1438                 cc, cnv, steps[i].name,
1439                 (uint8_t *)result, resultLength,
1440                 cc.offsets!=NULL ? resultOffsets : NULL,
1441                 errorCode);
1442         if(U_FAILURE(errorCode) || !cc.finalFlush) {
1443             // reset if an error occurred or we did not flush
1444             // otherwise do nothing to make sure that flushing resets
1445             ucnv_resetFromUnicode(cnv);
1446         }
1447         if (resultOffsets[resultLength] != -1) {
1448             errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d",
1449                 cc.caseNr, cc.charset, resultLength);
1450         }
1451         if (result[resultLength] != (char)-1) {
1452             errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d",
1453                 cc.caseNr, cc.charset, resultLength);
1454         }
1455
1456         // bulk test is first, then offsets are not checked any more
1457         cc.offsets=NULL;
1458
1459         // test direct conversion from UTF-8
1460         if(cc.utf8Length>=0) {
1461             errorCode=U_ZERO_ERROR;
1462             resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
1463                                     result, LENGTHOF(result),
1464                                     step, &errorCode);
1465             ok=checkFromUnicode(
1466                     cc, cnv, steps[i].utf8Name,
1467                     (uint8_t *)result, resultLength,
1468                     NULL,
1469                     errorCode);
1470             if(U_FAILURE(errorCode) || !cc.finalFlush) {
1471                 // reset if an error occurred or we did not flush
1472                 // otherwise do nothing to make sure that flushing resets
1473                 ucnv_resetToUnicode(utf8Cnv);
1474                 ucnv_resetFromUnicode(cnv);
1475             }
1476         }
1477     }
1478
1479     // not a real loop, just a convenience for breaking out of the block
1480     while(ok && cc.finalFlush) {
1481         // test ucnv_fromUChars()
1482         memset(result, 0, sizeof(result));
1483
1484         errorCode=U_ZERO_ERROR;
1485         resultLength=ucnv_fromUChars(cnv,
1486                         result, LENGTHOF(result),
1487                         cc.unicode, cc.unicodeLength,
1488                         &errorCode);
1489         ok=checkFromUnicode(
1490                 cc, cnv, "fromUChars",
1491                 (uint8_t *)result, resultLength,
1492                 NULL,
1493                 errorCode);
1494         if(!ok) {
1495             break;
1496         }
1497
1498         // test preflighting
1499         // keep the correct result for simple checking
1500         errorCode=U_ZERO_ERROR;
1501         resultLength=ucnv_fromUChars(cnv,
1502                         NULL, 0,
1503                         cc.unicode, cc.unicodeLength,
1504                         &errorCode);
1505         if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) {
1506             errorCode=U_ZERO_ERROR;
1507         }
1508         ok=checkFromUnicode(
1509                 cc, cnv, "preflight fromUChars",
1510                 (uint8_t *)result, resultLength,
1511                 NULL,
1512                 errorCode);
1513         break;
1514     }
1515
1516     ucnv_close(cnv);
1517     return ok;
1518 }
1519
1520 UBool
1521 ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name,
1522                                  const uint8_t *result, int32_t resultLength,
1523                                  const int32_t *resultOffsets,
1524                                  UErrorCode resultErrorCode) {
1525     UChar resultInvalidUChars[8];
1526     int8_t resultInvalidLength;
1527     UErrorCode errorCode;
1528
1529     const char *msg;
1530
1531     // reset the message; NULL will mean "ok"
1532     msg=NULL;
1533
1534     errorCode=U_ZERO_ERROR;
1535     resultInvalidLength=LENGTHOF(resultInvalidUChars);
1536     ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
1537     if(U_FAILURE(errorCode)) {
1538         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",
1539                 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode));
1540         return FALSE;
1541     }
1542
1543     // check everything that might have gone wrong
1544     if(cc.bytesLength!=resultLength) {
1545         msg="wrong result length";
1546     } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) {
1547         msg="wrong result string";
1548     } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) {
1549         msg="wrong offsets";
1550     } else if(cc.outErrorCode!=resultErrorCode) {
1551         msg="wrong error code";
1552     } else if(cc.invalidLength!=resultInvalidLength) {
1553         msg="wrong length of last invalid input";
1554     } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) {
1555         msg="wrong last invalid input";
1556     }
1557
1558     if(msg==NULL) {
1559         return TRUE;
1560     } else {
1561         char buffer[2000]; // one buffer for all strings
1562         char *s, *unicodeString, *bytesString, *resultString,
1563             *offsetsString, *resultOffsetsString,
1564             *invalidCharsString, *resultInvalidUCharsString;
1565
1566         unicodeString=s=buffer;
1567         s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString);
1568         s=printBytes(cc.bytes, cc.bytesLength, bytesString=s);
1569         s=printBytes(result, resultLength, resultString=s);
1570         s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s);
1571         s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s);
1572         s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s);
1573         s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s);
1574
1575         if((s-buffer)>(int32_t)sizeof(buffer)) {
1576             errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n",
1577                     cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer));
1578             exit(1);
1579         }
1580
1581         errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n"
1582               "  unicode <%s>[%d]\n"
1583               " expected <%s>[%d]\n"
1584               "  result  <%s>[%d]\n"
1585               " offsets         <%s>\n"
1586               "  result offsets <%s>\n"
1587               " error code expected %s got %s\n"
1588               "  invalidChars expected <%s> got <%s>\n",
1589               cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg,
1590               unicodeString, cc.unicodeLength,
1591               bytesString, cc.bytesLength,
1592               resultString, resultLength,
1593               offsetsString,
1594               resultOffsetsString,
1595               u_errorName(cc.outErrorCode), u_errorName(resultErrorCode),
1596               invalidCharsString, resultInvalidUCharsString);
1597
1598         return FALSE;
1599     }
1600 }
1601
1602 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */