icuSources/test/cintltst/cucdtst.c

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1997-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /*******************************************************************************
   9 *
  10 * File CUCDTST.C
  11 *
  12 * Modification History:
  13 *        Name                     Description
  14 *     Madhu Katragadda            Ported for C API, added tests for string functions
  15 ********************************************************************************
  16 */
  17
  18 #include <string.h>
  19 #include <math.h>
  20 #include <stdlib.h>
  21
  22 #include "unicode/utypes.h"
  23 #include "unicode/uchar.h"
  24 #include "unicode/putil.h"
  25 #include "unicode/ustring.h"
  26 #include "unicode/uloc.h"
  27 #include "unicode/unorm2.h"
  28 #include "unicode/utf16.h"
  29 #include "unicode/utf_old.h"
  30 #include "cintltst.h"
  31 #include "putilimp.h"
  32 #include "uparse.h"
  33 #include "ucase.h"
  34 #include "ubidi_props.h"
  35 #include "uprops.h"
  36 #include "uset_imp.h"
  37 #include "usc_impl.h"
  38 #include "udatamem.h"
  39 #include "cucdapi.h"
  40 #include "cmemory.h"
  41
  42 /* prototypes --------------------------------------------------------------- */
  43
  44 static void TestUpperLower(void);
  45 static void TestLetterNumber(void);
  46 static void TestMisc(void);
  47 static void TestPOSIX(void);
  48 static void TestControlPrint(void);
  49 static void TestIdentifier(void);
  50 static void TestUnicodeData(void);
  51 static void TestCodeUnit(void);
  52 static void TestCodePoint(void);
  53 static void TestCharLength(void);
  54 static void TestCharNames(void);
  55 static void TestUCharFromNameUnderflow(void);
  56 static void TestMirroring(void);
  57 static void TestUScriptRunAPI(void);
  58 static void TestAdditionalProperties(void);
  59 static void TestNumericProperties(void);
  60 static void TestPropertyNames(void);
  61 static void TestPropertyValues(void);
  62 static void TestConsistency(void);
  63 static void TestCaseFolding(void);
  64 static void TestBinaryCharacterPropertiesAPI(void);
  65 static void TestIntCharacterPropertiesAPI(void);
  66
  67 /* internal methods used */
  68 static int32_t MakeProp(char* str);
  69 static int32_t MakeDir(char* str);
  70
  71 /* helpers ------------------------------------------------------------------ */
  72
  73 static void
  74 parseUCDFile(const char *filename,
  75              char *fields[][2], int32_t fieldCount,
  76              UParseLineFn *lineFn, void *context,
  77              UErrorCode *pErrorCode) {
  78     // buffer sizes changed from 256 for APPLE_XCODE_BUILD (which is generating really long pathnames)
  79     char path[512];
  80     char backupPath[512];
  81
  82     if(U_FAILURE(*pErrorCode)) {
  83         return;
  84     }
  85
  86     /* Look inside ICU_DATA first */
  87     strcpy(path, u_getDataDirectory());
  88     strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
  89     strcat(path, filename);
  90
  91     /* As a fallback, try to guess where the source data was located
  92      *    at the time ICU was built, and look there.
  93      */
  94     strcpy(backupPath, ctest_dataSrcDir());
  95     strcat(backupPath, U_FILE_SEP_STRING);
  96     strcat(backupPath, "unidata" U_FILE_SEP_STRING);
  97     strcat(backupPath, filename);
  98
  99     u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode);
 100     if(*pErrorCode==U_FILE_ACCESS_ERROR) {
 101         *pErrorCode=U_ZERO_ERROR;
 102         u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode);
 103     }
 104     if(U_FAILURE(*pErrorCode)) {
 105         log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode));
 106     }
 107 }
 108
 109 /* test data ---------------------------------------------------------------- */
 110
 111 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf";
 112 static const int32_t tagValues[] =
 113     {
 114     /* Mn */ U_NON_SPACING_MARK,
 115     /* Mc */ U_COMBINING_SPACING_MARK,
 116     /* Me */ U_ENCLOSING_MARK,
 117     /* Nd */ U_DECIMAL_DIGIT_NUMBER,
 118     /* Nl */ U_LETTER_NUMBER,
 119     /* No */ U_OTHER_NUMBER,
 120     /* Zs */ U_SPACE_SEPARATOR,
 121     /* Zl */ U_LINE_SEPARATOR,
 122     /* Zp */ U_PARAGRAPH_SEPARATOR,
 123     /* Cc */ U_CONTROL_CHAR,
 124     /* Cf */ U_FORMAT_CHAR,
 125     /* Cs */ U_SURROGATE,
 126     /* Co */ U_PRIVATE_USE_CHAR,
 127     /* Cn */ U_UNASSIGNED,
 128     /* Lu */ U_UPPERCASE_LETTER,
 129     /* Ll */ U_LOWERCASE_LETTER,
 130     /* Lt */ U_TITLECASE_LETTER,
 131     /* Lm */ U_MODIFIER_LETTER,
 132     /* Lo */ U_OTHER_LETTER,
 133     /* Pc */ U_CONNECTOR_PUNCTUATION,
 134     /* Pd */ U_DASH_PUNCTUATION,
 135     /* Ps */ U_START_PUNCTUATION,
 136     /* Pe */ U_END_PUNCTUATION,
 137     /* Po */ U_OTHER_PUNCTUATION,
 138     /* Sm */ U_MATH_SYMBOL,
 139     /* Sc */ U_CURRENCY_SYMBOL,
 140     /* Sk */ U_MODIFIER_SYMBOL,
 141     /* So */ U_OTHER_SYMBOL,
 142     /* Pi */ U_INITIAL_PUNCTUATION,
 143     /* Pf */ U_FINAL_PUNCTUATION
 144     };
 145
 146 static const char dirStrings[][5] = {
 147     "L",
 148     "R",
 149     "EN",
 150     "ES",
 151     "ET",
 152     "AN",
 153     "CS",
 154     "B",
 155     "S",
 156     "WS",
 157     "ON",
 158     "LRE",
 159     "LRO",
 160     "AL",
 161     "RLE",
 162     "RLO",
 163     "PDF",
 164     "NSM",
 165     "BN",
 166     /* new in Unicode 6.3/ICU 52 */
 167     "FSI",
 168     "LRI",
 169     "RLI",
 170     "PDI"
 171 };
 172
 173 void addUnicodeTest(TestNode** root);
 174
 175 void addUnicodeTest(TestNode** root)
 176 {
 177     addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit");
 178     addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint");
 179     addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength");
 180     addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues");
 181     addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData");
 182     addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties");
 183     addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties");
 184     addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower");
 185     addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber");
 186     addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc");
 187     addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX");
 188     addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint");
 189     addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier");
 190     addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames");
 191     addTest(root, &TestUCharFromNameUnderflow, "tsutil/cucdtst/TestUCharFromNameUnderflow");
 192     addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring");
 193     addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI");
 194     addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript");
 195     addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions");
 196     addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI");
 197     addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI");
 198     addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
 199     addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
 200     addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
 201     addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
 202     addTest(root, &TestBinaryCharacterPropertiesAPI,
 203             "tsutil/cucdtst/TestBinaryCharacterPropertiesAPI");
 204     addTest(root, &TestIntCharacterPropertiesAPI,
 205             "tsutil/cucdtst/TestIntCharacterPropertiesAPI");
 206 }
 207
 208 /*==================================================== */
 209 /* test u_toupper() and u_tolower()                    */
 210 /*==================================================== */
 211 static void TestUpperLower()
 212 {
 213     const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000};
 214     const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000};
 215     U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21);
 216     U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
 217     int32_t i;
 218
 219     U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21);
 220     U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21);
 221
 222 /*
 223 Checks LetterLike Symbols which were previously a source of confusion
 224 [Bertrand A. D. 02/04/98]
 225 */
 226     for (i=0x2100;i<0x2138;i++)
 227     {
 228         /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */
 229         if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132)
 230         {
 231             if (i != (int)u_tolower(i)) /* itself */
 232                 log_err("Failed case conversion with itself: U+%04x\n", i);
 233             if (i != (int)u_toupper(i))
 234                 log_err("Failed case conversion with itself: U+%04x\n", i);
 235         }
 236     }
 237
 238     for(i=0; i < u_strlen(upper); i++){
 239         if(u_tolower(upper[i]) != lower[i]){
 240             log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i]));
 241         }
 242     }
 243
 244     log_verbose("testing upper lower\n");
 245     for (i = 0; i < 21; i++) {
 246
 247         if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i]))
 248         {
 249             log_err("Failed isLowerCase test at  %c\n", upperTest[i]);
 250         }
 251         else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i]))
 252          {
 253             log_err("Failed isUpperCase test at %c\n", lowerTest[i]);
 254         }
 255         else if (upperTest[i] != u_tolower(lowerTest[i]))
 256         {
 257             log_err("Failed case conversion from %c  To %c :\n", lowerTest[i], upperTest[i]);
 258         }
 259         else if (lowerTest[i] != u_toupper(upperTest[i]))
 260          {
 261             log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]);
 262         }
 263         else if (upperTest[i] != u_tolower(upperTest[i]))
 264         {
 265             log_err("Failed case conversion with itself: %c\n", upperTest[i]);
 266         }
 267         else if (lowerTest[i] != u_toupper(lowerTest[i]))
 268         {
 269             log_err("Failed case conversion with itself: %c\n", lowerTest[i]);
 270         }
 271     }
 272     log_verbose("done testing upper lower\n");
 273
 274     log_verbose("testing u_istitle\n");
 275     {
 276         static const UChar expected[] = {
 277             0x1F88,
 278             0x1F89,
 279             0x1F8A,
 280             0x1F8B,
 281             0x1F8C,
 282             0x1F8D,
 283             0x1F8E,
 284             0x1F8F,
 285             0x1F88,
 286             0x1F89,
 287             0x1F8A,
 288             0x1F8B,
 289             0x1F8C,
 290             0x1F8D,
 291             0x1F8E,
 292             0x1F8F,
 293             0x1F98,
 294             0x1F99,
 295             0x1F9A,
 296             0x1F9B,
 297             0x1F9C,
 298             0x1F9D,
 299             0x1F9E,
 300             0x1F9F,
 301             0x1F98,
 302             0x1F99,
 303             0x1F9A,
 304             0x1F9B,
 305             0x1F9C,
 306             0x1F9D,
 307             0x1F9E,
 308             0x1F9F,
 309             0x1FA8,
 310             0x1FA9,
 311             0x1FAA,
 312             0x1FAB,
 313             0x1FAC,
 314             0x1FAD,
 315             0x1FAE,
 316             0x1FAF,
 317             0x1FA8,
 318             0x1FA9,
 319             0x1FAA,
 320             0x1FAB,
 321             0x1FAC,
 322             0x1FAD,
 323             0x1FAE,
 324             0x1FAF,
 325             0x1FBC,
 326             0x1FBC,
 327             0x1FCC,
 328             0x1FCC,
 329             0x1FFC,
 330             0x1FFC,
 331         };
 332         int32_t num = UPRV_LENGTHOF(expected);
 333         for(i=0; i<num; i++){
 334             if(!u_istitle(expected[i])){
 335                 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]);
 336             }
 337         }
 338
 339     }
 340 }
 341
 342 /* compare two sets and verify that their difference or intersection is empty */
 343 static UBool
 344 showADiffB(const USet *a, const USet *b,
 345            const char *a_name, const char *b_name,
 346            UBool expect, UBool diffIsError) {
 347     USet *aa;
 348     int32_t i, start, end, length;
 349     UErrorCode errorCode;
 350
 351     /*
 352      * expect:
 353      * TRUE  -> a-b should be empty, that is, b should contain all of a
 354      * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa)
 355      */
 356     if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) {
 357         return TRUE;
 358     }
 359
 360     /* clone a to aa because a is const */
 361     aa=uset_open(1, 0);
 362     if(aa==NULL) {
 363         /* unusual problem - out of memory? */
 364         return FALSE;
 365     }
 366     uset_addAll(aa, a);
 367
 368     /* compute the set in question */
 369     if(expect) {
 370         /* a-b */
 371         uset_removeAll(aa, b);
 372     } else {
 373         /* a&b */
 374         uset_retainAll(aa, b);
 375     }
 376
 377     /* aa is not empty because of the initial tests above; show its contents */
 378     errorCode=U_ZERO_ERROR;
 379     i=0;
 380     for(;;) {
 381         length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode);
 382         if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
 383             break; /* done */
 384         }
 385         if(U_FAILURE(errorCode)) {
 386             log_err("error comparing %s with %s at difference item %d: %s\n",
 387                 a_name, b_name, i, u_errorName(errorCode));
 388             break;
 389         }
 390         if(length!=0) {
 391             break; /* done with code points, got a string or -1 */
 392         }
 393
 394         if(diffIsError) {
 395             if(expect) {
 396                 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
 397             } else {
 398                 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
 399             }
 400         } else {
 401             if(expect) {
 402                 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name);
 403             } else {
 404                 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end);
 405             }
 406         }
 407
 408         ++i;
 409     }
 410
 411     uset_close(aa);
 412     return FALSE;
 413 }
 414
 415 static UBool
 416 showAMinusB(const USet *a, const USet *b,
 417             const char *a_name, const char *b_name,
 418             UBool diffIsError) {
 419     return showADiffB(a, b, a_name, b_name, TRUE, diffIsError);
 420 }
 421
 422 static UBool
 423 showAIntersectB(const USet *a, const USet *b,
 424                 const char *a_name, const char *b_name,
 425                 UBool diffIsError) {
 426     return showADiffB(a, b, a_name, b_name, FALSE, diffIsError);
 427 }
 428
 429 static UBool
 430 compareUSets(const USet *a, const USet *b,
 431              const char *a_name, const char *b_name,
 432              UBool diffIsError) {
 433     /*
 434      * Use an arithmetic & not a logical && so that both branches
 435      * are always taken and all differences are shown.
 436      */
 437     return
 438         showAMinusB(a, b, a_name, b_name, diffIsError) &
 439         showAMinusB(b, a, b_name, a_name, diffIsError);
 440 }
 441
 442 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */
 443 static void TestLetterNumber()
 444 {
 445     UChar i = 0x0000;
 446
 447     log_verbose("Testing for isalpha\n");
 448     for (i = 0x0041; i < 0x005B; i++) {
 449         if (!u_isalpha(i))
 450         {
 451             log_err("Failed isLetter test at  %.4X\n", i);
 452         }
 453     }
 454     for (i = 0x0660; i < 0x066A; i++) {
 455         if (u_isalpha(i))
 456         {
 457             log_err("Failed isLetter test with numbers at %.4X\n", i);
 458         }
 459     }
 460
 461     log_verbose("Testing for isdigit\n");
 462     for (i = 0x0660; i < 0x066A; i++) {
 463         if (!u_isdigit(i))
 464         {
 465             log_verbose("Failed isNumber test at %.4X\n", i);
 466         }
 467     }
 468
 469     log_verbose("Testing for isalnum\n");
 470     for (i = 0x0041; i < 0x005B; i++) {
 471         if (!u_isalnum(i))
 472         {
 473             log_err("Failed isAlNum test at  %.4X\n", i);
 474         }
 475     }
 476     for (i = 0x0660; i < 0x066A; i++) {
 477         if (!u_isalnum(i))
 478         {
 479             log_err("Failed isAlNum test at  %.4X\n", i);
 480         }
 481     }
 482
 483     {
 484         /*
 485          * The following checks work only starting from Unicode 4.0.
 486          * Check the version number here.
 487          */
 488         static UVersionInfo u401={ 4, 0, 1, 0 };
 489         UVersionInfo version;
 490         u_getUnicodeVersion(version);
 491         if(version[0]<4 || 0==memcmp(version, u401, 4)) {
 492             return;
 493         }
 494     }
 495
 496     {
 497         /*
 498          * Sanity check:
 499          * Verify that exactly the digit characters have decimal digit values.
 500          * This assumption is used in the implementation of u_digit()
 501          * (which checks nt=de)
 502          * compared with the parallel java.lang.Character.digit()
 503          * (which checks Nd).
 504          *
 505          * This was not true in Unicode 3.2 and earlier.
 506          * Unicode 4.0 fixed discrepancies.
 507          * Unicode 4.0.1 re-introduced problems in this area due to an
 508          * unintentionally incomplete last-minute change.
 509          */
 510         U_STRING_DECL(digitsPattern, "[:Nd:]", 6);
 511         U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
 512
 513         USet *digits, *decimalValues;
 514         UErrorCode errorCode;
 515
 516         U_STRING_INIT(digitsPattern, "[:Nd:]", 6);
 517         U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24);
 518         errorCode=U_ZERO_ERROR;
 519         digits=uset_openPattern(digitsPattern, 6, &errorCode);
 520         decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode);
 521
 522         if(U_SUCCESS(errorCode)) {
 523             compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE);
 524         }
 525
 526         uset_close(digits);
 527         uset_close(decimalValues);
 528     }
 529 }
 530
 531 static void testSampleCharProps(UBool propFn(UChar32), const char *propName,
 532                                 const UChar32 *sampleChars, int32_t sampleCharsLength,
 533                                 UBool expected) {
 534     int32_t i;
 535     for (i = 0; i < sampleCharsLength; ++i) {
 536         UBool result = propFn(sampleChars[i]);
 537         if (result != expected) {
 538             log_err("error: character property function %s(U+%04x)=%d is wrong\n",
 539                     propName, sampleChars[i], result);
 540         }
 541     }
 542 }
 543
 544 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */
 545 static void TestMisc()
 546 {
 547     static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
 548     static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
 549     static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e};
 550     static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd};
 551     static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2};
 552     static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B};
 553 /*    static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/
 554     static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5};
 555     static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
 556     static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
 557     static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef};
 558
 559     static const int32_t sampleDigitValues[] = {0, 2, 3, 5};
 560
 561     uint32_t mask;
 562
 563     int32_t i;
 564     char icuVersion[U_MAX_VERSION_STRING_LENGTH];
 565     UVersionInfo realVersion;
 566
 567     memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH);
 568
 569     testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
 570     testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
 571
 572     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
 573                         sampleSpaces, UPRV_LENGTHOF(sampleSpaces), TRUE);
 574     testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar",
 575                         sampleNonSpaces, UPRV_LENGTHOF(sampleNonSpaces), FALSE);
 576
 577     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
 578                         sampleWhiteSpaces, UPRV_LENGTHOF(sampleWhiteSpaces), TRUE);
 579     testSampleCharProps(u_isWhitespace, "u_isWhitespace",
 580                         sampleNonWhiteSpaces, UPRV_LENGTHOF(sampleNonWhiteSpaces), FALSE);
 581
 582     testSampleCharProps(u_isdefined, "u_isdefined",
 583                         sampleDefined, UPRV_LENGTHOF(sampleDefined), TRUE);
 584     testSampleCharProps(u_isdefined, "u_isdefined",
 585                         sampleUndefined, UPRV_LENGTHOF(sampleUndefined), FALSE);
 586
 587     testSampleCharProps(u_isbase, "u_isbase", sampleBase, UPRV_LENGTHOF(sampleBase), TRUE);
 588     testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, UPRV_LENGTHOF(sampleNonBase), FALSE);
 589
 590     testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, UPRV_LENGTHOF(sampleDigits), TRUE);
 591     testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, UPRV_LENGTHOF(sampleNonDigits), FALSE);
 592
 593     for (i = 0; i < UPRV_LENGTHOF(sampleDigits); i++) {
 594         if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) {
 595             log_err("error: u_charDigitValue(U+04x)=%d != %d\n",
 596                     sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]);
 597         }
 598     }
 599
 600     /* Tests the ICU version #*/
 601     u_getVersion(realVersion);
 602     u_versionToString(realVersion, icuVersion);
 603     if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0)
 604     {
 605         log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion);
 606     }
 607 #if defined(ICU_VERSION)
 608     /* test only happens where we have configure.in with VERSION - sanity check. */
 609     if(strcmp(U_ICU_VERSION, ICU_VERSION))
 610     {
 611         log_err("ICU version mismatch: Header says %s, build environment says %s.\n",  U_ICU_VERSION, ICU_VERSION);
 612     }
 613 #endif
 614
 615     /* test U_GC_... */
 616     if(
 617         U_GET_GC_MASK(0x41)!=U_GC_LU_MASK ||
 618         U_GET_GC_MASK(0x662)!=U_GC_ND_MASK ||
 619         U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK ||
 620         U_GET_GC_MASK(0x28)!=U_GC_PS_MASK ||
 621         U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK ||
 622         U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK
 623     ) {
 624         log_err("error: U_GET_GC_MASK does not work properly\n");
 625     }
 626
 627     mask=0;
 628     mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK;
 629
 630     mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK;
 631     mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK;
 632     mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK;
 633     mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK;
 634     mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK;
 635
 636     mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK;
 637     mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK;
 638     mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK;
 639
 640     mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK;
 641     mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK;
 642     mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK;
 643
 644     mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK;
 645     mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK;
 646     mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK;
 647
 648     mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK;
 649     mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK;
 650     mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK;
 651     mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK;
 652
 653     mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK;
 654     mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK;
 655     mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK;
 656     mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK;
 657     mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK;
 658
 659     mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK;
 660     mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK;
 661     mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK;
 662     mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK;
 663
 664     mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK;
 665     mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK;
 666
 667     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
 668         log_err("error: problems with U_GC_XX_MASK constants\n");
 669     }
 670
 671     mask=0;
 672     mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK;
 673     mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK;
 674     mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK;
 675     mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK;
 676     mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK;
 677     mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK;
 678     mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK;
 679
 680     if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) {
 681         log_err("error: problems with U_GC_Y_MASK constants\n");
 682     }
 683     {
 684         static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 };
 685         for(i=0; i<10; i++){
 686             if(digit[i]!=u_forDigit(i,10)){
 687                 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10));
 688             }
 689         }
 690     }
 691
 692     /* test u_digit() */
 693     {
 694         static const struct {
 695             UChar32 c;
 696             int8_t radix, value;
 697         } data[]={
 698             /* base 16 */
 699             { 0x0031, 16, 1 },
 700             { 0x0038, 16, 8 },
 701             { 0x0043, 16, 12 },
 702             { 0x0066, 16, 15 },
 703             { 0x00e4, 16, -1 },
 704             { 0x0662, 16, 2 },
 705             { 0x06f5, 16, 5 },
 706             { 0xff13, 16, 3 },
 707             { 0xff41, 16, 10 },
 708
 709             /* base 8 */
 710             { 0x0031, 8, 1 },
 711             { 0x0038, 8, -1 },
 712             { 0x0043, 8, -1 },
 713             { 0x0066, 8, -1 },
 714             { 0x00e4, 8, -1 },
 715             { 0x0662, 8, 2 },
 716             { 0x06f5, 8, 5 },
 717             { 0xff13, 8, 3 },
 718             { 0xff41, 8, -1 },
 719
 720             /* base 36 */
 721             { 0x5a, 36, 35 },
 722             { 0x7a, 36, 35 },
 723             { 0xff3a, 36, 35 },
 724             { 0xff5a, 36, 35 },
 725
 726             /* wrong radix values */
 727             { 0x0031, 1, -1 },
 728             { 0xff3a, 37, -1 }
 729         };
 730
 731         for(i=0; i<UPRV_LENGTHOF(data); ++i) {
 732             if(u_digit(data[i].c, data[i].radix)!=data[i].value) {
 733                 log_err("u_digit(U+%04x, %d)=%d expected %d\n",
 734                         data[i].c,
 735                         data[i].radix,
 736                         u_digit(data[i].c, data[i].radix),
 737                         data[i].value);
 738             }
 739         }
 740     }
 741 }
 742
 743 /* test C/POSIX-style functions --------------------------------------------- */
 744
 745 /* bit flags */
 746 #define ISAL     1
 747 #define ISLO     2
 748 #define ISUP     4
 749
 750 #define ISDI     8
 751 #define ISXD  0x10
 752
 753 #define ISAN  0x20
 754
 755 #define ISPU  0x40
 756 #define ISGR  0x80
 757 #define ISPR 0x100
 758
 759 #define ISSP 0x200
 760 #define ISBL 0x400
 761 #define ISCN 0x800
 762
 763 /* C/POSIX-style functions, in the same order as the bit flags */
 764 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c);
 765
 766 static const struct {
 767     IsPOSIXClass *fn;
 768     const char *name;
 769 } posixClasses[]={
 770     { u_isalpha, "isalpha" },
 771     { u_islower, "islower" },
 772     { u_isupper, "isupper" },
 773     { u_isdigit, "isdigit" },
 774     { u_isxdigit, "isxdigit" },
 775     { u_isalnum, "isalnum" },
 776     { u_ispunct, "ispunct" },
 777     { u_isgraph, "isgraph" },
 778     { u_isprint, "isprint" },
 779     { u_isspace, "isspace" },
 780     { u_isblank, "isblank" },
 781     { u_iscntrl, "iscntrl" }
 782 };
 783
 784 static const struct {
 785     UChar32 c;
 786     uint32_t posixResults;
 787 } posixData[]={
 788     { 0x0008,                                                        ISCN },    /* backspace */
 789     { 0x0009,                                              ISSP|ISBL|ISCN },    /* TAB */
 790     { 0x000a,                                              ISSP|     ISCN },    /* LF */
 791     { 0x000c,                                              ISSP|     ISCN },    /* FF */
 792     { 0x000d,                                              ISSP|     ISCN },    /* CR */
 793     { 0x0020,                                         ISPR|ISSP|ISBL      },    /* space */
 794     { 0x0021,                               ISPU|ISGR|ISPR                },    /* ! */
 795     { 0x0033,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* 3 */
 796     { 0x0040,                               ISPU|ISGR|ISPR                },    /* @ */
 797     { 0x0041, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* A */
 798     { 0x007a, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* z */
 799     { 0x007b,                               ISPU|ISGR|ISPR                },    /* { */
 800     { 0x0085,                                              ISSP|     ISCN },    /* NEL */
 801     { 0x00a0,                                         ISPR|ISSP|ISBL      },    /* NBSP */
 802     { 0x00a4,                                    ISGR|ISPR                },    /* currency sign */
 803     { 0x00e4, ISAL|ISLO|               ISAN|     ISGR|ISPR                },    /* a-umlaut */
 804     { 0x0300,                                    ISGR|ISPR                },    /* combining grave */
 805     { 0x0600,                                                        ISCN },    /* arabic number sign */
 806     { 0x0627, ISAL|                    ISAN|     ISGR|ISPR                },    /* alef */
 807     { 0x0663,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* arabic 3 */
 808     { 0x2002,                                         ISPR|ISSP|ISBL      },    /* en space */
 809     { 0x2007,                                         ISPR|ISSP|ISBL      },    /* figure space */
 810     { 0x2009,                                         ISPR|ISSP|ISBL      },    /* thin space */
 811     { 0x200b,                                                        ISCN },    /* ZWSP */
 812   /*{ 0x200b,                                         ISPR|ISSP           },*/    /* ZWSP */ /* ZWSP became a control char in 4.0.1*/
 813     { 0x200e,                                                        ISCN },    /* LRM */
 814     { 0x2028,                                         ISPR|ISSP|     ISCN },    /* LS */
 815     { 0x2029,                                         ISPR|ISSP|     ISCN },    /* PS */
 816     { 0x20ac,                                    ISGR|ISPR                },    /* Euro */
 817     { 0xff15,                ISDI|ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth 5 */
 818     { 0xff25, ISAL|     ISUP|     ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth E */
 819     { 0xff35, ISAL|     ISUP|          ISAN|     ISGR|ISPR                },    /* fullwidth U */
 820     { 0xff45, ISAL|ISLO|          ISXD|ISAN|     ISGR|ISPR                },    /* fullwidth e */
 821     { 0xff55, ISAL|ISLO|               ISAN|     ISGR|ISPR                }     /* fullwidth u */
 822 };
 823
 824 static void
 825 TestPOSIX() {
 826     uint32_t mask;
 827     int32_t cl, i;
 828     UBool expect;
 829
 830     mask=1;
 831     for(cl=0; cl<12; ++cl) {
 832         for(i=0; i<UPRV_LENGTHOF(posixData); ++i) {
 833             expect=(UBool)((posixData[i].posixResults&mask)!=0);
 834             if(posixClasses[cl].fn(posixData[i].c)!=expect) {
 835                 log_err("u_%s(U+%04x)=%s is wrong\n",
 836                     posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE");
 837             }
 838         }
 839         mask<<=1;
 840     }
 841 }
 842
 843 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */
 844 static void TestControlPrint()
 845 {
 846     const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b};
 847     const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2};
 848     const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014};
 849     const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b};
 850     UChar32 c;
 851
 852     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, UPRV_LENGTHOF(sampleControl), TRUE);
 853     testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, UPRV_LENGTHOF(sampleNonControl), FALSE);
 854
 855     testSampleCharProps(u_isprint, "u_isprint",
 856                         samplePrintable, UPRV_LENGTHOF(samplePrintable), TRUE);
 857     testSampleCharProps(u_isprint, "u_isprint",
 858                         sampleNonPrintable, UPRV_LENGTHOF(sampleNonPrintable), FALSE);
 859
 860     /* test all ISO 8 controls */
 861     for(c=0; c<=0x9f; ++c) {
 862         if(c==0x20) {
 863             /* skip ASCII graphic characters and continue with DEL */
 864             c=0x7f;
 865         }
 866         if(!u_iscntrl(c)) {
 867             log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c);
 868         }
 869         if(!u_isISOControl(c)) {
 870             log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c);
 871         }
 872         if(u_isprint(c)) {
 873             log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c);
 874         }
 875     }
 876
 877     /* test all Latin-1 graphic characters */
 878     for(c=0x20; c<=0xff; ++c) {
 879         if(c==0x7f) {
 880             c=0xa0;
 881         } else if(c==0xad) {
 882             /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */
 883             ++c;
 884         }
 885         if(!u_isprint(c)) {
 886             log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c);
 887         }
 888     }
 889 }
 890
 891 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/
 892 static void TestIdentifier()
 893 {
 894     const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f};
 895     const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082};
 896     const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045};
 897     const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020};
 898     const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061};
 899     const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019};
 900     const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045};
 901     const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020};
 902     const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85};
 903     const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061};
 904
 905     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
 906                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
 907     testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart",
 908                         sampleNonJavaIDStart, UPRV_LENGTHOF(sampleNonJavaIDStart), FALSE);
 909
 910     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
 911                         sampleJavaIDPart, UPRV_LENGTHOF(sampleJavaIDPart), TRUE);
 912     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
 913                         sampleNonJavaIDPart, UPRV_LENGTHOF(sampleNonJavaIDPart), FALSE);
 914
 915     /* IDPart should imply IDStart */
 916     testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart",
 917                         sampleJavaIDStart, UPRV_LENGTHOF(sampleJavaIDStart), TRUE);
 918
 919     testSampleCharProps(u_isIDStart, "u_isIDStart",
 920                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
 921     testSampleCharProps(u_isIDStart, "u_isIDStart",
 922                         sampleNonUnicodeIDStart, UPRV_LENGTHOF(sampleNonUnicodeIDStart), FALSE);
 923
 924     testSampleCharProps(u_isIDPart, "u_isIDPart",
 925                         sampleUnicodeIDPart, UPRV_LENGTHOF(sampleUnicodeIDPart), TRUE);
 926     testSampleCharProps(u_isIDPart, "u_isIDPart",
 927                         sampleNonUnicodeIDPart, UPRV_LENGTHOF(sampleNonUnicodeIDPart), FALSE);
 928
 929     /* IDPart should imply IDStart */
 930     testSampleCharProps(u_isIDPart, "u_isIDPart",
 931                         sampleUnicodeIDStart, UPRV_LENGTHOF(sampleUnicodeIDStart), TRUE);
 932
 933     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
 934                         sampleIDIgnore, UPRV_LENGTHOF(sampleIDIgnore), TRUE);
 935     testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable",
 936                         sampleNonIDIgnore, UPRV_LENGTHOF(sampleNonIDIgnore), FALSE);
 937 }
 938
 939 /* for each line of UnicodeData.txt, check some of the properties */
 940 typedef struct UnicodeDataContext {
 941 #if UCONFIG_NO_NORMALIZATION
 942     const void *dummy;
 943 #else
 944     const UNormalizer2 *nfc;
 945     const UNormalizer2 *nfkc;
 946 #endif
 947 } UnicodeDataContext;
 948
 949 /*
 950  * ### TODO
 951  * This test fails incorrectly if the First or Last code point of a repetitive area
 952  * is overridden, which is allowed and is encouraged for the PUAs.
 953  * Currently, this means that both area First/Last and override lines are
 954  * tested against the properties from the API,
 955  * and the area boundary will not match and cause an error.
 956  *
 957  * This function should detect area boundaries and skip them for the test of individual
 958  * code points' properties.
 959  * Then it should check that the areas contain all the same properties except where overridden.
 960  * For this, it would have had to set a flag for which code points were listed explicitly.
 961  */
 962 static void U_CALLCONV
 963 unicodeDataLineFn(void *context,
 964                   char *fields[][2], int32_t fieldCount,
 965                   UErrorCode *pErrorCode)
 966 {
 967     (void)fieldCount; // suppress compiler warnings about unused variable
 968     char buffer[100];
 969     const char *d;
 970     char *end;
 971     uint32_t value;
 972     UChar32 c;
 973     int32_t i;
 974     int8_t type;
 975     int32_t dt;
 976     UChar dm[32], s[32];
 977     int32_t dmLength, length;
 978
 979 #if !UCONFIG_NO_NORMALIZATION
 980     const UNormalizer2 *nfc, *nfkc;
 981 #endif
 982
 983     /* get the character code, field 0 */
 984     c=strtoul(fields[0][0], &end, 16);
 985     if(end<=fields[0][0] || end!=fields[0][1]) {
 986         log_err("error: syntax error in field 0 at %s\n", fields[0][0]);
 987         return;
 988     }
 989     if((uint32_t)c>=UCHAR_MAX_VALUE + 1) {
 990         log_err("error in UnicodeData.txt: code point %lu out of range\n", c);
 991         return;
 992     }
 993
 994     /* get general category, field 2 */
 995     *fields[2][1]=0;
 996     type = (int8_t)tagValues[MakeProp(fields[2][0])];
 997     if(u_charType(c)!=type) {
 998         log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type);
 999     }
1000     if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1001         log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1002     }
1003
1004     /* get canonical combining class, field 3 */
1005     value=strtoul(fields[3][0], &end, 10);
1006     if(end<=fields[3][0] || end!=fields[3][1]) {
1007         log_err("error: syntax error in field 3 at code 0x%lx\n", c);
1008         return;
1009     }
1010     if(value>255) {
1011         log_err("error in UnicodeData.txt: combining class %lu out of range\n", value);
1012         return;
1013     }
1014 #if !UCONFIG_NO_NORMALIZATION
1015     if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) {
1016         log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value);
1017     }
1018     nfkc=((UnicodeDataContext *)context)->nfkc;
1019     if(value!=unorm2_getCombiningClass(nfkc, c)) {
1020         log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value);
1021     }
1022 #endif
1023
1024     /* get BiDi category, field 4 */
1025     *fields[4][1]=0;
1026     i=MakeDir(fields[4][0]);
1027     if(i!=(int32_t)u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) {
1028         log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]);
1029     }
1030
1031     /* get Decomposition_Type & Decomposition_Mapping, field 5 */
1032     d=NULL;
1033     if(fields[5][0]==fields[5][1]) {
1034         /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */
1035         if(c==0xac00 || c==0xd7a3) {
1036             dt=U_DT_CANONICAL;
1037         } else {
1038             dt=U_DT_NONE;
1039         }
1040     } else {
1041         d=fields[5][0];
1042         *fields[5][1]=0;
1043         dt=UCHAR_INVALID_CODE;
1044         if(*d=='<') {
1045             end=strchr(++d, '>');
1046             if(end!=NULL) {
1047                 *end=0;
1048                 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d);
1049                 d=u_skipWhitespace(end+1);
1050             }
1051         } else {
1052             dt=U_DT_CANONICAL;
1053         }
1054     }
1055     if(dt>U_DT_NONE) {
1056         if(c==0xac00) {
1057             dm[0]=0x1100;
1058             dm[1]=0x1161;
1059             dm[2]=0;
1060             dmLength=2;
1061         } else if(c==0xd7a3) {
1062             dm[0]=0xd788;
1063             dm[1]=0x11c2;
1064             dm[2]=0;
1065             dmLength=2;
1066         } else {
1067             dmLength=u_parseString(d, dm, 32, NULL, pErrorCode);
1068         }
1069     } else {
1070         dmLength=-1;
1071     }
1072     if(dt<0 || U_FAILURE(*pErrorCode)) {
1073         log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c);
1074         return;
1075     }
1076 #if !UCONFIG_NO_NORMALIZATION
1077     i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE);
1078     if(i!=dt) {
1079         log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt);
1080     }
1081     /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */
1082     length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode);
1083     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1084         log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d "
1085                 "or the Decomposition_Mapping is different (%s)\n",
1086                 c, length, dmLength, u_errorName(*pErrorCode));
1087         return;
1088     }
1089     /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */
1090     if(dt!=U_DT_CANONICAL) {
1091         dmLength=-1;
1092     }
1093     nfc=((UnicodeDataContext *)context)->nfc;
1094     length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode);
1095     if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) {
1096         log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d "
1097                 "or the Decomposition_Mapping is different (%s)\n",
1098                 c, length, dmLength, u_errorName(*pErrorCode));
1099         return;
1100     }
1101     /* recompose */
1102     if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
1103         UChar32 a, b, composite;
1104         i=0;
1105         U16_NEXT(dm, i, dmLength, a);
1106         U16_NEXT(dm, i, dmLength, b);
1107         /* i==dmLength */
1108         composite=unorm2_composePair(nfc, a, b);
1109         if(composite!=c) {
1110             log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
1111                     (long)c, (long)a, (long)b, (long)composite);
1112         }
1113         /*
1114          * Note: NFKC has fewer round-trip mappings than NFC,
1115          * so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
1116          */
1117     }
1118 #endif
1119
1120     /* get ISO Comment, field 11 */
1121     *fields[11][1]=0;
1122     i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode);
1123     if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) {
1124         log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n",
1125             c, u_errorName(*pErrorCode),
1126             U_FAILURE(*pErrorCode) ? buffer : "[error]",
1127             fields[11][0]);
1128     }
1129
1130     /* get uppercase mapping, field 12 */
1131     if(fields[12][0]!=fields[12][1]) {
1132         value=strtoul(fields[12][0], &end, 16);
1133         if(end!=fields[12][1]) {
1134             log_err("error: syntax error in field 12 at code 0x%lx\n", c);
1135             return;
1136         }
1137         if((UChar32)value!=u_toupper(c)) {
1138             log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value);
1139         }
1140     } else {
1141         /* no case mapping: the API must map the code point to itself */
1142         if(c!=u_toupper(c)) {
1143             log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c));
1144         }
1145     }
1146
1147     /* get lowercase mapping, field 13 */
1148     if(fields[13][0]!=fields[13][1]) {
1149         value=strtoul(fields[13][0], &end, 16);
1150         if(end!=fields[13][1]) {
1151             log_err("error: syntax error in field 13 at code 0x%lx\n", c);
1152             return;
1153         }
1154         if((UChar32)value!=u_tolower(c)) {
1155             log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value);
1156         }
1157     } else {
1158         /* no case mapping: the API must map the code point to itself */
1159         if(c!=u_tolower(c)) {
1160             log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c));
1161         }
1162     }
1163
1164     /* get titlecase mapping, field 14 */
1165     if(fields[14][0]!=fields[14][1]) {
1166         value=strtoul(fields[14][0], &end, 16);
1167         if(end!=fields[14][1]) {
1168             log_err("error: syntax error in field 14 at code 0x%lx\n", c);
1169             return;
1170         }
1171         if((UChar32)value!=u_totitle(c)) {
1172             log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value);
1173         }
1174     } else {
1175         /* no case mapping: the API must map the code point to itself */
1176         if(c!=u_totitle(c)) {
1177             log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c));
1178         }
1179     }
1180 }
1181
1182 static UBool U_CALLCONV
1183 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1184     static const UChar32 test[][2]={
1185         {0x41, U_UPPERCASE_LETTER},
1186         {0x308, U_NON_SPACING_MARK},
1187         {0xfffe, U_GENERAL_OTHER_TYPES},
1188         {0xe0041, U_FORMAT_CHAR},
1189         {0xeffff, U_UNASSIGNED}
1190     };
1191
1192     int32_t i, count;
1193
1194     if(0!=strcmp((const char *)context, "a1")) {
1195         log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n");
1196         return FALSE;
1197     }
1198
1199     count=UPRV_LENGTHOF(test);
1200     for(i=0; i<count; ++i) {
1201         if(start<=test[i][0] && test[i][0]<limit) {
1202             if(type!=(UCharCategory)test[i][1]) {
1203                 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n",
1204                         start, limit, (long)type, test[i][0], test[i][1]);
1205             }
1206             /* stop at the range that includes the last test code point (increases code coverage for enumeration) */
1207             return i==(count-1) ? FALSE : TRUE;
1208         }
1209     }
1210
1211     if(start>test[count-1][0]) {
1212         log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n",
1213                 start, limit, (long)type);
1214         return FALSE;
1215     }
1216
1217     return TRUE;
1218 }
1219
1220 static UBool U_CALLCONV
1221 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1222     (void)context; // suppress compiler warnings about unused variable
1223
1224     /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */
1225     static const int32_t defaultBidi[][2]={ /* { limit, class } */
1226         { 0x0590, U_LEFT_TO_RIGHT },
1227         { 0x0600, U_RIGHT_TO_LEFT },
1228         { 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
1229         { 0x0860, U_RIGHT_TO_LEFT },
1230         { 0x0870, U_RIGHT_TO_LEFT_ARABIC },  // Unicode 10 changes U+0860..U+086F from R to AL.
1231         { 0x08A0, U_RIGHT_TO_LEFT },
1232         { 0x0900, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */
1233         { 0x20A0, U_LEFT_TO_RIGHT },
1234         { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR },  /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */
1235         { 0xFB1D, U_LEFT_TO_RIGHT },
1236         { 0xFB50, U_RIGHT_TO_LEFT },
1237         { 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
1238         { 0xFE70, U_LEFT_TO_RIGHT },
1239         { 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
1240
1241         { 0x10800, U_LEFT_TO_RIGHT },
1242         { 0x10D00, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+10D00..U+10D3F from R to AL.
1243         { 0x10D40, U_RIGHT_TO_LEFT_ARABIC },
1244         { 0x10F30, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+10F30..U+10F6F from R to AL.
1245         { 0x10F70, U_RIGHT_TO_LEFT_ARABIC },
1246         { 0x11000, U_RIGHT_TO_LEFT },
1247
1248         { 0x1E800, U_LEFT_TO_RIGHT },  /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */
1249         { 0x1EC70, U_RIGHT_TO_LEFT },  // Unicode 11 changes U+1EC70..U+1ECBF from R to AL.
1250         { 0x1ECC0, U_RIGHT_TO_LEFT_ARABIC },
1251         { 0x1ED00, U_RIGHT_TO_LEFT },  // Unicode 12 changes U+1ED00..U+1ED4F from R to AL.
1252         { 0x1ED50, U_RIGHT_TO_LEFT_ARABIC },
1253         { 0x1EE00, U_RIGHT_TO_LEFT },
1254         { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC },  /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */
1255         { 0x1F000, U_RIGHT_TO_LEFT },
1256         { 0x110000, U_LEFT_TO_RIGHT }
1257     };
1258
1259     UChar32 c;
1260     int32_t i;
1261     UCharDirection shouldBeDir;
1262
1263     /*
1264      * LineBreak.txt specifies:
1265      *   #  - Assigned characters that are not listed explicitly are given the value
1266      *   #    "AL".
1267      *   #  - Unassigned characters are given the value "XX".
1268      *
1269      * PUA characters are listed explicitly with "XX".
1270      * Verify that no assigned character has "XX".
1271      */
1272     if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) {
1273         c=start;
1274         while(c<limit) {
1275             if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) {
1276                 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c);
1277             }
1278             ++c;
1279         }
1280     }
1281
1282     /*
1283      * Verify default Bidi classes.
1284      * See DerivedBidiClass.txt, especially for unassigned code points.
1285      */
1286     if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) {
1287         /* enumerate the intersections of defaultBidi ranges with [start..limit[ */
1288         c=start;
1289         for(i=0; i<UPRV_LENGTHOF(defaultBidi) && c<limit; ++i) {
1290             if((int32_t)c<defaultBidi[i][0]) {
1291                 while(c<limit && (int32_t)c<defaultBidi[i][0]) {
1292                     if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
1293                         shouldBeDir=U_BOUNDARY_NEUTRAL;
1294                     } else {
1295                         shouldBeDir=(UCharDirection)defaultBidi[i][1];
1296                     }
1297
1298                     if( u_charDirection(c)!=shouldBeDir ||
1299                         (UCharDirection)u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir
1300                     ) {
1301                         log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n",
1302                             c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]);
1303                     }
1304                     ++c;
1305                 }
1306             }
1307         }
1308     }
1309
1310     return TRUE;
1311 }
1312
1313 /* tests for several properties */
1314 static void TestUnicodeData()
1315 {
1316     UVersionInfo expectVersionArray;
1317     UVersionInfo versionArray;
1318     char *fields[15][2];
1319     UErrorCode errorCode;
1320     UChar32 c;
1321     int8_t type;
1322
1323     UnicodeDataContext context;
1324
1325     u_versionFromString(expectVersionArray, U_UNICODE_VERSION);
1326     u_getUnicodeVersion(versionArray);
1327     if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0)
1328     {
1329         log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n",
1330         versionArray[0], versionArray[1], versionArray[2], versionArray[3]);
1331     }
1332
1333 #if defined(ICU_UNICODE_VERSION)
1334     /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */
1335     if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION))
1336     {
1337          log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n");
1338     }
1339 #endif
1340
1341     if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) {
1342         log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041));
1343     }
1344
1345     errorCode=U_ZERO_ERROR;
1346 #if !UCONFIG_NO_NORMALIZATION
1347     context.nfc=unorm2_getNFCInstance(&errorCode);
1348     context.nfkc=unorm2_getNFKCInstance(&errorCode);
1349     if(U_FAILURE(errorCode)) {
1350         log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode));
1351         return;
1352     }
1353 #endif
1354     parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode);
1355     if(U_FAILURE(errorCode)) {
1356         return; /* if we couldn't parse UnicodeData.txt, we should return */
1357     }
1358
1359     /* sanity check on repeated properties */
1360     for(c=0xfffe; c<=0x10ffff;) {
1361         type=u_charType(c);
1362         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1363             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1364         }
1365         if(type!=U_UNASSIGNED) {
1366             log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
1367         }
1368         if((c&0xffff)==0xfffe) {
1369             ++c;
1370         } else {
1371             c+=0xffff;
1372         }
1373     }
1374
1375     /* test that PUA is not "unassigned" */
1376     for(c=0xe000; c<=0x10fffd;) {
1377         type=u_charType(c);
1378         if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) {
1379             log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c);
1380         }
1381         if(type==U_UNASSIGNED) {
1382             log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c);
1383         } else if(type!=U_PRIVATE_USE_CHAR) {
1384             log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type);
1385         }
1386         if(c==0xf8ff) {
1387             c=0xf0000;
1388         } else if(c==0xffffd) {
1389             c=0x100000;
1390         } else {
1391             ++c;
1392         }
1393     }
1394
1395     /* test u_enumCharTypes() */
1396     u_enumCharTypes(enumTypeRange, "a1");
1397
1398     /* check default properties */
1399     u_enumCharTypes(enumDefaultsRange, NULL);
1400 }
1401
1402 static void TestCodeUnit(){
1403     const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
1404
1405     int32_t i;
1406
1407     for(i=0; i<UPRV_LENGTHOF(codeunit); i++){
1408         UChar c=codeunit[i];
1409         if(i<4){
1410             if(!(U16_IS_SINGLE(c)) || (U16_IS_LEAD(c)) || (U16_IS_TRAIL(c)) ||
1411                     U16_IS_SURROGATE(c) || U_IS_SURROGATE(c)) {
1412                 log_err("ERROR: U+%04x is a single", c);
1413             }
1414
1415         }
1416         if(i >= 4 && i< 8){
1417             if(!(U16_IS_LEAD(c)) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c) ||
1418                     !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1419                 log_err("ERROR: U+%04x is a first surrogate", c);
1420             }
1421         }
1422         if(i >= 8 && i< 12){
1423             if(!(U16_IS_TRAIL(c)) || U16_IS_SINGLE(c) || U16_IS_LEAD(c) ||
1424                     !U16_IS_SURROGATE(c) || !U_IS_SURROGATE(c)){
1425                 log_err("ERROR: U+%04x is a second surrogate", c);
1426             }
1427         }
1428 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1429         if(i<4){
1430             if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){
1431                 log_err("ERROR: U+%04x is a single", c);
1432             }
1433
1434         }
1435         if(i >= 4 && i< 8){
1436             if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){
1437                 log_err("ERROR: U+%04x is a first surrogate", c);
1438             }
1439         }
1440         if(i >= 8 && i< 12){
1441             if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){
1442                 log_err("ERROR: U+%04x is a second surrogate", c);
1443             }
1444         }
1445 #endif
1446     }
1447 }
1448
1449 static void TestCodePoint(){
1450     const UChar32 codePoint[]={
1451         /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */
1452         0xd800,
1453         0xdbff,
1454         0xdc00,
1455         0xdfff,
1456         0xdc04,
1457         0xd821,
1458         /*not a surrogate, valid, isUnicodeChar , not Error*/
1459         0x20ac,
1460         0xd7ff,
1461         0xe000,
1462         0xe123,
1463         0x0061,
1464         0xe065,
1465         0x20402,
1466         0x24506,
1467         0x23456,
1468         0x20402,
1469         0x10402,
1470         0x23456,
1471         /*not a surrogate, not valid, isUnicodeChar, isError */
1472         0x0015,
1473         0x009f,
1474         /*not a surrogate, not valid, not isUnicodeChar, isError */
1475         0xffff,
1476         0xfffe,
1477     };
1478     int32_t i;
1479     for(i=0; i<UPRV_LENGTHOF(codePoint); i++) {
1480         UChar32 c=codePoint[i];
1481         if(i<6) {
1482             if(!U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)) {
1483                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1484             }
1485             if(U_IS_UNICODE_CHAR(c)) {
1486                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1487             }
1488         } else if(i >=6 && i<18) {
1489             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1490                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1491             }
1492             if(!U_IS_UNICODE_CHAR(c)) {
1493                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1494             }
1495         } else if(i >=18 && i<20) {
1496             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1497                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1498             }
1499             if(!U_IS_UNICODE_CHAR(c)) {
1500                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1501             }
1502         } else if(i >=18 && i<UPRV_LENGTHOF(codePoint)) {
1503             if(U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)) {
1504                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1505             }
1506             if(U_IS_UNICODE_CHAR(c)) {
1507                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1508             }
1509         }
1510 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1511         if(i<6){
1512             if(!UTF_IS_SURROGATE(c)){
1513                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1514             }
1515             if(UTF_IS_VALID(c)){
1516                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1517             }
1518             if(UTF_IS_UNICODE_CHAR(c)){
1519                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1520             }
1521             if(UTF_IS_ERROR(c)){
1522                 log_err("ERROR: isError() failed for U+%04x\n", c);
1523             }
1524         }else if(i >=6 && i<18){
1525             if(UTF_IS_SURROGATE(c)){
1526                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1527             }
1528             if(!UTF_IS_VALID(c)){
1529                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1530             }
1531             if(!UTF_IS_UNICODE_CHAR(c)){
1532                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1533             }
1534             if(UTF_IS_ERROR(c)){
1535                 log_err("ERROR: isError() failed for U+%04x\n", c);
1536             }
1537         }else if(i >=18 && i<20){
1538             if(UTF_IS_SURROGATE(c)){
1539                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1540             }
1541             if(UTF_IS_VALID(c)){
1542                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1543             }
1544             if(!UTF_IS_UNICODE_CHAR(c)){
1545                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1546             }
1547             if(!UTF_IS_ERROR(c)){
1548                 log_err("ERROR: isError() failed for U+%04x\n", c);
1549             }
1550         }
1551         else if(i >=18 && i<UPRV_LENGTHOF(codePoint)){
1552             if(UTF_IS_SURROGATE(c)){
1553                 log_err("ERROR: isSurrogate() failed for U+%04x\n", c);
1554             }
1555             if(UTF_IS_VALID(c)){
1556                 log_err("ERROR: isValid() failed for U+%04x\n", c);
1557             }
1558             if(UTF_IS_UNICODE_CHAR(c)){
1559                 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c);
1560             }
1561             if(!UTF_IS_ERROR(c)){
1562                 log_err("ERROR: isError() failed for U+%04x\n", c);
1563             }
1564         }
1565 #endif
1566     }
1567
1568     if(
1569         !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) ||
1570         !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) ||
1571         U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) ||
1572         U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff)
1573     ) {
1574         log_err("error with U_IS_BMP()\n");
1575     }
1576
1577     if(
1578         U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) ||
1579         U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) ||
1580         U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) ||
1581         !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff)
1582     ) {
1583         log_err("error with U_IS_SUPPLEMENTARY()\n");
1584     }
1585 }
1586
1587 static void TestCharLength()
1588 {
1589     const int32_t codepoint[]={
1590         1, 0x0061,
1591         1, 0xe065,
1592         1, 0x20ac,
1593         2, 0x20402,
1594         2, 0x23456,
1595         2, 0x24506,
1596         2, 0x20402,
1597         2, 0x10402,
1598         1, 0xd7ff,
1599         1, 0xe000
1600     };
1601
1602     int32_t i;
1603 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1604     UBool multiple;
1605 #endif
1606     for(i=0; i<UPRV_LENGTHOF(codepoint); i=(int16_t)(i+2)){
1607         UChar32 c=codepoint[i+1];
1608         if(
1609 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1610                 UTF_CHAR_LENGTH(c) != codepoint[i] ||
1611 #endif
1612                 U16_LENGTH(c) != codepoint[i]) {
1613             log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c));
1614         }
1615 #if !U_HIDE_OBSOLETE_UTF_OLD_H
1616         multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE);
1617         if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){
1618             log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c);
1619         }
1620 #endif
1621     }
1622 }
1623
1624 /*internal functions ----*/
1625 static int32_t MakeProp(char* str)
1626 {
1627     int32_t result = 0;
1628     char* matchPosition =0;
1629
1630     matchPosition = strstr(tagStrings, str);
1631     if (matchPosition == 0)
1632     {
1633         log_err("unrecognized type letter ");
1634         log_err(str);
1635     }
1636     else
1637         result = (int32_t)((matchPosition - tagStrings) / 2);
1638     return result;
1639 }
1640
1641 static int32_t MakeDir(char* str)
1642 {
1643     int32_t pos = 0;
1644     for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) {
1645         if (strcmp(str, dirStrings[pos]) == 0) {
1646             return pos;
1647         }
1648     }
1649     return -1;
1650 }
1651
1652 /* test u_charName() -------------------------------------------------------- */
1653
1654 static const struct {
1655     uint32_t code;
1656     const char *name, *oldName, *extName, *alias;
1657 } names[]={
1658     {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A", NULL},
1659     {0x01a2, "LATIN CAPITAL LETTER OI", "",
1660              "LATIN CAPITAL LETTER OI",
1661              "LATIN CAPITAL LETTER GHA"},
1662     {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "",
1663              "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", NULL},
1664     {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "",
1665              "TIBETAN MARK BSKA- SHOG GI MGO RGYAN",
1666              "TIBETAN MARK BKA- SHOG GI MGO RGYAN"},
1667     {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401", NULL},
1668     {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED", NULL},
1669     {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA", NULL},
1670     {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH", NULL},
1671     {0xd800, "", "", "<lead surrogate-D800>", NULL},
1672     {0xdc00, "", "", "<trail surrogate-DC00>", NULL},
1673     {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS", NULL},
1674     {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN", NULL},
1675     {0xffff, "", "", "<noncharacter-FFFF>", NULL},
1676     {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "",
1677               "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS",
1678               "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"},
1679     {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456", NULL}
1680 };
1681
1682 static UBool
1683 enumCharNamesFn(void *context,
1684                 UChar32 code, UCharNameChoice nameChoice,
1685                 const char *name, int32_t length) {
1686     int32_t *pCount=(int32_t *)context;
1687     const char *expected;
1688     int i;
1689
1690     if(length<=0 || length!=(int32_t)strlen(name)) {
1691         /* should not be called with an empty string or invalid length */
1692         log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length);
1693         return TRUE;
1694     }
1695
1696     ++*pCount;
1697     for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1698         if(code==(UChar32)names[i].code) {
1699             switch (nameChoice) {
1700                 case U_EXTENDED_CHAR_NAME:
1701                     if(0!=strcmp(name, names[i].extName)) {
1702                         log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName);
1703                     }
1704                     break;
1705                 case U_UNICODE_CHAR_NAME:
1706                     if(0!=strcmp(name, names[i].name)) {
1707                         log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name);
1708                     }
1709                     break;
1710                 case U_UNICODE_10_CHAR_NAME:
1711                     expected=names[i].oldName;
1712                     if(expected[0]==0 || 0!=strcmp(name, expected)) {
1713                         log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected);
1714                     }
1715                     break;
1716                 case U_CHAR_NAME_ALIAS:
1717                     expected=names[i].alias;
1718                     if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) {
1719                         log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected);
1720                     }
1721                     break;
1722                 case U_CHAR_NAME_CHOICE_COUNT:
1723                     break;
1724             }
1725             break;
1726         }
1727     }
1728     return TRUE;
1729 }
1730
1731 struct enumExtCharNamesContext {
1732     uint32_t length;
1733     int32_t last;
1734 };
1735
1736 static UBool
1737 enumExtCharNamesFn(void *context,
1738                 UChar32 code, UCharNameChoice nameChoice,
1739                 const char *name, int32_t length) {
1740     struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context;
1741
1742     if (ecncp->last != (int32_t) code - 1) {
1743         if (ecncp->last < 0) {
1744             log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1);
1745         } else {
1746             log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code);
1747         }
1748     }
1749     ecncp->last = (int32_t) code;
1750
1751     if (!*name) {
1752         log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code);
1753     }
1754
1755     return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length);
1756 }
1757
1758 /**
1759  * This can be made more efficient by moving it into putil.c and having
1760  * it directly access the ebcdic translation tables.
1761  * TODO: If we get this method in putil.c, then delete it from here.
1762  */
1763 static UChar
1764 u_charToUChar(char c) {
1765     UChar uc;
1766     u_charsToUChars(&c, &uc, 1);
1767     return uc;
1768 }
1769
1770 static void
1771 TestCharNames() {
1772     static char name[80];
1773     UErrorCode errorCode=U_ZERO_ERROR;
1774     struct enumExtCharNamesContext extContext;
1775     const char *expected;
1776     int32_t length;
1777     UChar32 c;
1778     int32_t i;
1779
1780     log_verbose("Testing uprv_getMaxCharNameLength()\n");
1781     length=uprv_getMaxCharNameLength();
1782     if(length==0) {
1783         /* no names data available */
1784         return;
1785     }
1786     if(length<83) { /* Unicode 3.2 max char name length */
1787         log_err("uprv_getMaxCharNameLength()=%d is too short");
1788     }
1789     /* ### TODO same tests for max ISO comment length as for max name length */
1790
1791     log_verbose("Testing u_charName()\n");
1792     for(i=0; i<UPRV_LENGTHOF(names); ++i) {
1793         /* modern Unicode character name */
1794         length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode);
1795         if(U_FAILURE(errorCode)) {
1796             log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode));
1797             return;
1798         }
1799         if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) {
1800             log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name);
1801         }
1802
1803         /* find the modern name */
1804         if (*names[i].name) {
1805             c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode);
1806             if(U_FAILURE(errorCode)) {
1807                 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode));
1808                 return;
1809             }
1810             if(c!=(UChar32)names[i].code) {
1811                 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code);
1812             }
1813         }
1814
1815         /* Unicode 1.0 character name */
1816         length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode);
1817         if(U_FAILURE(errorCode)) {
1818             log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode));
1819             return;
1820         }
1821         if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) {
1822             log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName);
1823         }
1824
1825         /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */
1826         if(names[i].oldName[0]!=0 /* && length>0 */) {
1827             c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode);
1828             if(U_FAILURE(errorCode)) {
1829                 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode));
1830                 return;
1831             }
1832             if(c!=(UChar32)names[i].code) {
1833                 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code);
1834             }
1835         }
1836
1837         /* Unicode character name alias */
1838         length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode);
1839         if(U_FAILURE(errorCode)) {
1840             log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode));
1841             return;
1842         }
1843         expected=names[i].alias;
1844         if(expected==NULL) {
1845             expected="";
1846         }
1847         if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) {
1848             log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n",
1849                     names[i].code, name, length, expected);
1850         }
1851
1852         /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */
1853         if(expected[0]!=0 /* && length>0 */) {
1854             c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode);
1855             if(U_FAILURE(errorCode)) {
1856                 log_err("u_charFromName(%s - alias) error %s\n",
1857                         expected, u_errorName(errorCode));
1858                 return;
1859             }
1860             if(c!=(UChar32)names[i].code) {
1861                 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n",
1862                         expected, c, names[i].code);
1863             }
1864         }
1865     }
1866
1867     /* test u_enumCharNames() */
1868     length=0;
1869     errorCode=U_ZERO_ERROR;
1870     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode);
1871     if(U_FAILURE(errorCode) || length<94140) {
1872         log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length);
1873     }
1874
1875     extContext.length = 0;
1876     extContext.last = -1;
1877     errorCode=U_ZERO_ERROR;
1878     u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode);
1879     if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) {
1880         log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length);
1881     }
1882
1883     /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */
1884     if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) {
1885         log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode));
1886     }
1887
1888     /* Test getCharNameCharacters */
1889     if(!getTestOption(QUICK_OPTION)) {
1890         enum { BUFSIZE = 256 };
1891         UErrorCode ec = U_ZERO_ERROR;
1892         char buf[BUFSIZE];
1893         int32_t maxLength;
1894         UChar32 cp;
1895         UChar pat[BUFSIZE], dumbPat[BUFSIZE];
1896         int32_t l1, l2;
1897         UBool map[256];
1898         UBool ok;
1899
1900         USet* set = uset_open(1, 0); /* empty set */
1901         USet* dumb = uset_open(1, 0); /* empty set */
1902
1903         /*
1904          * uprv_getCharNameCharacters() will likely return more lowercase
1905          * letters than actual character names contain because
1906          * it includes all the characters in lowercased names of
1907          * general categories, for the full possible set of extended names.
1908          */
1909         {
1910             USetAdder sa={
1911                 NULL,
1912                 uset_add,
1913                 uset_addRange,
1914                 uset_addString,
1915                 NULL, /* don't need remove() */
1916                 NULL  /* don't need removeRange() */
1917             };
1918             sa.set=set;
1919             uprv_getCharNameCharacters(&sa);
1920         }
1921
1922         /* build set the dumb (but sure-fire) way */
1923         for (i=0; i<256; ++i) {
1924             map[i] = FALSE;
1925         }
1926
1927         maxLength=0;
1928         for (cp=0; cp<0x110000; ++cp) {
1929             int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME,
1930                                      buf, BUFSIZE, &ec);
1931             if (U_FAILURE(ec)) {
1932                 log_err("FAIL: u_charName failed when it shouldn't\n");
1933                 uset_close(set);
1934                 uset_close(dumb);
1935                 return;
1936             }
1937             if(len>maxLength) {
1938                 maxLength=len;
1939             }
1940
1941             for (i=0; i<len; ++i) {
1942                 if (!map[(uint8_t) buf[i]]) {
1943                     uset_add(dumb, (UChar32)u_charToUChar(buf[i]));
1944                     map[(uint8_t) buf[i]] = TRUE;
1945                 }
1946             }
1947
1948             /* test for leading/trailing whitespace */
1949             if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') {
1950                 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp);
1951             }
1952         }
1953
1954         if(map[(uint8_t)'\t']) {
1955             log_err("u_charName() returned a name with a TAB for some code point\n", cp);
1956         }
1957
1958         length=uprv_getMaxCharNameLength();
1959         if(length!=maxLength) {
1960             log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n",
1961                     length, maxLength);
1962         }
1963
1964         /* compare the sets.  Where is my uset_equals?!! */
1965         ok=TRUE;
1966         for(i=0; i<256; ++i) {
1967             if(uset_contains(set, i)!=uset_contains(dumb, i)) {
1968                 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) {
1969                     /* ignore lowercase a-z that are in set but not in dumb */
1970                     ok=TRUE;
1971                 } else {
1972                     ok=FALSE;
1973                     break;
1974                 }
1975             }
1976         }
1977
1978         l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec);
1979         l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec);
1980         if (U_FAILURE(ec)) {
1981             log_err("FAIL: uset_toPattern failed when it shouldn't\n");
1982             uset_close(set);
1983             uset_close(dumb);
1984             return;
1985         }
1986
1987         if (l1 >= BUFSIZE) {
1988             l1 = BUFSIZE-1;
1989             pat[l1] = 0;
1990         }
1991         if (l2 >= BUFSIZE) {
1992             l2 = BUFSIZE-1;
1993             dumbPat[l2] = 0;
1994         }
1995
1996         if (!ok) {
1997             log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n",
1998                     aescstrdup(pat, l1), aescstrdup(dumbPat, l2));
1999         } else if(getTestOption(VERBOSITY_OPTION)) {
2000             log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1));
2001         }
2002
2003         uset_close(set);
2004         uset_close(dumb);
2005     }
2006
2007     /* ### TODO: test error cases and other interesting things */
2008 }
2009
2010 static void
2011 TestUCharFromNameUnderflow() {
2012     // Ticket #10889: Underflow crash when there is no dash.
2013     const char *name="<NO BREAK SPACE>";
2014     UErrorCode errorCode=U_ZERO_ERROR;
2015     UChar32 c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2016     if(U_SUCCESS(errorCode)) {
2017         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2018                 name, c, u_errorName(errorCode));
2019     }
2020
2021     // Test related edge cases.
2022     name="<-00a0>";
2023     errorCode=U_ZERO_ERROR;
2024     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2025     if(U_SUCCESS(errorCode)) {
2026         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2027                 name, c, u_errorName(errorCode));
2028     }
2029
2030     errorCode=U_ZERO_ERROR;
2031     name="<control->";
2032     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2033     if(U_SUCCESS(errorCode)) {
2034         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2035                 name, c, u_errorName(errorCode));
2036     }
2037
2038     errorCode=U_ZERO_ERROR;
2039     name="<control-111111>";
2040     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2041     if(U_SUCCESS(errorCode)) {
2042         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2043                 name, c, u_errorName(errorCode));
2044     }
2045
2046     // ICU-20292: integer overflow
2047     errorCode=U_ZERO_ERROR;
2048     name="<noncharacter-10010FFFF>";
2049     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2050     if(U_SUCCESS(errorCode)) {
2051         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2052                 name, c, u_errorName(errorCode));
2053     }
2054
2055     errorCode=U_ZERO_ERROR;
2056     name="<noncharacter-00010FFFF>";  // too many digits even if only leading 0s
2057     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2058     if(U_SUCCESS(errorCode)) {
2059         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2060                 name, c, u_errorName(errorCode));
2061     }
2062
2063     errorCode=U_ZERO_ERROR;
2064     name="<noncharacter-fFFf>>";
2065     c=u_charFromName(U_EXTENDED_CHAR_NAME, name, &errorCode);
2066     if(U_SUCCESS(errorCode)) {
2067         log_err("u_charFromName(%s) = U+%04x but should fail - %s\n",
2068                 name, c, u_errorName(errorCode));
2069     }
2070 }
2071
2072 /* test u_isMirrored() and u_charMirror() ----------------------------------- */
2073
2074 static void
2075 TestMirroring() {
2076     USet *set;
2077     UErrorCode errorCode;
2078
2079     UChar32 start, end, c2, c3;
2080     int32_t i;
2081
2082     U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2083
2084     U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17);
2085
2086     log_verbose("Testing u_isMirrored()\n");
2087     if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) &&
2088          !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400)
2089         )
2090     ) {
2091         log_err("u_isMirrored() does not work correctly\n");
2092     }
2093
2094     log_verbose("Testing u_charMirror()\n");
2095     if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 &&
2096          u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */
2097          u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab &&
2098          /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2099          u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d
2100          )
2101     ) {
2102         log_err("u_charMirror() does not work correctly\n");
2103     }
2104
2105     /* verify that Bidi_Mirroring_Glyph roundtrips */
2106     errorCode=U_ZERO_ERROR;
2107     set=uset_openPattern(mirroredPattern, 17, &errorCode);
2108
2109     if (U_FAILURE(errorCode)) {
2110         log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n");
2111     } else {
2112         for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) {
2113             do {
2114                 c2=u_charMirror(start);
2115                 c3=u_charMirror(c2);
2116                 if(c3!=start) {
2117                     log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3);
2118                 }
2119                 c3=u_getBidiPairedBracket(start);
2120                 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) {
2121                     if(c3!=start) {
2122                         log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n",
2123                                 (long)start);
2124                     }
2125                 } else {
2126                     if(c3!=c2) {
2127                         log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n",
2128                                 (long)start, (long)c2);
2129                     }
2130                 }
2131             } while(++start<=end);
2132         }
2133     }
2134
2135     uset_close(set);
2136 }
2137
2138
2139 struct RunTestData
2140 {
2141     const char *runText;
2142     UScriptCode runCode;
2143 };
2144
2145 typedef struct RunTestData RunTestData;
2146
2147 static void
2148 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns,
2149                 const char *prefix)
2150 {
2151     int32_t run, runStart, runLimit;
2152     UScriptCode runCode;
2153
2154     /* iterate over all the runs */
2155     run = 0;
2156     while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) {
2157         if (runStart != runStarts[run]) {
2158             log_err("%s: incorrect start offset for run %d: expected %d, got %d\n",
2159                 prefix, run, runStarts[run], runStart);
2160         }
2161
2162         if (runLimit != runStarts[run + 1]) {
2163             log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n",
2164                 prefix, run, runStarts[run + 1], runLimit);
2165         }
2166
2167         if (runCode != testData[run].runCode) {
2168             log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n",
2169                 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode));
2170         }
2171
2172         run += 1;
2173
2174         /* stop when we've seen all the runs we expect to see */
2175         if (run >= nRuns) {
2176             break;
2177         }
2178     }
2179
2180     /* Complain if we didn't see then number of runs we expected */
2181     if (run != nRuns) {
2182         log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns);
2183     }
2184 }
2185
2186 static void
2187 TestUScriptRunAPI()
2188 {
2189     static const RunTestData testData1[] = {
2190         {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI},
2191         {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC},
2192         {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC},
2193         {"English (", USCRIPT_LATIN},
2194         {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI},
2195         {") ", USCRIPT_LATIN},
2196         {"\\u6F22\\u5B75", USCRIPT_HAN},
2197         {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA},
2198         {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA},
2199         {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET}
2200     };
2201
2202     static const RunTestData testData2[] = {
2203        {"((((((((((abc))))))))))", USCRIPT_LATIN}
2204     };
2205
2206     static const struct {
2207       const RunTestData *testData;
2208       int32_t nRuns;
2209     } testDataEntries[] = {
2210         {testData1, UPRV_LENGTHOF(testData1)},
2211         {testData2, UPRV_LENGTHOF(testData2)}
2212     };
2213
2214     static const int32_t nTestEntries = UPRV_LENGTHOF(testDataEntries);
2215     int32_t testEntry;
2216
2217     for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) {
2218         UChar testString[1024];
2219         int32_t runStarts[256];
2220         int32_t nTestRuns = testDataEntries[testEntry].nRuns;
2221         const RunTestData *testData = testDataEntries[testEntry].testData;
2222
2223         int32_t run, stringLimit;
2224         UScriptRun *scriptRun = NULL;
2225         UErrorCode err;
2226
2227         /*
2228          * Fill in the test string and the runStarts array.
2229          */
2230         stringLimit = 0;
2231         for (run = 0; run < nTestRuns; run += 1) {
2232             runStarts[run] = stringLimit;
2233             stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit);
2234             /*stringLimit -= 1;*/
2235         }
2236
2237         /* The limit of the last run */
2238         runStarts[nTestRuns] = stringLimit;
2239
2240         /*
2241          * Make sure that calling uscript_OpenRun with a NULL text pointer
2242          * and a non-zero text length returns the correct error.
2243          */
2244         err = U_ZERO_ERROR;
2245         scriptRun = uscript_openRun(NULL, stringLimit, &err);
2246
2247         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2248             log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2249         }
2250
2251         if (scriptRun != NULL) {
2252             log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n");
2253             uscript_closeRun(scriptRun);
2254         }
2255
2256         /*
2257          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2258          * and a zero text length returns the correct error.
2259          */
2260         err = U_ZERO_ERROR;
2261         scriptRun = uscript_openRun(testString, 0, &err);
2262
2263         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2264             log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2265         }
2266
2267         if (scriptRun != NULL) {
2268             log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n");
2269             uscript_closeRun(scriptRun);
2270         }
2271
2272         /*
2273          * Make sure that calling uscript_openRun with a NULL text pointer
2274          * and a zero text length doesn't return an error.
2275          */
2276         err = U_ZERO_ERROR;
2277         scriptRun = uscript_openRun(NULL, 0, &err);
2278
2279         if (U_FAILURE(err)) {
2280             log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err));
2281         }
2282
2283         /* Make sure that the empty iterator doesn't find any runs */
2284         if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) {
2285             log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n");
2286         }
2287
2288         /*
2289          * Make sure that calling uscript_setRunText with a NULL text pointer
2290          * and a non-zero text length returns the correct error.
2291          */
2292         err = U_ZERO_ERROR;
2293         uscript_setRunText(scriptRun, NULL, stringLimit, &err);
2294
2295         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2296             log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2297         }
2298
2299         /*
2300          * Make sure that calling uscript_OpenRun with a non-NULL text pointer
2301          * and a zero text length returns the correct error.
2302          */
2303         err = U_ZERO_ERROR;
2304         uscript_setRunText(scriptRun, testString, 0, &err);
2305
2306         if (err != U_ILLEGAL_ARGUMENT_ERROR) {
2307             log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err));
2308         }
2309
2310         /*
2311          * Now call uscript_setRunText on the empty iterator
2312          * and make sure that it works.
2313          */
2314         err = U_ZERO_ERROR;
2315         uscript_setRunText(scriptRun, testString, stringLimit, &err);
2316
2317         if (U_FAILURE(err)) {
2318             log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err));
2319         } else {
2320             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText");
2321         }
2322
2323         uscript_closeRun(scriptRun);
2324
2325         /*
2326          * Now open an interator over the testString
2327          * using uscript_openRun and make sure that it works
2328          */
2329         scriptRun = uscript_openRun(testString, stringLimit, &err);
2330
2331         if (U_FAILURE(err)) {
2332             log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err));
2333         } else {
2334             CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun");
2335         }
2336
2337         /* Now reset the iterator, and make sure
2338          * that it still works.
2339          */
2340         uscript_resetRun(scriptRun);
2341
2342         CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun");
2343
2344         /* Close the iterator */
2345         uscript_closeRun(scriptRun);
2346     }
2347 }
2348
2349 /* test additional, non-core properties */
2350 static void
2351 TestAdditionalProperties() {
2352     /* test data for u_charAge() */
2353     static const struct {
2354         UChar32 c;
2355         UVersionInfo version;
2356     } charAges[]={
2357         {0x41,    { 1, 1, 0, 0 }},
2358         {0xffff,  { 1, 1, 0, 0 }},
2359         {0x20ab,  { 2, 0, 0, 0 }},
2360         {0x2fffe, { 2, 0, 0, 0 }},
2361         {0x20ac,  { 2, 1, 0, 0 }},
2362         {0xfb1d,  { 3, 0, 0, 0 }},
2363         {0x3f4,   { 3, 1, 0, 0 }},
2364         {0x10300, { 3, 1, 0, 0 }},
2365         {0x220,   { 3, 2, 0, 0 }},
2366         {0xff60,  { 3, 2, 0, 0 }}
2367     };
2368
2369     /* test data for u_hasBinaryProperty() */
2370     static const int32_t
2371     props[][3]={ /* code point, property, value */
2372         { 0x0627, UCHAR_ALPHABETIC, TRUE },
2373         { 0x1034a, UCHAR_ALPHABETIC, TRUE },
2374         { 0x2028, UCHAR_ALPHABETIC, FALSE },
2375
2376         { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE },
2377         { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE },
2378
2379         { 0x202c, UCHAR_BIDI_CONTROL, TRUE },
2380         { 0x202f, UCHAR_BIDI_CONTROL, FALSE },
2381
2382         { 0x003c, UCHAR_BIDI_MIRRORED, TRUE },
2383         { 0x003d, UCHAR_BIDI_MIRRORED, FALSE },
2384
2385         /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */
2386         { 0x2018, UCHAR_BIDI_MIRRORED, FALSE },
2387         { 0x201d, UCHAR_BIDI_MIRRORED, FALSE },
2388         { 0x201f, UCHAR_BIDI_MIRRORED, FALSE },
2389         { 0x301e, UCHAR_BIDI_MIRRORED, FALSE },
2390
2391         { 0x058a, UCHAR_DASH, TRUE },
2392         { 0x007e, UCHAR_DASH, FALSE },
2393
2394         { 0x0c4d, UCHAR_DIACRITIC, TRUE },
2395         { 0x3000, UCHAR_DIACRITIC, FALSE },
2396
2397         { 0x0e46, UCHAR_EXTENDER, TRUE },
2398         { 0x0020, UCHAR_EXTENDER, FALSE },
2399
2400 #if !UCONFIG_NO_NORMALIZATION
2401         { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2402         { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE },
2403         { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE },
2404
2405         { 0x110a, UCHAR_NFD_INERT, TRUE },      /* Jamo L */
2406         { 0x0308, UCHAR_NFD_INERT, FALSE },
2407
2408         { 0x1164, UCHAR_NFKD_INERT, TRUE },     /* Jamo V */
2409         { 0x1d79d, UCHAR_NFKD_INERT, FALSE },   /* math compat version of xi */
2410
2411         { 0x0021, UCHAR_NFC_INERT, TRUE },      /* ! */
2412         { 0x0061, UCHAR_NFC_INERT, FALSE },     /* a */
2413         { 0x00e4, UCHAR_NFC_INERT, FALSE },     /* a-umlaut */
2414         { 0x0102, UCHAR_NFC_INERT, FALSE },     /* a-breve */
2415         { 0xac1c, UCHAR_NFC_INERT, FALSE },     /* Hangul LV */
2416         { 0xac1d, UCHAR_NFC_INERT, TRUE },      /* Hangul LVT */
2417
2418         { 0x1d79d, UCHAR_NFKC_INERT, FALSE },   /* math compat version of xi */
2419         { 0x2a6d6, UCHAR_NFKC_INERT, TRUE },    /* Han, last of CJK ext. B */
2420
2421         { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE },
2422         { 0x0308, UCHAR_SEGMENT_STARTER, FALSE },
2423         { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */
2424         { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */
2425         { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */
2426         { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */
2427 #endif
2428
2429         { 0x0044, UCHAR_HEX_DIGIT, TRUE },
2430         { 0xff46, UCHAR_HEX_DIGIT, TRUE },
2431         { 0x0047, UCHAR_HEX_DIGIT, FALSE },
2432
2433         { 0x30fb, UCHAR_HYPHEN, TRUE },
2434         { 0xfe58, UCHAR_HYPHEN, FALSE },
2435
2436         { 0x2172, UCHAR_ID_CONTINUE, TRUE },
2437         { 0x0307, UCHAR_ID_CONTINUE, TRUE },
2438         { 0x005c, UCHAR_ID_CONTINUE, FALSE },
2439
2440         { 0x2172, UCHAR_ID_START, TRUE },
2441         { 0x007a, UCHAR_ID_START, TRUE },
2442         { 0x0039, UCHAR_ID_START, FALSE },
2443
2444         { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE },
2445         { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE },
2446         { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE },
2447
2448         { 0x200c, UCHAR_JOIN_CONTROL, TRUE },
2449         { 0x2029, UCHAR_JOIN_CONTROL, FALSE },
2450
2451         { 0x1d7bc, UCHAR_LOWERCASE, TRUE },
2452         { 0x0345, UCHAR_LOWERCASE, TRUE },
2453         { 0x0030, UCHAR_LOWERCASE, FALSE },
2454
2455         { 0x1d7a9, UCHAR_MATH, TRUE },
2456         { 0x2135, UCHAR_MATH, TRUE },
2457         { 0x0062, UCHAR_MATH, FALSE },
2458
2459         { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2460         { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE },
2461         { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE },
2462
2463         { 0x0022, UCHAR_QUOTATION_MARK, TRUE },
2464         { 0xff62, UCHAR_QUOTATION_MARK, TRUE },
2465         { 0xd840, UCHAR_QUOTATION_MARK, FALSE },
2466
2467         { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE },
2468         { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE },
2469
2470         { 0x1d44a, UCHAR_UPPERCASE, TRUE },
2471         { 0x2162, UCHAR_UPPERCASE, TRUE },
2472         { 0x0345, UCHAR_UPPERCASE, FALSE },
2473
2474         { 0x0020, UCHAR_WHITE_SPACE, TRUE },
2475         { 0x202f, UCHAR_WHITE_SPACE, TRUE },
2476         { 0x3001, UCHAR_WHITE_SPACE, FALSE },
2477
2478         { 0x0711, UCHAR_XID_CONTINUE, TRUE },
2479         { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE },
2480         { 0x007c, UCHAR_XID_CONTINUE, FALSE },
2481
2482         { 0x16ee, UCHAR_XID_START, TRUE },
2483         { 0x23456, UCHAR_XID_START, TRUE },
2484         { 0x1d1aa, UCHAR_XID_START, FALSE },
2485
2486         /*
2487          * Version break:
2488          * The following properties are only supported starting with the
2489          * Unicode version indicated in the second field.
2490          */
2491         { -1, 0x320, 0 },
2492
2493         { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2494         { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE },
2495         { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE },
2496
2497         { 0x0149, UCHAR_DEPRECATED, TRUE },         /* changed in Unicode 5.2 */
2498         { 0x0341, UCHAR_DEPRECATED, FALSE },        /* changed in Unicode 5.2 */
2499         { 0xe0001, UCHAR_DEPRECATED, TRUE },        /* changed from Unicode 5 to 5.1 */
2500         { 0xe0100, UCHAR_DEPRECATED, FALSE },
2501
2502         { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE },
2503         { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE },
2504         { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE },
2505         { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE },     /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2506
2507         { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE },
2508         { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE },
2509         { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE },    /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */
2510         { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE },
2511
2512         { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE },
2513         { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE },
2514
2515         { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE },
2516         { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE },
2517
2518         { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE },
2519         { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE },
2520
2521         { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE },
2522         { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE },
2523
2524         { 0x2e9b, UCHAR_RADICAL, TRUE },
2525         { 0x4e00, UCHAR_RADICAL, FALSE },
2526
2527         { 0x012f, UCHAR_SOFT_DOTTED, TRUE },
2528         { 0x0049, UCHAR_SOFT_DOTTED, FALSE },
2529
2530         { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE },
2531         { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE },
2532
2533         { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */
2534
2535         { 0x002e, UCHAR_S_TERM, TRUE },
2536         { 0x0061, UCHAR_S_TERM, FALSE },
2537
2538         { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE },
2539         { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE },
2540         { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE },
2541         { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE },
2542
2543         /* enum/integer type properties */
2544
2545         /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */
2546         /* test default Bidi classes for unassigned code points */
2547         { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2548         { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2549         { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2550         { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */
2551         { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */
2552         { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2553         { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2554         { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2555         { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2556         { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2557         { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2558
2559         { 0x061d, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2560         { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2561         { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2562         { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2563         { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2564         { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2565         { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2566
2567         { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS },
2568         { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU },
2569         { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS },
2570         { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG },
2571         { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU },
2572         { 0x0870, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2573         { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA },
2574         { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS },
2575         { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2576         { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK },
2577         { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B },
2578
2579         /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */
2580         { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 },
2581
2582         { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK },
2583         { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT },
2584         { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2585         { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2586         { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2587         { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL },
2588         { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL },
2589         { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT },
2590         { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE },
2591
2592         { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2593         { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW },
2594         { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2595         { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH },
2596         { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2597         { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH },
2598         { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2599         { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2600         { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2601         { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2602         { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2603         { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2604         { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE },
2605         { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */
2606         { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL },
2607         { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2608         { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS },
2609
2610         /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */
2611         { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 },
2612         { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER },     /* changed in Unicode 5.2 */
2613
2614         { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2615         { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN },
2616         { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH },
2617         { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH },
2618         { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL },
2619
2620         { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING },
2621         { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2622         { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING },
2623         { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING },
2624         { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING },
2625         { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2626         { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2627         { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT },
2628
2629         /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */
2630         { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2631         { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN },
2632         { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION },
2633         { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION },
2634         { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2635         { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2636         { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2637         { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC },
2638         { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2639         { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2640         { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE },
2641         { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION },
2642         { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS },
2643         { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2644         { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC },
2645
2646         /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */
2647
2648         /* UCHAR_SCRIPT tested in cucdapi.c TestUScriptCodeAPI() */
2649
2650         { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2651         { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2652         { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2653         { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2654         { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2655         { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2656         { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },
2657
2658         { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2659         { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2660         { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO },     /* changed in Unicode 5.2 */
2661         { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2662
2663         { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2664         { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2665         { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2666         { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },
2667         { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2668         { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2669
2670         { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2671         { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2672         { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO },       /* changed in Unicode 5.2 */
2673         { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2674
2675         { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2676         { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2677         { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2678         { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },
2679         { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2680         { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2681         { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2682
2683         { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2684         { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2685         { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO },    /* changed in Unicode 5.2 */
2686         { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2687
2688         { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2689         { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2690         { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2691         { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE },
2692
2693         { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2694         { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2695         { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2696         { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2697         { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE },
2698
2699         { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },
2700
2701         { -1, 0x410, 0 }, /* version break for Unicode 4.1 */
2702
2703         { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE },
2704         { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE },
2705         { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE },
2706
2707         { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2708         { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2709         { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE },
2710         { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2711         { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE },
2712
2713         { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION },
2714         { 0x2c8e,  UCHAR_BLOCK, UBLOCK_COPTIC },
2715         { 0xfe17,  UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS },
2716
2717         { 0x1a00,  UCHAR_SCRIPT, USCRIPT_BUGINESE },
2718         { 0x2cea,  UCHAR_SCRIPT, USCRIPT_COPTIC },
2719         { 0xa82b,  UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI },
2720         { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN },
2721
2722         { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 },
2723         { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 },
2724         { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 },
2725         { 0x115f, UCHAR_LINE_BREAK, U_LB_JL },
2726         { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT },
2727         { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV },
2728
2729         { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT },
2730         { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND },
2731         { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL },
2732         { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V },
2733
2734         { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER },
2735         { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER },
2736         { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC },
2737         { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM },
2738
2739         { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER },
2740         { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER },
2741         { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE },
2742         { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP },
2743
2744         { -1, 0x520, 0 }, /* version break for Unicode 5.2 */
2745
2746         /* unassigned code points in new default Bidi R blocks */
2747         { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2748         { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT },
2749
2750         /* test some script codes >127 */
2751         { 0xa6e6,  UCHAR_SCRIPT, USCRIPT_BAMUM },
2752         { 0xa4d0,  UCHAR_SCRIPT, USCRIPT_LISU },
2753         { 0x10a7f,  UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN },
2754
2755         { -1, 0x600, 0 }, /* version break for Unicode 6.0 */
2756
2757         /* value changed in Unicode 6.0 */
2758         { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL },
2759
2760         { -1, 0x610, 0 }, /* version break for Unicode 6.1 */
2761
2762         /* unassigned code points in new/changed default Bidi AL blocks */
2763         { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2764         { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC },
2765
2766         { -1, 0x630, 0 }, /* version break for Unicode 6.3 */
2767
2768         /* unassigned code points in the currency symbols block now default to ET */
2769         { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2770         { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR },
2771
2772         /* new property in Unicode 6.3 */
2773         { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2774         { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2775         { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2776         { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE },
2777         { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN },
2778         { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE },
2779
2780         { -1, 0x700, 0 }, /* version break for Unicode 7.0 */
2781
2782         /* new character range with Joining_Group values */
2783         { 0x10ABF, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2784         { 0x10AC0, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_ALEPH },
2785         { 0x10AC1, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_BETH },
2786         { 0x10AEF, UCHAR_JOINING_GROUP, U_JG_MANICHAEAN_HUNDRED },
2787         { 0x10AF0, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP },
2788
2789         { -1, 0xa00, 0 },  // version break for Unicode 10
2790
2791         { 0x1F1E5, UCHAR_REGIONAL_INDICATOR, FALSE },
2792         { 0x1F1E7, UCHAR_REGIONAL_INDICATOR, TRUE },
2793         { 0x1F1FF, UCHAR_REGIONAL_INDICATOR, TRUE },
2794         { 0x1F200, UCHAR_REGIONAL_INDICATOR, FALSE },
2795
2796         { 0x0600, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2797         { 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, FALSE },
2798         { 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, TRUE },
2799
2800         /* undefined UProperty values */
2801         { 0x61, 0x4a7, 0 },
2802         { 0x234bc, 0x15ed, 0 }
2803     };
2804
2805     UVersionInfo version;
2806     UChar32 c;
2807     int32_t i, result, uVersion;
2808     UProperty which;
2809
2810     /* what is our Unicode version? */
2811     u_getUnicodeVersion(version);
2812     uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */
2813
2814     u_charAge(0x20, version);
2815     if(version[0]==0) {
2816         /* no additional properties available */
2817         log_err("TestAdditionalProperties: no additional properties available, not tested\n");
2818         return;
2819     }
2820
2821     /* test u_charAge() */
2822     for(i=0; i<UPRV_LENGTHOF(charAges); ++i) {
2823         u_charAge(charAges[i].c, version);
2824         if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) {
2825             log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n",
2826                 charAges[i].c,
2827                 version[0], version[1], version[2], version[3],
2828                 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]);
2829         }
2830     }
2831
2832     if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 ||
2833         u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 ||
2834         u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 ||   /* j2478 */
2835         u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/
2836         u_getIntPropertyMinValue(0x2345)!=0
2837     ) {
2838         log_err("error: u_getIntPropertyMinValue() wrong\n");
2839     }
2840     if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) {
2841         log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n");
2842     }
2843     if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) {
2844         log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n");
2845     }
2846     if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) {
2847         log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n");
2848     }
2849     if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) {
2850         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n");
2851     }
2852     if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) {
2853         log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n");
2854     }
2855     if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) {
2856         log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n");
2857     }
2858     if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) {
2859         log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n");
2860     }
2861     if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) {
2862         log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n");
2863     }
2864     if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) {
2865         log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n");
2866     }
2867     if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) {
2868         log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n");
2869     }
2870     if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) {
2871         log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n");
2872     }
2873     if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) {
2874         log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n");
2875     }
2876     if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) {
2877         log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n");
2878     }
2879     if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) {
2880         log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n");
2881     }
2882     /*JB#2410*/
2883     if( u_getIntPropertyMaxValue(0x2345)!=-1) {
2884         log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n");
2885     }
2886     if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) {
2887         log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n");
2888     }
2889     if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) !=  (int32_t) (U_JG_COUNT -1)) {
2890         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n");
2891     }
2892     if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) {
2893         log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n");
2894     }
2895     if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) {
2896         log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n");
2897     }
2898
2899     /* test u_hasBinaryProperty() and u_getIntPropertyValue() */
2900     for(i=0; i<UPRV_LENGTHOF(props); ++i) {
2901         const char *whichName;
2902
2903         if(props[i][0]<0) {
2904             /* Unicode version break */
2905             if(uVersion<props[i][1]) {
2906                 break; /* do not test properties that are not yet supported */
2907             } else {
2908                 continue; /* skip this row */
2909             }
2910         }
2911
2912         c=(UChar32)props[i][0];
2913         which=(UProperty)props[i][1];
2914         whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME);
2915
2916         if(which<UCHAR_INT_START) {
2917             result=u_hasBinaryProperty(c, which);
2918             if(result!=props[i][2]) {
2919                 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n",
2920                         c, whichName, result, i);
2921             }
2922         }
2923
2924         result=u_getIntPropertyValue(c, which);
2925         if(result!=props[i][2]) {
2926             log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n",
2927                     c, whichName, result, props[i][2], i);
2928         }
2929
2930         /* test separate functions, too */
2931         switch((UProperty)props[i][1]) {
2932         case UCHAR_ALPHABETIC:
2933             if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) {
2934                 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n",
2935                         props[i][0], result, i);
2936             }
2937             break;
2938         case UCHAR_LOWERCASE:
2939             if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2940                 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n",
2941                         props[i][0], result, i);
2942             }
2943             break;
2944         case UCHAR_UPPERCASE:
2945             if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) {
2946                 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n",
2947                         props[i][0], result, i);
2948             }
2949             break;
2950         case UCHAR_WHITE_SPACE:
2951             if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) {
2952                 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n",
2953                         props[i][0], result, i);
2954             }
2955             break;
2956         default:
2957             break;
2958         }
2959     }
2960 }
2961
2962 static void
2963 TestNumericProperties(void) {
2964     /* see UnicodeData.txt, DerivedNumericValues.txt */
2965     static const struct {
2966         UChar32 c;
2967         int32_t type;
2968         double numValue;
2969     } values[]={
2970         { 0x0F33, U_NT_NUMERIC, -1./2. },
2971         { 0x0C66, U_NT_DECIMAL, 0 },
2972         { 0x96f6, U_NT_NUMERIC, 0 },
2973         { 0xa833, U_NT_NUMERIC, 1./16. },
2974         { 0x2152, U_NT_NUMERIC, 1./10. },
2975         { 0x2151, U_NT_NUMERIC, 1./9. },
2976         { 0x1245f, U_NT_NUMERIC, 1./8. },
2977         { 0x2150, U_NT_NUMERIC, 1./7. },
2978         { 0x2159, U_NT_NUMERIC, 1./6. },
2979         { 0x09f6, U_NT_NUMERIC, 3./16. },
2980         { 0x2155, U_NT_NUMERIC, 1./5. },
2981         { 0x00BD, U_NT_NUMERIC, 1./2. },
2982         { 0x0031, U_NT_DECIMAL, 1. },
2983         { 0x4e00, U_NT_NUMERIC, 1. },
2984         { 0x58f1, U_NT_NUMERIC, 1. },
2985         { 0x10320, U_NT_NUMERIC, 1. },
2986         { 0x0F2B, U_NT_NUMERIC, 3./2. },
2987         { 0x00B2, U_NT_DIGIT, 2. },
2988         { 0x5f10, U_NT_NUMERIC, 2. },
2989         { 0x1813, U_NT_DECIMAL, 3. },
2990         { 0x5f0e, U_NT_NUMERIC, 3. },
2991         { 0x2173, U_NT_NUMERIC, 4. },
2992         { 0x8086, U_NT_NUMERIC, 4. },
2993         { 0x278E, U_NT_DIGIT, 5. },
2994         { 0x1D7F2, U_NT_DECIMAL, 6. },
2995         { 0x247A, U_NT_DIGIT, 7. },
2996         { 0x7396, U_NT_NUMERIC, 9. },
2997         { 0x1372, U_NT_NUMERIC, 10. },
2998         { 0x216B, U_NT_NUMERIC, 12. },
2999         { 0x16EE, U_NT_NUMERIC, 17. },
3000         { 0x249A, U_NT_NUMERIC, 19. },
3001         { 0x303A, U_NT_NUMERIC, 30. },
3002         { 0x5345, U_NT_NUMERIC, 30. },
3003         { 0x32B2, U_NT_NUMERIC, 37. },
3004         { 0x1375, U_NT_NUMERIC, 40. },
3005         { 0x10323, U_NT_NUMERIC, 50. },
3006         { 0x0BF1, U_NT_NUMERIC, 100. },
3007         { 0x964c, U_NT_NUMERIC, 100. },
3008         { 0x217E, U_NT_NUMERIC, 500. },
3009         { 0x2180, U_NT_NUMERIC, 1000. },
3010         { 0x4edf, U_NT_NUMERIC, 1000. },
3011         { 0x2181, U_NT_NUMERIC, 5000. },
3012         { 0x137C, U_NT_NUMERIC, 10000. },
3013         { 0x4e07, U_NT_NUMERIC, 10000. },
3014         { 0x12432, U_NT_NUMERIC, 216000. },
3015         { 0x12433, U_NT_NUMERIC, 432000. },
3016         { 0x4ebf, U_NT_NUMERIC, 100000000. },
3017         { 0x5146, U_NT_NUMERIC, 1000000000000. },
3018         { -1, U_NT_NONE, U_NO_NUMERIC_VALUE },
3019         { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE },
3020         { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE },
3021         { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE },
3022         { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE },
3023         { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE },
3024         { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE },
3025         { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE }
3026     };
3027
3028     double nv;
3029     UChar32 c;
3030     int32_t i, type;
3031
3032     for(i=0; i<UPRV_LENGTHOF(values); ++i) {
3033         c=values[i].c;
3034         type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE);
3035         nv=u_getNumericValue(c);
3036
3037         if(type!=values[i].type) {
3038             log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type);
3039         }
3040         if(0.000001 <= fabs(nv - values[i].numValue)) {
3041             log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue);
3042         }
3043     }
3044 }
3045
3046 /**
3047  * Test the property names and property value names API.
3048  */
3049 static void
3050 TestPropertyNames(void) {
3051     int32_t p, v, choice=0, rev;
3052     UBool atLeastSomething = FALSE;
3053
3054     for (p=0; ; ++p) {
3055         UProperty propEnum = (UProperty)p;
3056         UBool sawProp = FALSE;
3057         if(p > 10 && !atLeastSomething) {
3058           log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice);
3059           return;
3060         }
3061
3062         for (choice=0; ; ++choice) {
3063             const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice);
3064             if (name) {
3065                 if (!sawProp)
3066                     log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff);
3067                 log_verbose("%d=\"%s\"", choice, name);
3068                 sawProp = TRUE;
3069                 atLeastSomething = TRUE;
3070
3071                 /* test reverse mapping */
3072                 rev = u_getPropertyEnum(name);
3073                 if (rev != p) {
3074                     log_err("Property round-trip failure: %d -> %s -> %d\n",
3075                             p, name, rev);
3076                 }
3077             }
3078             if (!name && choice>0) break;
3079         }
3080         if (sawProp) {
3081             /* looks like a valid property; check the values */
3082             const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3083             int32_t max = 0;
3084             if (p == UCHAR_CANONICAL_COMBINING_CLASS) {
3085                 max = 255;
3086             } else if (p == UCHAR_GENERAL_CATEGORY_MASK) {
3087                 /* it's far too slow to iterate all the way up to
3088                    the real max, U_GC_P_MASK */
3089                 max = U_GC_NL_MASK;
3090             } else if (p == UCHAR_BLOCK) {
3091                 /* UBlockCodes, unlike other values, start at 1 */
3092                 max = 1;
3093             }
3094             log_verbose("\n");
3095             for (v=-1; ; ++v) {
3096                 UBool sawValue = FALSE;
3097                 for (choice=0; ; ++choice) {
3098                     const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice);
3099                     if (vname) {
3100                         if (!sawValue) log_verbose(" %s, value %d:", pname, v);
3101                         log_verbose("%d=\"%s\"", choice, vname);
3102                         sawValue = TRUE;
3103
3104                         /* test reverse mapping */
3105                         rev = u_getPropertyValueEnum(propEnum, vname);
3106                         if (rev != v) {
3107                             log_err("Value round-trip failure (%s): %d -> %s -> %d\n",
3108                                     pname, v, vname, rev);
3109                         }
3110                     }
3111                     if (!vname && choice>0) break;
3112                 }
3113                 if (sawValue) {
3114                     log_verbose("\n");
3115                 }
3116                 if (!sawValue && v>=max) break;
3117             }
3118         }
3119         if (!sawProp) {
3120             if (p>=UCHAR_STRING_LIMIT) {
3121                 break;
3122             } else if (p>=UCHAR_DOUBLE_LIMIT) {
3123                 p = UCHAR_STRING_START - 1;
3124             } else if (p>=UCHAR_MASK_LIMIT) {
3125                 p = UCHAR_DOUBLE_START - 1;
3126             } else if (p>=UCHAR_INT_LIMIT) {
3127                 p = UCHAR_MASK_START - 1;
3128             } else if (p>=UCHAR_BINARY_LIMIT) {
3129                 p = UCHAR_INT_START - 1;
3130             }
3131         }
3132     }
3133 }
3134
3135 /**
3136  * Test the property values API.  See JB#2410.
3137  */
3138 static void
3139 TestPropertyValues(void) {
3140     int32_t i, p, min, max;
3141     UErrorCode ec;
3142
3143     /* Min should be 0 for everything. */
3144     /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */
3145     for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) {
3146         UProperty propEnum = (UProperty)p;
3147         min = u_getIntPropertyMinValue(propEnum);
3148         if (min != 0) {
3149             if (p == UCHAR_BLOCK) {
3150                 /* This is okay...for now.  See JB#2487.
3151                    TODO Update this for JB#2487. */
3152             } else {
3153                 const char* name;
3154                 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME);
3155                 if (name == NULL)
3156                     name = "<ERROR>";
3157                 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n",
3158                         name, min);
3159             }
3160         }
3161     }
3162
3163     if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 ||
3164         u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) {
3165         log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n");
3166     }
3167
3168     /* Max should be -1 for invalid properties. */
3169     max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE);
3170     if (max != -1) {
3171         log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n",
3172                 max);
3173     }
3174
3175     /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */
3176     for (i=0; i<2; ++i) {
3177         int32_t script;
3178         const char* desc;
3179         ec = U_ZERO_ERROR;
3180         switch (i) {
3181         case 0:
3182             script = uscript_getScript(-1, &ec);
3183             desc = "uscript_getScript(-1)";
3184             break;
3185         case 1:
3186             script = u_getIntPropertyValue(-1, UCHAR_SCRIPT);
3187             desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)";
3188             break;
3189         default:
3190             log_err("Internal test error. Too many scripts\n");
3191             return;
3192         }
3193         /* We don't explicitly test ec.  It should be U_FAILURE but it
3194            isn't documented as such. */
3195         if (script != (int32_t)USCRIPT_INVALID_CODE) {
3196             log_err("FAIL: %s = %d, exp. 0\n",
3197                     desc, script);
3198         }
3199     }
3200 }
3201
3202 /* various tests for consistency of UCD data and API behavior */
3203 static void
3204 TestConsistency() {
3205     char buffer[300];
3206     USet *set1, *set2, *set3, *set4;
3207     UErrorCode errorCode;
3208
3209     UChar32 start, end;
3210     int32_t i, length;
3211
3212     U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10);
3213     U_STRING_DECL(dashPattern, "[:Dash:]", 8);
3214     U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13);
3215     U_STRING_DECL(formatPattern, "[:Cf:]", 6);
3216     U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14);
3217
3218     U_STRING_DECL(mathBlocksPattern,
3219         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3220         214);
3221     U_STRING_DECL(mathPattern, "[:Math:]", 8);
3222     U_STRING_DECL(unassignedPattern, "[:Cn:]", 6);
3223     U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14);
3224     U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3225
3226     U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10);
3227     U_STRING_INIT(dashPattern, "[:Dash:]", 8);
3228     U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13);
3229     U_STRING_INIT(formatPattern, "[:Cf:]", 6);
3230     U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14);
3231
3232     U_STRING_INIT(mathBlocksPattern,
3233         "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]",
3234         214);
3235     U_STRING_INIT(mathPattern, "[:Math:]", 8);
3236     U_STRING_INIT(unassignedPattern, "[:Cn:]", 6);
3237     U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14);
3238     U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20);
3239
3240     /*
3241      * It used to be that UCD.html and its precursors said
3242      * "Those dashes used to mark connections between pieces of words,
3243      *  plus the Katakana middle dot."
3244      *
3245      * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash
3246      * but not from Hyphen.
3247      * UTC 94 (2003mar) decided to leave it that way and to change UCD.html.
3248      * Therefore, do not show errors when testing the Hyphen property.
3249      */
3250     log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n"
3251                 "known to the UTC and not considered errors.\n");
3252
3253     errorCode=U_ZERO_ERROR;
3254     set1=uset_openPattern(hyphenPattern, 10, &errorCode);
3255     set2=uset_openPattern(dashPattern, 8, &errorCode);
3256     if(U_SUCCESS(errorCode)) {
3257         /* remove the Katakana middle dot(s) from set1 */
3258         uset_remove(set1, 0x30fb);
3259         uset_remove(set1, 0xff65); /* halfwidth variant */
3260         showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE);
3261     } else {
3262         log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3263     }
3264
3265     /* check that Cf is neither Hyphen nor Dash nor Alphabetic */
3266     set3=uset_openPattern(formatPattern, 6, &errorCode);
3267     set4=uset_openPattern(alphaPattern, 14, &errorCode);
3268     if(U_SUCCESS(errorCode)) {
3269         showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE);
3270         showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE);
3271         showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE);
3272     } else {
3273         log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3274     }
3275
3276     uset_close(set1);
3277     uset_close(set2);
3278     uset_close(set3);
3279     uset_close(set4);
3280
3281     /*
3282      * Check that each lowercase character has "small" in its name
3283      * and not "capital".
3284      * There are some such characters, some of which seem odd.
3285      * Use the verbose flag to see these notices.
3286      */
3287     errorCode=U_ZERO_ERROR;
3288     set1=uset_openPattern(lowerPattern, 13, &errorCode);
3289     if(U_SUCCESS(errorCode)) {
3290         for(i=0;; ++i) {
3291             length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode);
3292             if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
3293                 break; /* done */
3294             }
3295             if(U_FAILURE(errorCode)) {
3296                 log_err("error iterating over [:Lowercase:] at item %d: %s\n",
3297                         i, u_errorName(errorCode));
3298                 break;
3299             }
3300             if(length!=0) {
3301                 break; /* done with code points, got a string or -1 */
3302             }
3303
3304             while(start<=end) {
3305                 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode);
3306                 if(U_FAILURE(errorCode)) {
3307                     log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode));
3308                     errorCode=U_ZERO_ERROR;
3309                 }
3310                 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) &&
3311                     strstr(buffer, "SMALL CAPITAL")==NULL
3312                 ) {
3313                     log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer);
3314                 }
3315                 ++start;
3316             }
3317         }
3318     } else {
3319         log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3320     }
3321     uset_close(set1);
3322
3323     /* verify that all assigned characters in Math blocks are exactly Math characters */
3324     errorCode=U_ZERO_ERROR;
3325     set1=uset_openPattern(mathBlocksPattern, -1, &errorCode);
3326     set2=uset_openPattern(mathPattern, 8, &errorCode);
3327     set3=uset_openPattern(unassignedPattern, 6, &errorCode);
3328     if(U_SUCCESS(errorCode)) {
3329         uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */
3330         uset_complement(set3);      /* assigned characters */
3331         uset_retainAll(set1, set3); /* [math blocks]&[assigned] */
3332         compareUSets(set1, set2,
3333                      "[assigned Math block chars]", "[math blocks]&[:Math:]",
3334                      TRUE);
3335     } else {
3336         log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode));
3337     }
3338     uset_close(set1);
3339     uset_close(set2);
3340     uset_close(set3);
3341
3342     /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */
3343     errorCode=U_ZERO_ERROR;
3344     set1=uset_openPattern(unknownPattern, 14, &errorCode);
3345     set2=uset_openPattern(reservedPattern, 20, &errorCode);
3346     if(U_SUCCESS(errorCode)) {
3347         compareUSets(set1, set2,
3348                      "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]",
3349                      TRUE);
3350     } else {
3351         log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode));
3352     }
3353     uset_close(set1);
3354     uset_close(set2);
3355 }
3356
3357 /* test case folding, compare return values with CaseFolding.txt ------------ */
3358
3359 /* bit set for which case foldings for a character have been tested already */
3360 enum {
3361     CF_SIMPLE=1,
3362     CF_FULL=2,
3363     CF_TURKIC=4,
3364     CF_ALL=7
3365 };
3366
3367 static void
3368 testFold(UChar32 c, int which,
3369          UChar32 simple, UChar32 turkic,
3370          const UChar *full, int32_t fullLength,
3371          const UChar *turkicFull, int32_t turkicFullLength) {
3372     UChar s[2], t[32];
3373     UChar32 c2;
3374     int32_t length, length2;
3375
3376     UErrorCode errorCode=U_ZERO_ERROR;
3377
3378     length=0;
3379     U16_APPEND_UNSAFE(s, length, c);
3380
3381     if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) {
3382         log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3383     }
3384     if((which&CF_FULL)!=0) {
3385         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, 0, &errorCode);
3386         if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) {
3387             log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c);
3388         }
3389     }
3390     if((which&CF_TURKIC)!=0) {
3391         if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) {
3392             log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple);
3393         }
3394
3395         length2=u_strFoldCase(t, UPRV_LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
3396         if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) {
3397             log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c);
3398         }
3399     }
3400 }
3401
3402 /* test that c case-folds to itself */
3403 static void
3404 testFoldToSelf(UChar32 c, int which) {
3405     UChar s[2];
3406     int32_t length;
3407
3408     length=0;
3409     U16_APPEND_UNSAFE(s, length, c);
3410     testFold(c, which, c, c, s, length, s, length);
3411 }
3412
3413 struct CaseFoldingData {
3414     USet *notSeen;
3415     UChar32 prev, prevSimple;
3416     UChar prevFull[32];
3417     int32_t prevFullLength;
3418     int which;
3419 };
3420 typedef struct CaseFoldingData CaseFoldingData;
3421
3422 static void U_CALLCONV
3423 caseFoldingLineFn(void *context,
3424                   char *fields[][2], int32_t fieldCount,
3425                   UErrorCode *pErrorCode) {
3426     (void)fieldCount; // suppress compiler warnings about unused variable
3427
3428     CaseFoldingData *pData=(CaseFoldingData *)context;
3429     char *end;
3430     UChar full[32];
3431     UChar32 c, prev, simple;
3432     int32_t count;
3433     int which;
3434     char status;
3435
3436     /* get code point */
3437     const char *s=u_skipWhitespace(fields[0][0]);
3438     if(0==strncmp(s, "0000..10FFFF", 12)) {
3439         /*
3440          * Ignore the line
3441          * # @missing: 0000..10FFFF; C; <code point>
3442          * because maps-to-self is already our default, and this line breaks this parser.
3443          */
3444         return;
3445     }
3446     c=(UChar32)strtoul(s, &end, 16);
3447     end=(char *)u_skipWhitespace(end);
3448     if(end<=fields[0][0] || end!=fields[0][1]) {
3449         log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
3450         *pErrorCode=U_PARSE_ERROR;
3451         return;
3452     }
3453
3454     /* get the status of this mapping */
3455     status=*u_skipWhitespace(fields[1][0]);
3456     if(status!='C' && status!='S' && status!='F' && status!='T') {
3457         log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
3458         *pErrorCode=U_PARSE_ERROR;
3459         return;
3460     }
3461
3462     /* get the mapping */
3463     count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode);
3464     if(U_FAILURE(*pErrorCode)) {
3465         log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
3466         return;
3467     }
3468
3469     /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
3470     if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) {
3471         simple=c;
3472     }
3473
3474     if(c!=(prev=pData->prev)) {
3475         /*
3476          * Test remaining mappings for the previous code point.
3477          * If a turkic folding was not mentioned, then it should fold the same
3478          * as the regular simple case folding.
3479          */
3480         UChar prevString[2];
3481         int32_t length;
3482
3483         length=0;
3484         U16_APPEND_UNSAFE(prevString, length, prev);
3485         testFold(prev, (~pData->which)&CF_ALL,
3486                  prev, pData->prevSimple,
3487                  prevString, length,
3488                  pData->prevFull, pData->prevFullLength);
3489         pData->prev=pData->prevSimple=c;
3490         length=0;
3491         U16_APPEND_UNSAFE(pData->prevFull, length, c);
3492         pData->prevFullLength=length;
3493         pData->which=0;
3494     }
3495
3496     /*
3497      * Turn the status into a bit set of case foldings to test.
3498      * Remember non-Turkic case foldings as defaults for Turkic mode.
3499      */
3500     switch(status) {
3501     case 'C':
3502         which=CF_SIMPLE|CF_FULL;
3503         pData->prevSimple=simple;
3504         u_memcpy(pData->prevFull, full, count);
3505         pData->prevFullLength=count;
3506         break;
3507     case 'S':
3508         which=CF_SIMPLE;
3509         pData->prevSimple=simple;
3510         break;
3511     case 'F':
3512         which=CF_FULL;
3513         u_memcpy(pData->prevFull, full, count);
3514         pData->prevFullLength=count;
3515         break;
3516     case 'T':
3517         which=CF_TURKIC;
3518         break;
3519     default:
3520         which=0;
3521         break; /* won't happen because of test above */
3522     }
3523
3524     testFold(c, which, simple, simple, full, count, full, count);
3525
3526     /* remember which case foldings of c have been tested */
3527     pData->which|=which;
3528
3529     /* remove c from the set of ones not mentioned in CaseFolding.txt */
3530     uset_remove(pData->notSeen, c);
3531 }
3532
3533 static void
3534 TestCaseFolding() {
3535     CaseFoldingData data={ NULL, 0, 0, {0}, 0, 0 };
3536     char *fields[3][2];
3537     UErrorCode errorCode;
3538
3539     static char *lastLine= (char *)"10FFFF; C; 10FFFF;";
3540
3541     errorCode=U_ZERO_ERROR;
3542     /* test BMP & plane 1 - nothing interesting above */
3543     data.notSeen=uset_open(0, 0x1ffff);
3544     data.prevFullLength=1; /* length of full case folding of U+0000 */
3545
3546     parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode);
3547     if(U_SUCCESS(errorCode)) {
3548         int32_t i, start, end;
3549
3550         /* add a pseudo-last line to finish testing of the actual last one */
3551         fields[0][0]=lastLine;
3552         fields[0][1]=lastLine+6;
3553         fields[1][0]=lastLine+7;
3554         fields[1][1]=lastLine+9;
3555         fields[2][0]=lastLine+10;
3556         fields[2][1]=lastLine+17;
3557         caseFoldingLineFn(&data, fields, 3, &errorCode);
3558
3559         /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */
3560         for(i=0;
3561             0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) &&
3562                 U_SUCCESS(errorCode);
3563             ++i
3564         ) {
3565             do {
3566                 testFoldToSelf(start, CF_ALL);
3567             } while(++start<=end);
3568         }
3569     }
3570
3571     uset_close(data.notSeen);
3572 }
3573
3574 static void TestBinaryCharacterPropertiesAPI() {
3575     // API test only. See intltest/ucdtest.cpp for functional test.
3576     UErrorCode errorCode = U_ZERO_ERROR;
3577     const USet *set = u_getBinaryPropertySet(-1, &errorCode);
3578     if (U_SUCCESS(errorCode)) {
3579         log_err("u_getBinaryPropertySet(-1) did not fail\n");
3580     }
3581     errorCode = U_ZERO_ERROR;
3582     set = u_getBinaryPropertySet(UCHAR_BINARY_LIMIT, &errorCode);
3583     if (U_SUCCESS(errorCode)) {
3584         log_err("u_getBinaryPropertySet(UCHAR_BINARY_LIMIT) did not fail\n");
3585     }
3586     errorCode = U_ZERO_ERROR;
3587     set = u_getBinaryPropertySet(UCHAR_WHITE_SPACE, &errorCode);
3588     if (!uset_contains(set, 0x20) || uset_contains(set, 0x61)) {
3589         log_err("u_getBinaryPropertySet(UCHAR_WHITE_SPACE) wrong contents\n");
3590     }
3591 }
3592
3593 static void TestIntCharacterPropertiesAPI() {
3594     // API test only. See intltest/ucdtest.cpp for functional test.
3595     UErrorCode errorCode = U_ZERO_ERROR;
3596     const UCPMap *map = u_getIntPropertyMap(UCHAR_INT_START - 1, &errorCode);
3597     if (U_SUCCESS(errorCode)) {
3598         log_err("u_getIntPropertyMap(UCHAR_INT_START - 1) did not fail\n");
3599     }
3600     errorCode = U_ZERO_ERROR;
3601     map = u_getIntPropertyMap(UCHAR_INT_LIMIT, &errorCode);
3602     if (U_SUCCESS(errorCode)) {
3603         log_err("u_getIntPropertyMap(UCHAR_INT_LIMIT) did not fail\n");
3604     }
3605     errorCode = U_ZERO_ERROR;
3606     map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &errorCode);
3607     if (ucpmap_get(map, 0x20) != U_SPACE_SEPARATOR || ucpmap_get(map, 0x23456) != U_OTHER_LETTER) {
3608         log_err("u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY) wrong contents\n");
3609     }
3610 }