icuSources/test/intltest/ucdtest.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1997-2010, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 #include "unicode/ustring.h"
   8 #include "unicode/uchar.h"
   9 #include "unicode/uniset.h"
  10 #include "unicode/putil.h"
  11 #include "cstring.h"
  12 #include "hash.h"
  13 #include "normalizer2impl.h"
  14 #include "uparse.h"
  15 #include "ucdtest.h"
  16
  17 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0]))
  18
  19 static const char *ignorePropNames[]={
  20     "FC_NFKC",
  21     "NFD_QC",
  22     "NFC_QC",
  23     "NFKD_QC",
  24     "NFKC_QC",
  25     "Expands_On_NFD",
  26     "Expands_On_NFC",
  27     "Expands_On_NFKD",
  28     "Expands_On_NFKC",
  29     "NFKC_CF"
  30 };
  31
  32 UnicodeTest::UnicodeTest()
  33 {
  34     UErrorCode errorCode=U_ZERO_ERROR;
  35     unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode);
  36     if(U_FAILURE(errorCode)) {
  37         delete unknownPropertyNames;
  38         unknownPropertyNames=NULL;
  39     }
  40     // Ignore some property names altogether.
  41     for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) {
  42         unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
  43     }
  44 }
  45
  46 UnicodeTest::~UnicodeTest()
  47 {
  48     delete unknownPropertyNames;
  49 }
  50
  51 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  52 {
  53     if (exec) logln("TestSuite UnicodeTest: ");
  54     switch (index) {
  55         case 0: name = "TestAdditionalProperties"; if(exec) TestAdditionalProperties(); break;
  56         case 1: name = "TestBinaryValues"; if(exec) TestBinaryValues(); break;
  57         case 2: name = "TestConsistency"; if(exec) TestConsistency(); break;
  58         default: name = ""; break; //needed to end loop
  59     }
  60 }
  61
  62 //====================================================
  63 // private data used by the tests
  64 //====================================================
  65
  66 // test DerivedCoreProperties.txt -------------------------------------------
  67
  68 // copied from genprops.c
  69 static int32_t
  70 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
  71     const char *t, *z;
  72     int32_t i, j;
  73
  74     s=u_skipWhitespace(s);
  75     for(i=0; i<countTokens; ++i) {
  76         t=tokens[i];
  77         if(t!=NULL) {
  78             for(j=0;; ++j) {
  79                 if(t[j]!=0) {
  80                     if(s[j]!=t[j]) {
  81                         break;
  82                     }
  83                 } else {
  84                     z=u_skipWhitespace(s+j);
  85                     if(*z==';' || *z==0) {
  86                         return i;
  87                     } else {
  88                         break;
  89                     }
  90                 }
  91             }
  92         }
  93     }
  94     return -1;
  95 }
  96
  97 static const char *const
  98 derivedPropsNames[]={
  99     "Math",
 100     "Alphabetic",
 101     "Lowercase",
 102     "Uppercase",
 103     "ID_Start",
 104     "ID_Continue",
 105     "XID_Start",
 106     "XID_Continue",
 107     "Default_Ignorable_Code_Point",
 108     "Full_Composition_Exclusion",
 109     "Grapheme_Extend",
 110     "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
 111     "Grapheme_Base",
 112     "Cased",
 113     "Case_Ignorable",
 114     "Changes_When_Lowercased",
 115     "Changes_When_Uppercased",
 116     "Changes_When_Titlecased",
 117     "Changes_When_Casefolded",
 118     "Changes_When_Casemapped",
 119     "Changes_When_NFKC_Casefolded"
 120 };
 121
 122 static const UProperty
 123 derivedPropsIndex[]={
 124     UCHAR_MATH,
 125     UCHAR_ALPHABETIC,
 126     UCHAR_LOWERCASE,
 127     UCHAR_UPPERCASE,
 128     UCHAR_ID_START,
 129     UCHAR_ID_CONTINUE,
 130     UCHAR_XID_START,
 131     UCHAR_XID_CONTINUE,
 132     UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
 133     UCHAR_FULL_COMPOSITION_EXCLUSION,
 134     UCHAR_GRAPHEME_EXTEND,
 135     UCHAR_GRAPHEME_LINK,
 136     UCHAR_GRAPHEME_BASE,
 137     UCHAR_CASED,
 138     UCHAR_CASE_IGNORABLE,
 139     UCHAR_CHANGES_WHEN_LOWERCASED,
 140     UCHAR_CHANGES_WHEN_UPPERCASED,
 141     UCHAR_CHANGES_WHEN_TITLECASED,
 142     UCHAR_CHANGES_WHEN_CASEFOLDED,
 143     UCHAR_CHANGES_WHEN_CASEMAPPED,
 144     UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
 145 };
 146
 147 static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 };
 148
 149 enum { MAX_ERRORS=50 };
 150
 151 U_CFUNC void U_CALLCONV
 152 derivedPropsLineFn(void *context,
 153                    char *fields[][2], int32_t /* fieldCount */,
 154                    UErrorCode *pErrorCode)
 155 {
 156     UnicodeTest *me=(UnicodeTest *)context;
 157     uint32_t start, end;
 158     int32_t i;
 159
 160     u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
 161     if(U_FAILURE(*pErrorCode)) {
 162         me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
 163         return;
 164     }
 165
 166     /* parse derived binary property name, ignore unknown names */
 167     i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]);
 168     if(i<0) {
 169         UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
 170         propName.trim();
 171         if(me->unknownPropertyNames->find(propName)==NULL) {
 172             UErrorCode errorCode=U_ZERO_ERROR;
 173             me->unknownPropertyNames->puti(propName, 1, errorCode);
 174             me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
 175         }
 176         return;
 177     }
 178
 179     me->derivedProps[i].add(start, end);
 180 }
 181
 182 void UnicodeTest::TestAdditionalProperties() {
 183 #if !UCONFIG_NO_NORMALIZATION
 184     // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
 185     if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) {
 186         errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
 187               LENGTHOF(derivedPropsNames));
 188         return;
 189     }
 190     if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) {
 191         errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n");
 192         return;
 193     }
 194
 195     char newPath[256];
 196     char backupPath[256];
 197     char *fields[2][2];
 198     UErrorCode errorCode=U_ZERO_ERROR;
 199
 200     /* Look inside ICU_DATA first */
 201     strcpy(newPath, pathToDataDirectory());
 202     strcat(newPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
 203
 204     // As a fallback, try to guess where the source data was located
 205     // at the time ICU was built, and look there.
 206 #   ifdef U_TOPSRCDIR
 207         strcpy(backupPath, U_TOPSRCDIR  U_FILE_SEP_STRING "data");
 208 #   else
 209         strcpy(backupPath, loadTestData(errorCode));
 210         strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data");
 211 #   endif
 212     strcat(backupPath, U_FILE_SEP_STRING);
 213     strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
 214
 215     char *path=newPath;
 216     u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
 217
 218     if(errorCode==U_FILE_ACCESS_ERROR) {
 219         errorCode=U_ZERO_ERROR;
 220         path=backupPath;
 221         u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
 222     }
 223     if(U_FAILURE(errorCode)) {
 224         errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
 225         return;
 226     }
 227     char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt");
 228     strcpy(basename, "DerivedNormalizationProps.txt");
 229     u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
 230     if(U_FAILURE(errorCode)) {
 231         errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
 232         return;
 233     }
 234
 235     // now we have all derived core properties in the UnicodeSets
 236     // run them all through the API
 237     int32_t rangeCount, range;
 238     uint32_t i;
 239     UChar32 start, end;
 240
 241     // test all TRUE properties
 242     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
 243         rangeCount=derivedProps[i].getRangeCount();
 244         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
 245             start=derivedProps[i].getRangeStart(range);
 246             end=derivedProps[i].getRangeEnd(range);
 247             for(; start<=end; ++start) {
 248                 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
 249                     dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]);
 250                     if(++numErrors[i]>=MAX_ERRORS) {
 251                       dataerrln("Too many errors, moving to the next test");
 252                       break;
 253                     }
 254                 }
 255             }
 256         }
 257     }
 258
 259     // invert all properties
 260     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
 261         derivedProps[i].complement();
 262     }
 263
 264     // test all FALSE properties
 265     for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
 266         rangeCount=derivedProps[i].getRangeCount();
 267         for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
 268             start=derivedProps[i].getRangeStart(range);
 269             end=derivedProps[i].getRangeEnd(range);
 270             for(; start<=end; ++start) {
 271                 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
 272                     errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]);
 273                     if(++numErrors[i]>=MAX_ERRORS) {
 274                       errln("Too many errors, moving to the next test");
 275                       break;
 276                     }
 277                 }
 278             }
 279         }
 280     }
 281 #endif /* !UCONFIG_NO_NORMALIZATION */
 282 }
 283
 284 void UnicodeTest::TestBinaryValues() {
 285     /*
 286      * Unicode 5.1 explicitly defines binary property value aliases.
 287      * Verify that they are all recognized.
 288      */
 289     UErrorCode errorCode=U_ZERO_ERROR;
 290     UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode);
 291     if(U_FAILURE(errorCode)) {
 292         dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode));
 293         return;
 294     }
 295
 296     static const char *const falseValues[]={ "N", "No", "F", "False" };
 297     static const char *const trueValues[]={ "Y", "Yes", "T", "True" };
 298     int32_t i;
 299     for(i=0; i<LENGTHOF(falseValues); ++i) {
 300         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
 301         pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV));
 302         errorCode=U_ZERO_ERROR;
 303         UnicodeSet set(pattern, errorCode);
 304         if(U_FAILURE(errorCode)) {
 305             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode));
 306             continue;
 307         }
 308         set.complement();
 309         if(set!=alpha) {
 310             errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]);
 311         }
 312     }
 313     for(i=0; i<LENGTHOF(trueValues); ++i) {
 314         UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]");
 315         pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV));
 316         errorCode=U_ZERO_ERROR;
 317         UnicodeSet set(pattern, errorCode);
 318         if(U_FAILURE(errorCode)) {
 319             errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode));
 320             continue;
 321         }
 322         if(set!=alpha) {
 323             errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]);
 324         }
 325     }
 326 }
 327
 328 void UnicodeTest::TestConsistency() {
 329 #if !UCONFIG_NO_NORMALIZATION
 330     /*
 331      * Test for an example that getCanonStartSet() delivers
 332      * all characters that compose from the input one,
 333      * even in multiple steps.
 334      * For example, the set for "I" (0049) should contain both
 335      * I-diaeresis (00CF) and I-diaeresis-acute (1E2E).
 336      * In general, the set for the middle such character should be a subset
 337      * of the set for the first.
 338      */
 339     IcuTestErrorCode errorCode(*this, "TestConsistency");
 340     const Normalizer2 *nfd=Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
 341     const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
 342     if(errorCode.isFailure()) {
 343         dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n",
 344                   errorCode.errorName());
 345         errorCode.reset();
 346         return;
 347     }
 348
 349     UnicodeSet set1, set2;
 350     if (nfcImpl->getCanonStartSet(0x49, set1)) {
 351         /* enumerate all characters that are plausible to be latin letters */
 352         for(UChar start=0xa0; start<0x2000; ++start) {
 353             UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode);
 354             if(decomp.length()>1 && decomp[0]==0x49) {
 355                 set2.add(start);
 356             }
 357         }
 358
 359         if (set1!=set2) {
 360             errln("[canon start set of 0049] != [all c with canon decomp with 0049]");
 361         }
 362         // This was available in cucdtst.c but the test had to move to intltest
 363         // because the new internal normalization functions are in C++.
 364         //compareUSets(set1, set2,
 365         //             "[canon start set of 0049]", "[all c with canon decomp with 0049]",
 366         //             TRUE);
 367     } else {
 368         errln("NFC.getCanonStartSet() returned FALSE");
 369     }
 370 #endif
 371 }