icuSources/test/intltest/canittst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2006, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************
   6  *
   7  * @author Mark E. Davis
   8  * @author Vladimir Weinstein
   9  */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_NORMALIZATION
  14
  15 #include "intltest.h"
  16 #include "cstring.h"
  17 #include "canittst.h"
  18 #include "unicode/caniter.h"
  19 #include "unicode/normlzr.h"
  20 #include "unicode/uchar.h"
  21 #include "hash.h"
  22
  23 #define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array)))
  24
  25 #define CASE(id,test) case id:                          \
  26                           name = #test;                 \
  27                           if (exec) {                   \
  28                               logln(#test "---");       \
  29                               logln((UnicodeString)""); \
  30                               test();                   \
  31                           }                             \
  32                           break
  33
  34 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
  35                                          const char* &name, char* /*par*/) {
  36     switch (index) {
  37         CASE(0, TestBasic);
  38         CASE(1, TestExhaustive);
  39         CASE(2, TestAPI);
  40       default: name = ""; break;
  41     }
  42 }
  43
  44 /**
  45  * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
  46 static UnicodeString str(const char *input)
  47 {
  48     UnicodeString str(input, ""); // Invariant conversion
  49     return str.unescape();
  50 }
  51  */
  52
  53
  54 CanonicalIteratorTest::CanonicalIteratorTest() :
  55 nameTrans(NULL), hexTrans(NULL)
  56 {
  57 }
  58
  59 CanonicalIteratorTest::~CanonicalIteratorTest()
  60 {
  61 #if !UCONFIG_NO_TRANSLITERATION
  62   if(nameTrans != NULL) {
  63     delete(nameTrans);
  64   }
  65   if(hexTrans != NULL) {
  66     delete(hexTrans);
  67   }
  68 #endif
  69 }
  70
  71 void CanonicalIteratorTest::TestExhaustive() {
  72     UErrorCode status = U_ZERO_ERROR;
  73     CanonicalIterator it("", status);
  74     UChar32 i = 0;
  75     UnicodeString s;
  76     // Test static and dynamic class IDs
  77     if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
  78         errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
  79     }
  80     for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
  81         //for (i = 0xae00; i < 0xaf00; ++i) {
  82
  83         if ((i % 0x100) == 0) {
  84             logln("Testing U+%06X", i);
  85         }
  86
  87         // skip characters we know don't have decomps
  88         int8_t type = u_charType(i);
  89         if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
  90             || type == U_SURROGATE) continue;
  91
  92         s = i;
  93         characterTest(s, i, it);
  94
  95         s += (UChar32)0x0345; //"\\u0345";
  96         characterTest(s, i, it);
  97     }
  98 }
  99
 100 void CanonicalIteratorTest::TestBasic() {
 101
 102     UErrorCode status = U_ZERO_ERROR;
 103
 104     static const char * const testArray[][2] = {
 105         {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
 106             "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
 107             "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
 108             "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
 109         {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
 110         {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
 111     };
 112
 113 #if 0
 114     // This is not interesting for C/C++ as the data is already built beforehand
 115     // check build
 116     UnicodeSet ss = CanonicalIterator.getSafeStart();
 117     logln("Safe Start: " + ss.toPattern(true));
 118     ss = CanonicalIterator.getStarts('a');
 119     expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
 120         new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
 121         + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
 122             );
 123 #endif
 124
 125     // check permute
 126     // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
 127
 128     Hashtable *permutations = new Hashtable(FALSE, status);
 129     permutations->setValueDeleter(uhash_deleteUnicodeString);
 130     UnicodeString toPermute("ABC");
 131
 132     CanonicalIterator::permute(toPermute, FALSE, permutations, status);
 133
 134     logln("testing permutation");
 135
 136     expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
 137
 138     delete permutations;
 139
 140     // try samples
 141     logln("testing samples");
 142     Hashtable *set = new Hashtable(FALSE, status);
 143     set->setValueDeleter(uhash_deleteUnicodeString);
 144     int32_t i = 0;
 145     CanonicalIterator it("", status);
 146     if(U_SUCCESS(status)) {
 147       for (i = 0; i < ARRAY_LENGTH(testArray); ++i) {
 148           //logln("Results for: " + name.transliterate(testArray[i]));
 149           UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
 150           it.setSource(testStr, status);
 151           set->removeAll();
 152           for (;;) {
 153               //UnicodeString *result = new UnicodeString(it.next());
 154               UnicodeString result(it.next());
 155               if (result.isBogus()) {
 156                   break;
 157               }
 158               set->put(result, new UnicodeString(result), status); // Add result to the table
 159               //logln(++counter + ": " + hex.transliterate(result));
 160               //logln(" = " + name.transliterate(result));
 161           }
 162           expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
 163
 164       }
 165     } else {
 166       errln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
 167     }
 168     delete set;
 169 }
 170
 171 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
 172 {
 173     UErrorCode status = U_ZERO_ERROR;
 174     UnicodeString decomp, comp;
 175     UBool gotDecomp = FALSE;
 176     UBool gotComp = FALSE;
 177     UBool gotSource = FALSE;
 178
 179     Normalizer::decompose(s, FALSE, 0, decomp, status);
 180     Normalizer::compose(s, FALSE, 0, comp, status);
 181
 182     // skip characters that don't have either decomp.
 183     // need quick test for this!
 184     if (s == decomp && s == comp) {
 185         return;
 186     }
 187
 188     it.setSource(s, status);
 189
 190     for (;;) {
 191         UnicodeString item = it.next();
 192         if (item.isBogus()) break;
 193         if (item == s) gotSource = TRUE;
 194         if (item == decomp) gotDecomp = TRUE;
 195         if (item == comp) gotComp = TRUE;
 196     }
 197
 198     if (!gotSource || !gotDecomp || !gotComp) {
 199         errln("FAIL CanonicalIterator: " + s + (int)ch);
 200     }
 201 }
 202
 203 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
 204     if (!(a==b)) {
 205         errln("FAIL: " + message + getReadable(item));
 206         errln("\t" + getReadable(a));
 207         errln("\t" + getReadable(b));
 208     } else {
 209         logln("Checked: " + message + getReadable(item));
 210         logln("\t" + getReadable(a));
 211         logln("\t" + getReadable(b));
 212     }
 213 }
 214
 215 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
 216   UErrorCode status = U_ZERO_ERROR;
 217   UnicodeString result = "[";
 218     if (s.length() == 0) return "";
 219     // set up for readable display
 220 #if !UCONFIG_NO_TRANSLITERATION
 221     if(verbose) {
 222       if (nameTrans == NULL)
 223           nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
 224       UnicodeString sName = s;
 225       nameTrans->transliterate(sName);
 226       result += sName;
 227       result += ";";
 228     }
 229     if (hexTrans == NULL)
 230         hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
 231 #endif
 232     UnicodeString sHex = s;
 233 #if !UCONFIG_NO_TRANSLITERATION
 234     if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
 235       hexTrans->transliterate(sHex);
 236     }
 237 #endif
 238     result += sHex;
 239     result += "]";
 240     return result;
 241     //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
 242 }
 243
 244 U_CFUNC int U_CALLCONV
 245 compareUnicodeStrings(const void *s1, const void *s2) {
 246   UnicodeString **st1 = (UnicodeString **)s1;
 247   UnicodeString **st2 = (UnicodeString **)s2;
 248
 249   return (*st1)->compare(**st2);
 250 }
 251
 252
 253 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
 254     UnicodeString result;
 255
 256     // Iterate over the Hashtable, then qsort.
 257
 258     UnicodeString **resArray = new UnicodeString*[col->count()];
 259     int32_t i = 0;
 260
 261     const UHashElement *ne = NULL;
 262     int32_t el = -1;
 263     //Iterator it = basic.iterator();
 264     ne = col->nextElement(el);
 265     //while (it.hasNext())
 266     while (ne != NULL) {
 267       //String item = (String) it.next();
 268       UnicodeString *item = (UnicodeString *)(ne->value.pointer);
 269       resArray[i++] = item;
 270       ne = col->nextElement(el);
 271     }
 272
 273     for(i = 0; i<col->count(); ++i) {
 274       logln(*resArray[i]);
 275     }
 276
 277     qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
 278
 279     result = *resArray[0];
 280
 281     for(i = 1; i<col->count(); ++i) {
 282       result += ", ";
 283       result += *resArray[i];
 284     }
 285
 286 /*
 287     Iterator it = col.iterator();
 288     while (it.hasNext()) {
 289         if (result.length() != 0) result.append(", ");
 290         result.append(it.next().toString());
 291     }
 292 */
 293
 294     delete [] resArray;
 295
 296     return result;
 297 }
 298
 299 void CanonicalIteratorTest::TestAPI() {
 300   UErrorCode status = U_ZERO_ERROR;
 301   // Test reset and getSource
 302   UnicodeString start("ljubav");
 303   logln("Testing CanonicalIterator::getSource");
 304   logln("Instantiating canonical iterator with string "+start);
 305   CanonicalIterator can(start, status);
 306   UnicodeString source = can.getSource();
 307   logln("CanonicalIterator::getSource returned "+source);
 308   if(start != source) {
 309     errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
 310   }
 311   logln("Testing CanonicalIterator::reset");
 312   UnicodeString next = can.next();
 313   logln("CanonicalIterator::next returned "+next);
 314
 315   can.reset();
 316
 317   UnicodeString afterReset = can.next();
 318   logln("After reset, CanonicalIterator::next returned "+afterReset);
 319
 320   if(next != afterReset) {
 321     errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
 322   }
 323
 324   logln("Testing getStaticClassID and getDynamicClassID");
 325   if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
 326       errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
 327   }
 328 }
 329
 330 #endif /* #if !UCONFIG_NO_NORMALIZATION */