ICU-461.18.tar.gz

[apple/icu.git] / icuSources / test / intltest / thcoll.cpp
diff --git a/icuSources/test/intltest/thcoll.cpp b/icuSources/test/intltest/thcoll.cpp

index 60cdc1dc664857db11b1578c6042042a2b18553b..1199b993b544196a28adef2c1f5333c3001faa56 100644 (file)
--- a/icuSources/test/intltest/thcoll.cpp
+++ b/icuSources/test/intltest/thcoll.cpp
@@ -1,6 +1,6 @@
  /*
  **********************************************************************
-*   Copyright (C) 1999-2003, International Business Machines
+*   Copyright (C) 1999-2009, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *   Date        Name        Description
@@ -16,37 +16,28 @@
  #include "unicode/utypes.h"
  #include "unicode/coll.h"
  #include "unicode/sortkey.h"
+#include "unicode/ustring.h"
  #include "cstring.h"
  #include "filestrm.h"
+#include "textfile.h"
  
  /**
   * The TestDictionary test expects a file of this name, with this
   * encoding, to be present in the directory $ICU/source/test/testdata.
   */
-#define TEST_FILE           "th18057.txt"
-#define TEST_FILE_ENCODING  "UTF8"
+//#define TEST_FILE           "th18057.txt"
  
  /**
   * This is the most failures we show in TestDictionary.  If this number
   * is < 0, we show all failures.
   */
-#define MAX_FAILURES_TO_SHOW 8
-
-#define CASE(id,test)                 \
-    case id:                          \
-        name = #test;                 \
-        if (exec) {                   \
-            logln(#test "---");       \
-            logln((UnicodeString)""); \
-            test();                   \
-        }                             \
-        break;
+#define MAX_FAILURES_TO_SHOW -1
  
  CollationThaiTest::CollationThaiTest() {
      UErrorCode status = U_ZERO_ERROR;
      coll = Collator::createInstance(Locale("th", "TH", ""), status);
      if (coll && U_SUCCESS(status)) {
-        coll->setStrength(Collator::TERTIARY);
+        //coll->setStrength(Collator::TERTIARY);
      } else {
          delete coll;
          coll = 0;
@@ -61,45 +52,21 @@ void CollationThaiTest::runIndexedTest(int32_t index, UBool exec, const char* &n
                                         char* /*par*/) {
  
      if((!coll) && exec) {
-      errln(__FILE__ " cannot test - failed to create collator.");
-      name = "";
+      dataerrln(__FILE__ " cannot test - failed to create collator.");
+      name = "some test";
        return;
      }
  
      switch (index) {
-        CASE(0,TestDictionary)
-        CASE(1,TestCornerCases)
-        CASE(2,TestNamesList)
+        TESTCASE(0,TestDictionary);
+        TESTCASE(1,TestCornerCases);
+        TESTCASE(2,TestNamesList);
+        TESTCASE(3,TestInvalidThai);
+        TESTCASE(4,TestReordering);
          default: name = ""; break;
      }
  }
  
-/**
- * Read a line terminated by a single ^J or ^M, and convert it from
- * the TEST_FILE_ENCODING to Unicode.  ASSUMES FILE LINES ARE 127
- * characters long or less.  This is true for th18057.txt, which
- * has 80-char or shorter lines.  DOES NOT HANDLE ^M^J sequence.
- */
-static UBool readLine(FileStream *in, UnicodeString& line, const char* encoding) {
-    if (T_FileStream_eof(in)) {
-        return FALSE;
-    }
-    char buffer[128];
-    char* p = buffer;
-    char* limit = p + sizeof(buffer) - 1; // Leave space for 0
-    while (p<limit) {
-        int c = T_FileStream_getc(in);
-        if (c < 0 || c == 0xD || c == 0xA) {
-            break;
-        }
-        *p++ = c;
-    }
-    *p = 0;
-    line = UnicodeString(buffer, encoding);
-    return TRUE;
-}
-
-
  /**
   * Read the external names list, and confirms that the collator 
   * gets the same results when comparing lines one to another
@@ -111,28 +78,12 @@ void CollationThaiTest::TestNamesList(void) {
          return;
      }
   
-    // Read in a dictionary of Thai words
-    UErrorCode status = U_ZERO_ERROR;
-    char buffer[1024];
-    uprv_strcpy(buffer,IntlTest::loadTestData(status) );
-    char* index = 0;
-   
-    if (U_FAILURE(status)) {
-        errln("ERROR: could not open test data %s", u_errorName(status));
-           return;
-    }
-    index=strrchr(buffer,(char)U_FILE_SEP_CHAR);
-
-    if((unsigned int)(index-buffer) != (strlen(buffer)-1)){
-            *(index+1)=0;
-    }
-    uprv_strcat(buffer,".."U_FILE_SEP_STRING);
-    uprv_strcat(buffer, "TestNames_Thai.txt");
-
-    FileStream *in = T_FileStream_open(buffer, "rb");
-    if (in == 0) {
-        logln((UnicodeString)"Could not find file: " + buffer +" will not do this test");
-        return;        
+    UErrorCode ec = U_ZERO_ERROR;
+    TextFile names("TestNames_Thai.txt", "UTF16LE", ec);
+    if (U_FAILURE(ec)) {
+        logln("Can't open TestNames_Thai.txt: %s; skipping test",
+              u_errorName(ec));
+        return;
      }
  
      //
@@ -140,16 +91,9 @@ void CollationThaiTest::TestNamesList(void) {
      // word.  They should be in sorted order.
      //
      UnicodeString lastWord, word;
-    int32_t line = 0;
      //int32_t failed = 0;
      int32_t wordCount = 0;
-    while (readLine(in, word, "UTF16LE")) {
-        line++;
-
-        // Skip comments and blank lines
-        if (word.charAt(0) == 0x23 || word.length() == 0) {
-            continue;
-        }
+    while (names.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
  
          // Show the first 8 words being compared, so we can see what's happening
          ++wordCount;
@@ -165,9 +109,9 @@ void CollationThaiTest::TestNamesList(void) {
          lastWord = word;
      }
  
+    assertSuccess("readLine", ec);
  
      logln((UnicodeString)"Words checked: " + wordCount);
-    T_FileStream_close(in);
  }
  
  /**
@@ -180,29 +124,13 @@ void CollationThaiTest::TestDictionary(void) {
          errln("Error: could not construct Thai collator");
          return;
      }
- 
-    // Read in a dictionary of Thai words
-    UErrorCode status = U_ZERO_ERROR;
-    char buffer[1024];
-    uprv_strcpy(buffer,IntlTest::loadTestData(status) );
-    char* index = 0;
-   
-    if (U_FAILURE(status)) {
-        errln("ERROR: could not open test data %s", u_errorName(status));
-           return;
-    }
-    index=strrchr(buffer,(char)U_FILE_SEP_CHAR);
-
-    if((unsigned int)(index-buffer) != (strlen(buffer)-1)){
-            *(index+1)=0;
-    }
-    uprv_strcat(buffer,".."U_FILE_SEP_STRING);
-    uprv_strcat(buffer, TEST_FILE);
  
-    FileStream *in = T_FileStream_open(buffer, "rb");
-    if (in == 0) {
-        errln((UnicodeString)"Error: could not open test file " + buffer);
-        return;        
+    UErrorCode ec = U_ZERO_ERROR;
+    TextFile riwords("riwords.txt", "UTF8", ec);
+    if (U_FAILURE(ec)) {
+        logln("Can't open riwords.txt: %s; skipping test",
+              u_errorName(ec));
+        return;
      }
  
      //
@@ -210,16 +138,9 @@ void CollationThaiTest::TestDictionary(void) {
      // word.  They should be in sorted order.
      //
      UnicodeString lastWord, word;
-    int32_t line = 0;
      int32_t failed = 0;
      int32_t wordCount = 0;
-    while (readLine(in, word, "UTF8")) {
-        line++;
-
-        // Skip comments and blank lines
-        if (word.charAt(0) == 0x23 || word.length() == 0) {
-            continue;
-        }
+    while (riwords.readLineSkippingComments(word, ec, FALSE) && U_SUCCESS(ec)) {
  
          // Show the first 8 words being compared, so we can see what's happening
          ++wordCount;
@@ -239,7 +160,7 @@ void CollationThaiTest::TestDictionary(void) {
                      UnicodeString str;
                      UnicodeString msg =
                          UnicodeString("--------------------------------------------\n")
-                        + line
+                        + riwords.getLineNumber()
                          + " compare(" + IntlTest::prettify(lastWord, str);
                      msg += UnicodeString(", ")
                          + IntlTest::prettify(word, str) + ") returned " + result
@@ -261,17 +182,18 @@ void CollationThaiTest::TestDictionary(void) {
          lastWord = word;
      }
  
+    assertSuccess("readLine", ec);
+
      if (failed != 0) {
          if (failed > MAX_FAILURES_TO_SHOW) {
              errln((UnicodeString)"Too many failures; only the first " +
                    MAX_FAILURES_TO_SHOW + " failures were shown");
          }
-        errln((UnicodeString)"Summary: " + failed + " of " + (line - 1) +
+        errln((UnicodeString)"Summary: " + failed + " of " + (riwords.getLineNumber() - 1) +
                " comparisons failed");
      }
  
      logln((UnicodeString)"Words checked: " + wordCount);
-    T_FileStream_close(in);
  }
  
  /**
@@ -330,18 +252,17 @@ void CollationThaiTest::TestCornerCases(void) {
  // Internal utilities
  //------------------------------------------------------------------------
  
-void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
+void CollationThaiTest::compareArray(Collator& c, const char* tests[],
                                       int32_t testsLength) {
-    UErrorCode status = U_ZERO_ERROR;
      for (int32_t i = 0; i < testsLength; i += 3) {
  
-        int32_t expect = 0;
+        Collator::EComparisonResult expect;
          if (tests[i+1][0] == '<') {
-            expect = -1;
+          expect = Collator::LESS;
          } else if (tests[i+1][0] == '>') {
-            expect = 1;
+          expect = Collator::GREATER;
          } else if (tests[i+1][0] == '=') {
-            expect = 0;
+          expect = Collator::EQUAL;
          } else {
              // expect = Integer.decode(tests[i+1]).intValue();
              errln((UnicodeString)"Error: unknown operator " + tests[i+1]);
@@ -352,6 +273,9 @@ void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
          parseChars(s1, tests[i]);
          parseChars(s2, tests[i+2]);
  
+        doTest(&c, s1, s2, expect);
+#if 0
+        UErrorCode status = U_ZERO_ERROR;
          int32_t result = c.compare(s1, s2);
          if (sign(result) != sign(expect))
          {
@@ -393,6 +317,7 @@ void CollationThaiTest::compareArray(const Collator& c, const char* tests[],
                  errln((UnicodeString)"  " + prettify(k1, t1) + " vs. " + prettify(k2, t2));
              }
          }
+#endif
      }
  }
  
@@ -412,4 +337,123 @@ UnicodeString& CollationThaiTest::parseChars(UnicodeString& result,
      return result = CharsToUnicodeString(chars);
  }
  
+UCollator *thaiColl = NULL;
+
+U_CDECL_BEGIN
+static int U_CALLCONV
+StrCmp(const void *p1, const void *p2) {
+  return ucol_strcoll(thaiColl, *(UChar **) p1, -1,  *(UChar **)p2, -1);
+}
+U_CDECL_END
+
+
+#define LINES 6
+
+void CollationThaiTest::TestInvalidThai(void) {
+  const char *tests[LINES] = {
+    "\\u0E44\\u0E01\\u0E44\\u0E01",
+    "\\u0E44\\u0E01\\u0E01\\u0E44",
+    "\\u0E01\\u0E44\\u0E01\\u0E44",
+    "\\u0E01\\u0E01\\u0E44\\u0E44",
+    "\\u0E44\\u0E44\\u0E01\\u0E01",
+    "\\u0E01\\u0E44\\u0E44\\u0E01",
+  };
+
+  UChar strings[LINES][20];
+
+  UChar *toSort[LINES];
+
+  int32_t i = 0, j = 0, len = 0;
+
+  UErrorCode coll_status = U_ZERO_ERROR;
+  UnicodeString iteratorText;
+
+  thaiColl = ucol_open ("th_TH", &coll_status);
+  if (U_FAILURE(coll_status)) {
+    errln("Error opening Thai collator: %s", u_errorName(coll_status));
+    return;
+  }
+
+  CollationElementIterator* c = ((RuleBasedCollator *)coll)->createCollationElementIterator( iteratorText );
+
+  for(i = 0; i < (int32_t)(sizeof(tests)/sizeof(tests[0])); i++) {
+    len = u_unescape(tests[i], strings[i], 20);
+    strings[i][len] = 0;
+    toSort[i] = strings[i];
+  }
+
+  qsort (toSort, LINES, sizeof (UChar *), StrCmp);
+
+  for (i=0; i < LINES; i++)
+  {
+    logln("%i", i);
+      for (j=i+1; j < LINES; j++) {
+          if (ucol_strcoll (thaiColl, toSort[i], -1, toSort[j], -1) == UCOL_GREATER)
+          {
+              // inconsistency ordering found!
+            errln("Inconsistent ordering between strings %i and %i", i, j);
+          }
+      }
+      iteratorText.setTo(toSort[i]);
+      c->setText(iteratorText, coll_status);
+      backAndForth(*c);
+  }
+
+  
+  ucol_close(thaiColl);
+  delete c;
+}
+
+void CollationThaiTest::TestReordering(void) {
+  const char *tests[] = { 
+                          "\\u0E41c\\u0301",       "=", "\\u0E41\\u0107", // composition
+                          "\\u0E41\\uD835\\uDFCE", "<", "\\u0E41\\uD835\\uDFCF", // supplementaries
+                          "\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
+                          "\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
+                          "\\u0E41\\u0301",        "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
+                          "\\u0E41\\u0301\\u0316", "=", "\\u0E41\\u0316\\u0301",
+                          // after UCA 4.1, the two lines below are not equal anymore do not have equal sign
+                          "\\u0e24\\u0e41",        "<", "\\u0e41\\u0e24", // exiting contraction bug
+                          "\\u0e3f\\u0e3f\\u0e24\\u0e41", "<", "\\u0e3f\\u0e3f\\u0e41\\u0e24",
+
+                          "abc\\u0E41c\\u0301",       "=", "abc\\u0E41\\u0107", // composition
+                          "abc\\u0E41\\uD834\\uDC00", "<", "abc\\u0E41\\uD834\\uDC01", // supplementaries
+                          "abc\\u0E41\\uD834\\uDD5F", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
+                          "abc\\u0E41\\uD87E\\uDC02", "=", "abc\\u0E41\\u4E41", // supplementary composition decomps to BMP
+                          "abc\\u0E41\\u0301",        "=", "abc\\u0E41\\u0301", // unsafe (just checking backwards iteration)
+                          "abc\\u0E41\\u0301\\u0316", "=", "abc\\u0E41\\u0316\\u0301",
+
+                          "\\u0E41c\\u0301abc",       "=", "\\u0E41\\u0107abc", // composition
+                          "\\u0E41\\uD834\\uDC00abc", "<", "\\u0E41\\uD834\\uDC01abc", // supplementaries
+                          "\\u0E41\\uD834\\uDD5Fabc", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
+                          "\\u0E41\\uD87E\\uDC02abc", "=", "\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
+                          "\\u0E41\\u0301abc",        "=", "\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
+                          "\\u0E41\\u0301\\u0316abc", "=", "\\u0E41\\u0316\\u0301abc",
+
+                          "abc\\u0E41c\\u0301abc",       "=", "abc\\u0E41\\u0107abc", // composition
+                          "abc\\u0E41\\uD834\\uDC00abc", "<", "abc\\u0E41\\uD834\\uDC01abc", // supplementaries
+                          "abc\\u0E41\\uD834\\uDD5Fabc", "=", "abc\\u0E41\\uD834\\uDD58\\uD834\\uDD65abc", // supplementary composition decomps to supplementary
+                          "abc\\u0E41\\uD87E\\uDC02abc", "=", "abc\\u0E41\\u4E41abc", // supplementary composition decomps to BMP
+                          "abc\\u0E41\\u0301abc",        "=", "abc\\u0E41\\u0301abc", // unsafe (just checking backwards iteration)
+                          "abc\\u0E41\\u0301\\u0316abc", "=", "abc\\u0E41\\u0316\\u0301abc",
+                        };
+
+  compareArray(*coll, tests, sizeof(tests)/sizeof(tests[0]));
+ 
+  const char *rule = "& c < ab";
+  const char *testcontraction[] = { "\\u0E41ab", ">", "\\u0E41c"}; // After UCA 4.1 Thai are normal so won't break a contraction
+  UnicodeString rules;
+  UErrorCode status = U_ZERO_ERROR;
+  parseChars(rules, rule);
+  RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
+  if(U_SUCCESS(status)) {
+    compareArray(*rcoll, testcontraction, 3);
+    delete rcoll;
+  } else {
+    errln("Couldn't instantiate collator from rules");
+  }
+
+}
+
+
  #endif /* #if !UCONFIG_NO_COLLATION */