ICU-64232.0.1.tar.gz

[apple/icu.git] / icuSources / test / intltest / convtest.cpp
diff --git a/icuSources/test/intltest/convtest.cpp b/icuSources/test/intltest/convtest.cpp

index d12809ab65d8dbed7410a35066240953495d6d27..edadb126206ac3eb2ff9939ab28abd065b0062e7 100644 (file)
--- a/icuSources/test/intltest/convtest.cpp
+++ b/icuSources/test/intltest/convtest.cpp
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2003-2010, International Business Machines
+*   Copyright (C) 2003-2014, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
  *   file name:  convtest.cpp
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
  *
@@ -35,13 +37,13 @@
  #include "unicode/uniset.h"
  #include "unicode/ustring.h"
  #include "unicode/ures.h"
+#include "unicode/utf16.h"
  #include "convtest.h"
+#include "cmemory.h"
  #include "unicode/tstdtmod.h"
  #include <string.h>
  #include <stdlib.h>
  
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
  enum {
      // characters used in test data for callbacks
      SUB_CB='?',
@@ -66,19 +68,16 @@ ConversionTest::~ConversionTest() {
  void
  ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
      if (exec) logln("TestSuite ConversionTest: ");
-    switch (index) {
+    TESTCASE_AUTO_BEGIN;
  #if !UCONFIG_NO_FILE_IO
-        case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
-        case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
-        case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
-#else
-        case 0:
-        case 1:
-        case 2: name="skip"; break;
+    TESTCASE_AUTO(TestToUnicode);
+    TESTCASE_AUTO(TestFromUnicode);
+    TESTCASE_AUTO(TestGetUnicodeSet);
  #endif
-        case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
-        default: name=""; break; //needed to end loop
-    }
+    TESTCASE_AUTO(TestGetUnicodeSet2);
+    TESTCASE_AUTO(TestDefaultIgnorableCallback);
+    TESTCASE_AUTO(TestUTF8ToUTF8Overflow);
+    TESTCASE_AUTO_END;
  }
  
  // test data interface ----------------------------------------------------- ***
@@ -289,7 +288,7 @@ ConversionTest::TestFromUnicode() {
                      // read a substitution string, separated by an equal sign
                      p=s.getBuffer()+index+1;
                      length=s.length()-(index+1);
-                    if(length<0 || length>=LENGTHOF(cc.subString)) {
+                    if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) {
                          errorCode=U_ILLEGAL_ARGUMENT_ERROR;
                      } else {
                          u_memcpy(cc.subString, p, length);
@@ -443,7 +442,7 @@ ConversionTest::TestGetUnicodeSet() {
                  if(!diffSet.isEmpty()) {
                      diffSet.toPattern(s, TRUE);
                      if(s.length()>100) {
-                        s.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+                        s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
                      }
                      errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d",
                              charset, i);
@@ -455,7 +454,7 @@ ConversionTest::TestGetUnicodeSet() {
                  if(!diffSet.isEmpty()) {
                      diffSet.toPattern(s, TRUE);
                      if(s.length()>100) {
-                        s.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+                        s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
                      }
                      errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d",
                              charset, i);
@@ -553,7 +552,7 @@ ConversionTest::TestGetUnicodeSet2() {
      LocalUConverterPointer cnv;
      char buffer[1024];
      int32_t i;
-    for(i=0; i<LENGTHOF(cnvNames); ++i) {
+    for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) {
          UErrorCode errorCode=U_ZERO_ERROR;
          cnv.adoptInstead(cnv_open(cnvNames[i], errorCode));
          if(U_FAILURE(errorCode)) {
@@ -607,12 +606,7 @@ ConversionTest::TestGetUnicodeSet2() {
                  // First try to see if we have different sets because ucnv_getUnicodeSet()
                  // added strings: The above conversion method does not tell us what strings might be convertible.
                  // Remove strings from the set and compare again.
-                // Unfortunately, there are no good, direct set methods for finding out whether there are strings
-                // in the set, nor for enumerating or removing just them.
-                // Intersect all code points with the set. The intersection will not contain strings.
-                UnicodeSet temp(0, 0x10ffff);
-                temp.retainAll(set);
-                set=temp;
+                set.removeAllStrings();
              }
              if(set!=expected) {
                  UnicodeSet diffSet;
@@ -623,7 +617,7 @@ ConversionTest::TestGetUnicodeSet2() {
                  if(!diffSet.isEmpty()) {
                      diffSet.toPattern(out, TRUE);
                      if(out.length()>100) {
-                        out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+                        out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
                      }
                      errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
                              cnvNames[i], which);
@@ -635,7 +629,7 @@ ConversionTest::TestGetUnicodeSet2() {
                  if(!diffSet.isEmpty()) {
                      diffSet.toPattern(out, TRUE);
                      if(out.length()>100) {
-                        out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
+                        out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis));
                      }
                      errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
                              cnvNames[i], which);
@@ -648,15 +642,209 @@ ConversionTest::TestGetUnicodeSet2() {
      delete [] s0;
  }
  
+// Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping
+// If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated
+void
+ConversionTest::TestDefaultIgnorableCallback() {
+    UErrorCode status = U_ZERO_ERROR;
+    const char *cnv_name = "euc-jp-2007";
+    const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]";
+    const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]";
+
+    UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status);
+    if (U_FAILURE(status)) {
+        dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status));
+        return;
+    }
+
+    UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status);
+    if (U_FAILURE(status)) {
+        dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status));
+        return;
+    }
+
+    UConverter *cnv = cnv_open(cnv_name, status);
+    if (U_FAILURE(status)) {
+        dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status));
+        return;
+    }
+
+    // set callback for the converter 
+    ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status);
+
+    UChar32 input[1];
+    char output[10];
+    int32_t outputLength;
+    
+    // test default ignorables are ignored
+    int size = set_ignorable->size();
+    for (int i = 0; i < size; i++) {
+        status = U_ZERO_ERROR;
+        outputLength= 0;
+
+        input[0] = set_ignorable->charAt(i);
+
+        outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
+        if (U_FAILURE(status) || outputLength != 0) {
+            errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status));
+        }
+    }
+
+    // test non-ignorables are not ignored
+    size = set_not_ignorable->size();
+    for (int i = 0; i < size; i++) {
+        status = U_ZERO_ERROR;
+        outputLength= 0;
+
+        input[0] = set_not_ignorable->charAt(i);
+
+        if (input[0] == 0) {
+            continue;
+        }
+
+        outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status);
+        if (U_FAILURE(status) || outputLength <= 0) {
+            errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status));
+        }
+    }
+    
+    ucnv_close(cnv);
+    delete set_not_ignorable;
+    delete set_ignorable;
+}
+
+void
+ConversionTest::TestUTF8ToUTF8Overflow() {
+    IcuTestErrorCode errorCode(*this, "TestUTF8ToUTF8Overflow");
+    LocalUConverterPointer cnv1(ucnv_open("UTF-8", errorCode));
+    LocalUConverterPointer cnv2(ucnv_open("UTF-8", errorCode));
+    static const char *text = "aä";  // ä: 2 bytes
+    const char *source = text;
+    const char *sourceLimit = text + strlen(text);
+    char result[20];
+    char *target = result;
+    const char *targetLimit = result + sizeof(result);
+    UChar buffer16[20];
+    UChar *pivotSource = buffer16;
+    UChar *pivotTarget = buffer16;
+    const UChar *pivotLimit = buffer16 + UPRV_LENGTHOF(buffer16);
+    int32_t length;
+
+    // Convert with insufficient target capacity.
+    result[2] = 5;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, result + 2, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, FALSE, errorCode);
+    assertEquals("overflow", U_BUFFER_OVERFLOW_ERROR, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("number of bytes written", 2, length);
+    assertEquals("next byte not clobbered", 5, result[2]);
+
+    // Convert the rest and flush.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+
+    assertSuccess("UTF-8->UTF-8", errorCode);
+    length = (int32_t)(target - result);
+    assertEquals("3 bytes", 3, length);
+    if (length == 3) {
+        assertTrue("result same as input", memcmp(text, result, length) == 0);
+    }
+
+    ucnv_reset(cnv1.getAlias());
+    ucnv_reset(cnv2.getAlias());
+    memset(result, 0, sizeof(result));
+    static const char *text2 = "a🚲";  // U+1F6B2 bicycle: 4 bytes
+    source = text2;
+    sourceLimit = text2 + strlen(text2);
+    target = result;
+    pivotSource = pivotTarget = buffer16;
+
+    // Convert with insufficient target capacity.
+    result[3] = 5;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, result + 3, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, FALSE, errorCode);
+    assertEquals("text2 overflow", U_BUFFER_OVERFLOW_ERROR, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("text2 number of bytes written", 3, length);
+    assertEquals("text2 next byte not clobbered", 5, result[3]);
+
+    // Convert the rest and flush.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+
+    assertSuccess("text2 UTF-8->UTF-8", errorCode);
+    length = (int32_t)(target - result);
+    assertEquals("text2 5 bytes", 5, length);
+    if (length == 5) {
+        assertTrue("text2 result same as input", memcmp(text2, result, length) == 0);
+    }
+
+    ucnv_reset(cnv1.getAlias());
+    ucnv_reset(cnv2.getAlias());
+    memset(result, 0, sizeof(result));
+    static const char *illFormed = "\xf1\x91\x93\x96\x91\x94";  // U+514D6 + two more trail bytes
+    source = illFormed;
+    sourceLimit = illFormed + strlen(illFormed);
+    target = result;
+    pivotSource = pivotTarget = buffer16;
+
+    ucnv_setToUCallBack(cnv1.getAlias(), UCNV_TO_U_CALLBACK_STOP, nullptr, nullptr, nullptr, errorCode);
+
+    // Convert only two bytes and flush (but expect failure).
+    char errorBytes[10];
+    int8_t errorLength;
+    result[0] = 5;
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, source + 2,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+    assertEquals("illFormed truncated", U_TRUNCATED_CHAR_FOUND, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("illFormed number of bytes written", 0, length);
+    errorLength = UPRV_LENGTHOF(errorBytes);
+    ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
+    assertEquals("illFormed truncated errorLength", 2, (int32_t)errorLength);
+    if (errorLength == 2) {
+        assertEquals("illFormed truncated errorBytes", 0xf191, 
+                     ((int32_t)(uint8_t)errorBytes[0] << 8) | (uint8_t)errorBytes[1]);
+    }
+
+    // Continue conversion starting with a trail byte.
+    ucnv_convertEx(cnv2.getAlias(), cnv1.getAlias(),
+                   &target, targetLimit, &source, sourceLimit,
+                   buffer16, &pivotSource, &pivotTarget, pivotLimit,
+                   FALSE, TRUE, errorCode);
+
+    assertEquals("illFormed trail byte", U_ILLEGAL_CHAR_FOUND, errorCode.reset());
+    length = (int32_t)(target - result);
+    assertEquals("illFormed trail byte number of bytes written", 0, length);
+    errorLength = UPRV_LENGTHOF(errorBytes);
+    ucnv_getInvalidChars(cnv1.getAlias(), errorBytes, &errorLength, errorCode);
+    assertEquals("illFormed trail byte errorLength", 1, (int32_t)errorLength);
+    if (errorLength == 1) {
+        assertEquals("illFormed trail byte errorBytes", 0x93, (int32_t)(uint8_t)errorBytes[0]);
+    }
+}
+
  // open testdata or ICU data converter ------------------------------------- ***
  
  UConverter *
  ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) {
+    if(name!=NULL && *name=='+') {
+        // Converter names that start with '+' are ignored in ICU4J tests.
+        ++name;
+    }
      if(name!=NULL && *name=='*') {
          /* loadTestData(): set the data directory */
          return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode);
-    } else if(name!=NULL && *name=='+') {
-        return ucnv_open((name+1), &errorCode);
      } else {
          return ucnv_open(name, &errorCode);
      }
@@ -947,6 +1135,7 @@ ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback
      // open the converter
      IcuTestErrorCode errorCode(*this, "ToUnicodeCase");
      LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode));
+    // with no data, the above crashes with "pointer being freed was not allocated" for charset "x11-compound-text", see #13078
      if(errorCode.isFailure()) {
          errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s",
                  cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName());
@@ -988,7 +1177,7 @@ ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback
      int32_t i, step;
  
      ok=TRUE;
-    for(i=0; i<LENGTHOF(steps) && ok; ++i) {
+    for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
          step=steps[i].step;
          if(step<0 && !cc.finalFlush) {
              // skip ucnv_getNextUChar() if !finalFlush because
@@ -1000,12 +1189,12 @@ ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback
              cc.offsets=NULL;
          }
          else {
-            memset(resultOffsets, -1, LENGTHOF(resultOffsets));
+            memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
          }
-        memset(result, -1, LENGTHOF(result));
+        memset(result, -1, UPRV_LENGTHOF(result));
          errorCode.reset();
          resultLength=stepToUnicode(cc, cnv.getAlias(),
-                                result, LENGTHOF(result),
+                                result, UPRV_LENGTHOF(result),
                                  step==0 ? resultOffsets : NULL,
                                  step, errorCode);
          ok=checkToUnicode(
@@ -1035,7 +1224,7 @@ ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback
  
          errorCode.reset();
          resultLength=ucnv_toUChars(cnv.getAlias(),
-                        result, LENGTHOF(result),
+                        result, UPRV_LENGTHOF(result),
                          (const char *)cc.bytes, cc.bytesLength,
                          errorCode);
          ok=checkToUnicode(
@@ -1182,7 +1371,7 @@ stepFromUTF8(ConversionCase &cc,
          targetLimit=resultLimit;
          flush=cc.finalFlush;
  
-        pivotLimit=pivotBuffer+LENGTHOF(pivotBuffer);
+        pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer);
      } else {
          // start with empty partial buffers
          sourceLimit=source;
@@ -1401,7 +1590,7 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
      // convert unicode to utf8
      char utf8[256];
      cc.utf8=utf8;
-    u_strToUTF8(utf8, LENGTHOF(utf8), &cc.utf8Length,
+    u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length,
                  cc.unicode, cc.unicodeLength,
                  &errorCode);
      if(U_FAILURE(errorCode)) {
@@ -1428,13 +1617,13 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
      int32_t i, step;
  
      ok=TRUE;
-    for(i=0; i<LENGTHOF(steps) && ok; ++i) {
+    for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) {
          step=steps[i].step;
-        memset(resultOffsets, -1, LENGTHOF(resultOffsets));
-        memset(result, -1, LENGTHOF(result));
+        memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets));
+        memset(result, -1, UPRV_LENGTHOF(result));
          errorCode=U_ZERO_ERROR;
          resultLength=stepFromUnicode(cc, cnv,
-                                result, LENGTHOF(result),
+                                result, UPRV_LENGTHOF(result),
                                  step==0 ? resultOffsets : NULL,
                                  step, &errorCode);
          ok=checkFromUnicode(
@@ -1463,7 +1652,7 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
          if(cc.utf8Length>=0) {
              errorCode=U_ZERO_ERROR;
              resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
-                                    result, LENGTHOF(result),
+                                    result, UPRV_LENGTHOF(result),
                                      step, &errorCode);
              ok=checkFromUnicode(
                      cc, cnv, steps[i].utf8Name,
@@ -1486,7 +1675,7 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
  
          errorCode=U_ZERO_ERROR;
          resultLength=ucnv_fromUChars(cnv,
-                        result, LENGTHOF(result),
+                        result, UPRV_LENGTHOF(result),
                          cc.unicode, cc.unicodeLength,
                          &errorCode);
          ok=checkFromUnicode(
@@ -1535,7 +1724,7 @@ ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char
      msg=NULL;
  
      errorCode=U_ZERO_ERROR;
-    resultInvalidLength=LENGTHOF(resultInvalidUChars);
+    resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars);
      ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode);
      if(U_FAILURE(errorCode)) {
          errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s",