ICU-57165.0.1.tar.gz

[apple/icu.git] / icuSources / common / ucol_swp.cpp
diff --git a/icuSources/common/ucol_swp.cpp b/icuSources/common/ucol_swp.cpp

index c59a610ba50f5f056535b6e6e6ff247f633648cb..759743528c0914e9e17c74751dee53f5867ddadc 100644 (file)
--- a/icuSources/common/ucol_swp.cpp
+++ b/icuSources/common/ucol_swp.cpp
@@ -1,11 +1,11 @@
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2003-2010, International Business Machines
+*   Copyright (C) 2003-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
-*   file name:  ucol_swp.c
+*   file name:  ucol_swp.cpp
  *   encoding:   US-ASCII
  *   tab size:   8 (not used)
  *   indentation:4
@@ -18,9 +18,10 @@
  
  #include "unicode/udata.h" /* UDataInfo */
  #include "utrie.h"
+#include "utrie2.h"
  #include "udataswp.h"
  #include "cmemory.h"
-#include "ucol_imp.h"
+#include "ucol_data.h"
  #include "ucol_swp.h"
  
  /* swapping ----------------------------------------------------------------- */
@@ -102,20 +103,28 @@ utrie_swap(const UDataSwapper *ds,
  
  #if !UCONFIG_NO_COLLATION
  
-/* Modified copy of the beginning of ucol_swapBinary(). */
  U_CAPI UBool U_EXPORT2
  ucol_looksLikeCollationBinary(const UDataSwapper *ds,
                                const void *inData, int32_t length) {
-    const uint8_t *inBytes;
-    const UCATableHeader *inHeader;
-    UCATableHeader header;
-
      if(ds==NULL || inData==NULL || length<-1) {
          return FALSE;
      }
  
-    inBytes=(const uint8_t *)inData;
-    inHeader=(const UCATableHeader *)inData;
+    // First check for format version 4+ which has a standard data header.
+    UErrorCode errorCode=U_ZERO_ERROR;
+    (void)udata_swapDataHeader(ds, inData, -1, NULL, &errorCode);
+    if(U_SUCCESS(errorCode)) {
+        const UDataInfo &info=*(const UDataInfo *)((const char *)inData+4);
+        if(info.dataFormat[0]==0x55 &&   // dataFormat="UCol"
+                info.dataFormat[1]==0x43 &&
+                info.dataFormat[2]==0x6f &&
+                info.dataFormat[3]==0x6c) {
+            return TRUE;
+        }
+    }
+
+    // Else check for format version 3.
+    const UCATableHeader *inHeader=(const UCATableHeader *)inData;
  
      /*
       * The collation binary must contain at least the UCATableHeader,
@@ -123,6 +132,7 @@ ucol_looksLikeCollationBinary(const UDataSwapper *ds,
       * sizeof(UCATableHeader)==42*4 in ICU 2.8
       * check the length against the header size before reading the size field
       */
+    UCATableHeader header;
      uprv_memset(&header, 0, sizeof(header));
      if(length<0) {
          header.size=udata_readInt32(ds, inHeader->size);
@@ -146,11 +156,13 @@ ucol_looksLikeCollationBinary(const UDataSwapper *ds,
      return TRUE;
  }
  
-/* swap a header-less collation binary, inside a resource bundle or ucadata.icu */
-U_CAPI int32_t U_EXPORT2
-ucol_swapBinary(const UDataSwapper *ds,
-                const void *inData, int32_t length, void *outData,
-                UErrorCode *pErrorCode) {
+namespace {
+
+/* swap a header-less collation formatVersion=3 binary, inside a resource bundle or ucadata.icu */
+int32_t
+swapFormatVersion3(const UDataSwapper *ds,
+                   const void *inData, int32_t length, void *outData,
+                   UErrorCode *pErrorCode) {
      const uint8_t *inBytes;
      uint8_t *outBytes;
  
@@ -161,7 +173,7 @@ ucol_swapBinary(const UDataSwapper *ds,
      uint32_t count;
  
      /* argument checking in case we were not called from ucol_swap() */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+    if(U_FAILURE(*pErrorCode)) {
          return 0;
      }
      if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
@@ -185,7 +197,7 @@ ucol_swapBinary(const UDataSwapper *ds,
      if(length<0) {
          header.size=udata_readInt32(ds, inHeader->size);
      } else if((length<(42*4) || length<(header.size=udata_readInt32(ds, inHeader->size)))) {
-        udata_printError(ds, "ucol_swapBinary(): too few bytes (%d after header) for collation data\n",
+        udata_printError(ds, "ucol_swap(formatVersion=3): too few bytes (%d after header) for collation data\n",
                           length);
          *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
          return 0;
@@ -197,7 +209,7 @@ ucol_swapBinary(const UDataSwapper *ds,
          inHeader->formatVersion[0]==3 /*&&
          inHeader->formatVersion[1]>=0*/
      )) {
-        udata_printError(ds, "ucol_swapBinary(): magic 0x%08x or format version %02x.%02x is not a collation binary\n",
+        udata_printError(ds, "ucol_swap(formatVersion=3): magic 0x%08x or format version %02x.%02x is not a collation binary\n",
                           header.magic,
                           inHeader->formatVersion[0], inHeader->formatVersion[1]);
          *pErrorCode=U_UNSUPPORTED_ERROR;
@@ -205,7 +217,7 @@ ucol_swapBinary(const UDataSwapper *ds,
      }
  
      if(inHeader->isBigEndian!=ds->inIsBigEndian || inHeader->charSetFamily!=ds->inCharset) {
-        udata_printError(ds, "ucol_swapBinary(): endianness %d or charset %d does not match the swapper\n",
+        udata_printError(ds, "ucol_swap(formatVersion=3): endianness %d or charset %d does not match the swapper\n",
                           inHeader->isBigEndian, inHeader->charSetFamily);
          *pErrorCode=U_INVALID_FORMAT_ERROR;
          return 0;
@@ -295,7 +307,6 @@ ucol_swapBinary(const UDataSwapper *ds,
               * if UCAConsts!=0 then contractionUCACombos because we are swapping
               * the UCA data file, and we know that the UCA contains contractions
               */
-            count=header.contractionUCACombos-header.UCAConsts;
              ds->swapArray32(ds, inBytes+header.UCAConsts, header.contractionUCACombos-header.UCAConsts,
                                 outBytes+header.UCAConsts, pErrorCode);
          }
@@ -329,44 +340,252 @@ ucol_swapBinary(const UDataSwapper *ds,
      return header.size;
  }
  
+// swap formatVersion 4 or 5 ----------------------------------------------- ***
+
+// The following are copied from CollationDataReader, trading an awkward copy of constants
+// for an awkward relocation of the i18n collationdatareader.h file into the common library.
+// Keep them in sync!
+
+enum {
+    IX_INDEXES_LENGTH,  // 0
+    IX_OPTIONS,
+    IX_RESERVED2,
+    IX_RESERVED3,
+
+    IX_JAMO_CE32S_START,  // 4
+    IX_REORDER_CODES_OFFSET,
+    IX_REORDER_TABLE_OFFSET,
+    IX_TRIE_OFFSET,
+
+    IX_RESERVED8_OFFSET,  // 8
+    IX_CES_OFFSET,
+    IX_RESERVED10_OFFSET,
+    IX_CE32S_OFFSET,
+
+    IX_ROOT_ELEMENTS_OFFSET,  // 12
+    IX_CONTEXTS_OFFSET,
+    IX_UNSAFE_BWD_OFFSET,
+    IX_FAST_LATIN_TABLE_OFFSET,
+
+    IX_SCRIPTS_OFFSET,  // 16
+    IX_COMPRESSIBLE_BYTES_OFFSET,
+    IX_RESERVED18_OFFSET,
+    IX_TOTAL_SIZE
+};
+
+int32_t
+swapFormatVersion4(const UDataSwapper *ds,
+                   const void *inData, int32_t length, void *outData,
+                   UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) { return 0; }
+
+    const uint8_t *inBytes=(const uint8_t *)inData;
+    uint8_t *outBytes=(uint8_t *)outData;
+
+    const int32_t *inIndexes=(const int32_t *)inBytes;
+    int32_t indexes[IX_TOTAL_SIZE+1];
+
+    // Need at least IX_INDEXES_LENGTH and IX_OPTIONS.
+    if(0<=length && length<8) {
+        udata_printError(ds, "ucol_swap(formatVersion=4): too few bytes "
+                         "(%d after header) for collation data\n",
+                         length);
+        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    int32_t indexesLength=indexes[0]=udata_readInt32(ds, inIndexes[0]);
+    if(0<=length && length<(indexesLength*4)) {
+        udata_printError(ds, "ucol_swap(formatVersion=4): too few bytes "
+                         "(%d after header) for collation data\n",
+                         length);
+        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    for(int32_t i=1; i<=IX_TOTAL_SIZE && i<indexesLength; ++i) {
+        indexes[i]=udata_readInt32(ds, inIndexes[i]);
+    }
+    for(int32_t i=indexesLength; i<=IX_TOTAL_SIZE; ++i) {
+        indexes[i]=-1;
+    }
+    inIndexes=NULL;  // Make sure we do not accidentally use these instead of indexes[].
+
+    // Get the total length of the data.
+    int32_t size;
+    if(indexesLength>IX_TOTAL_SIZE) {
+        size=indexes[IX_TOTAL_SIZE];
+    } else if(indexesLength>IX_REORDER_CODES_OFFSET) {
+        size=indexes[indexesLength-1];
+    } else {
+        size=indexesLength*4;
+    }
+    if(length<0) { return size; }
+
+    if(length<size) {
+        udata_printError(ds, "ucol_swap(formatVersion=4): too few bytes "
+                         "(%d after header) for collation data\n",
+                         length);
+        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0;
+    }
+
+    // Copy the data for inaccessible bytes and arrays of bytes.
+    if(inBytes!=outBytes) {
+        uprv_memcpy(outBytes, inBytes, size);
+    }
+
+    // Swap the int32_t indexes[].
+    ds->swapArray32(ds, inBytes, indexesLength * 4, outBytes, &errorCode);
+
+    // The following is a modified version of CollationDataReader::read().
+    // Here we use indexes[] not inIndexes[] because
+    // the inIndexes[] may not be in this machine's endianness.
+    int32_t index;  // one of the indexes[] slots
+    int32_t offset;  // byte offset for the index part
+    // int32_t length;  // number of bytes in the index part
+
+    index = IX_REORDER_CODES_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray32(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    // Skip the IX_REORDER_TABLE_OFFSET byte array.
+
+    index = IX_TRIE_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        utrie2_swap(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_RESERVED8_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        udata_printError(ds, "ucol_swap(formatVersion=4): unknown data at IX_RESERVED8_OFFSET\n", length);
+        errorCode = U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    index = IX_CES_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray64(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_RESERVED10_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        udata_printError(ds, "ucol_swap(formatVersion=4): unknown data at IX_RESERVED10_OFFSET\n", length);
+        errorCode = U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    index = IX_CE32S_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray32(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_ROOT_ELEMENTS_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray32(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_CONTEXTS_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray16(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_UNSAFE_BWD_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray16(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_FAST_LATIN_TABLE_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray16(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    index = IX_SCRIPTS_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        ds->swapArray16(ds, inBytes + offset, length, outBytes + offset, &errorCode);
+    }
+
+    // Skip the  IX_COMPRESSIBLE_BYTES_OFFSET byte array.
+
+    index = IX_RESERVED18_OFFSET;
+    offset = indexes[index];
+    length = indexes[index + 1] - offset;
+    if(length > 0) {
+        udata_printError(ds, "ucol_swap(formatVersion=4): unknown data at IX_RESERVED18_OFFSET\n", length);
+        errorCode = U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    return size;
+}
+
+}  // namespace
+
  /* swap ICU collation data like ucadata.icu */
  U_CAPI int32_t U_EXPORT2
  ucol_swap(const UDataSwapper *ds,
            const void *inData, int32_t length, void *outData,
            UErrorCode *pErrorCode) {
-          
-    const UDataInfo *pInfo;
-    int32_t headerSize, collationSize;
+    if(U_FAILURE(*pErrorCode)) { return 0; }
  
      /* udata_swapDataHeader checks the arguments */
-    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
-        return 0;
+    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+    if(U_FAILURE(*pErrorCode)) {
+        // Try to swap the old format version which did not have a standard data header.
+        *pErrorCode=U_ZERO_ERROR;
+        return swapFormatVersion3(ds, inData, length, outData, pErrorCode);
      }
  
      /* check data format and format version */
-    pInfo=(const UDataInfo *)((const char *)inData+4);
+    const UDataInfo &info=*(const UDataInfo *)((const char *)inData+4);
      if(!(
-        pInfo->dataFormat[0]==0x55 &&   /* dataFormat="UCol" */
-        pInfo->dataFormat[1]==0x43 &&
-        pInfo->dataFormat[2]==0x6f &&
-        pInfo->dataFormat[3]==0x6c &&
-        pInfo->formatVersion[0]==3 /*&&
-        pInfo->formatVersion[1]>=0*/
+        info.dataFormat[0]==0x55 &&   // dataFormat="UCol"
+        info.dataFormat[1]==0x43 &&
+        info.dataFormat[2]==0x6f &&
+        info.dataFormat[3]==0x6c &&
+        (3<=info.formatVersion[0] && info.formatVersion[0]<=5)
      )) {
-        udata_printError(ds, "ucol_swap(): data format %02x.%02x.%02x.%02x (format version %02x.%02x) is not a collation file\n",
-                         pInfo->dataFormat[0], pInfo->dataFormat[1],
-                         pInfo->dataFormat[2], pInfo->dataFormat[3],
-                         pInfo->formatVersion[0], pInfo->formatVersion[1]);
+        udata_printError(ds, "ucol_swap(): data format %02x.%02x.%02x.%02x "
+                         "(format version %02x.%02x) is not recognized as collation data\n",
+                         info.dataFormat[0], info.dataFormat[1],
+                         info.dataFormat[2], info.dataFormat[3],
+                         info.formatVersion[0], info.formatVersion[1]);
          *pErrorCode=U_UNSUPPORTED_ERROR;
          return 0;
      }
  
-    collationSize=ucol_swapBinary(ds,
-                        (const char *)inData+headerSize,
-                        length>=0 ? length-headerSize : -1,
-                        (char *)outData+headerSize,
-                        pErrorCode);
+    inData=(const char *)inData+headerSize;
+    if(length>=0) { length-=headerSize; }
+    outData=(char *)outData+headerSize;
+    int32_t collationSize;
+    if(info.formatVersion[0]>=4) {
+        collationSize=swapFormatVersion4(ds, inData, length, outData, *pErrorCode);
+    } else {
+        collationSize=swapFormatVersion3(ds, inData, length, outData, pErrorCode);
+    }
      if(U_SUCCESS(*pErrorCode)) {
          return headerSize+collationSize;
      } else {