ICU-57166.0.1.tar.gz

[apple/icu.git] / icuSources / common / ucnv_u32.c
diff --git a/icuSources/common/ucnv_u32.c b/icuSources/common/ucnv_u32.c

index 1a37e96f5993936164a92f4bf80e9b665eda1ba5..b6804cafa0313054f35b1b36146e9ddfa7604c65 100644 (file)
--- a/icuSources/common/ucnv_u32.c
+++ b/icuSources/common/ucnv_u32.c
@@ -1,6 +1,6 @@
  /*  
  **********************************************************************
-*   Copyright (C) 2002-2004, International Business Machines
+*   Copyright (C) 2002-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *   file name:  ucnv_u32.c
@@ -16,9 +16,10 @@
  
  #include "unicode/utypes.h"
  
-#if !UCONFIG_NO_CONVERSION
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  
  #include "unicode/ucnv.h"
+#include "unicode/utf.h"
  #include "ucnv_bld.h"
  #include "ucnv_cnv.h"
  #include "cmemory.h"
@@ -34,6 +35,10 @@
  /* -SURROGATE_LOW_START + HALF_BASE */
  #define SURROGATE_LOW_BASE      9216
  
+enum {
+    UCNV_NEED_TO_WRITE_BOM=1
+};
+
  /* UTF-32BE ----------------------------------------------------------------- */
  
  static void
@@ -47,9 +52,10 @@ T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
      unsigned char *toUBytes = args->converter->toUBytes;
      uint32_t ch, i;
  
-    /* UTF-8 returns here for only non-offset, this needs to change.*/
+    /* Restore state of current sequence */
      if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
          i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
  
          ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
          args->converter->toUnicodeStatus = 0;
@@ -127,8 +133,10 @@ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
      uint32_t ch, i;
      int32_t offsetNum = 0;
  
+    /* Restore state of current sequence */
      if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
          i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
  
          ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
          args->converter->toUnicodeStatus = 0;
@@ -204,13 +212,30 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
                                    UErrorCode * err)
  {
      const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    unsigned char *myTarget;
      const UChar *sourceLimit = args->sourceLimit;
      const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
      UChar32 ch, ch2;
      unsigned int indexToWrite;
      unsigned char temp[sizeof(uint32_t)];
  
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
      temp[0] = 0;
  
      if (args->converter->fromUChar32) {
@@ -222,7 +247,7 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
      while (mySource < sourceLimit && myTarget < targetLimit) {
          ch = *(mySource++);
  
-        if (UTF_IS_SURROGATE(ch)) {
+        if (U_IS_SURROGATE(ch)) {
              if (U_IS_LEAD(ch)) {
  lowsurogate:
                  if (mySource < sourceLimit) {
@@ -288,8 +313,8 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
                                                 UErrorCode * err)
  {
      const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
-    int32_t *myOffsets = args->offsets;
+    unsigned char *myTarget;
+    int32_t *myOffsets;
      const UChar *sourceLimit = args->sourceLimit;
      const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
      UChar32 ch, ch2;
@@ -297,6 +322,24 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
      unsigned int indexToWrite;
      unsigned char temp[sizeof(uint32_t)];
  
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
+    myOffsets = args->offsets;
      temp[0] = 0;
  
      if (args->converter->fromUChar32) {
@@ -308,7 +351,7 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
      while (mySource < sourceLimit && myTarget < targetLimit) {
          ch = *(mySource++);
  
-        if (UTF_IS_SURROGATE(ch)) {
+        if (U_IS_SURROGATE(ch)) {
              if (U_IS_LEAD(ch)) {
  lowsurogate:
                  if (mySource < sourceLimit) {
@@ -360,7 +403,7 @@ lowsurogate:
                  *err = U_BUFFER_OVERFLOW_ERROR;
              }
          }
-        offsetNum++;
+        offsetNum = offsetNum + 1 + (temp[1] != 0);
      }
  
      if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
@@ -437,7 +480,7 @@ static const UConverterImpl _UTF32BEImpl = {
      NULL,
      NULL,
      NULL,
-    ucnv_getCompleteUnicodeSet
+    ucnv_getNonSurrogateUnicodeSet
  };
  
  /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
@@ -452,11 +495,8 @@ static const UConverterStaticData _UTF32BEStaticData = {
      { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  };
  
-const UConverterSharedData _UTF32BEData = {
-    sizeof(UConverterSharedData), ~((uint32_t) 0),
-    NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, 
-    0
-};
+const UConverterSharedData _UTF32BEData =
+        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
  
  /* UTF-32LE ---------------------------------------------------------- */
  
@@ -471,10 +511,11 @@ T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
      unsigned char *toUBytes = args->converter->toUBytes;
      uint32_t ch, i;
  
-    /* UTF-8 returns here for only non-offset, this needs to change.*/
+    /* Restore state of current sequence */
      if (args->converter->toUnicodeStatus && myTarget < targetLimit)
      {
          i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
  
          /* Stores the previously calculated ch from a previous call*/
          ch = args->converter->toUnicodeStatus - 1;
@@ -557,10 +598,11 @@ T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
      uint32_t ch, i;
      int32_t offsetNum = 0;
  
-    /* UTF-8 returns here for only non-offset, this needs to change.*/
+    /* Restore state of current sequence */
      if (args->converter->toUnicodeStatus && myTarget < targetLimit)
      {
          i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
  
          /* Stores the previously calculated ch from a previous call*/
          ch = args->converter->toUnicodeStatus - 1;
@@ -645,13 +687,30 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
                                    UErrorCode * err)
  {
      const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    unsigned char *myTarget;
      const UChar *sourceLimit = args->sourceLimit;
      const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
      UChar32 ch, ch2;
      unsigned int indexToWrite;
      unsigned char temp[sizeof(uint32_t)];
  
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
      temp[3] = 0;
  
      if (args->converter->fromUChar32)
@@ -665,14 +724,14 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
      {
          ch = *(mySource++);
  
-        if (UTF_IS_SURROGATE(ch)) {
-            if (U_IS_LEAD(ch))
+        if (U16_IS_SURROGATE(ch)) {
+            if (U16_IS_LEAD(ch))
              {
  lowsurogate:
                  if (mySource < sourceLimit)
                  {
                      ch2 = *mySource;
-                    if (U_IS_TRAIL(ch2)) {
+                    if (U16_IS_TRAIL(ch2)) {
                          ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
                          mySource++;
                      }
@@ -737,8 +796,8 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
                                                 UErrorCode * err)
  {
      const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
-    int32_t *myOffsets = args->offsets;
+    unsigned char *myTarget;
+    int32_t *myOffsets;
      const UChar *sourceLimit = args->sourceLimit;
      const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
      UChar32 ch, ch2;
@@ -746,6 +805,24 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
      unsigned char temp[sizeof(uint32_t)];
      int32_t offsetNum = 0;
  
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
+    myOffsets = args->offsets;
      temp[3] = 0;
  
      if (args->converter->fromUChar32)
@@ -759,14 +836,14 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
      {
          ch = *(mySource++);
  
-        if (UTF_IS_SURROGATE(ch)) {
-            if (U_IS_LEAD(ch))
+        if (U16_IS_SURROGATE(ch)) {
+            if (U16_IS_LEAD(ch))
              {
  lowsurogate:
                  if (mySource < sourceLimit)
                  {
                      ch2 = *mySource;
-                    if (U_IS_TRAIL(ch2))
+                    if (U16_IS_TRAIL(ch2))
                      {
                          ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
                          mySource++;
@@ -817,7 +894,7 @@ lowsurogate:
                  *err = U_BUFFER_OVERFLOW_ERROR;
              }
          }
-        offsetNum++;
+        offsetNum = offsetNum + 1 + (temp[2] != 0);
      }
  
      if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
@@ -895,7 +972,7 @@ static const UConverterImpl _UTF32LEImpl = {
      NULL,
      NULL,
      NULL,
-    ucnv_getCompleteUnicodeSet
+    ucnv_getNonSurrogateUnicodeSet
  };
  
  /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
@@ -911,11 +988,8 @@ static const UConverterStaticData _UTF32LEStaticData = {
  };
  
  
-const UConverterSharedData _UTF32LEData = {
-    sizeof(UConverterSharedData), ~((uint32_t) 0),
-    NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, 
-    0
-};
+const UConverterSharedData _UTF32LEData =
+        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
  
  /* UTF-32 (Detect BOM) ------------------------------------------------------ */
  
@@ -948,26 +1022,13 @@ _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
      }
      if(choice!=UCNV_RESET_TO_UNICODE) {
          /* reset fromUnicode: prepare to output the UTF-32PE BOM */
-        cnv->charErrorBufferLength=4;
-#if U_IS_BIG_ENDIAN
-        cnv->charErrorBuffer[0]=0;
-        cnv->charErrorBuffer[1]=0;
-        cnv->charErrorBuffer[2]=0xfe;
-        cnv->charErrorBuffer[3]=0xff;
-#else
-        cnv->charErrorBuffer[0]=0xff;
-        cnv->charErrorBuffer[1]=0xfe;
-        cnv->charErrorBuffer[2]=0;
-        cnv->charErrorBuffer[3]=0;
-#endif
+        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
      }
  }
  
  static void
  _UTF32Open(UConverter *cnv,
-           const char *name,
-           const char *locale,
-           uint32_t options,
+           UConverterLoadArgs *pArgs,
             UErrorCode *pErrorCode) {
      _UTF32Reset(cnv, UCNV_RESET_BOTH);
  }
@@ -1019,14 +1080,14 @@ _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                  ++source;
                  if(state==4) {
                      state=8; /* detect UTF-32BE */
-                    offsetDelta=source-pArgs->source;
+                    offsetDelta=(int32_t)(source-pArgs->source);
                  } else if(state==8) {
                      state=9; /* detect UTF-32LE */
-                    offsetDelta=source-pArgs->source;
+                    offsetDelta=(int32_t)(source-pArgs->source);
                  }
              } else {
                  /* switch to UTF-32BE and pass the previous bytes */
-                int32_t count=source-pArgs->source; /* number of bytes from this buffer */
+                int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
  
                  /* reset the source */
                  source=pArgs->source;
@@ -1153,13 +1214,14 @@ static const UConverterImpl _UTF32Impl = {
      NULL,
      NULL,
      NULL,
-    ucnv_getCompleteUnicodeSet
+    ucnv_getNonSurrogateUnicodeSet
  };
  
+/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
  static const UConverterStaticData _UTF32StaticData = {
      sizeof(UConverterStaticData),
      "UTF-32",
-    0, /* ### TODO review correctness of all Unicode CCSIDs */
+    1236,
      UCNV_IBM, UCNV_UTF32, 4, 4,
  #if U_IS_BIG_ENDIAN
      { 0, 0, 0xff, 0xfd }, 4,
@@ -1172,10 +1234,7 @@ static const UConverterStaticData _UTF32StaticData = {
      { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  };
  
-const UConverterSharedData _UTF32Data = {
-    sizeof(UConverterSharedData), ~((uint32_t) 0),
-    NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, 
-    0
-};
+const UConverterSharedData _UTF32Data = 
+        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
  
  #endif