]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ucnv_u32.c
ICU-491.11.1.tar.gz
[apple/icu.git] / icuSources / common / ucnv_u32.c
index 82c15d78fe999b5f0fe638232df7ea60532aa309..c24aaeeed79f18e1c27d46f773a443e314d929ae 100644 (file)
@@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2002-2003, International Business Machines
+*   Copyright (C) 2002-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnv_u32.c
 */
 
 #include "unicode/utypes.h"
+
+#if !UCONFIG_NO_CONVERSION
+
 #include "unicode/ucnv.h"
-#include "unicode/ucnv_err.h"
+#include "unicode/utf.h"
 #include "ucnv_bld.h"
 #include "ucnv_cnv.h"
 #include "cmemory.h"
 
 #define MAXIMUM_UCS2            0x0000FFFF
 #define MAXIMUM_UTF             0x0010FFFF
-#define MAXIMUM_UCS4            0x7FFFFFFF
 #define HALF_SHIFT              10
 #define HALF_BASE               0x0010000
 #define HALF_MASK               0x3FF
 #define SURROGATE_HIGH_START    0xD800
-#define SURROGATE_HIGH_END      0xDBFF
 #define SURROGATE_LOW_START     0xDC00
-#define SURROGATE_LOW_END       0xDFFF
 
 /* -SURROGATE_LOW_START + HALF_BASE */
 #define SURROGATE_LOW_BASE      9216
 
-/**
- * Calls invalid char callback when an invalid character sequence is encountered.
- * It presumes that the converter has a callback to call.
- *
- * @returns true when callback fails
- */
-static UBool
-T_UConverter_toUnicode_InvalidChar_Callback(UConverterToUnicodeArgs * args,
-                                            UConverterCallbackReason reason,
-                                            UErrorCode *err)
-{
-    UConverter *converter = args->converter;
-
-    if (U_SUCCESS(*err))
-    {
-        if (reason == UCNV_ILLEGAL) {
-            *err = U_ILLEGAL_CHAR_FOUND;
-        } else {
-            *err = U_INVALID_CHAR_FOUND;
-        }
-    }
-
-    /* copy the toUBytes to the invalidCharBuffer */
-    uprv_memcpy(converter->invalidCharBuffer,
-                converter->toUBytes,
-                converter->invalidCharLength);
-
-    /* Call the ErrorFunction */
-    args->converter->fromCharErrorBehaviour(converter->toUContext,
-                                            args,
-                                            converter->invalidCharBuffer,
-                                            converter->invalidCharLength,
-                                            reason,
-                                            err);
-
-    return (UBool)U_FAILURE(*err);
-}
-
-static UBool
-T_UConverter_toUnicode_InvalidChar_OffsetCallback(UConverterToUnicodeArgs * args,
-                                                  int32_t currentOffset,
-                                                  UConverterCallbackReason reason,
-                                                  UErrorCode *err)
-{
-    int32_t *saveOffsets = args->offsets;
-    UBool result;
-    
-    result = T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err);
-
-    while (saveOffsets < args->offsets)
-    {
-        *(saveOffsets++) = currentOffset;
-    }
-    return result;
-}
+enum {
+    UCNV_NEED_TO_WRITE_BOM=1
+};
 
 /* UTF-32BE ----------------------------------------------------------------- */
 
@@ -104,68 +52,49 @@ T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
     unsigned char *toUBytes = args->converter->toUBytes;
     uint32_t ch, i;
 
-    /* UTF-8 returns here for only non-offset, this needs to change.*/
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
-    {
+    /* Restore state of current sequence */
+    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
         i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
 
         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
         args->converter->toUnicodeStatus = 0;
         goto morebytes;
     }
 
-    while (mySource < sourceLimit && myTarget < targetLimit)
-    {
+    while (mySource < sourceLimit && myTarget < targetLimit) {
         i = 0;
         ch = 0;
 morebytes:
-        while (i < sizeof(uint32_t))
-        {
-            if (mySource < sourceLimit)
-            {
+        while (i < sizeof(uint32_t)) {
+            if (mySource < sourceLimit) {
                 ch = (ch << 8) | (uint8_t)(*mySource);
                 toUBytes[i++] = (char) *(mySource++);
             }
-            else
-            {
-                if (args->flush)
-                {
-                    if (U_SUCCESS(*err))
-                    {
-                        *err = U_TRUNCATED_CHAR_FOUND;
-                        args->converter->toUnicodeStatus = MAXIMUM_UCS4;
-                    }
-                }
-                else
-                {   /* stores a partially calculated target*/
-                    /* + 1 to make 0 a valid character */
-                    args->converter->toUnicodeStatus = ch + 1;
-                    args->converter->toULength = (int8_t) i;
-                }
+            else {
+                /* stores a partially calculated target*/
+                /* + 1 to make 0 a valid character */
+                args->converter->toUnicodeStatus = ch + 1;
+                args->converter->toULength = (int8_t) i;
                 goto donefornow;
             }
         }
 
-        if (ch <= MAXIMUM_UTF)
-        {
+        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
             if (ch <= MAXIMUM_UCS2) 
             {
                 /* fits in 16 bits */
                 *(myTarget++) = (UChar) ch;
             }
-            else
-            {
+            else {
                 /* write out the surrogates */
-                ch -= HALF_BASE;
-                *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
-                ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
-                if (myTarget < targetLimit)
-                {
+                *(myTarget++) = U16_LEAD(ch);
+                ch = U16_TRAIL(ch);
+                if (myTarget < targetLimit) {
                     *(myTarget++) = (UChar)ch;
                 }
-                else
-                {
+                else {
                     /* Put in overflow buffer (not handled here) */
                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
                     args->converter->UCharErrorBufferLength = 1;
@@ -174,25 +103,15 @@ morebytes:
                 }
             }
         }
-        else
-        {
-            args->source = (const char *) mySource;
-            args->target = myTarget;
-            args->converter->invalidCharLength = (int8_t)i;
-            if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
-            {
-                /* Stop if the error wasn't handled */
-                break;
-            }
-            args->converter->invalidCharLength = 0;
-            mySource = (unsigned char *) args->source;
-            myTarget = args->target;
+        else {
+            args->converter->toULength = (int8_t)i;
+            *err = U_ILLEGAL_CHAR_FOUND;
+            break;
         }
     }
 
 donefornow:
-    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
-    {
+    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
         /* End of target buffer */
         *err = U_BUFFER_OVERFLOW_ERROR;
     }
@@ -214,70 +133,52 @@ T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
     uint32_t ch, i;
     int32_t offsetNum = 0;
 
-    if (args->converter->toUnicodeStatus && myTarget < targetLimit)
-    {
+    /* Restore state of current sequence */
+    if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
         i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
 
         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
         args->converter->toUnicodeStatus = 0;
         goto morebytes;
     }
 
-    while (mySource < sourceLimit && myTarget < targetLimit)
-    {
+    while (mySource < sourceLimit && myTarget < targetLimit) {
         i = 0;
         ch = 0;
 morebytes:
-        while (i < sizeof(uint32_t))
-        {
-            if (mySource < sourceLimit)
-            {
+        while (i < sizeof(uint32_t)) {
+            if (mySource < sourceLimit) {
                 ch = (ch << 8) | (uint8_t)(*mySource);
                 toUBytes[i++] = (char) *(mySource++);
             }
-            else
-            {
-                if (args->flush)
-                {
-                    if (U_SUCCESS(*err))
-                    {
-                        *err = U_TRUNCATED_CHAR_FOUND;
-                        args->converter->toUnicodeStatus = MAXIMUM_UCS4;
-                    }
-                }
-                else
-                {   /* stores a partially calculated target*/
-                    /* + 1 to make 0 a valid character */
-                    args->converter->toUnicodeStatus = ch + 1;
-                    args->converter->toULength = (int8_t) i;
-                }
+            else {
+                /* stores a partially calculated target*/
+                /* + 1 to make 0 a valid character */
+                args->converter->toUnicodeStatus = ch + 1;
+                args->converter->toULength = (int8_t) i;
                 goto donefornow;
             }
         }
 
-        if (ch <= MAXIMUM_UTF)
-        {
+        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
-            if (ch <= MAXIMUM_UCS2) 
-            {
+            if (ch <= MAXIMUM_UCS2) {
                 /* fits in 16 bits */
                 *(myTarget++) = (UChar) ch;
                 *(myOffsets++) = offsetNum;
             }
-            else
-            {
+            else {
                 /* write out the surrogates */
-                ch -= HALF_BASE;
-                *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
+                *(myTarget++) = U16_LEAD(ch);
                 *myOffsets++ = offsetNum;
-                ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
+                ch = U16_TRAIL(ch);
                 if (myTarget < targetLimit)
                 {
                     *(myTarget++) = (UChar)ch;
                     *(myOffsets++) = offsetNum;
                 }
-                else
-                {
+                else {
                     /* Put in overflow buffer (not handled here) */
                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
                     args->converter->UCharErrorBufferLength = 1;
@@ -286,21 +187,10 @@ morebytes:
                 }
             }
         }
-        else
-        {
-            args->source = (const char *) mySource;
-            args->target = myTarget;
-            args->converter->invalidCharLength = (int8_t)i;
-            args->offsets = myOffsets;
-            if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
-            {
-                /* Stop if the error wasn't handled */
-                break;
-            }
-            args->converter->invalidCharLength = 0;
-            mySource = (unsigned char *) args->source;
-            myTarget = args->target;
-            myOffsets = args->offsets;
+        else {
+            args->converter->toULength = (int8_t)i;
+            *err = U_ILLEGAL_CHAR_FOUND;
+            break;
         }
         offsetNum += i;
     }
@@ -322,42 +212,74 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
                                   UErrorCode * err)
 {
     const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    unsigned char *myTarget;
     const UChar *sourceLimit = args->sourceLimit;
     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
     UChar32 ch, ch2;
     unsigned int indexToWrite;
     unsigned char temp[sizeof(uint32_t)];
 
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
     temp[0] = 0;
 
-    if (args->converter->fromUnicodeStatus)
-    {
-        ch = args->converter->fromUnicodeStatus;
-        args->converter->fromUnicodeStatus = 0;
+    if (args->converter->fromUChar32) {
+        ch = args->converter->fromUChar32;
+        args->converter->fromUChar32 = 0;
         goto lowsurogate;
     }
 
-    while (mySource < sourceLimit && myTarget < targetLimit)
-    {
+    while (mySource < sourceLimit && myTarget < targetLimit) {
         ch = *(mySource++);
 
-        if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
-        {
+        if (U_IS_SURROGATE(ch)) {
+            if (U_IS_LEAD(ch)) {
 lowsurogate:
-            if (mySource < sourceLimit)
-            {
-                ch2 = *mySource;
-                if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
-                {
-                    ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
-                    mySource++;
+                if (mySource < sourceLimit) {
+                    ch2 = *mySource;
+                    if (U_IS_TRAIL(ch2)) {
+                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+                        mySource++;
+                    }
+                    else {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        args->converter->fromUChar32 = ch;
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                        break;
+                    }
+                }
+                else {
+                    /* ran out of source */
+                    args->converter->fromUChar32 = ch;
+                    if (args->flush) {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                    }
+                    break;
                 }
             }
-            else if (!args->flush)
-            {
-                /* ran out of source */
-                args->converter->fromUnicodeStatus = ch;
+            else {
+                /* this is an unmatched trail code unit (2nd surrogate) */
+                /* callback(illegal) */
+                args->converter->fromUChar32 = ch;
+                *err = U_ILLEGAL_CHAR_FOUND;
                 break;
             }
         }
@@ -367,22 +289,18 @@ lowsurogate:
         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 
-        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
-        {
-            if (myTarget < targetLimit)
-            {
+        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
+            if (myTarget < targetLimit) {
                 *(myTarget++) = temp[indexToWrite];
             }
-            else
-            {
+            else {
                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
                 *err = U_BUFFER_OVERFLOW_ERROR;
             }
         }
     }
 
-    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
-    {
+    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
         *err = U_BUFFER_OVERFLOW_ERROR;
     }
 
@@ -395,8 +313,8 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
                                                UErrorCode * err)
 {
     const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
-    int32_t *myOffsets = args->offsets;
+    unsigned char *myTarget;
+    int32_t *myOffsets;
     const UChar *sourceLimit = args->sourceLimit;
     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
     UChar32 ch, ch2;
@@ -404,35 +322,68 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
     unsigned int indexToWrite;
     unsigned char temp[sizeof(uint32_t)];
 
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
+    myOffsets = args->offsets;
     temp[0] = 0;
 
-    if (args->converter->fromUnicodeStatus)
-    {
-        ch = args->converter->fromUnicodeStatus;
-        args->converter->fromUnicodeStatus = 0;
+    if (args->converter->fromUChar32) {
+        ch = args->converter->fromUChar32;
+        args->converter->fromUChar32 = 0;
         goto lowsurogate;
     }
 
-    while (mySource < sourceLimit && myTarget < targetLimit)
-    {
+    while (mySource < sourceLimit && myTarget < targetLimit) {
         ch = *(mySource++);
 
-        if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
-        {
+        if (U_IS_SURROGATE(ch)) {
+            if (U_IS_LEAD(ch)) {
 lowsurogate:
-            if (mySource < sourceLimit)
-            {
-                ch2 = *mySource;
-                if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
-                {
-                    ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
-                    mySource++;
+                if (mySource < sourceLimit) {
+                    ch2 = *mySource;
+                    if (U_IS_TRAIL(ch2)) {
+                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+                        mySource++;
+                    }
+                    else {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        args->converter->fromUChar32 = ch;
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                        break;
+                    }
+                }
+                else {
+                    /* ran out of source */
+                    args->converter->fromUChar32 = ch;
+                    if (args->flush) {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                    }
+                    break;
                 }
             }
-            else if (!args->flush)
-            {
-                /* ran out of source */
-                args->converter->fromUnicodeStatus = ch;
+            else {
+                /* this is an unmatched trail code unit (2nd surrogate) */
+                /* callback(illegal) */
+                args->converter->fromUChar32 = ch;
+                *err = U_ILLEGAL_CHAR_FOUND;
                 break;
             }
         }
@@ -442,24 +393,20 @@ lowsurogate:
         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
 
-        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
-        {
-            if (myTarget < targetLimit)
-            {
+        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
+            if (myTarget < targetLimit) {
                 *(myTarget++) = temp[indexToWrite];
                 *(myOffsets++) = offsetNum;
             }
-            else
-            {
+            else {
                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
                 *err = U_BUFFER_OVERFLOW_ERROR;
             }
         }
-        offsetNum++;
+        offsetNum = offsetNum + 1 + (temp[1] != 0);
     }
 
-    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
-    {
+    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
         *err = U_BUFFER_OVERFLOW_ERROR;
     }
 
@@ -472,65 +419,44 @@ static UChar32
 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
                                    UErrorCode* err)
 {
-    UChar myUCharBuf[2];
-    UChar *myUCharPtr;
-    const unsigned char *mySource;
+    const uint8_t *mySource;
     UChar32 myUChar;
     int32_t length;
 
-    while (args->source < args->sourceLimit)
+    mySource = (const uint8_t *)args->source;
+    if (mySource >= (const uint8_t *)args->sourceLimit)
     {
-        if (args->source + 4 > args->sourceLimit) 
-        {
-            /* got a partial character */
-            *err = U_TRUNCATED_CHAR_FOUND;
-            return 0xffff;
-        }
+        /* no input */
+        *err = U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0xffff;
+    }
 
-        /* Don't even try to do a direct cast because the value may be on an odd address. */
-        mySource = (unsigned char *) args->source;
-        myUChar = (mySource[0] << 24)
-                | (mySource[1] << 16)
-                | (mySource[2] << 8)
-                | (mySource[3]);
+    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
+    if (length < 4) 
+    {
+        /* got a partial character */
+        uprv_memcpy(args->converter->toUBytes, mySource, length);
+        args->converter->toULength = (int8_t)length;
+        args->source = (const char *)(mySource + length);
+        *err = U_TRUNCATED_CHAR_FOUND;
+        return 0xffff;
+    }
 
-        args->source = (const char *)(mySource + 4);
-        if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
-            return myUChar;
-        }
+    /* Don't even try to do a direct cast because the value may be on an odd address. */
+    myUChar = ((UChar32)mySource[0] << 24)
+            | ((UChar32)mySource[1] << 16)
+            | ((UChar32)mySource[2] << 8)
+            | ((UChar32)mySource[3]);
 
-        uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
-        args->converter->invalidCharLength = 4;
-
-        myUCharPtr = myUCharBuf;
-        *err = U_ILLEGAL_CHAR_FOUND;
-        args->target = myUCharPtr;
-        args->targetLimit = myUCharBuf + 2;
-        args->converter->fromCharErrorBehaviour(args->converter->toUContext,
-                                        args,
-                                        (const char *)mySource,
-                                        4,
-                                        UCNV_ILLEGAL,
-                                        err);
-
-        if(U_SUCCESS(*err)) {
-            length = (uint16_t)(args->target - myUCharBuf);
-            if(length > 0) {
-                return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
-            }
-            /* else (callback did not write anything) continue */
-        } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
-            *err = U_ZERO_ERROR;
-            return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
-        } else {
-            /* break on error */
-            /* ### what if a callback set an error but _also_ generated output?! */
-            return 0xffff;
-        }
+    args->source = (const char *)(mySource + 4);
+    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
+        return myUChar;
     }
 
-    /* no input or only skipping callbacks */
-    *err = U_INDEX_OUTOFBOUNDS_ERROR;
+    uprv_memcpy(args->converter->toUBytes, mySource, 4);
+    args->converter->toULength = 4;
+
+    *err = U_ILLEGAL_CHAR_FOUND;
     return 0xffff;
 }
 
@@ -554,7 +480,7 @@ static const UConverterImpl _UTF32BEImpl = {
     NULL,
     NULL,
     NULL,
-    ucnv_getCompleteUnicodeSet
+    ucnv_getNonSurrogateUnicodeSet
 };
 
 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
@@ -588,10 +514,11 @@ T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
     unsigned char *toUBytes = args->converter->toUBytes;
     uint32_t ch, i;
 
-    /* UTF-8 returns here for only non-offset, this needs to change.*/
+    /* Restore state of current sequence */
     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
     {
         i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
 
         /* Stores the previously calculated ch from a previous call*/
         ch = args->converter->toUnicodeStatus - 1;
@@ -613,44 +540,28 @@ morebytes:
             }
             else
             {
-                if (args->flush)
-                {
-                    if (U_SUCCESS(*err))
-                    {
-                        *err = U_TRUNCATED_CHAR_FOUND;
-                        args->converter->toUnicodeStatus = 0;
-                    }
-                }
-                else
-                {   /* stores a partially calculated target*/
-                    /* + 1 to make 0 a valid character */
-                    args->converter->toUnicodeStatus = ch + 1;
-                    args->converter->toULength = (int8_t) i;
-                }
+                /* stores a partially calculated target*/
+                /* + 1 to make 0 a valid character */
+                args->converter->toUnicodeStatus = ch + 1;
+                args->converter->toULength = (int8_t) i;
                 goto donefornow;
             }
         }
 
-        if (ch <= MAXIMUM_UTF)
-        {
+        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
-            if (ch <= MAXIMUM_UCS2) 
-            {
+            if (ch <= MAXIMUM_UCS2) {
                 /* fits in 16 bits */
                 *(myTarget++) = (UChar) ch;
             }
-            else
-            {
+            else {
                 /* write out the surrogates */
-                ch -= HALF_BASE;
-                *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
-                ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
-                if (myTarget < targetLimit)
-                {
+                *(myTarget++) = U16_LEAD(ch);
+                ch = U16_TRAIL(ch);
+                if (myTarget < targetLimit) {
                     *(myTarget++) = (UChar)ch;
                 }
-                else
-                {
+                else {
                     /* Put in overflow buffer (not handled here) */
                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
                     args->converter->UCharErrorBufferLength = 1;
@@ -659,19 +570,10 @@ morebytes:
                 }
             }
         }
-        else
-        {
-            args->source = (const char *) mySource;
-            args->target = myTarget;
-            args->converter->invalidCharLength = (int8_t)i;
-            if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
-            {
-                /* Stop if the error wasn't handled */
-                break;
-            }
-            args->converter->invalidCharLength = 0;
-            mySource = (unsigned char *) args->source;
-            myTarget = args->target;
+        else {
+            args->converter->toULength = (int8_t)i;
+            *err = U_ILLEGAL_CHAR_FOUND;
+            break;
         }
     }
 
@@ -699,10 +601,11 @@ T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
     uint32_t ch, i;
     int32_t offsetNum = 0;
 
-    /* UTF-8 returns here for only non-offset, this needs to change.*/
+    /* Restore state of current sequence */
     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
     {
         i = args->converter->toULength;       /* restore # of bytes consumed */
+        args->converter->toULength = 0;
 
         /* Stores the previously calculated ch from a previous call*/
         ch = args->converter->toUnicodeStatus - 1;
@@ -724,25 +627,15 @@ morebytes:
             }
             else
             {
-                if (args->flush)
-                {
-                    if (U_SUCCESS(*err))
-                    {
-                        *err = U_TRUNCATED_CHAR_FOUND;
-                        args->converter->toUnicodeStatus = 0;
-                    }
-                }
-                else
-                {   /* stores a partially calculated target*/
-                    /* + 1 to make 0 a valid character */
-                    args->converter->toUnicodeStatus = ch + 1;
-                    args->converter->toULength = (int8_t) i;
-                }
+                /* stores a partially calculated target*/
+                /* + 1 to make 0 a valid character */
+                args->converter->toUnicodeStatus = ch + 1;
+                args->converter->toULength = (int8_t) i;
                 goto donefornow;
             }
         }
 
-        if (ch <= MAXIMUM_UTF)
+        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
         {
             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
             if (ch <= MAXIMUM_UCS2) 
@@ -751,13 +644,11 @@ morebytes:
                 *(myTarget++) = (UChar) ch;
                 *(myOffsets++) = offsetNum;
             }
-            else
-            {
+            else {
                 /* write out the surrogates */
-                ch -= HALF_BASE;
-                *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
+                *(myTarget++) = U16_LEAD(ch);
                 *(myOffsets++) = offsetNum;
-                ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
+                ch = U16_TRAIL(ch);
                 if (myTarget < targetLimit)
                 {
                     *(myTarget++) = (UChar)ch;
@@ -775,19 +666,9 @@ morebytes:
         }
         else
         {
-            args->source = (const char *) mySource;
-            args->target = myTarget;
-            args->converter->invalidCharLength = (int8_t)i;
-            args->offsets = myOffsets;
-            if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, offsetNum, UCNV_ILLEGAL, err))
-            {
-                /* Stop if the error wasn't handled */
-                break;
-            }
-            args->converter->invalidCharLength = 0;
-            mySource = (unsigned char *) args->source;
-            myTarget = args->target;
-            myOffsets = args->offsets;
+            args->converter->toULength = (int8_t)i;
+            *err = U_ILLEGAL_CHAR_FOUND;
+            break;
         }
         offsetNum += i;
     }
@@ -809,19 +690,36 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
                                   UErrorCode * err)
 {
     const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
+    unsigned char *myTarget;
     const UChar *sourceLimit = args->sourceLimit;
     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
     UChar32 ch, ch2;
     unsigned int indexToWrite;
     unsigned char temp[sizeof(uint32_t)];
 
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
     temp[3] = 0;
 
-    if (args->converter->fromUnicodeStatus)
+    if (args->converter->fromUChar32)
     {
-        ch = args->converter->fromUnicodeStatus;
-        args->converter->fromUnicodeStatus = 0;
+        ch = args->converter->fromUChar32;
+        args->converter->fromUChar32 = 0;
         goto lowsurogate;
     }
 
@@ -829,22 +727,41 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
     {
         ch = *(mySource++);
 
-        if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
-        {
-lowsurogate:
-            if (mySource < sourceLimit)
+        if (U16_IS_SURROGATE(ch)) {
+            if (U16_IS_LEAD(ch))
             {
-                ch2 = *mySource;
-                if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
+lowsurogate:
+                if (mySource < sourceLimit)
                 {
-                    ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
-                    mySource++;
+                    ch2 = *mySource;
+                    if (U16_IS_TRAIL(ch2)) {
+                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+                        mySource++;
+                    }
+                    else {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        args->converter->fromUChar32 = ch;
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                        break;
+                    }
+                }
+                else {
+                    /* ran out of source */
+                    args->converter->fromUChar32 = ch;
+                    if (args->flush) {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                    }
+                    break;
                 }
             }
-            else if (!args->flush)
-            {
-                /* ran out of source */
-                args->converter->fromUnicodeStatus = ch;
+            else {
+                /* this is an unmatched trail code unit (2nd surrogate) */
+                /* callback(illegal) */
+                args->converter->fromUChar32 = ch;
+                *err = U_ILLEGAL_CHAR_FOUND;
                 break;
             }
         }
@@ -882,8 +799,8 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
                                                UErrorCode * err)
 {
     const UChar *mySource = args->source;
-    unsigned char *myTarget = (unsigned char *) args->target;
-    int32_t *myOffsets = args->offsets;
+    unsigned char *myTarget;
+    int32_t *myOffsets;
     const UChar *sourceLimit = args->sourceLimit;
     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
     UChar32 ch, ch2;
@@ -891,12 +808,30 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
     unsigned char temp[sizeof(uint32_t)];
     int32_t offsetNum = 0;
 
+    if(mySource >= sourceLimit) {
+        /* no input, nothing to do */
+        return;
+    }
+
+    /* write the BOM if necessary */
+    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+        static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+        ucnv_fromUWriteBytes(args->converter,
+                             bom, 4,
+                             &args->target, args->targetLimit,
+                             &args->offsets, -1,
+                             err);
+        args->converter->fromUnicodeStatus=0;
+    }
+
+    myTarget = (unsigned char *) args->target;
+    myOffsets = args->offsets;
     temp[3] = 0;
 
-    if (args->converter->fromUnicodeStatus)
+    if (args->converter->fromUChar32)
     {
-        ch = args->converter->fromUnicodeStatus;
-        args->converter->fromUnicodeStatus = 0;
+        ch = args->converter->fromUChar32;
+        args->converter->fromUChar32 = 0;
         goto lowsurogate;
     }
 
@@ -904,22 +839,42 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
     {
         ch = *(mySource++);
 
-        if (SURROGATE_HIGH_START <= ch && ch < SURROGATE_LOW_START)
-        {
-lowsurogate:
-            if (mySource < sourceLimit)
+        if (U16_IS_SURROGATE(ch)) {
+            if (U16_IS_LEAD(ch))
             {
-                ch2 = *mySource;
-                if (SURROGATE_LOW_START <= ch2 && ch2 <= SURROGATE_LOW_END)
+lowsurogate:
+                if (mySource < sourceLimit)
                 {
-                    ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
-                    mySource++;
+                    ch2 = *mySource;
+                    if (U16_IS_TRAIL(ch2))
+                    {
+                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+                        mySource++;
+                    }
+                    else {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        args->converter->fromUChar32 = ch;
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                        break;
+                    }
+                }
+                else {
+                    /* ran out of source */
+                    args->converter->fromUChar32 = ch;
+                    if (args->flush) {
+                        /* this is an unmatched trail code unit (2nd surrogate) */
+                        /* callback(illegal) */
+                        *err = U_ILLEGAL_CHAR_FOUND;
+                    }
+                    break;
                 }
             }
-            else if (!args->flush)
-            {
-                /* ran out of source */
-                args->converter->fromUnicodeStatus = ch;
+            else {
+                /* this is an unmatched trail code unit (2nd surrogate) */
+                /* callback(illegal) */
+                args->converter->fromUChar32 = ch;
+                *err = U_ILLEGAL_CHAR_FOUND;
                 break;
             }
         }
@@ -942,7 +897,7 @@ lowsurogate:
                 *err = U_BUFFER_OVERFLOW_ERROR;
             }
         }
-        offsetNum++;
+        offsetNum = offsetNum + 1 + (temp[2] != 0);
     }
 
     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
@@ -959,65 +914,44 @@ static UChar32
 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
                                    UErrorCode* err)
 {
-    UChar myUCharBuf[2];
-    UChar *myUCharPtr;
-    const unsigned char *mySource;
+    const uint8_t *mySource;
     UChar32 myUChar;
     int32_t length;
 
-    while (args->source < args->sourceLimit)
+    mySource = (const uint8_t *)args->source;
+    if (mySource >= (const uint8_t *)args->sourceLimit)
     {
-        if (args->source + 4 > args->sourceLimit) 
-        {
-            /* got a partial character */
-            *err = U_TRUNCATED_CHAR_FOUND;
-            return 0xffff;
-        }
+        /* no input */
+        *err = U_INDEX_OUTOFBOUNDS_ERROR;
+        return 0xffff;
+    }
 
-        /* Don't even try to do a direct cast because the value may be on an odd address. */
-        mySource = (unsigned char *) args->source;
-        myUChar = (mySource[0])
-                | (mySource[1] << 8)
-                | (mySource[2] << 16)
-                | (mySource[3] << 24);
+    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
+    if (length < 4) 
+    {
+        /* got a partial character */
+        uprv_memcpy(args->converter->toUBytes, mySource, length);
+        args->converter->toULength = (int8_t)length;
+        args->source = (const char *)(mySource + length);
+        *err = U_TRUNCATED_CHAR_FOUND;
+        return 0xffff;
+    }
 
-        args->source = (const char *)(mySource + 4);
-        if (myUChar <= MAXIMUM_UTF && myUChar >= 0) {
-            return myUChar;
-        }
+    /* Don't even try to do a direct cast because the value may be on an odd address. */
+    myUChar = ((UChar32)mySource[3] << 24)
+            | ((UChar32)mySource[2] << 16)
+            | ((UChar32)mySource[1] << 8)
+            | ((UChar32)mySource[0]);
 
-        uprv_memcpy(args->converter->invalidCharBuffer, mySource, 4);
-        args->converter->invalidCharLength = 4;
-
-        myUCharPtr = myUCharBuf;
-        *err = U_ILLEGAL_CHAR_FOUND;
-        args->target = myUCharPtr;
-        args->targetLimit = myUCharBuf + 2;
-        args->converter->fromCharErrorBehaviour(args->converter->toUContext,
-                                        args,
-                                        (const char *)mySource,
-                                        4,
-                                        UCNV_ILLEGAL,
-                                        err);
-
-        if(U_SUCCESS(*err)) {
-            length = (uint16_t)(args->target - myUCharBuf);
-            if(length > 0) {
-                return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, length);
-            }
-            /* else (callback did not write anything) continue */
-        } else if(*err == U_BUFFER_OVERFLOW_ERROR) {
-            *err = U_ZERO_ERROR;
-            return ucnv_getUChar32KeepOverflow(args->converter, myUCharBuf, 2);
-        } else {
-            /* break on error */
-            /* ### what if a callback set an error but _also_ generated output?! */
-            return 0xffff;
-        }
+    args->source = (const char *)(mySource + 4);
+    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
+        return myUChar;
     }
 
-    /* no input or only skipping callbacks */
-    *err = U_INDEX_OUTOFBOUNDS_ERROR;
+    uprv_memcpy(args->converter->toUBytes, mySource, 4);
+    args->converter->toULength = 4;
+
+    *err = U_ILLEGAL_CHAR_FOUND;
     return 0xffff;
 }
 
@@ -1041,7 +975,7 @@ static const UConverterImpl _UTF32LEImpl = {
     NULL,
     NULL,
     NULL,
-    ucnv_getCompleteUnicodeSet
+    ucnv_getNonSurrogateUnicodeSet
 };
 
 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
@@ -1094,26 +1028,13 @@ _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
     }
     if(choice!=UCNV_RESET_TO_UNICODE) {
         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
-        cnv->charErrorBufferLength=4;
-#if U_IS_BIG_ENDIAN
-        cnv->charErrorBuffer[0]=0;
-        cnv->charErrorBuffer[1]=0;
-        cnv->charErrorBuffer[2]=0xfe;
-        cnv->charErrorBuffer[3]=0xff;
-#else
-        cnv->charErrorBuffer[0]=0xff;
-        cnv->charErrorBuffer[1]=0xfe;
-        cnv->charErrorBuffer[2]=0;
-        cnv->charErrorBuffer[3]=0;
-#endif
+        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
     }
 }
 
 static void
 _UTF32Open(UConverter *cnv,
-           const char *name,
-           const char *locale,
-           uint32_t options,
+           UConverterLoadArgs *pArgs,
            UErrorCode *pErrorCode) {
     _UTF32Reset(cnv, UCNV_RESET_BOTH);
 }
@@ -1165,14 +1086,14 @@ _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                 ++source;
                 if(state==4) {
                     state=8; /* detect UTF-32BE */
-                    offsetDelta=source-pArgs->source;
+                    offsetDelta=(int32_t)(source-pArgs->source);
                 } else if(state==8) {
                     state=9; /* detect UTF-32LE */
-                    offsetDelta=source-pArgs->source;
+                    offsetDelta=(int32_t)(source-pArgs->source);
                 }
             } else {
                 /* switch to UTF-32BE and pass the previous bytes */
-                int32_t count=source-pArgs->source; /* number of bytes from this buffer */
+                int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
 
                 /* reset the source */
                 source=pArgs->source;
@@ -1253,12 +1174,12 @@ _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
             pArgs->source=source;
             pArgs->sourceLimit=sourceLimit;
+            state=8;
             break;
         }
-        cnv->mode=0; /* reset */
-    } else {
-        cnv->mode=state;
     }
+
+    cnv->mode=state;
 }
 
 static UChar32
@@ -1270,7 +1191,7 @@ _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
     case 9:
         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
     default:
-        return ucnv_getNextUCharFromToUImpl(pArgs, _UTF32ToUnicodeWithOffsets, FALSE, pErrorCode);
+        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
     }
 }
 
@@ -1299,13 +1220,14 @@ static const UConverterImpl _UTF32Impl = {
     NULL,
     NULL,
     NULL,
-    ucnv_getCompleteUnicodeSet
+    ucnv_getNonSurrogateUnicodeSet
 };
 
+/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
 static const UConverterStaticData _UTF32StaticData = {
     sizeof(UConverterStaticData),
     "UTF-32",
-    0, /* ### TODO review correctness of all Unicode CCSIDs */
+    1236,
     UCNV_IBM, UCNV_UTF32, 4, 4,
 #if U_IS_BIG_ENDIAN
     { 0, 0, 0xff, 0xfd }, 4,
@@ -1323,3 +1245,5 @@ const UConverterSharedData _UTF32Data = {
     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, 
     0
 };
+
+#endif