]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ucnvmbcs.c
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / common / ucnvmbcs.c
index 538a18c521de5d070c24d2b4dd01dc2b5b3054c2..143daf69af7a1ae9557408edc556eae83fee3462 100644 (file)
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2000-2010, International Business Machines
+*   Copyright (C) 2000-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 #include "unicode/ucnv_cb.h"
 #include "unicode/udata.h"
 #include "unicode/uset.h"
+#include "unicode/utf8.h"
+#include "unicode/utf16.h"
 #include "ucnv_bld.h"
 #include "ucnvmbcs.h"
 #include "ucnv_ext.h"
 #include "ucnv_cnv.h"
-#include "umutex.h"
 #include "cmemory.h"
 #include "cstring.h"
+#include "cmutex.h"
 
 /* control optimizations according to the platform */
 #define MBCS_UNROLL_SINGLE_TO_BMP 1
@@ -379,10 +381,11 @@ static const UConverterImpl _DBCSUTF8Impl;
  * as of the re-released mapping tables from 2000-nov-30.
  */
 static const uint32_t
-gb18030Ranges[13][4]={
+gb18030Ranges[14][4]={
     {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},
     {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},
-    {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},
+    {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},
+    {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},
     {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},
     {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},
     {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},
@@ -823,9 +826,9 @@ ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
                                     switch(st3Multiplier) {
                                     case 4:
                                         b|=*stage3++;
-                                    case 3:
+                                    case 3: /*fall through*/
                                         b|=*stage3++;
-                                    case 2:
+                                    case 2: /*fall through*/
                                         b|=stage3[0]|stage3[1];
                                         stage3+=2;
                                     default:
@@ -1340,7 +1343,6 @@ reconstituteData(UConverterMBCSTable *mbcsTable,
                  UErrorCode *pErrorCode) {
     uint16_t *stage1;
     uint32_t *stage2;
-    uint8_t *bytes;
     uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
     mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
     if(mbcsTable->reconstitutedData==NULL) {
@@ -1359,7 +1361,7 @@ reconstituteData(UConverterMBCSTable *mbcsTable,
                 stage2Length*4);
 
     mbcsTable->fromUnicodeTable=stage1;
-    mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
+    mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);
 
     /* indexes into stage 2 count from the bottom of the fromUnicodeTable */
     stage2=(uint32_t *)stage1;
@@ -3351,16 +3353,16 @@ ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                  * If it does, then surrogates are not paired but mapped separately.
                  * Note that in this case unmatched surrogates are not detected.
                  */
-                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
-                    if(UTF_IS_SURROGATE_FIRST(c)) {
+                if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+                    if(U16_IS_SURROGATE_LEAD(c)) {
 getTrail:
                         if(source<sourceLimit) {
                             /* test the following code unit */
                             UChar trail=*source;
-                            if(UTF_IS_SECOND_SURROGATE(trail)) {
+                            if(U16_IS_TRAIL(trail)) {
                                 ++source;
                                 ++nextSourceIndex;
-                                c=UTF16_GET_PAIR_VALUE(c, trail);
+                                c=U16_GET_SUPPLEMENTARY(c, trail);
                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
                                     /* callback(unassigned) */
@@ -3556,16 +3558,16 @@ ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
              */
             c=*source++;
             ++nextSourceIndex;
-            if(UTF_IS_SURROGATE(c)) {
-                if(UTF_IS_SURROGATE_FIRST(c)) {
+            if(U16_IS_SURROGATE(c)) {
+                if(U16_IS_SURROGATE_LEAD(c)) {
 getTrail:
                     if(source<sourceLimit) {
                         /* test the following code unit */
                         UChar trail=*source;
-                        if(UTF_IS_SECOND_SURROGATE(trail)) {
+                        if(U16_IS_TRAIL(trail)) {
                             ++source;
                             ++nextSourceIndex;
-                            c=UTF16_GET_PAIR_VALUE(c, trail);
+                            c=U16_GET_SUPPLEMENTARY(c, trail);
                             if(!hasSupplementary) {
                                 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
                                 /* callback(unassigned) */
@@ -3804,16 +3806,16 @@ unrolled:
             /* normal end of conversion: prepare for a new character */
             c=0;
             continue;
-        } else if(!UTF_IS_SURROGATE(c)) {
+        } else if(!U16_IS_SURROGATE(c)) {
             /* normal, unassigned BMP character */
-        } else if(UTF_IS_SURROGATE_FIRST(c)) {
+        } else if(U16_IS_SURROGATE_LEAD(c)) {
 getTrail:
             if(source<sourceLimit) {
                 /* test the following code unit */
                 UChar trail=*source;
-                if(UTF_IS_SECOND_SURROGATE(trail)) {
+                if(U16_IS_TRAIL(trail)) {
                     ++source;
-                    c=UTF16_GET_PAIR_VALUE(c, trail);
+                    c=U16_GET_SUPPLEMENTARY(c, trail);
                     /* this codepage does not map supplementary code points */
                     /* callback(unassigned) */
                 } else {
@@ -3938,9 +3940,10 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
     uint32_t stage2Entry;
     uint32_t asciiRoundtrips;
     uint32_t value;
-    uint8_t si_value[2] = {0, 0}; 
-    uint8_t so_value[2] = {0, 0}; 
-    uint8_t si_value_length, so_value_length;
+    /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */
+    uint8_t siBytes[2] = {0, 0};
+    uint8_t soBytes[2] = {0, 0};
+    uint8_t siLength, soLength;
     int32_t length = 0, prevLength;
     uint8_t unicodeMask;
 
@@ -4013,8 +4016,8 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
     nextSourceIndex=0;
 
     /* Get the SI/SO character for the converter */
-    si_value_length = getSISOBytes(SI, cnv->options, si_value);
-    so_value_length = getSISOBytes(SO, cnv->options, so_value);
+    siLength = getSISOBytes(SI, cnv->options, siBytes);
+    soLength = getSISOBytes(SO, cnv->options, soBytes);
 
     /* conversion loop */
     /*
@@ -4105,12 +4108,12 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                             length=1;
                         } else {
                             /* change from double-byte mode to single-byte */
-                            if (si_value_length == 1) {
-                                value|=(uint32_t)si_value[0]<<8;
+                            if (siLength == 1) {
+                                value|=(uint32_t)siBytes[0]<<8;
                                 length = 2;
-                            } else if (si_value_length == 2) {
-                                value|=(uint32_t)si_value[1]<<8;
-                                value|=(uint32_t)si_value[0]<<16;
+                            } else if (siLength == 2) {
+                                value|=(uint32_t)siBytes[1]<<8;
+                                value|=(uint32_t)siBytes[0]<<16;
                                 length = 3;
                             }
                             prevLength=1;
@@ -4120,12 +4123,12 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                             length=2;
                         } else {
                             /* change from single-byte mode to double-byte */
-                            if (so_value_length == 1) {
-                                value|=(uint32_t)so_value[0]<<16;
+                            if (soLength == 1) {
+                                value|=(uint32_t)soBytes[0]<<16;
                                 length = 3;
-                            } else if (so_value_length == 2) {
-                                value|=(uint32_t)so_value[1]<<16;
-                                value|=(uint32_t)so_value[0]<<24;
+                            } else if (soLength == 2) {
+                                value|=(uint32_t)soBytes[1]<<16;
+                                value|=(uint32_t)soBytes[0]<<24;
                                 length = 4;
                             }
                             prevLength=2;
@@ -4234,16 +4237,16 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                  * If it does, then surrogates are not paired but mapped separately.
                  * Note that in this case unmatched surrogates are not detected.
                  */
-                if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
-                    if(UTF_IS_SURROGATE_FIRST(c)) {
+                if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {
+                    if(U16_IS_SURROGATE_LEAD(c)) {
 getTrail:
                         if(source<sourceLimit) {
                             /* test the following code unit */
                             UChar trail=*source;
-                            if(UTF_IS_SECOND_SURROGATE(trail)) {
+                            if(U16_IS_TRAIL(trail)) {
                                 ++source;
                                 ++nextSourceIndex;
-                                c=UTF16_GET_PAIR_VALUE(c, trail);
+                                c=U16_GET_SUPPLEMENTARY(c, trail);
                                 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
                                     /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
                                     cnv->fromUnicodeStatus=prevLength; /* save the old state */
@@ -4337,12 +4340,12 @@ getTrail:
                             length=1;
                         } else {
                             /* change from double-byte mode to single-byte */
-                            if (si_value_length == 1) {
-                                value|=(uint32_t)si_value[0]<<8;
+                            if (siLength == 1) {
+                                value|=(uint32_t)siBytes[0]<<8;
                                 length = 2;
-                            } else if (si_value_length == 2) {
-                                value|=(uint32_t)si_value[1]<<8;
-                                value|=(uint32_t)si_value[0]<<16;
+                            } else if (siLength == 2) {
+                                value|=(uint32_t)siBytes[1]<<8;
+                                value|=(uint32_t)siBytes[0]<<16;
                                 length = 3;
                             }
                             prevLength=1;
@@ -4352,12 +4355,12 @@ getTrail:
                             length=2;
                         } else {
                             /* change from single-byte mode to double-byte */
-                            if (so_value_length == 1) {
-                                value|=(uint32_t)so_value[0]<<16;
+                            if (soLength == 1) {
+                                value|=(uint32_t)soBytes[0]<<16;
                                 length = 3;
-                            } else if (so_value_length == 2) {
-                                value|=(uint32_t)so_value[1]<<16;
-                                value|=(uint32_t)so_value[0]<<24;
+                            } else if (soLength == 2) {
+                                value|=(uint32_t)soBytes[1]<<16;
+                                value|=(uint32_t)soBytes[0]<<24;
                                 length = 4;
                             }
                             prevLength=2;
@@ -4494,11 +4497,11 @@ unassigned:
                         /* each branch falls through to the next one */
                     case 4:
                         *target++=(uint8_t)(value>>24);
-                    case 3:
+                    case 3: /*fall through*/
                         *target++=(uint8_t)(value>>16);
-                    case 2:
+                    case 2: /*fall through*/
                         *target++=(uint8_t)(value>>8);
-                    case 1:
+                    case 1: /*fall through*/
                         *target++=(uint8_t)value;
                     default:
                         /* will never occur */
@@ -4510,13 +4513,13 @@ unassigned:
                     case 4:
                         *target++=(uint8_t)(value>>24);
                         *offsets++=sourceIndex;
-                    case 3:
+                    case 3: /*fall through*/
                         *target++=(uint8_t)(value>>16);
                         *offsets++=sourceIndex;
-                    case 2:
+                    case 2: /*fall through*/
                         *target++=(uint8_t)(value>>8);
                         *offsets++=sourceIndex;
-                    case 1:
+                    case 1: /*fall through*/
                         *target++=(uint8_t)value;
                         *offsets++=sourceIndex;
                     default:
@@ -4541,9 +4544,9 @@ unassigned:
                     /* each branch falls through to the next one */
                 case 3:
                     *charErrorBuffer++=(uint8_t)(value>>16);
-                case 2:
+                case 2: /*fall through*/
                     *charErrorBuffer++=(uint8_t)(value>>8);
-                case 1:
+                case 1: /*fall through*/
                     *charErrorBuffer=(uint8_t)value;
                 default:
                     /* will never occur */
@@ -4560,12 +4563,12 @@ unassigned:
                     if(offsets!=NULL) {
                         *offsets++=sourceIndex;
                     }
-                case 2:
+                case 2: /*fall through*/
                     *target++=(uint8_t)(value>>8);
                     if(offsets!=NULL) {
                         *offsets++=sourceIndex;
                     }
-                case 1:
+                case 1: /*fall through*/
                     *target++=(uint8_t)value;
                     if(offsets!=NULL) {
                         *offsets++=sourceIndex;
@@ -4612,14 +4615,14 @@ unassigned:
     ) {
         /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
         if(targetCapacity>0) {
-            *target++=(uint8_t)si_value[0];
-            if (si_value_length == 2) {
+            *target++=(uint8_t)siBytes[0];
+            if (siLength == 2) {
                 if (targetCapacity<2) {
-                    cnv->charErrorBuffer[0]=(uint8_t)si_value[1];
+                    cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];
                     cnv->charErrorBufferLength=1;
                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                 } else {
-                    *target++=(uint8_t)si_value[1];
+                    *target++=(uint8_t)siBytes[1];
                 }
             }
             if(offsets!=NULL) {
@@ -4628,11 +4631,11 @@ unassigned:
             }
         } else {
             /* target is full */
-            cnv->charErrorBuffer[0]=(uint8_t)si_value[0];
-            if (si_value_length == 2) {
-                cnv->charErrorBuffer[1]=(uint8_t)si_value[1];
+            cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];
+            if (siLength == 2) {
+                cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];
             }
-            cnv->charErrorBufferLength=si_value_length;
+            cnv->charErrorBufferLength=siLength;
             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
         }
         prevLength=1; /* we switched into SBCS */
@@ -4928,7 +4931,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
             if(U8_IS_TRAIL(b)) {
                 ++i;
             } else {
-                if(i<utf8_countTrailBytes[b]) {
+                if(i<U8_COUNT_TRAIL_BYTES(b)) {
                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
                     sourceLimit-=i+1;
                 }
@@ -5021,7 +5024,7 @@ ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                     /* handle "complicated" and error cases, and continuing partial characters */
                     oldToULength=0;
                     toULength=1;
-                    toULimit=utf8_countTrailBytes[b]+1;
+                    toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
                     c=b;
 moreBytes:
                     while(toULength<toULimit) {
@@ -5120,6 +5123,7 @@ moreBytes:
                      * but then exit the loop because the extension match would
                      * have consumed the source.
                      */
+                    *pErrorCode=U_USING_DEFAULT_WARNING;
                     break;
                 } else {
                     /* a mapping was written to the target, continue */
@@ -5140,10 +5144,12 @@ moreBytes:
      * to stop before a truncated sequence.
      * If so, then collect the truncated sequence now.
      */
-    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+    if(U_SUCCESS(*pErrorCode) &&
+            cnv->preFromUFirstCP<0 &&
+            source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
         c=utf8->toUBytes[0]=b=*source++;
         toULength=1;
-        toULimit=utf8_countTrailBytes[b]+1;
+        toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
         while(source<sourceLimit) {
             utf8->toUBytes[toULength++]=b=*source++;
             c=(c<<6)+b;
@@ -5177,7 +5183,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
 
     uint32_t stage2Entry;
     uint32_t asciiRoundtrips;
-    uint16_t value, minValue;
+    uint16_t value;
     UBool hasSupplementary;
 
     /* set up the local pointers */
@@ -5197,13 +5203,6 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
     }
     asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;
 
-    if(cnv->useFallback) {
-        /* use all roundtrip and fallback results */
-        minValue=0x800;
-    } else {
-        /* use only roundtrips and fallbacks from private-use characters */
-        minValue=0xc00;
-    }
     hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
 
     /* get the converter state from the UTF-8 UConverter */
@@ -5232,7 +5231,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
             if(U8_IS_TRAIL(b)) {
                 ++i;
             } else {
-                if(i<utf8_countTrailBytes[b]) {
+                if(i<U8_COUNT_TRAIL_BYTES(b)) {
                     /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */
                     sourceLimit-=i+1;
                 }
@@ -5305,7 +5304,7 @@ ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
                     /* handle "complicated" and error cases, and continuing partial characters */
                     oldToULength=0;
                     toULength=1;
-                    toULimit=utf8_countTrailBytes[b]+1;
+                    toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
                     c=b;
 moreBytes:
                     while(toULength<toULimit) {
@@ -5433,6 +5432,7 @@ unassigned:
                      * but then exit the loop because the extension match would
                      * have consumed the source.
                      */
+                    *pErrorCode=U_USING_DEFAULT_WARNING;
                     break;
                 } else {
                     /* a mapping was written to the target, continue */
@@ -5454,10 +5454,12 @@ unassigned:
      * to stop before a truncated sequence.
      * If so, then collect the truncated sequence now.
      */
-    if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+    if(U_SUCCESS(*pErrorCode) &&
+            cnv->preFromUFirstCP<0 &&
+            source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
         c=utf8->toUBytes[0]=b=*source++;
         toULength=1;
-        toULimit=utf8_countTrailBytes[b]+1;
+        toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
         while(source<sourceLimit) {
             utf8->toUBytes[toULength++]=b=*source++;
             c=(c<<6)+b;