]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ucnvisci.c
ICU-57163.0.1.tar.gz
[apple/icu.git] / icuSources / common / ucnvisci.c
index 411085226c98e13f7dc58a6e36ffaca83e6037db..9b44ecf9cf3927f643aa877c9a6b4c3c052b7158 100644 (file)
@@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2000-2009, International Business Machines
+*   Copyright (C) 2000-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnvisci.c
 
 #include "unicode/utypes.h"
 
-#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
 
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/utf16.h"
 #include "cmemory.h"
 #include "ucnv_bld.h"
-#include "unicode/ucnv.h"
 #include "ucnv_cnv.h"
-#include "unicode/ucnv_cb.h"
-#include "unicode/uset.h"
 #include "cstring.h"
+#include "uassert.h"
 
 #define UCNV_OPTIONS_VERSION_MASK 0xf
 #define NUKTA               0x093c
@@ -63,9 +64,6 @@
 #define PNJ_HA              0x0A39
 #define PNJ_RRA             0x0A5C
 
-static USet* PNJ_BINDI_TIPPI_SET= NULL;
-static USet* PNJ_CONSONANT_SET= NULL;
-
 typedef enum {
     DEVANAGARI =0,
     BENGALI,
@@ -151,24 +149,40 @@ static const LookupDataStruct lookupInitialData[]={
     { MALAYALAM,  MLM_MASK,  MLM }
 };
 
-static void initializeSets() {
-    /* TODO: Replace the following two lines with PNJ_CONSONANT_SET = uset_openEmpty(); */
-    PNJ_CONSONANT_SET = uset_open(0,0);
-    uset_clear(PNJ_CONSONANT_SET);
-
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A15, 0x0A28);
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A2A, 0x0A30);
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A35, 0x0A36);
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A38, 0x0A39);
-    
-    PNJ_BINDI_TIPPI_SET = uset_clone(PNJ_CONSONANT_SET);
-    uset_add(PNJ_BINDI_TIPPI_SET, 0x0A05);
-    uset_add(PNJ_BINDI_TIPPI_SET, 0x0A07);
-    uset_add(PNJ_BINDI_TIPPI_SET, 0x0A3F);
-    uset_addRange(PNJ_BINDI_TIPPI_SET, 0x0A41, 0x0A42);
-    
-    uset_compact(PNJ_CONSONANT_SET);
-    uset_compact(PNJ_BINDI_TIPPI_SET);
+/*
+ * For special handling of certain Gurmukhi characters.
+ * Bit 0 (value 1): PNJ consonant
+ * Bit 1 (value 2): PNJ Bindi Tippi
+ */
+static const uint8_t pnjMap[80] = {
+    /* 0A00..0A0F */
+    0, 0, 0, 0, 0, 2, 0, 2,  0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0A10..0A1F */
+    0, 0, 0, 0, 0, 3, 3, 3,  3, 3, 3, 3, 3, 3, 3, 3,
+    /* 0A20..0A2F */
+    3, 3, 3, 3, 3, 3, 3, 3,  3, 0, 3, 3, 3, 3, 3, 3,
+    /* 0A30..0A3F */
+    3, 0, 0, 0, 0, 3, 3, 0,  3, 3, 0, 0, 0, 0, 0, 2,
+    /* 0A40..0A4F */
+    0, 2, 2, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static UBool
+isPNJConsonant(UChar32 c) {
+    if (c < 0xa00 || 0xa50 <= c) {
+        return FALSE;
+    } else {
+        return (UBool)(pnjMap[c - 0xa00] & 1);
+    }
+}
+
+static UBool
+isPNJBindiTippi(UChar32 c) {
+    if (c < 0xa00 || 0xa50 <= c) {
+        return FALSE;
+    } else {
+        return (UBool)(pnjMap[c - 0xa00] >> 1);
+    }
 }
 
 static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) {
@@ -176,9 +190,6 @@ static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *e
         return;
     }
 
-    /* Ensure that the sets used in special handling of certain Gurmukhi characters are initialized. */
-    initializeSets();
-    
     cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII));
 
     if (cnv->extraInfo != NULL) {
@@ -225,14 +236,6 @@ static void _ISCIIClose(UConverter *cnv) {
         }
         cnv->extraInfo=NULL;
     }
-    if (PNJ_CONSONANT_SET != NULL) {
-        uset_close(PNJ_CONSONANT_SET);
-        PNJ_CONSONANT_SET = NULL;
-    }
-    if (PNJ_BINDI_TIPPI_SET != NULL) {
-        uset_close(PNJ_BINDI_TIPPI_SET);
-        PNJ_BINDI_TIPPI_SET = NULL;
-    }
 }
 
 static const char* _ISCIIgetName(const UConverter* cnv) {
@@ -1031,7 +1034,7 @@ static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
             converterData->contextCharFromUnicode = 0x00;
             break;
         }
-        if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && uset_contains(PNJ_CONSONANT_SET, (sourceChar + PNJ_DELTA))) {
+        if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) {
             /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
             /* reset context char */
             converterData->contextCharFromUnicode = 0x0000;
@@ -1053,16 +1056,16 @@ static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
         } else {
             /* oops.. the code point is unassigned */
             /*check if the char is a First surrogate*/
-            if (UTF_IS_SURROGATE(sourceChar)) {
-                if (UTF_IS_SURROGATE_FIRST(sourceChar)) {
+            if (U16_IS_SURROGATE(sourceChar)) {
+                if (U16_IS_SURROGATE_LEAD(sourceChar)) {
 getTrail:
                     /*look ahead to find the trail surrogate*/
                     if (source < sourceLimit) {
                         /* test the following code unit */
                         UChar trail= (*source);
-                        if (UTF_IS_SECOND_SURROGATE(trail)) {
+                        if (U16_IS_TRAIL(trail)) {
                             source++;
-                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
+                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
                             *err =U_INVALID_CHAR_FOUND;
                             /* convert this surrogate code point */
                             /* exit this condition tree */
@@ -1137,7 +1140,7 @@ static const uint16_t lookupTable[][2]={
     targetUniChar = toUnicodeTable[(sourceChar)] ;                                       \
     /* is the code point valid in current script? */                                     \
     if(sourceChar> ASCII_END &&                                                          \
-            (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode)==0){    \
+            (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){    \
         /* Vocallic RR is assigne in ISCII Telugu and Unicode */                         \
         if(data->currentDeltaToUnicode!=(TELUGU_DELTA) ||                                \
                     targetUniChar!=VOCALLIC_RR){                                         \
@@ -1281,7 +1284,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
             /* look at the pre-context and perform special processing */
             switch (sourceChar) {
             case ISCII_INV:
-            case EXT: /*falls through*/
+            case EXT:
             case ATR:
                 *contextCharToUnicode = (UChar)sourceChar;
 
@@ -1319,7 +1322,6 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                 }
                 break;
             case 0x0A:
-                /* fall through */
             case 0x0D:
                 data->resetToDefaultToUnicode = TRUE;
                 GET_MAPPING(sourceChar,targetUniChar,data)
@@ -1331,6 +1333,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                 i=1;
                 found=FALSE;
                 for (; i<vowelSignESpecialCases[0][0]; i++) {
+                    U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases));
                     if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) {
                         targetUniChar=vowelSignESpecialCases[i][1];
                         found=TRUE;
@@ -1416,6 +1419,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                         /* else fall through to default */
                     }
                     /* else fall through to default */
+                    U_FALLTHROUGH;
                 }
             default:GET_MAPPING(sourceChar,targetUniChar,data)
                 ;
@@ -1425,7 +1429,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
 
             if (*toUnicodeStatus != missingCharMarker) {
                 /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
-                if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && uset_contains(PNJ_CONSONANT_SET, data->prevToUnicodeStatus) &&
+                if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) &&
                         (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && (targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus) {
                     /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
                     offset = (int)(source-args->source - 3);
@@ -1444,10 +1448,10 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                     /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script. 
                      * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
                      */
-                    if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && uset_contains(PNJ_BINDI_TIPPI_SET, (*toUnicodeStatus + PNJ_DELTA))) {
+                    if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) {
                         targetUniChar = PNJ_TIPPI - PNJ_DELTA;
                         WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err);
-                    } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && uset_contains(PNJ_CONSONANT_SET, (*toUnicodeStatus + PNJ_DELTA))) {
+                    } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) {
                         /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
                         data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA;
                     } else {
@@ -1613,15 +1617,7 @@ static const UConverterStaticData _ISCIIStaticData={
 
 };
 
-const UConverterSharedData _ISCIIData={
-    sizeof(UConverterSharedData),
-        ~((uint32_t) 0),
-        NULL,
-        NULL,
-        &_ISCIIStaticData,
-        FALSE,
-        &_ISCIIImpl,
-        0
-};
+const UConverterSharedData _ISCIIData=
+        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl);
 
 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */