ICU-57163.0.1.tar.gz

[apple/icu.git] / icuSources / common / ucnvisci.c
diff --git a/icuSources/common/ucnvisci.c b/icuSources/common/ucnvisci.c

index 411085226c98e13f7dc58a6e36ffaca83e6037db..9b44ecf9cf3927f643aa877c9a6b4c3c052b7158 100644 (file)
--- a/icuSources/common/ucnvisci.c
+++ b/icuSources/common/ucnvisci.c
@@ -1,6 +1,6 @@
  /*
  **********************************************************************
-*   Copyright (C) 2000-2009, International Business Machines
+*   Copyright (C) 2000-2016, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *   file name:  ucnvisci.c
@@ -17,15 +17,16 @@
  
  #include "unicode/utypes.h"
  
-#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/utf16.h"
  #include "cmemory.h"
  #include "ucnv_bld.h"
-#include "unicode/ucnv.h"
  #include "ucnv_cnv.h"
-#include "unicode/ucnv_cb.h"
-#include "unicode/uset.h"
  #include "cstring.h"
+#include "uassert.h"
  
  #define UCNV_OPTIONS_VERSION_MASK 0xf
  #define NUKTA               0x093c
@@ -63,9 +64,6 @@
  #define PNJ_HA              0x0A39
  #define PNJ_RRA             0x0A5C
  
-static USet* PNJ_BINDI_TIPPI_SET= NULL;
-static USet* PNJ_CONSONANT_SET= NULL;
-
  typedef enum {
      DEVANAGARI =0,
      BENGALI,
@@ -151,24 +149,40 @@ static const LookupDataStruct lookupInitialData[]={
      { MALAYALAM,  MLM_MASK,  MLM }
  };
  
-static void initializeSets() {
-    /* TODO: Replace the following two lines with PNJ_CONSONANT_SET = uset_openEmpty(); */
-    PNJ_CONSONANT_SET = uset_open(0,0);
-    uset_clear(PNJ_CONSONANT_SET);
-
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A15, 0x0A28);
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A2A, 0x0A30);
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A35, 0x0A36);
-    uset_addRange(PNJ_CONSONANT_SET, 0x0A38, 0x0A39);
-    
-    PNJ_BINDI_TIPPI_SET = uset_clone(PNJ_CONSONANT_SET);
-    uset_add(PNJ_BINDI_TIPPI_SET, 0x0A05);
-    uset_add(PNJ_BINDI_TIPPI_SET, 0x0A07);
-    uset_add(PNJ_BINDI_TIPPI_SET, 0x0A3F);
-    uset_addRange(PNJ_BINDI_TIPPI_SET, 0x0A41, 0x0A42);
-    
-    uset_compact(PNJ_CONSONANT_SET);
-    uset_compact(PNJ_BINDI_TIPPI_SET);
+/*
+ * For special handling of certain Gurmukhi characters.
+ * Bit 0 (value 1): PNJ consonant
+ * Bit 1 (value 2): PNJ Bindi Tippi
+ */
+static const uint8_t pnjMap[80] = {
+    /* 0A00..0A0F */
+    0, 0, 0, 0, 0, 2, 0, 2,  0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0A10..0A1F */
+    0, 0, 0, 0, 0, 3, 3, 3,  3, 3, 3, 3, 3, 3, 3, 3,
+    /* 0A20..0A2F */
+    3, 3, 3, 3, 3, 3, 3, 3,  3, 0, 3, 3, 3, 3, 3, 3,
+    /* 0A30..0A3F */
+    3, 0, 0, 0, 0, 3, 3, 0,  3, 3, 0, 0, 0, 0, 0, 2,
+    /* 0A40..0A4F */
+    0, 2, 2, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static UBool
+isPNJConsonant(UChar32 c) {
+    if (c < 0xa00 || 0xa50 <= c) {
+        return FALSE;
+    } else {
+        return (UBool)(pnjMap[c - 0xa00] & 1);
+    }
+}
+
+static UBool
+isPNJBindiTippi(UChar32 c) {
+    if (c < 0xa00 || 0xa50 <= c) {
+        return FALSE;
+    } else {
+        return (UBool)(pnjMap[c - 0xa00] >> 1);
+    }
  }
  
  static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) {
@@ -176,9 +190,6 @@ static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *e
          return;
      }
  
-    /* Ensure that the sets used in special handling of certain Gurmukhi characters are initialized. */
-    initializeSets();
-    
      cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII));
  
      if (cnv->extraInfo != NULL) {
@@ -225,14 +236,6 @@ static void _ISCIIClose(UConverter *cnv) {
          }
          cnv->extraInfo=NULL;
      }
-    if (PNJ_CONSONANT_SET != NULL) {
-        uset_close(PNJ_CONSONANT_SET);
-        PNJ_CONSONANT_SET = NULL;
-    }
-    if (PNJ_BINDI_TIPPI_SET != NULL) {
-        uset_close(PNJ_BINDI_TIPPI_SET);
-        PNJ_BINDI_TIPPI_SET = NULL;
-    }
  }
  
  static const char* _ISCIIgetName(const UConverter* cnv) {
@@ -1031,7 +1034,7 @@ static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
              converterData->contextCharFromUnicode = 0x00;
              break;
          }
-        if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && uset_contains(PNJ_CONSONANT_SET, (sourceChar + PNJ_DELTA))) {
+        if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) {
              /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
              /* reset context char */
              converterData->contextCharFromUnicode = 0x0000;
@@ -1053,16 +1056,16 @@ static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
          } else {
              /* oops.. the code point is unassigned */
              /*check if the char is a First surrogate*/
-            if (UTF_IS_SURROGATE(sourceChar)) {
-                if (UTF_IS_SURROGATE_FIRST(sourceChar)) {
+            if (U16_IS_SURROGATE(sourceChar)) {
+                if (U16_IS_SURROGATE_LEAD(sourceChar)) {
  getTrail:
                      /*look ahead to find the trail surrogate*/
                      if (source < sourceLimit) {
                          /* test the following code unit */
                          UChar trail= (*source);
-                        if (UTF_IS_SECOND_SURROGATE(trail)) {
+                        if (U16_IS_TRAIL(trail)) {
                              source++;
-                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
+                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
                              *err =U_INVALID_CHAR_FOUND;
                              /* convert this surrogate code point */
                              /* exit this condition tree */
@@ -1137,7 +1140,7 @@ static const uint16_t lookupTable[][2]={
      targetUniChar = toUnicodeTable[(sourceChar)] ;                                       \
      /* is the code point valid in current script? */                                     \
      if(sourceChar> ASCII_END &&                                                          \
-            (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode)==0){    \
+            (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){    \
          /* Vocallic RR is assigne in ISCII Telugu and Unicode */                         \
          if(data->currentDeltaToUnicode!=(TELUGU_DELTA) ||                                \
                      targetUniChar!=VOCALLIC_RR){                                         \
@@ -1281,7 +1284,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
              /* look at the pre-context and perform special processing */
              switch (sourceChar) {
              case ISCII_INV:
-            case EXT: /*falls through*/
+            case EXT:
              case ATR:
                  *contextCharToUnicode = (UChar)sourceChar;
  
@@ -1319,7 +1322,6 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                  }
                  break;
              case 0x0A:
-                /* fall through */
              case 0x0D:
                  data->resetToDefaultToUnicode = TRUE;
                  GET_MAPPING(sourceChar,targetUniChar,data)
@@ -1331,6 +1333,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                  i=1;
                  found=FALSE;
                  for (; i<vowelSignESpecialCases[0][0]; i++) {
+                    U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases));
                      if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) {
                          targetUniChar=vowelSignESpecialCases[i][1];
                          found=TRUE;
@@ -1416,6 +1419,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                          /* else fall through to default */
                      }
                      /* else fall through to default */
+                    U_FALLTHROUGH;
                  }
              default:GET_MAPPING(sourceChar,targetUniChar,data)
                  ;
@@ -1425,7 +1429,7 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
  
              if (*toUnicodeStatus != missingCharMarker) {
                  /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
-                if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && uset_contains(PNJ_CONSONANT_SET, data->prevToUnicodeStatus) &&
+                if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) &&
                          (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && (targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus) {
                      /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
                      offset = (int)(source-args->source - 3);
@@ -1444,10 +1448,10 @@ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *ar
                      /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script. 
                       * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
                       */
-                    if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && uset_contains(PNJ_BINDI_TIPPI_SET, (*toUnicodeStatus + PNJ_DELTA))) {
+                    if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) {
                          targetUniChar = PNJ_TIPPI - PNJ_DELTA;
                          WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err);
-                    } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && uset_contains(PNJ_CONSONANT_SET, (*toUnicodeStatus + PNJ_DELTA))) {
+                    } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) {
                          /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
                          data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA;
                      } else {
@@ -1613,15 +1617,7 @@ static const UConverterStaticData _ISCIIStaticData={
  
  };
  
-const UConverterSharedData _ISCIIData={
-    sizeof(UConverterSharedData),
-        ~((uint32_t) 0),
-        NULL,
-        NULL,
-        &_ISCIIStaticData,
-        FALSE,
-        &_ISCIIImpl,
-        0
-};
+const UConverterSharedData _ISCIIData=
+        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl);
  
  #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */