ICU-66108.tar.gz

[apple/icu.git] / icuSources / i18n / csrsbcs.cpp
diff --git a/icuSources/i18n/csrsbcs.cpp b/icuSources/i18n/csrsbcs.cpp

index 1aad70e39ae7f532078ee0c1d057083560fad403..c381958540837ecd40a7ef08bc1cefafafee523f 100644 (file)
--- a/icuSources/i18n/csrsbcs.cpp
+++ b/icuSources/i18n/csrsbcs.cpp
@@ -1,6 +1,8 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
   **********************************************************************
- *   Copyright (C) 2005-2012, International Business Machines
+ *   Copyright (C) 2005-2016, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   **********************************************************************
   */
@@ -15,12 +17,11 @@
  
  #define N_GRAM_SIZE 3
  #define N_GRAM_MASK 0xFFFFFF
-#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
  
  U_NAMESPACE_BEGIN
  
  NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
-  :byteIndex(0), ngram(0)
+ : ngram(0), byteIndex(0)
  {
      ngramList = theNgramList;
      charMap   = theCharMap;
@@ -28,6 +29,10 @@ NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
      ngramCount = hitCount = 0;
  }
  
+NGramParser::~NGramParser()
+{
+}
+
  /*
   * Binary search for value in table, which must have exactly 64 entries.
   */
@@ -96,7 +101,7 @@ int32_t NGramParser::nextByte(InputText *det)
      return det->fInputBytes[byteIndex++];
  }
  
-int32_t NGramParser::parse(InputText *det)
+void NGramParser::parseCharacters(InputText *det)
  {
      int32_t b;
      bool ignoreSpace = FALSE;
@@ -113,6 +118,11 @@ int32_t NGramParser::parse(InputText *det)
              ignoreSpace = (mb == 0x20);
          }
      }
+}
+
+int32_t NGramParser::parse(InputText *det)
+{
+    parseCharacters(det);
  
      // TODO: Is this OK? The buffer could have ended in the middle of a word...
      addByte(0x20);
@@ -132,6 +142,105 @@ int32_t NGramParser::parse(InputText *det)
      return (int32_t) (rawPercent * 300.0);
  }
  
+#if !UCONFIG_ONLY_HTML_CONVERSION
+static const uint8_t unshapeMap_IBM420[] = {
+/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
+/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
+/* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 
+/* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 
+/* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 
+/* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 
+/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, 
+/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, 
+/* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, 
+/* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, 
+/* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, 
+/* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, 
+/* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 
+/* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 
+};
+
+NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
+{
+       alef = 0x00;
+}
+
+NGramParser_IBM420::~NGramParser_IBM420() {}
+
+int32_t NGramParser_IBM420::isLamAlef(int32_t b)
+{
+       if(b == 0xB2 || b == 0xB3){
+               return 0x47;                    
+        }else if(b == 0xB4 || b == 0xB5){
+               return 0x49;
+        }else if(b == 0xB8 || b == 0xB9){
+               return 0x56;
+        }else
+               return 0x00;
+}
+
+/*
+* Arabic shaping needs to be done manually. Cannot call ArabicShaping class
+* because CharsetDetector is dealing with bytes not Unicode code points. We could
+* convert the bytes to Unicode code points but that would leave us dependent
+* on CharsetICU which we try to avoid. IBM420 converter amongst different versions
+* of JDK can produce different results and therefore is also avoided.
+*/ 
+int32_t NGramParser_IBM420::nextByte(InputText *det)
+{
+       
+    if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
+        return -1;
+    }              
+    int next;
+             
+    alef = isLamAlef(det->fInputBytes[byteIndex]);
+    if(alef != 0x00)
+        next = 0xB1 & 0xFF;
+    else
+        next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
+            
+    byteIndex++;
+             
+    return next;
+}
+
+void NGramParser_IBM420::parseCharacters(InputText *det)
+{
+       int32_t b;
+    bool ignoreSpace = FALSE;
+
+    while ((b = nextByte(det)) >= 0) {
+        uint8_t mb = charMap[b];
+
+        // TODO: 0x20 might not be a space in all character sets...
+        if (mb != 0) {
+            if (!(mb == 0x20 && ignoreSpace)) {
+                addByte(mb);
+            }
+            ignoreSpace = (mb == 0x20);
+        }
+               
+               if(alef != 0x00){
+            mb = charMap[alef & 0xFF];
+                     
+            // TODO: 0x20 might not be a space in all character sets...
+            if (mb != 0) {
+                if (!(mb == 0x20 && ignoreSpace)) {
+                    addByte(mb);                    
+                }
+                         
+                ignoreSpace = (mb == 0x20);
+            }
+                        
+        }
+    }
+}
+#endif
+
  CharsetRecog_sbcs::CharsetRecog_sbcs()
  {
      // nothing else to do
@@ -523,6 +632,7 @@ static const uint8_t charMap_KOI8_R[] = {
      0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 
  };
  
+#if !UCONFIG_ONLY_HTML_CONVERSION
  static const int32_t ngrams_IBM424_he_rtl[] = {
      0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, 
      0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, 
@@ -590,6 +700,7 @@ static const uint8_t charMap_IBM420_ar[]= {
  /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF, 
  /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40, 
  };
+#endif
  
  //ISO-8859-1,2,5,6,7,8,9 Ngrams
  
@@ -782,7 +893,7 @@ UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const
      const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
      uint32_t i;
      int32_t bestConfidenceSoFar = -1;
-    for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
+    for (i=0; i < UPRV_LENGTHOF(ngrams_8859_1) ; i++) {
          const int32_t *ngrams = ngrams_8859_1[i].ngrams;
          const char    *lang   = ngrams_8859_1[i].lang;
          int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
@@ -791,6 +902,10 @@ UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const
              bestConfidenceSoFar = confidence;
          }
      }
+    if (bestConfidenceSoFar < 10 && textIn->fOnlyTypicalASCII) { // rdar://56373519
+        bestConfidenceSoFar = 15;
+        results->set(textIn, this, bestConfidenceSoFar, name);
+    }
      return (bestConfidenceSoFar > 0);
  }
  
@@ -809,7 +924,7 @@ UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const
      const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
      uint32_t i;
      int32_t bestConfidenceSoFar = -1;
-    for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
+    for (i=0; i < UPRV_LENGTHOF(ngrams_8859_2) ; i++) {
          const int32_t *ngrams = ngrams_8859_2[i].ngrams;
          const char    *lang   = ngrams_8859_2[i].lang;
          int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
@@ -1054,6 +1169,7 @@ UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
      return (confidence > 0);
  }
  
+#if !UCONFIG_ONLY_HTML_CONVERSION
  CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
  {
      // nothing to do
@@ -1098,26 +1214,6 @@ UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results
      return (confidence > 0);
  }
  
-static const uint8_t unshapeMap_IBM420[] = {
-/*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
-/* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 
-/* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 
-/* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 
-/* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 
-/* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 
-/* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F, 
-/* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E, 
-/* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF, 
-/* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF, 
-/* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF, 
-/* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF, 
-/* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 
-/* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 
-};
-
  CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
  {
      // nothing to do
@@ -1128,88 +1224,15 @@ const char *CharsetRecog_IBM420_ar::getLanguage() const
      return "ar";
  }
  
-void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
-    prev_fInputBytesLength = textIn->fInputLen;
-    prev_fInputBytes = textIn->fInputBytes;
      
-    int32_t length = 0;
-    uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
-    
-    if (bb != NULL) {
-        textIn->fInputBytes = bb;
-        textIn->fInputLen = length;
-        
-        deleteBuffer = TRUE;
-    } else {
-        deleteBuffer = FALSE;
-    }
-}
-
-uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
-    uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
-    
-    if (resultArray != NULL) {
-        for (int32_t i = 0; i < inputBytesLength; i++) {
-            resultArray[i] = unshapeMap_IBM420[resultArray[i]];
-        }
-    }
-    
-    return resultArray;
-}
-
-uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
-    int32_t bigBufferLength = inputBytesLength * 2;
-    uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
-    uint8_t *resultBuffer = NULL;
-    
-    if (bigBuffer != NULL) {
-        int32_t bufferIndex;
-        static const uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
-        
-        for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
-            if (isLamAlef(inputBytes[i])) {
-                bigBuffer[bufferIndex++] = unshapedLamAlef[0];
-                bigBuffer[bufferIndex++] = unshapedLamAlef[1];
-            } else {
-                bigBuffer[bufferIndex++] = inputBytes[i];
-            }
-        }
-        
-        length = bufferIndex;
-        resultBuffer = (uint8_t *)uprv_malloc(length);
-        if (resultBuffer != NULL) {
-            uprv_memcpy(resultBuffer, bigBuffer, length);
-        }
-    }
-    
-    if (bigBuffer != NULL) {
-        uprv_free(bigBuffer);
-    }
+int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
+{
+    NGramParser_IBM420 parser(ngrams, byteMap);
+    int32_t result;
      
-    return resultBuffer;
-}
-
-void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
-    if (deleteBuffer) {
-        uprv_free(textIn->fInputBytes);
+    result = parser.parse(det);
          
-        textIn->fInputBytes = prev_fInputBytes;
-        textIn->fInputLen = prev_fInputBytesLength;
-    }
-}
-
-UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
-    static const uint8_t shapedLamAlef[] = {
-        0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8 
-    };
-    
-    for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
-        if (b == shapedLamAlef[i]) {
-            return TRUE;
-        }
-    }
-    
-    return FALSE;
+    return result;
  }
  
  CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
@@ -1245,6 +1268,7 @@ UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
      results->set(textIn, this, confidence);
      return (confidence > 0);
  }
+#endif
  
  U_NAMESPACE_END
  #endif