ICU-64260.0.1.tar.gz

[apple/icu.git] / icuSources / i18n / inputext.cpp
diff --git a/icuSources/i18n/inputext.cpp b/icuSources/i18n/inputext.cpp

index a36a931ab551ee8c8be8caa561620aba2237ff3b..ab0b697ea0328605716e155cd37ccd620b24b9b1 100644 (file)
--- a/icuSources/i18n/inputext.cpp
+++ b/icuSources/i18n/inputext.cpp
@@ -1,6 +1,8 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
   **********************************************************************
- *   Copyright (C) 2005-2006, International Business Machines
+ *   Copyright (C) 2005-2016, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   **********************************************************************
   */
@@ -20,12 +22,10 @@ U_NAMESPACE_BEGIN
  
  #define BUFFER_SIZE 8192
  
-#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
-
  #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
  #define DELETE_ARRAY(array) uprv_free((void *) (array))
  
-InputText::InputText()
+InputText::InputText(UErrorCode &status)
      : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                   //   removed if appropriate.
        fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
@@ -33,8 +33,10 @@ InputText::InputText()
        fDeclaredEncoding(0),
        fRawInput(0),
        fRawLength(0)
-{  
-
+{
+    if (fInputBytes == NULL || fByteStats == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+    }
  }
  
  InputText::~InputText()
@@ -48,15 +50,16 @@ void InputText::setText(const char *in, int32_t len)
  {
      fInputLen  = 0;
      fC1Bytes   = FALSE;
+    fOnlyTypicalASCII = FALSE; // rdar://56373519
      fRawInput  = (const uint8_t *) in;
-    fRawLength = len == -1? uprv_strlen(in) : len;
+    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
  }
  
  void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
  {
      if(encoding) {
          if (len == -1) {
-            len = uprv_strlen(encoding);
+            len = (int32_t)uprv_strlen(encoding);
          }
  
          len += 1;     // to make place for the \0 at the end.
@@ -73,7 +76,8 @@ UBool InputText::isSet() const
  
  /**
  *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
-*               it by removing what appears to be html markup.
+*               it by removing what appears to be html markup. Currently only used
+*               by CharsetDetector::detectAll.
  * 
  * @internal
  */
@@ -82,6 +86,7 @@ void InputText::MungeInput(UBool fStripTags) {
      int     dsti = 0;
      uint8_t b;
      bool    inMarkup = FALSE;
+    bool    inCSSDecl = FALSE;
      int32_t openTags = 0;
      int32_t badTags  = 0;
  
@@ -96,22 +101,32 @@ void InputText::MungeInput(UBool fStripTags) {
          for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
              b = fRawInput[srci];
  
-            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
+            if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
                  if (inMarkup) {
                      badTags += 1;
                  }
-
                  inMarkup = TRUE;
                  openTags += 1;
              }
  
-            if (! inMarkup) {
+            if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
+                if (inCSSDecl) {
+                    badTags += 1;
+                }
+                inCSSDecl = TRUE;
+                openTags += 1;
+            }
+
+            if (!inMarkup && !inCSSDecl) {
                  fInputBytes[dsti++] = b;
              }
  
              if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
                  inMarkup = FALSE;
              }
+            if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
+                inCSSDecl = FALSE;
+            }
          }
  
          fInputLen = dsti;
@@ -149,12 +164,20 @@ void InputText::MungeInput(UBool fStripTags) {
          fByteStats[fInputBytes[srci]] += 1;
      }
  
-    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
+    fOnlyTypicalASCII = TRUE; // rdar://56373519
+    for (int32_t i = 0x01; i <= 0xFF; i += 1) {
          if (fByteStats[i] != 0) {
-            fC1Bytes = TRUE;
-            break;
+            if ((i < 0x20 && i != 0x09 && i != 0x0A && i != 0x0D) || i > 0x7E) {
+                fOnlyTypicalASCII = FALSE; // rdar://56373519
+                if (i >= 0x80 && i <= 0x9F) {
+                    fC1Bytes = TRUE;
+                }
+            }
          }
      }
+    if (fByteStats[0] > 1) {
+        fOnlyTypicalASCII = FALSE;
+    }
  }
  
  U_NAMESPACE_END