ICU-531.30.tar.gz

[apple/icu.git] / icuSources / i18n / csrutf8.cpp
diff --git a/icuSources/i18n/csrutf8.cpp b/icuSources/i18n/csrutf8.cpp

index 420c66909d4da0732941649e6ab0d50b373da819..b18aa77e79669f3115e862065498796883d30cce 100644 (file)
--- a/icuSources/i18n/csrutf8.cpp
+++ b/icuSources/i18n/csrutf8.cpp
@@ -1,6 +1,6 @@
  /*
   **********************************************************************
- *   Copyright (C) 2005-2012, International Business Machines
+ *   Copyright (C) 2005-2014, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   **********************************************************************
   */
@@ -55,12 +55,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
              trailBytes = 3;
          } else {
              numInvalid += 1;
-
-            if (numInvalid > 5) {
-                break;
-            }
-
-            trailBytes = 0;
+            continue;
          }
  
          // Verify that we've got the right number of trail bytes in the sequence
@@ -86,7 +81,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
  
      }
  
-    // Cook up some sort of confidence score, based on presense of a BOM
+    // Cook up some sort of confidence score, based on presence of a BOM
      //    and the existence of valid and/or invalid multi-byte sequences.
      confidence = 0;
      if (hasBOM && numInvalid == 0) {
@@ -98,8 +93,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
      } else if (numValid > 0 && numInvalid == 0) {
          confidence = 80;
      } else if (numValid == 0 && numInvalid == 0) {
-        // Plain ASCII.
-        confidence = 10;
+        // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+        //              accepts ASCII with confidence = 10.
+        confidence = 15;
      } else if (numValid > numInvalid*10) {
          // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
          confidence = 25;