X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/efa1e6592fb03ce23b15276b2b91d885a3ee7da5..57a6839dcb3bba09e8228b822b290604668416fe:/icuSources/i18n/csrutf8.cpp?ds=sidebyside

diff --git a/icuSources/i18n/csrutf8.cpp b/icuSources/i18n/csrutf8.cpp
index 420c6690..b18aa77e 100644
--- a/icuSources/i18n/csrutf8.cpp
+++ b/icuSources/i18n/csrutf8.cpp
@@ -1,6 +1,6 @@
 /*
  **********************************************************************
- *   Copyright (C) 2005-2012, International Business Machines
+ *   Copyright (C) 2005-2014, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  */
@@ -55,12 +55,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
             trailBytes = 3;
         } else {
             numInvalid += 1;
-
-            if (numInvalid > 5) {
-                break;
-            }
-
-            trailBytes = 0;
+            continue;
         }
 
         // Verify that we've got the right number of trail bytes in the sequence
@@ -86,7 +81,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
 
     }
 
-    // Cook up some sort of confidence score, based on presense of a BOM
+    // Cook up some sort of confidence score, based on presence of a BOM
     //    and the existence of valid and/or invalid multi-byte sequences.
     confidence = 0;
     if (hasBOM && numInvalid == 0) {
@@ -98,8 +93,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
     } else if (numValid > 0 && numInvalid == 0) {
         confidence = 80;
     } else if (numValid == 0 && numInvalid == 0) {
-        // Plain ASCII.
-        confidence = 10;
+        // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+        //              accepts ASCII with confidence = 10.
+        confidence = 15;
     } else if (numValid > numInvalid*10) {
         // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
         confidence = 25;