X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/efa1e6592fb03ce23b15276b2b91d885a3ee7da5..57a6839dcb3bba09e8228b822b290604668416fe:/icuSources/i18n/csrutf8.cpp?ds=sidebyside diff --git a/icuSources/i18n/csrutf8.cpp b/icuSources/i18n/csrutf8.cpp index 420c6690..b18aa77e 100644 --- a/icuSources/i18n/csrutf8.cpp +++ b/icuSources/i18n/csrutf8.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines + * Copyright (C) 2005-2014, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -55,12 +55,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { trailBytes = 3; } else { numInvalid += 1; - - if (numInvalid > 5) { - break; - } - - trailBytes = 0; + continue; } // Verify that we've got the right number of trail bytes in the sequence @@ -86,7 +81,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { } - // Cook up some sort of confidence score, based on presense of a BOM + // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. confidence = 0; if (hasBOM && numInvalid == 0) { @@ -98,8 +93,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid == 0 && numInvalid == 0) { - // Plain ASCII. - confidence = 10; + // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which + // accepts ASCII with confidence = 10. + confidence = 15; } else if (numValid > numInvalid*10) { // Probably corruput utf-8 data. Valid sequences aren't likely by chance. confidence = 25;