/*
**********************************************************************
- * Copyright (C) 2005-2012, International Business Machines
+ * Copyright (C) 2005-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
trailBytes = 3;
} else {
numInvalid += 1;
-
- if (numInvalid > 5) {
- break;
- }
-
- trailBytes = 0;
+ continue;
}
// Verify that we've got the right number of trail bytes in the sequence
}
- // Cook up some sort of confidence score, based on presense of a BOM
+ // Cook up some sort of confidence score, based on presence of a BOM
// and the existence of valid and/or invalid multi-byte sequences.
confidence = 0;
if (hasBOM && numInvalid == 0) {
} else if (numValid > 0 && numInvalid == 0) {
confidence = 80;
} else if (numValid == 0 && numInvalid == 0) {
- // Plain ASCII.
- confidence = 10;
+ // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
+ // accepts ASCII with confidence = 10.
+ confidence = 15;
} else if (numValid > numInvalid*10) {
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
confidence = 25;