git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	**********************************************************************
	5	* Copyright (C) 2005-2013, International Business Machines
	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_CONVERSION
	13
	14	#include "csrucode.h"
	15	#include "csmatch.h"
	16
	17	U_NAMESPACE_BEGIN
	18
	19	CharsetRecog_Unicode::~CharsetRecog_Unicode()
	20	{
	21	// nothing to do
	22	}
	23
	24	CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
	25	{
	26	// nothing to do
	27	}
	28
	29	const char *CharsetRecog_UTF_16_BE::getName() const
	30	{
	31	return "UTF-16BE";
	32	}
	33
	34	// UTF-16 confidence calculation. Very simple minded, but better than nothing.
	35	// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
	36	// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
	37	// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
	38	// NULs should be rare in actual text.
	39
	40	static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
	41	if (codeUnit == 0) {
	42	confidence -= 10;
	43	} else if ((codeUnit >= 0x20 && codeUnit <= 0xff) \|\| codeUnit == 0x0a) {
	44	confidence += 10;
	45	}
	46	if (confidence < 0) {
	47	confidence = 0;
	48	} else if (confidence > 100) {
	49	confidence = 100;
	50	}
	51	return confidence;
	52	}
	53
	54
	55	UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
	56	{
	57	const uint8_t *input = textIn->fRawInput;
	58	int32_t confidence = 10;
	59	int32_t length = textIn->fRawLength;
	60
	61	int32_t bytesToCheck = (length > 30) ? 30 : length;
	62	for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
	63	UChar codeUnit = (input[charIndex] << 8) \| input[charIndex + 1];
	64	if (charIndex == 0 && codeUnit == 0xFEFF) {
	65	confidence = 100;
	66	break;
	67	}
	68	confidence = adjustConfidence(codeUnit, confidence);
	69	if (confidence == 0 \|\| confidence == 100) {
	70	break;
	71	}
	72	}
	73	if (bytesToCheck < 4 && confidence < 100) {
	74	confidence = 0;
	75	}
	76	results->set(textIn, this, confidence);
	77	return (confidence > 0);
	78	}
	79
	80	CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
	81	{
	82	// nothing to do
	83	}
	84
	85	const char *CharsetRecog_UTF_16_LE::getName() const
	86	{
	87	return "UTF-16LE";
	88	}
	89
	90	UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
	91	{
	92	const uint8_t *input = textIn->fRawInput;
	93	int32_t confidence = 10;
	94	int32_t length = textIn->fRawLength;
	95
	96	int32_t bytesToCheck = (length > 30) ? 30 : length;
	97	for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
	98	UChar codeUnit = input[charIndex] \| (input[charIndex + 1] << 8);
	99	if (charIndex == 0 && codeUnit == 0xFEFF) {
	100	confidence = 100; // UTF-16 BOM
	101	if (length >= 4 && input[2] == 0 && input[3] == 0) {
	102	confidence = 0; // UTF-32 BOM
	103	}
	104	break;
	105	}
	106	confidence = adjustConfidence(codeUnit, confidence);
	107	if (confidence == 0 \|\| confidence == 100) {
	108	break;
	109	}
	110	}
	111	if (bytesToCheck < 4 && confidence < 100) {
	112	confidence = 0;
	113	}
	114	results->set(textIn, this, confidence);
	115	return (confidence > 0);
	116	}
	117
	118	CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
	119	{
	120	// nothing to do
	121	}
	122
	123	UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
	124	{
	125	const uint8_t *input = textIn->fRawInput;
	126	int32_t limit = (textIn->fRawLength / 4) * 4;
	127	int32_t numValid = 0;
	128	int32_t numInvalid = 0;
	129	bool hasBOM = FALSE;
	130	int32_t confidence = 0;
	131
	132	if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
	133	hasBOM = TRUE;
	134	}
	135
	136	for(int32_t i = 0; i < limit; i += 4) {
	137	int32_t ch = getChar(input, i);
	138
	139	if (ch < 0 \|\| ch >= 0x10FFFF \|\| (ch >= 0xD800 && ch <= 0xDFFF)) {
	140	numInvalid += 1;
	141	} else {
	142	numValid += 1;
	143	}
	144	}
	145
	146
	147	// Cook up some sort of confidence score, based on presense of a BOM
	148	// and the existence of valid and/or invalid multi-byte sequences.
	149	if (hasBOM && numInvalid==0) {
	150	confidence = 100;
	151	} else if (hasBOM && numValid > numInvalid*10) {
	152	confidence = 80;
	153	} else if (numValid > 3 && numInvalid == 0) {
	154	confidence = 100;
	155	} else if (numValid > 0 && numInvalid == 0) {
	156	confidence = 80;
	157	} else if (numValid > numInvalid*10) {
	158	// Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
	159	confidence = 25;
	160	}
	161
	162	results->set(textIn, this, confidence);
	163	return (confidence > 0);
	164	}
	165
	166	CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
	167	{
	168	// nothing to do
	169	}
	170
	171	const char *CharsetRecog_UTF_32_BE::getName() const
	172	{
	173	return "UTF-32BE";
	174	}
	175
	176	int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
	177	{
	178	return input[index + 0] << 24 \| input[index + 1] << 16 \|
	179	input[index + 2] << 8 \| input[index + 3];
	180	}
	181
	182	CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
	183	{
	184	// nothing to do
	185	}
	186
	187	const char *CharsetRecog_UTF_32_LE::getName() const
	188	{
	189	return "UTF-32LE";
	190	}
	191
	192	int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
	193	{
	194	return input[index + 3] << 24 \| input[index + 2] << 16 \|
	195	input[index + 1] << 8 \| input[index + 0];
	196	}
	197
	198	U_NAMESPACE_END
	199	#endif
	200