2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_CONVERSION
17 CharsetRecog_Unicode::~CharsetRecog_Unicode()
22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
27 const char *CharsetRecog_UTF_16_BE::getName() const
32 // UTF-16 confidence calculation. Very simple minded, but better than nothing.
33 // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
34 // and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
35 // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
36 // NULs should be rare in actual text.
38 static int32_t adjustConfidence(UChar codeUnit
, int32_t confidence
) {
41 } else if ((codeUnit
>= 0x20 && codeUnit
<= 0xff) || codeUnit
== 0x0a) {
46 } else if (confidence
> 100) {
53 UBool
CharsetRecog_UTF_16_BE::match(InputText
* textIn
, CharsetMatch
*results
) const
55 const uint8_t *input
= textIn
->fRawInput
;
56 int32_t confidence
= 10;
57 int32_t length
= textIn
->fRawLength
;
59 int32_t bytesToCheck
= (length
> 30) ? 30 : length
;
60 for (int32_t charIndex
=0; charIndex
<bytesToCheck
-1; charIndex
+=2) {
61 UChar codeUnit
= (input
[charIndex
] << 8) | input
[charIndex
+ 1];
62 if (charIndex
== 0 && codeUnit
== 0xFEFF) {
66 confidence
= adjustConfidence(codeUnit
, confidence
);
67 if (confidence
== 0 || confidence
== 100) {
71 if (bytesToCheck
< 4 && confidence
< 100) {
74 results
->set(textIn
, this, confidence
);
75 return (confidence
> 0);
78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
83 const char *CharsetRecog_UTF_16_LE::getName() const
88 UBool
CharsetRecog_UTF_16_LE::match(InputText
* textIn
, CharsetMatch
*results
) const
90 const uint8_t *input
= textIn
->fRawInput
;
91 int32_t confidence
= 10;
92 int32_t length
= textIn
->fRawLength
;
94 int32_t bytesToCheck
= (length
> 30) ? 30 : length
;
95 for (int32_t charIndex
=0; charIndex
<bytesToCheck
-1; charIndex
+=2) {
96 UChar codeUnit
= input
[charIndex
] | (input
[charIndex
+ 1] << 8);
97 if (charIndex
== 0 && codeUnit
== 0xFEFF) {
98 confidence
= 100; // UTF-16 BOM
99 if (length
>= 4 && input
[2] == 0 && input
[3] == 0) {
100 confidence
= 0; // UTF-32 BOM
104 confidence
= adjustConfidence(codeUnit
, confidence
);
105 if (confidence
== 0 || confidence
== 100) {
109 if (bytesToCheck
< 4 && confidence
< 100) {
112 results
->set(textIn
, this, confidence
);
113 return (confidence
> 0);
116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
121 UBool
CharsetRecog_UTF_32::match(InputText
* textIn
, CharsetMatch
*results
) const
123 const uint8_t *input
= textIn
->fRawInput
;
124 int32_t limit
= (textIn
->fRawLength
/ 4) * 4;
125 int32_t numValid
= 0;
126 int32_t numInvalid
= 0;
128 int32_t confidence
= 0;
130 if (limit
> 0 && getChar(input
, 0) == 0x0000FEFFUL
) {
134 for(int32_t i
= 0; i
< limit
; i
+= 4) {
135 int32_t ch
= getChar(input
, i
);
137 if (ch
< 0 || ch
>= 0x10FFFF || (ch
>= 0xD800 && ch
<= 0xDFFF)) {
145 // Cook up some sort of confidence score, based on presense of a BOM
146 // and the existence of valid and/or invalid multi-byte sequences.
147 if (hasBOM
&& numInvalid
==0) {
149 } else if (hasBOM
&& numValid
> numInvalid
*10) {
151 } else if (numValid
> 3 && numInvalid
== 0) {
153 } else if (numValid
> 0 && numInvalid
== 0) {
155 } else if (numValid
> numInvalid
*10) {
156 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
160 results
->set(textIn
, this, confidence
);
161 return (confidence
> 0);
164 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
169 const char *CharsetRecog_UTF_32_BE::getName() const
174 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input
, int32_t index
) const
176 return input
[index
+ 0] << 24 | input
[index
+ 1] << 16 |
177 input
[index
+ 2] << 8 | input
[index
+ 3];
180 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
185 const char *CharsetRecog_UTF_32_LE::getName() const
190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input
, int32_t index
) const
192 return input
[index
+ 3] << 24 | input
[index
+ 2] << 16 |
193 input
[index
+ 1] << 8 | input
[index
+ 0];