]>
git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csrucode.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2005-2013, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
10 #include "unicode/utypes.h"
12 #if !UCONFIG_NO_CONVERSION
19 CharsetRecog_Unicode::~CharsetRecog_Unicode()
24 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
29 const char *CharsetRecog_UTF_16_BE::getName() const
34 // UTF-16 confidence calculation. Very simple minded, but better than nothing.
35 // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
36 // and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
37 // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
38 // NULs should be rare in actual text.
40 static int32_t adjustConfidence(UChar codeUnit
, int32_t confidence
) {
43 } else if ((codeUnit
>= 0x20 && codeUnit
<= 0xff) || codeUnit
== 0x0a) {
48 } else if (confidence
> 100) {
55 UBool
CharsetRecog_UTF_16_BE::match(InputText
* textIn
, CharsetMatch
*results
) const
57 const uint8_t *input
= textIn
->fRawInput
;
58 int32_t confidence
= 10;
59 int32_t length
= textIn
->fRawLength
;
61 int32_t bytesToCheck
= (length
> 30) ? 30 : length
;
62 for (int32_t charIndex
=0; charIndex
<bytesToCheck
-1; charIndex
+=2) {
63 UChar codeUnit
= (input
[charIndex
] << 8) | input
[charIndex
+ 1];
64 if (charIndex
== 0 && codeUnit
== 0xFEFF) {
68 confidence
= adjustConfidence(codeUnit
, confidence
);
69 if (confidence
== 0 || confidence
== 100) {
73 if (bytesToCheck
< 4 && confidence
< 100) {
76 results
->set(textIn
, this, confidence
);
77 return (confidence
> 0);
80 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
85 const char *CharsetRecog_UTF_16_LE::getName() const
90 UBool
CharsetRecog_UTF_16_LE::match(InputText
* textIn
, CharsetMatch
*results
) const
92 const uint8_t *input
= textIn
->fRawInput
;
93 int32_t confidence
= 10;
94 int32_t length
= textIn
->fRawLength
;
96 int32_t bytesToCheck
= (length
> 30) ? 30 : length
;
97 for (int32_t charIndex
=0; charIndex
<bytesToCheck
-1; charIndex
+=2) {
98 UChar codeUnit
= input
[charIndex
] | (input
[charIndex
+ 1] << 8);
99 if (charIndex
== 0 && codeUnit
== 0xFEFF) {
100 confidence
= 100; // UTF-16 BOM
101 if (length
>= 4 && input
[2] == 0 && input
[3] == 0) {
102 confidence
= 0; // UTF-32 BOM
106 confidence
= adjustConfidence(codeUnit
, confidence
);
107 if (confidence
== 0 || confidence
== 100) {
111 if (bytesToCheck
< 4 && confidence
< 100) {
114 results
->set(textIn
, this, confidence
);
115 return (confidence
> 0);
118 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
123 UBool
CharsetRecog_UTF_32::match(InputText
* textIn
, CharsetMatch
*results
) const
125 const uint8_t *input
= textIn
->fRawInput
;
126 int32_t limit
= (textIn
->fRawLength
/ 4) * 4;
127 int32_t numValid
= 0;
128 int32_t numInvalid
= 0;
130 int32_t confidence
= 0;
132 if (limit
> 0 && getChar(input
, 0) == 0x0000FEFFUL
) {
136 for(int32_t i
= 0; i
< limit
; i
+= 4) {
137 int32_t ch
= getChar(input
, i
);
139 if (ch
< 0 || ch
>= 0x10FFFF || (ch
>= 0xD800 && ch
<= 0xDFFF)) {
147 // Cook up some sort of confidence score, based on presense of a BOM
148 // and the existence of valid and/or invalid multi-byte sequences.
149 if (hasBOM
&& numInvalid
==0) {
151 } else if (hasBOM
&& numValid
> numInvalid
*10) {
153 } else if (numValid
> 3 && numInvalid
== 0) {
155 } else if (numValid
> 0 && numInvalid
== 0) {
157 } else if (numValid
> numInvalid
*10) {
158 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
162 results
->set(textIn
, this, confidence
);
163 return (confidence
> 0);
166 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
171 const char *CharsetRecog_UTF_32_BE::getName() const
176 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input
, int32_t index
) const
178 return input
[index
+ 0] << 24 | input
[index
+ 1] << 16 |
179 input
[index
+ 2] << 8 | input
[index
+ 3];
182 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
187 const char *CharsetRecog_UTF_32_LE::getName() const
192 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input
, int32_t index
) const
194 return input
[index
+ 3] << 24 | input
[index
+ 2] << 16 |
195 input
[index
+ 1] << 8 | input
[index
+ 0];