ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / csrucode.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "csrucode.h"
13 #include "csmatch.h"
14
15 U_NAMESPACE_BEGIN
16
17 CharsetRecog_Unicode::~CharsetRecog_Unicode()
18 {
19 // nothing to do
20 }
21
22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
23 {
24 // nothing to do
25 }
26
27 const char *CharsetRecog_UTF_16_BE::getName() const
28 {
29 return "UTF-16BE";
30 }
31
32 // UTF-16 confidence calculation. Very simple minded, but better than nothing.
33 // Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
34 // and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
35 // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
36 // NULs should be rare in actual text.
37
38 static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
39 if (codeUnit == 0) {
40 confidence -= 10;
41 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
42 confidence += 10;
43 }
44 if (confidence < 0) {
45 confidence = 0;
46 } else if (confidence > 100) {
47 confidence = 100;
48 }
49 return confidence;
50 }
51
52
53 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
54 {
55 const uint8_t *input = textIn->fRawInput;
56 int32_t confidence = 10;
57 int32_t length = textIn->fRawLength;
58
59 int32_t bytesToCheck = (length > 30) ? 30 : length;
60 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
61 UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1];
62 if (charIndex == 0 && codeUnit == 0xFEFF) {
63 confidence = 100;
64 break;
65 }
66 confidence = adjustConfidence(codeUnit, confidence);
67 if (confidence == 0 || confidence == 100) {
68 break;
69 }
70 }
71 if (bytesToCheck < 4 && confidence < 100) {
72 confidence = 0;
73 }
74 results->set(textIn, this, confidence);
75 return (confidence > 0);
76 }
77
78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
79 {
80 // nothing to do
81 }
82
83 const char *CharsetRecog_UTF_16_LE::getName() const
84 {
85 return "UTF-16LE";
86 }
87
88 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
89 {
90 const uint8_t *input = textIn->fRawInput;
91 int32_t confidence = 10;
92 int32_t length = textIn->fRawLength;
93
94 int32_t bytesToCheck = (length > 30) ? 30 : length;
95 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
96 UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8);
97 if (charIndex == 0 && codeUnit == 0xFEFF) {
98 confidence = 100; // UTF-16 BOM
99 if (length >= 4 && input[2] == 0 && input[3] == 0) {
100 confidence = 0; // UTF-32 BOM
101 }
102 break;
103 }
104 confidence = adjustConfidence(codeUnit, confidence);
105 if (confidence == 0 || confidence == 100) {
106 break;
107 }
108 }
109 if (bytesToCheck < 4 && confidence < 100) {
110 confidence = 0;
111 }
112 results->set(textIn, this, confidence);
113 return (confidence > 0);
114 }
115
116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
117 {
118 // nothing to do
119 }
120
121 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
122 {
123 const uint8_t *input = textIn->fRawInput;
124 int32_t limit = (textIn->fRawLength / 4) * 4;
125 int32_t numValid = 0;
126 int32_t numInvalid = 0;
127 bool hasBOM = FALSE;
128 int32_t confidence = 0;
129
130 if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
131 hasBOM = TRUE;
132 }
133
134 for(int32_t i = 0; i < limit; i += 4) {
135 int32_t ch = getChar(input, i);
136
137 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
138 numInvalid += 1;
139 } else {
140 numValid += 1;
141 }
142 }
143
144
145 // Cook up some sort of confidence score, based on presense of a BOM
146 // and the existence of valid and/or invalid multi-byte sequences.
147 if (hasBOM && numInvalid==0) {
148 confidence = 100;
149 } else if (hasBOM && numValid > numInvalid*10) {
150 confidence = 80;
151 } else if (numValid > 3 && numInvalid == 0) {
152 confidence = 100;
153 } else if (numValid > 0 && numInvalid == 0) {
154 confidence = 80;
155 } else if (numValid > numInvalid*10) {
156 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
157 confidence = 25;
158 }
159
160 results->set(textIn, this, confidence);
161 return (confidence > 0);
162 }
163
164 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
165 {
166 // nothing to do
167 }
168
169 const char *CharsetRecog_UTF_32_BE::getName() const
170 {
171 return "UTF-32BE";
172 }
173
174 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
175 {
176 return input[index + 0] << 24 | input[index + 1] << 16 |
177 input[index + 2] << 8 | input[index + 3];
178 }
179
180 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
181 {
182 // nothing to do
183 }
184
185 const char *CharsetRecog_UTF_32_LE::getName() const
186 {
187 return "UTF-32LE";
188 }
189
190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
191 {
192 return input[index + 3] << 24 | input[index + 2] << 16 |
193 input[index + 1] << 8 | input[index + 0];
194 }
195
196 U_NAMESPACE_END
197 #endif
198