]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csrucode.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / i18n / csrucode.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
57a6839d 3 * Copyright (C) 2005-2013, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "csrucode.h"
51004dcb 13#include "csmatch.h"
73c04bcf
A
14
15U_NAMESPACE_BEGIN
16
17CharsetRecog_Unicode::~CharsetRecog_Unicode()
18{
19 // nothing to do
20}
21
22CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
23{
24 // nothing to do
25}
26
27const char *CharsetRecog_UTF_16_BE::getName() const
28{
29 return "UTF-16BE";
30}
31
57a6839d
A
32// UTF-16 confidence calculation. Very simple minded, but better than nothing.
33// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
34// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
35// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
36// NULs should be rare in actual text.
37
38static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
39 if (codeUnit == 0) {
40 confidence -= 10;
41 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
42 confidence += 10;
43 }
44 if (confidence < 0) {
45 confidence = 0;
46 } else if (confidence > 100) {
47 confidence = 100;
48 }
49 return confidence;
50}
51
52
51004dcb 53UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
73c04bcf
A
54{
55 const uint8_t *input = textIn->fRawInput;
57a6839d
A
56 int32_t confidence = 10;
57 int32_t length = textIn->fRawLength;
58
59 int32_t bytesToCheck = (length > 30) ? 30 : length;
60 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
61 UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1];
62 if (charIndex == 0 && codeUnit == 0xFEFF) {
63 confidence = 100;
64 break;
65 }
66 confidence = adjustConfidence(codeUnit, confidence);
67 if (confidence == 0 || confidence == 100) {
68 break;
69 }
70 }
71 if (bytesToCheck < 4 && confidence < 100) {
72 confidence = 0;
73c04bcf 73 }
51004dcb
A
74 results->set(textIn, this, confidence);
75 return (confidence > 0);
73c04bcf
A
76}
77
78CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
79{
80 // nothing to do
81}
82
83const char *CharsetRecog_UTF_16_LE::getName() const
84{
85 return "UTF-16LE";
86}
87
51004dcb 88UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
73c04bcf
A
89{
90 const uint8_t *input = textIn->fRawInput;
57a6839d
A
91 int32_t confidence = 10;
92 int32_t length = textIn->fRawLength;
93
94 int32_t bytesToCheck = (length > 30) ? 30 : length;
95 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
96 UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8);
97 if (charIndex == 0 && codeUnit == 0xFEFF) {
98 confidence = 100; // UTF-16 BOM
99 if (length >= 4 && input[2] == 0 && input[3] == 0) {
100 confidence = 0; // UTF-32 BOM
101 }
102 break;
103 }
104 confidence = adjustConfidence(codeUnit, confidence);
105 if (confidence == 0 || confidence == 100) {
106 break;
107 }
108 }
109 if (bytesToCheck < 4 && confidence < 100) {
110 confidence = 0;
73c04bcf 111 }
51004dcb
A
112 results->set(textIn, this, confidence);
113 return (confidence > 0);
73c04bcf
A
114}
115
116CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
117{
118 // nothing to do
119}
120
51004dcb 121UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
73c04bcf
A
122{
123 const uint8_t *input = textIn->fRawInput;
124 int32_t limit = (textIn->fRawLength / 4) * 4;
125 int32_t numValid = 0;
126 int32_t numInvalid = 0;
127 bool hasBOM = FALSE;
128 int32_t confidence = 0;
129
57a6839d 130 if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
73c04bcf
A
131 hasBOM = TRUE;
132 }
133
134 for(int32_t i = 0; i < limit; i += 4) {
135 int32_t ch = getChar(input, i);
136
137 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
138 numInvalid += 1;
139 } else {
140 numValid += 1;
141 }
142 }
143
144
145 // Cook up some sort of confidence score, based on presense of a BOM
146 // and the existence of valid and/or invalid multi-byte sequences.
147 if (hasBOM && numInvalid==0) {
148 confidence = 100;
149 } else if (hasBOM && numValid > numInvalid*10) {
150 confidence = 80;
151 } else if (numValid > 3 && numInvalid == 0) {
152 confidence = 100;
153 } else if (numValid > 0 && numInvalid == 0) {
154 confidence = 80;
155 } else if (numValid > numInvalid*10) {
156 // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
157 confidence = 25;
158 }
159
51004dcb
A
160 results->set(textIn, this, confidence);
161 return (confidence > 0);
73c04bcf
A
162}
163
164CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
165{
166 // nothing to do
167}
168
169const char *CharsetRecog_UTF_32_BE::getName() const
170{
171 return "UTF-32BE";
172}
173
174int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
175{
176 return input[index + 0] << 24 | input[index + 1] << 16 |
177 input[index + 2] << 8 | input[index + 3];
178}
179
180CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
181{
182 // nothing to do
183}
184
185const char *CharsetRecog_UTF_32_LE::getName() const
186{
187 return "UTF-32LE";
188}
189
190int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
191{
192 return input[index + 3] << 24 | input[index + 2] << 16 |
193 input[index + 1] << 8 | input[index + 0];
194}
195
196U_NAMESPACE_END
197#endif
198