]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csdetect.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
729e4ab9 3 * Copyright (C) 2005-2009, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
30#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35U_CDECL_BEGIN
46f4442e 36static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
73c04bcf
A
37
38static int32_t fCSRecognizers_size = 0;
39
40static UBool U_CALLCONV csdet_cleanup(void)
41{
42 if (fCSRecognizers != NULL) {
43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
44 delete fCSRecognizers[r];
45 fCSRecognizers[r] = NULL;
46 }
47
48 DELETE_ARRAY(fCSRecognizers);
49 fCSRecognizers = NULL;
50 fCSRecognizers_size = 0;
51 }
52
53 return TRUE;
54}
55
56static int32_t U_CALLCONV
46f4442e 57charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
73c04bcf 58{
46f4442e
A
59 U_NAMESPACE_USE
60
73c04bcf
A
61 const CharsetMatch **csm_l = (const CharsetMatch **) left;
62 const CharsetMatch **csm_r = (const CharsetMatch **) right;
63
64 // NOTE: compare is backwards to sort from highest to lowest.
65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
66}
67
68U_CDECL_END
69
70U_NAMESPACE_BEGIN
71
72void CharsetDetector::setRecognizers(UErrorCode &status)
73{
74 UBool needsInit;
75 CharsetRecognizer **recognizers;
76
77 if (U_FAILURE(status)) {
78 return;
79 }
80
46f4442e 81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
73c04bcf
A
82
83 if (needsInit) {
84 CharsetRecognizer *tempArray[] = {
85 new CharsetRecog_UTF8(),
86
87 new CharsetRecog_UTF_16_BE(),
88 new CharsetRecog_UTF_16_LE(),
89 new CharsetRecog_UTF_32_BE(),
90 new CharsetRecog_UTF_32_LE(),
91
92 new CharsetRecog_8859_1_en(),
93 new CharsetRecog_8859_1_da(),
94 new CharsetRecog_8859_1_de(),
95 new CharsetRecog_8859_1_es(),
96 new CharsetRecog_8859_1_fr(),
97 new CharsetRecog_8859_1_it(),
98 new CharsetRecog_8859_1_nl(),
99 new CharsetRecog_8859_1_no(),
100 new CharsetRecog_8859_1_pt(),
101 new CharsetRecog_8859_1_sv(),
102 new CharsetRecog_8859_2_cs(),
103 new CharsetRecog_8859_2_hu(),
104 new CharsetRecog_8859_2_pl(),
105 new CharsetRecog_8859_2_ro(),
106 new CharsetRecog_8859_5_ru(),
107 new CharsetRecog_8859_6_ar(),
108 new CharsetRecog_8859_7_el(),
109 new CharsetRecog_8859_8_I_he(),
110 new CharsetRecog_8859_8_he(),
111 new CharsetRecog_windows_1251(),
112 new CharsetRecog_windows_1256(),
113 new CharsetRecog_KOI8_R(),
114 new CharsetRecog_8859_9_tr(),
115 new CharsetRecog_sjis(),
116 new CharsetRecog_gb_18030(),
117 new CharsetRecog_euc_jp(),
118 new CharsetRecog_euc_kr(),
119 new CharsetRecog_big5(),
120
121 new CharsetRecog_2022JP(),
122 new CharsetRecog_2022KR(),
729e4ab9
A
123 new CharsetRecog_2022CN(),
124
125 new CharsetRecog_IBM424_he_rtl(),
126 new CharsetRecog_IBM424_he_ltr(),
127 new CharsetRecog_IBM420_ar_rtl(),
128 new CharsetRecog_IBM420_ar_ltr()
73c04bcf
A
129 };
130 int32_t rCount = ARRAY_SIZE(tempArray);
131 int32_t r;
132
133 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
134
135 if (recognizers == NULL) {
136 status = U_MEMORY_ALLOCATION_ERROR;
46f4442e 137 return;
73c04bcf
A
138 } else {
139 for (r = 0; r < rCount; r += 1) {
140 recognizers[r] = tempArray[r];
141
142 if (recognizers[r] == NULL) {
143 status = U_MEMORY_ALLOCATION_ERROR;
144 break;
145 }
146 }
147 }
148
149 if (U_SUCCESS(status)) {
150 umtx_lock(NULL);
151 if (fCSRecognizers == NULL) {
73c04bcf 152 fCSRecognizers_size = rCount;
729e4ab9 153 fCSRecognizers = recognizers;
73c04bcf
A
154 }
155 umtx_unlock(NULL);
156 }
157
158 if (fCSRecognizers != recognizers) {
159 for (r = 0; r < rCount; r += 1) {
160 delete recognizers[r];
161 recognizers[r] = NULL;
162 }
163
164 DELETE_ARRAY(recognizers);
165 }
166
167 recognizers = NULL;
168 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
169 }
170}
171
172CharsetDetector::CharsetDetector(UErrorCode &status)
46f4442e
A
173 : textIn(new InputText(status)), resultArray(NULL),
174 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
73c04bcf
A
175{
176 if (U_FAILURE(status)) {
177 return;
178 }
179
180 setRecognizers(status);
181
182 if (U_FAILURE(status)) {
183 return;
184 }
185
186 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
187
188 if (resultArray == NULL) {
189 status = U_MEMORY_ALLOCATION_ERROR;
190 return;
191 }
192
193 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
194 resultArray[i] = new CharsetMatch();
195
196 if (resultArray[i] == NULL) {
197 status = U_MEMORY_ALLOCATION_ERROR;
198 break;
199 }
200 }
201}
202
203CharsetDetector::~CharsetDetector()
204{
205 delete textIn;
206
207 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
208 delete resultArray[i];
209 }
210
211 uprv_free(resultArray);
212}
213
214void CharsetDetector::setText(const char *in, int32_t len)
215{
216 textIn->setText(in, len);
217 fFreshTextSet = TRUE;
218}
219
220UBool CharsetDetector::setStripTagsFlag(UBool flag)
221{
222 UBool temp = fStripTags;
223 fStripTags = flag;
224 fFreshTextSet = TRUE;
225 return temp;
226}
227
228UBool CharsetDetector::getStripTagsFlag() const
229{
230 return fStripTags;
231}
232
233void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
234{
235 textIn->setDeclaredEncoding(encoding,len);
236}
237
238int32_t CharsetDetector::getDetectableCount()
239{
240 UErrorCode status = U_ZERO_ERROR;
241
242 setRecognizers(status);
243
244 return fCSRecognizers_size;
245}
246
247const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
248{
249 int32_t maxMatchesFound = 0;
250
251 detectAll(maxMatchesFound, status);
252
253 if(maxMatchesFound > 0) {
254 return resultArray[0];
255 } else {
256 return NULL;
257 }
258}
259
260const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
261{
262 if(!textIn->isSet()) {
263 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
264
265 return NULL;
266 } else if(fFreshTextSet) {
267 CharsetRecognizer *csr;
268 int32_t detectResults;
269 int32_t confidence;
46f4442e 270 int32_t i;
73c04bcf
A
271
272 textIn->MungeInput(fStripTags);
273
274 // Iterate over all possible charsets, remember all that
275 // give a match quality > 0.
276 resultCount = 0;
46f4442e 277 for (i = 0; i < fCSRecognizers_size; i += 1) {
73c04bcf
A
278 csr = fCSRecognizers[i];
279 detectResults = csr->match(textIn);
280 confidence = detectResults;
281
282 if (confidence > 0) {
283 resultArray[resultCount++]->set(textIn, csr, confidence);
284 }
285 }
286
46f4442e 287 for(i = resultCount; i < fCSRecognizers_size; i += 1) {
73c04bcf
A
288 resultArray[i]->set(textIn, 0, 0);
289 }
290
291 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
729e4ab9
A
292
293 // Remove duplicate charsets from the results.
294 // Simple minded, brute force approach - check each entry against all that follow.
295 // The first entry of any duplicated set is the one that should be kept because it will
296 // be the one with the highest confidence rating.
297 // (Duplicate matches have different languages, only the charset is the same)
298 // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
299 // deleted, just reordered, with the unwanted duplicates placed after the good results.
300 int32_t j, k;
301 for (i=0; i<resultCount; i++) {
302 const char *charSetName = resultArray[i]->getName();
303 for (j=i+1; j<resultCount; ) {
304 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
305 // Not a duplicate.
306 j++;
307 } else {
308 // Duplicate entry at index j.
309 CharsetMatch *duplicate = resultArray[j];
310 for (k=j; k<resultCount-1; k++) {
311 resultArray[k] = resultArray[k+1];
312 }
313 resultCount--;
314 resultArray[resultCount] = duplicate;
315 }
316 }
317 }
73c04bcf
A
318
319 fFreshTextSet = FALSE;
320 }
321
322 maxMatchesFound = resultCount;
323
324 return resultArray;
325}
326
46f4442e 327/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
73c04bcf
A
328{
329 if( index > fCSRecognizers_size-1 || index < 0) {
330 status = U_INDEX_OUTOFBOUNDS_ERROR;
331
332 return 0;
333 } else {
334 return fCSRecognizers[index]->getName();
335 }
46f4442e 336}*/
73c04bcf
A
337
338U_NAMESPACE_END
339
340U_CDECL_BEGIN
341typedef struct {
342 int32_t currIndex;
343} Context;
344
345
346
347static void U_CALLCONV
348enumClose(UEnumeration *en) {
349 if(en->context != NULL) {
350 DELETE_ARRAY(en->context);
351 }
352
353 DELETE_ARRAY(en);
354}
355
356static int32_t U_CALLCONV
357enumCount(UEnumeration *, UErrorCode *) {
358 return fCSRecognizers_size;
359}
360
361static const char* U_CALLCONV
46f4442e 362enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
73c04bcf
A
363 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
364 if(resultLength != NULL) {
365 *resultLength = 0;
366 }
367 return NULL;
368 }
369 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
370 if(resultLength != NULL) {
371 *resultLength = (int32_t)uprv_strlen(currName);
372 }
373 ((Context *)en->context)->currIndex++;
374
375 return currName;
376}
377
378static void U_CALLCONV
379enumReset(UEnumeration *en, UErrorCode *) {
380 ((Context *)en->context)->currIndex = 0;
381}
382
383static const UEnumeration gCSDetEnumeration = {
384 NULL,
385 NULL,
386 enumClose,
387 enumCount,
388 uenum_unextDefault,
389 enumNext,
390 enumReset
391};
392
393U_CAPI UEnumeration * U_EXPORT2
729e4ab9 394ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
73c04bcf 395{
46f4442e
A
396 U_NAMESPACE_USE
397
73c04bcf
A
398 if(U_FAILURE(*status)) {
399 return 0;
400 }
401
402 /* Initialize recognized charsets. */
403 CharsetDetector::getDetectableCount();
404
405 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
406 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
407 en->context = (void*)NEW_ARRAY(Context, 1);
408 uprv_memset(en->context, 0, sizeof(Context));
409 return en;
410}
411U_CDECL_END
412
413#endif
46f4442e 414