]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/csdetect.cpp
ICU-531.31.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #if !UCONFIG_NO_CONVERSION
11
12 #include "unicode/ucsdet.h"
13
14 #include "csdetect.h"
15 #include "csmatch.h"
16 #include "uenumimp.h"
17
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "umutex.h"
21 #include "ucln_in.h"
22 #include "uarrsort.h"
23 #include "inputext.h"
24 #include "csrsbcs.h"
25 #include "csrmbcs.h"
26 #include "csrutf8.h"
27 #include "csrucode.h"
28 #include "csr2022.h"
29
30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
31
32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
34
35 U_NAMESPACE_BEGIN
36
37 struct CSRecognizerInfo : public UMemory {
38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
40
41 ~CSRecognizerInfo() {delete recognizer;};
42
43 CharsetRecognizer *recognizer;
44 UBool isDefaultEnabled;
45 };
46
47 U_NAMESPACE_END
48
49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
50 static icu::UInitOnce gCSRecognizersInitOnce;
51 static int32_t fCSRecognizers_size = 0;
52
53 U_CDECL_BEGIN
54 static UBool U_CALLCONV csdet_cleanup(void)
55 {
56 U_NAMESPACE_USE
57 if (fCSRecognizers != NULL) {
58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59 delete fCSRecognizers[r];
60 fCSRecognizers[r] = NULL;
61 }
62
63 DELETE_ARRAY(fCSRecognizers);
64 fCSRecognizers = NULL;
65 fCSRecognizers_size = 0;
66 }
67 gCSRecognizersInitOnce.reset();
68
69 return TRUE;
70 }
71
72 static int32_t U_CALLCONV
73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
74 {
75 U_NAMESPACE_USE
76
77 const CharsetMatch **csm_l = (const CharsetMatch **) left;
78 const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80 // NOTE: compare is backwards to sort from highest to lowest.
81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82 }
83
84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
85 U_NAMESPACE_USE
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87 CSRecognizerInfo *tempArray[] = {
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
89
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
94
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
111
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
113 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
114 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
115
116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
120 };
121 int32_t rCount = ARRAY_SIZE(tempArray);
122
123 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
124
125 if (fCSRecognizers == NULL) {
126 status = U_MEMORY_ALLOCATION_ERROR;
127 }
128 else {
129 fCSRecognizers_size = rCount;
130 for (int32_t r = 0; r < rCount; r += 1) {
131 fCSRecognizers[r] = tempArray[r];
132 if (fCSRecognizers[r] == NULL) {
133 status = U_MEMORY_ALLOCATION_ERROR;
134 }
135 }
136 }
137 }
138
139 U_CDECL_END
140
141 U_NAMESPACE_BEGIN
142
143 void CharsetDetector::setRecognizers(UErrorCode &status)
144 {
145 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
146 }
147
148 CharsetDetector::CharsetDetector(UErrorCode &status)
149 : textIn(new InputText(status)), resultArray(NULL),
150 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
151 fEnabledRecognizers(NULL)
152 {
153 if (U_FAILURE(status)) {
154 return;
155 }
156
157 setRecognizers(status);
158
159 if (U_FAILURE(status)) {
160 return;
161 }
162
163 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
164
165 if (resultArray == NULL) {
166 status = U_MEMORY_ALLOCATION_ERROR;
167 return;
168 }
169
170 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
171 resultArray[i] = new CharsetMatch();
172
173 if (resultArray[i] == NULL) {
174 status = U_MEMORY_ALLOCATION_ERROR;
175 break;
176 }
177 }
178 }
179
180 CharsetDetector::~CharsetDetector()
181 {
182 delete textIn;
183
184 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
185 delete resultArray[i];
186 }
187
188 uprv_free(resultArray);
189
190 if (fEnabledRecognizers) {
191 uprv_free(fEnabledRecognizers);
192 }
193 }
194
195 void CharsetDetector::setText(const char *in, int32_t len)
196 {
197 textIn->setText(in, len);
198 fFreshTextSet = TRUE;
199 }
200
201 UBool CharsetDetector::setStripTagsFlag(UBool flag)
202 {
203 UBool temp = fStripTags;
204 fStripTags = flag;
205 fFreshTextSet = TRUE;
206 return temp;
207 }
208
209 UBool CharsetDetector::getStripTagsFlag() const
210 {
211 return fStripTags;
212 }
213
214 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
215 {
216 textIn->setDeclaredEncoding(encoding,len);
217 }
218
219 int32_t CharsetDetector::getDetectableCount()
220 {
221 UErrorCode status = U_ZERO_ERROR;
222
223 setRecognizers(status);
224
225 return fCSRecognizers_size;
226 }
227
228 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
229 {
230 int32_t maxMatchesFound = 0;
231
232 detectAll(maxMatchesFound, status);
233
234 if(maxMatchesFound > 0) {
235 return resultArray[0];
236 } else {
237 return NULL;
238 }
239 }
240
241 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
242 {
243 if(!textIn->isSet()) {
244 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
245
246 return NULL;
247 } else if (fFreshTextSet) {
248 CharsetRecognizer *csr;
249 int32_t i;
250
251 textIn->MungeInput(fStripTags);
252
253 // Iterate over all possible charsets, remember all that
254 // give a match quality > 0.
255 resultCount = 0;
256 for (i = 0; i < fCSRecognizers_size; i += 1) {
257 csr = fCSRecognizers[i]->recognizer;
258 if (csr->match(textIn, resultArray[resultCount])) {
259 resultCount++;
260 }
261 }
262
263 if (resultCount > 1) {
264 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
265 }
266 fFreshTextSet = FALSE;
267 }
268
269 maxMatchesFound = resultCount;
270
271 return resultArray;
272 }
273
274 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
275 {
276 if (U_FAILURE(status)) {
277 return;
278 }
279
280 int32_t modIdx = -1;
281 UBool isDefaultVal = FALSE;
282 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
283 CSRecognizerInfo *csrinfo = fCSRecognizers[i];
284 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
285 modIdx = i;
286 isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
287 break;
288 }
289 }
290 if (modIdx < 0) {
291 // No matching encoding found
292 status = U_ILLEGAL_ARGUMENT_ERROR;
293 return;
294 }
295
296 if (fEnabledRecognizers == NULL && !isDefaultVal) {
297 // Create an array storing the non default setting
298 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
299 if (fEnabledRecognizers == NULL) {
300 status = U_MEMORY_ALLOCATION_ERROR;
301 return;
302 }
303 // Initialize the array with default info
304 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
305 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
306 }
307 }
308
309 if (fEnabledRecognizers != NULL) {
310 fEnabledRecognizers[modIdx] = enabled;
311 }
312 }
313
314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
315 {
316 if( index > fCSRecognizers_size-1 || index < 0) {
317 status = U_INDEX_OUTOFBOUNDS_ERROR;
318
319 return 0;
320 } else {
321 return fCSRecognizers[index]->getName();
322 }
323 }*/
324
325 U_NAMESPACE_END
326
327 U_CDECL_BEGIN
328 typedef struct {
329 int32_t currIndex;
330 UBool all;
331 UBool *enabledRecognizers;
332 } Context;
333
334
335
336 static void U_CALLCONV
337 enumClose(UEnumeration *en) {
338 if(en->context != NULL) {
339 DELETE_ARRAY(en->context);
340 }
341
342 DELETE_ARRAY(en);
343 }
344
345 static int32_t U_CALLCONV
346 enumCount(UEnumeration *en, UErrorCode *) {
347 if (((Context *)en->context)->all) {
348 // ucsdet_getAllDetectableCharsets, all charset detector names
349 return fCSRecognizers_size;
350 }
351
352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
353 int32_t count = 0;
354 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
355 if (enabledArray != NULL) {
356 // custom set
357 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
358 if (enabledArray[i]) {
359 count++;
360 }
361 }
362 } else {
363 // default set
364 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365 if (fCSRecognizers[i]->isDefaultEnabled) {
366 count++;
367 }
368 }
369 }
370 return count;
371 }
372
373 static const char* U_CALLCONV
374 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
375 const char *currName = NULL;
376
377 if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
378 if (((Context *)en->context)->all) {
379 // ucsdet_getAllDetectableCharsets, all charset detector names
380 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
381 ((Context *)en->context)->currIndex++;
382 } else {
383 // ucsdet_getDetectableCharsets
384 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
385 if (enabledArray != NULL) {
386 // custome set
387 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
388 if (enabledArray[((Context *)en->context)->currIndex]) {
389 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
390 }
391 ((Context *)en->context)->currIndex++;
392 }
393 } else {
394 // default set
395 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
396 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
397 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
398 }
399 ((Context *)en->context)->currIndex++;
400 }
401 }
402 }
403 }
404
405 if(resultLength != NULL) {
406 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
407 }
408
409 return currName;
410 }
411
412
413 static void U_CALLCONV
414 enumReset(UEnumeration *en, UErrorCode *) {
415 ((Context *)en->context)->currIndex = 0;
416 }
417
418 static const UEnumeration gCSDetEnumeration = {
419 NULL,
420 NULL,
421 enumClose,
422 enumCount,
423 uenum_unextDefault,
424 enumNext,
425 enumReset
426 };
427
428 U_CDECL_END
429
430 U_NAMESPACE_BEGIN
431
432 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
433 {
434
435 /* Initialize recognized charsets. */
436 setRecognizers(status);
437
438 if(U_FAILURE(status)) {
439 return 0;
440 }
441
442 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
443 if (en == NULL) {
444 status = U_MEMORY_ALLOCATION_ERROR;
445 return 0;
446 }
447 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
448 en->context = (void*)NEW_ARRAY(Context, 1);
449 if (en->context == NULL) {
450 status = U_MEMORY_ALLOCATION_ERROR;
451 DELETE_ARRAY(en);
452 return 0;
453 }
454 uprv_memset(en->context, 0, sizeof(Context));
455 ((Context*)en->context)->all = TRUE;
456 return en;
457 }
458
459 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
460 {
461 if(U_FAILURE(status)) {
462 return 0;
463 }
464
465 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
466 if (en == NULL) {
467 status = U_MEMORY_ALLOCATION_ERROR;
468 return 0;
469 }
470 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
471 en->context = (void*)NEW_ARRAY(Context, 1);
472 if (en->context == NULL) {
473 status = U_MEMORY_ALLOCATION_ERROR;
474 DELETE_ARRAY(en);
475 return 0;
476 }
477 uprv_memset(en->context, 0, sizeof(Context));
478 ((Context*)en->context)->all = FALSE;
479 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
480 return en;
481 }
482
483 U_NAMESPACE_END
484
485 #endif