]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csdetect.cpp
ICU-57149.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
CommitLineData
73c04bcf
A
1/*
2 **********************************************************************
2ca993e8 3 * Copyright (C) 2005-2016, International Business Machines
73c04bcf
A
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_CONVERSION
11
12#include "unicode/ucsdet.h"
13
14#include "csdetect.h"
15#include "csmatch.h"
16#include "uenumimp.h"
17
18#include "cmemory.h"
19#include "cstring.h"
20#include "umutex.h"
21#include "ucln_in.h"
22#include "uarrsort.h"
23#include "inputext.h"
24#include "csrsbcs.h"
25#include "csrmbcs.h"
26#include "csrutf8.h"
27#include "csrucode.h"
28#include "csr2022.h"
29
73c04bcf
A
30#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
31#define DELETE_ARRAY(array) uprv_free((void *) (array))
32
57a6839d
A
33U_NAMESPACE_BEGIN
34
35struct CSRecognizerInfo : public UMemory {
36 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
37 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
38
39 ~CSRecognizerInfo() {delete recognizer;};
40
41 CharsetRecognizer *recognizer;
42 UBool isDefaultEnabled;
43};
44
45U_NAMESPACE_END
73c04bcf 46
57a6839d
A
47static icu::CSRecognizerInfo **fCSRecognizers = NULL;
48static icu::UInitOnce gCSRecognizersInitOnce;
73c04bcf
A
49static int32_t fCSRecognizers_size = 0;
50
57a6839d 51U_CDECL_BEGIN
73c04bcf
A
52static UBool U_CALLCONV csdet_cleanup(void)
53{
57a6839d 54 U_NAMESPACE_USE
73c04bcf
A
55 if (fCSRecognizers != NULL) {
56 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
57 delete fCSRecognizers[r];
58 fCSRecognizers[r] = NULL;
59 }
60
61 DELETE_ARRAY(fCSRecognizers);
62 fCSRecognizers = NULL;
63 fCSRecognizers_size = 0;
64 }
57a6839d 65 gCSRecognizersInitOnce.reset();
73c04bcf
A
66
67 return TRUE;
68}
69
70static int32_t U_CALLCONV
46f4442e 71charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
73c04bcf 72{
46f4442e
A
73 U_NAMESPACE_USE
74
73c04bcf
A
75 const CharsetMatch **csm_l = (const CharsetMatch **) left;
76 const CharsetMatch **csm_r = (const CharsetMatch **) right;
77
78 // NOTE: compare is backwards to sort from highest to lowest.
79 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
80}
81
57a6839d
A
82static void U_CALLCONV initRecognizers(UErrorCode &status) {
83 U_NAMESPACE_USE
84 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
85 CSRecognizerInfo *tempArray[] = {
86 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
87
88 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
89 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
90 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
91 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
92
93 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
94 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
95 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
96 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
97 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
98 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
99 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
100 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
101 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
102 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
103 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
104 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
105 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
106 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
107 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
108 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
109
110 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
b331163b 111#if !UCONFIG_ONLY_HTML_CONVERSION
57a6839d
A
112 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
113 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
114
115 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
116 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
117 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
118 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
b331163b 119#endif
57a6839d 120 };
2ca993e8 121 int32_t rCount = UPRV_LENGTHOF(tempArray);
57a6839d
A
122
123 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
124
125 if (fCSRecognizers == NULL) {
126 status = U_MEMORY_ALLOCATION_ERROR;
127 }
128 else {
129 fCSRecognizers_size = rCount;
130 for (int32_t r = 0; r < rCount; r += 1) {
131 fCSRecognizers[r] = tempArray[r];
132 if (fCSRecognizers[r] == NULL) {
133 status = U_MEMORY_ALLOCATION_ERROR;
134 }
135 }
136 }
137}
138
73c04bcf
A
139U_CDECL_END
140
141U_NAMESPACE_BEGIN
142
143void CharsetDetector::setRecognizers(UErrorCode &status)
144{
57a6839d 145 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
73c04bcf
A
146}
147
148CharsetDetector::CharsetDetector(UErrorCode &status)
46f4442e 149 : textIn(new InputText(status)), resultArray(NULL),
57a6839d
A
150 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
151 fEnabledRecognizers(NULL)
73c04bcf
A
152{
153 if (U_FAILURE(status)) {
154 return;
155 }
156
157 setRecognizers(status);
158
159 if (U_FAILURE(status)) {
160 return;
161 }
162
163 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
164
165 if (resultArray == NULL) {
166 status = U_MEMORY_ALLOCATION_ERROR;
167 return;
168 }
169
170 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
171 resultArray[i] = new CharsetMatch();
172
173 if (resultArray[i] == NULL) {
174 status = U_MEMORY_ALLOCATION_ERROR;
175 break;
176 }
177 }
178}
179
180CharsetDetector::~CharsetDetector()
181{
182 delete textIn;
183
184 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
185 delete resultArray[i];
186 }
187
188 uprv_free(resultArray);
57a6839d
A
189
190 if (fEnabledRecognizers) {
191 uprv_free(fEnabledRecognizers);
192 }
73c04bcf
A
193}
194
195void CharsetDetector::setText(const char *in, int32_t len)
196{
197 textIn->setText(in, len);
198 fFreshTextSet = TRUE;
199}
200
201UBool CharsetDetector::setStripTagsFlag(UBool flag)
202{
203 UBool temp = fStripTags;
204 fStripTags = flag;
205 fFreshTextSet = TRUE;
206 return temp;
207}
208
209UBool CharsetDetector::getStripTagsFlag() const
210{
211 return fStripTags;
212}
213
214void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
215{
216 textIn->setDeclaredEncoding(encoding,len);
217}
218
219int32_t CharsetDetector::getDetectableCount()
220{
221 UErrorCode status = U_ZERO_ERROR;
222
223 setRecognizers(status);
224
225 return fCSRecognizers_size;
226}
227
228const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
229{
230 int32_t maxMatchesFound = 0;
231
232 detectAll(maxMatchesFound, status);
233
234 if(maxMatchesFound > 0) {
235 return resultArray[0];
236 } else {
237 return NULL;
238 }
239}
240
241const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
242{
243 if(!textIn->isSet()) {
244 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
245
246 return NULL;
51004dcb 247 } else if (fFreshTextSet) {
73c04bcf 248 CharsetRecognizer *csr;
46f4442e 249 int32_t i;
73c04bcf
A
250
251 textIn->MungeInput(fStripTags);
252
253 // Iterate over all possible charsets, remember all that
254 // give a match quality > 0.
255 resultCount = 0;
46f4442e 256 for (i = 0; i < fCSRecognizers_size; i += 1) {
57a6839d 257 csr = fCSRecognizers[i]->recognizer;
51004dcb
A
258 if (csr->match(textIn, resultArray[resultCount])) {
259 resultCount++;
73c04bcf
A
260 }
261 }
262
51004dcb
A
263 if (resultCount > 1) {
264 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
729e4ab9 265 }
73c04bcf
A
266 fFreshTextSet = FALSE;
267 }
268
269 maxMatchesFound = resultCount;
270
271 return resultArray;
272}
273
57a6839d
A
274void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
275{
276 if (U_FAILURE(status)) {
277 return;
278 }
279
280 int32_t modIdx = -1;
281 UBool isDefaultVal = FALSE;
282 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
283 CSRecognizerInfo *csrinfo = fCSRecognizers[i];
284 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
285 modIdx = i;
286 isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
287 break;
288 }
289 }
290 if (modIdx < 0) {
291 // No matching encoding found
292 status = U_ILLEGAL_ARGUMENT_ERROR;
293 return;
294 }
295
296 if (fEnabledRecognizers == NULL && !isDefaultVal) {
297 // Create an array storing the non default setting
298 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
299 if (fEnabledRecognizers == NULL) {
300 status = U_MEMORY_ALLOCATION_ERROR;
301 return;
302 }
303 // Initialize the array with default info
304 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
305 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
306 }
307 }
308
309 if (fEnabledRecognizers != NULL) {
310 fEnabledRecognizers[modIdx] = enabled;
311 }
312}
313
46f4442e 314/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
73c04bcf
A
315{
316 if( index > fCSRecognizers_size-1 || index < 0) {
317 status = U_INDEX_OUTOFBOUNDS_ERROR;
318
319 return 0;
320 } else {
321 return fCSRecognizers[index]->getName();
322 }
46f4442e 323}*/
73c04bcf
A
324
325U_NAMESPACE_END
326
327U_CDECL_BEGIN
328typedef struct {
329 int32_t currIndex;
57a6839d
A
330 UBool all;
331 UBool *enabledRecognizers;
73c04bcf
A
332} Context;
333
334
335
336static void U_CALLCONV
337enumClose(UEnumeration *en) {
338 if(en->context != NULL) {
339 DELETE_ARRAY(en->context);
340 }
341
342 DELETE_ARRAY(en);
343}
344
345static int32_t U_CALLCONV
57a6839d
A
346enumCount(UEnumeration *en, UErrorCode *) {
347 if (((Context *)en->context)->all) {
348 // ucsdet_getAllDetectableCharsets, all charset detector names
349 return fCSRecognizers_size;
350 }
351
352 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
353 int32_t count = 0;
354 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
355 if (enabledArray != NULL) {
356 // custom set
357 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
358 if (enabledArray[i]) {
359 count++;
360 }
361 }
362 } else {
363 // default set
364 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
365 if (fCSRecognizers[i]->isDefaultEnabled) {
366 count++;
367 }
368 }
369 }
370 return count;
73c04bcf
A
371}
372
373static const char* U_CALLCONV
46f4442e 374enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
57a6839d
A
375 const char *currName = NULL;
376
377 if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
378 if (((Context *)en->context)->all) {
379 // ucsdet_getAllDetectableCharsets, all charset detector names
380 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
381 ((Context *)en->context)->currIndex++;
382 } else {
383 // ucsdet_getDetectableCharsets
384 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
385 if (enabledArray != NULL) {
386 // custome set
387 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
388 if (enabledArray[((Context *)en->context)->currIndex]) {
389 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
390 }
391 ((Context *)en->context)->currIndex++;
392 }
393 } else {
394 // default set
395 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
396 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
397 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
398 }
399 ((Context *)en->context)->currIndex++;
400 }
401 }
73c04bcf 402 }
73c04bcf 403 }
57a6839d 404
73c04bcf 405 if(resultLength != NULL) {
57a6839d 406 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
73c04bcf 407 }
73c04bcf
A
408
409 return currName;
410}
411
57a6839d 412
73c04bcf
A
413static void U_CALLCONV
414enumReset(UEnumeration *en, UErrorCode *) {
415 ((Context *)en->context)->currIndex = 0;
416}
417
418static const UEnumeration gCSDetEnumeration = {
419 NULL,
420 NULL,
421 enumClose,
422 enumCount,
423 uenum_unextDefault,
424 enumNext,
425 enumReset
426};
427
57a6839d
A
428U_CDECL_END
429
430U_NAMESPACE_BEGIN
431
432UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
73c04bcf 433{
46f4442e 434
57a6839d
A
435 /* Initialize recognized charsets. */
436 setRecognizers(status);
437
438 if(U_FAILURE(status)) {
73c04bcf
A
439 return 0;
440 }
441
57a6839d
A
442 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
443 if (en == NULL) {
444 status = U_MEMORY_ALLOCATION_ERROR;
445 return 0;
446 }
447 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
448 en->context = (void*)NEW_ARRAY(Context, 1);
449 if (en->context == NULL) {
450 status = U_MEMORY_ALLOCATION_ERROR;
451 DELETE_ARRAY(en);
452 return 0;
453 }
454 uprv_memset(en->context, 0, sizeof(Context));
455 ((Context*)en->context)->all = TRUE;
456 return en;
457}
458
459UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
460{
461 if(U_FAILURE(status)) {
462 return 0;
463 }
73c04bcf
A
464
465 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
57a6839d
A
466 if (en == NULL) {
467 status = U_MEMORY_ALLOCATION_ERROR;
468 return 0;
469 }
73c04bcf
A
470 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
471 en->context = (void*)NEW_ARRAY(Context, 1);
57a6839d
A
472 if (en->context == NULL) {
473 status = U_MEMORY_ALLOCATION_ERROR;
474 DELETE_ARRAY(en);
475 return 0;
476 }
73c04bcf 477 uprv_memset(en->context, 0, sizeof(Context));
57a6839d
A
478 ((Context*)en->context)->all = FALSE;
479 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
73c04bcf
A
480 return en;
481}
73c04bcf 482
57a6839d 483U_NAMESPACE_END
46f4442e 484
57a6839d 485#endif