]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csdetect.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / csdetect.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/*
4 **********************************************************************
2ca993e8 5 * Copyright (C) 2005-2016, International Business Machines
73c04bcf
A
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_CONVERSION
13
14#include "unicode/ucsdet.h"
15
16#include "csdetect.h"
17#include "csmatch.h"
18#include "uenumimp.h"
19
20#include "cmemory.h"
21#include "cstring.h"
22#include "umutex.h"
23#include "ucln_in.h"
24#include "uarrsort.h"
25#include "inputext.h"
26#include "csrsbcs.h"
27#include "csrmbcs.h"
28#include "csrutf8.h"
29#include "csrucode.h"
30#include "csr2022.h"
31
73c04bcf
A
32#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
33#define DELETE_ARRAY(array) uprv_free((void *) (array))
34
57a6839d
A
35U_NAMESPACE_BEGIN
36
37struct CSRecognizerInfo : public UMemory {
38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
3d1f044b 39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
57a6839d 40
3d1f044b 41 ~CSRecognizerInfo() {delete recognizer;}
57a6839d
A
42
43 CharsetRecognizer *recognizer;
44 UBool isDefaultEnabled;
45};
46
47U_NAMESPACE_END
73c04bcf 48
57a6839d 49static icu::CSRecognizerInfo **fCSRecognizers = NULL;
3d1f044b 50static icu::UInitOnce gCSRecognizersInitOnce = U_INITONCE_INITIALIZER;
73c04bcf
A
51static int32_t fCSRecognizers_size = 0;
52
57a6839d 53U_CDECL_BEGIN
73c04bcf
A
54static UBool U_CALLCONV csdet_cleanup(void)
55{
57a6839d 56 U_NAMESPACE_USE
73c04bcf
A
57 if (fCSRecognizers != NULL) {
58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
59 delete fCSRecognizers[r];
60 fCSRecognizers[r] = NULL;
61 }
62
63 DELETE_ARRAY(fCSRecognizers);
64 fCSRecognizers = NULL;
65 fCSRecognizers_size = 0;
66 }
57a6839d 67 gCSRecognizersInitOnce.reset();
73c04bcf
A
68
69 return TRUE;
70}
71
72static int32_t U_CALLCONV
46f4442e 73charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
73c04bcf 74{
46f4442e
A
75 U_NAMESPACE_USE
76
73c04bcf
A
77 const CharsetMatch **csm_l = (const CharsetMatch **) left;
78 const CharsetMatch **csm_r = (const CharsetMatch **) right;
79
80 // NOTE: compare is backwards to sort from highest to lowest.
81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
82}
83
57a6839d
A
84static void U_CALLCONV initRecognizers(UErrorCode &status) {
85 U_NAMESPACE_USE
86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
87 CSRecognizerInfo *tempArray[] = {
88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
89
90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
94
95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
111
112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
b331163b 113#if !UCONFIG_ONLY_HTML_CONVERSION
57a6839d
A
114 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
115 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
116
117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
118 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
120 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
b331163b 121#endif
57a6839d 122 };
2ca993e8 123 int32_t rCount = UPRV_LENGTHOF(tempArray);
57a6839d
A
124
125 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
126
127 if (fCSRecognizers == NULL) {
128 status = U_MEMORY_ALLOCATION_ERROR;
129 }
130 else {
131 fCSRecognizers_size = rCount;
132 for (int32_t r = 0; r < rCount; r += 1) {
133 fCSRecognizers[r] = tempArray[r];
134 if (fCSRecognizers[r] == NULL) {
135 status = U_MEMORY_ALLOCATION_ERROR;
136 }
137 }
138 }
139}
140
73c04bcf
A
141U_CDECL_END
142
143U_NAMESPACE_BEGIN
144
145void CharsetDetector::setRecognizers(UErrorCode &status)
146{
57a6839d 147 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
73c04bcf
A
148}
149
150CharsetDetector::CharsetDetector(UErrorCode &status)
46f4442e 151 : textIn(new InputText(status)), resultArray(NULL),
57a6839d
A
152 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
153 fEnabledRecognizers(NULL)
73c04bcf
A
154{
155 if (U_FAILURE(status)) {
156 return;
157 }
158
159 setRecognizers(status);
160
161 if (U_FAILURE(status)) {
162 return;
163 }
164
165 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
166
167 if (resultArray == NULL) {
168 status = U_MEMORY_ALLOCATION_ERROR;
169 return;
170 }
171
172 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
173 resultArray[i] = new CharsetMatch();
174
175 if (resultArray[i] == NULL) {
176 status = U_MEMORY_ALLOCATION_ERROR;
177 break;
178 }
179 }
180}
181
182CharsetDetector::~CharsetDetector()
183{
184 delete textIn;
185
186 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
187 delete resultArray[i];
188 }
189
190 uprv_free(resultArray);
57a6839d
A
191
192 if (fEnabledRecognizers) {
193 uprv_free(fEnabledRecognizers);
194 }
73c04bcf
A
195}
196
197void CharsetDetector::setText(const char *in, int32_t len)
198{
199 textIn->setText(in, len);
200 fFreshTextSet = TRUE;
201}
202
203UBool CharsetDetector::setStripTagsFlag(UBool flag)
204{
205 UBool temp = fStripTags;
206 fStripTags = flag;
207 fFreshTextSet = TRUE;
208 return temp;
209}
210
211UBool CharsetDetector::getStripTagsFlag() const
212{
213 return fStripTags;
214}
215
216void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
217{
218 textIn->setDeclaredEncoding(encoding,len);
219}
220
221int32_t CharsetDetector::getDetectableCount()
222{
223 UErrorCode status = U_ZERO_ERROR;
224
225 setRecognizers(status);
226
227 return fCSRecognizers_size;
228}
229
230const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
231{
232 int32_t maxMatchesFound = 0;
233
234 detectAll(maxMatchesFound, status);
235
236 if(maxMatchesFound > 0) {
237 return resultArray[0];
238 } else {
239 return NULL;
240 }
241}
242
243const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
244{
245 if(!textIn->isSet()) {
246 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
247
248 return NULL;
51004dcb 249 } else if (fFreshTextSet) {
73c04bcf 250 CharsetRecognizer *csr;
46f4442e 251 int32_t i;
73c04bcf
A
252
253 textIn->MungeInput(fStripTags);
254
255 // Iterate over all possible charsets, remember all that
256 // give a match quality > 0.
257 resultCount = 0;
46f4442e 258 for (i = 0; i < fCSRecognizers_size; i += 1) {
57a6839d 259 csr = fCSRecognizers[i]->recognizer;
51004dcb
A
260 if (csr->match(textIn, resultArray[resultCount])) {
261 resultCount++;
73c04bcf
A
262 }
263 }
264
51004dcb
A
265 if (resultCount > 1) {
266 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
729e4ab9 267 }
73c04bcf
A
268 fFreshTextSet = FALSE;
269 }
270
271 maxMatchesFound = resultCount;
272
273 return resultArray;
274}
275
57a6839d
A
276void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
277{
278 if (U_FAILURE(status)) {
279 return;
280 }
281
282 int32_t modIdx = -1;
283 UBool isDefaultVal = FALSE;
284 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
285 CSRecognizerInfo *csrinfo = fCSRecognizers[i];
286 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
287 modIdx = i;
288 isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
289 break;
290 }
291 }
292 if (modIdx < 0) {
293 // No matching encoding found
294 status = U_ILLEGAL_ARGUMENT_ERROR;
295 return;
296 }
297
298 if (fEnabledRecognizers == NULL && !isDefaultVal) {
299 // Create an array storing the non default setting
300 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
301 if (fEnabledRecognizers == NULL) {
302 status = U_MEMORY_ALLOCATION_ERROR;
303 return;
304 }
305 // Initialize the array with default info
306 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
307 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
308 }
309 }
310
311 if (fEnabledRecognizers != NULL) {
312 fEnabledRecognizers[modIdx] = enabled;
313 }
314}
315
46f4442e 316/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
73c04bcf
A
317{
318 if( index > fCSRecognizers_size-1 || index < 0) {
319 status = U_INDEX_OUTOFBOUNDS_ERROR;
320
321 return 0;
322 } else {
323 return fCSRecognizers[index]->getName();
324 }
46f4442e 325}*/
73c04bcf
A
326
327U_NAMESPACE_END
328
329U_CDECL_BEGIN
330typedef struct {
331 int32_t currIndex;
57a6839d
A
332 UBool all;
333 UBool *enabledRecognizers;
73c04bcf
A
334} Context;
335
336
337
338static void U_CALLCONV
339enumClose(UEnumeration *en) {
340 if(en->context != NULL) {
341 DELETE_ARRAY(en->context);
342 }
343
344 DELETE_ARRAY(en);
345}
346
347static int32_t U_CALLCONV
57a6839d
A
348enumCount(UEnumeration *en, UErrorCode *) {
349 if (((Context *)en->context)->all) {
350 // ucsdet_getAllDetectableCharsets, all charset detector names
351 return fCSRecognizers_size;
352 }
353
354 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
355 int32_t count = 0;
356 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
357 if (enabledArray != NULL) {
358 // custom set
359 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
360 if (enabledArray[i]) {
361 count++;
362 }
363 }
364 } else {
365 // default set
366 for (int32_t i = 0; i < fCSRecognizers_size; i++) {
367 if (fCSRecognizers[i]->isDefaultEnabled) {
368 count++;
369 }
370 }
371 }
372 return count;
73c04bcf
A
373}
374
375static const char* U_CALLCONV
46f4442e 376enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
57a6839d
A
377 const char *currName = NULL;
378
379 if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
380 if (((Context *)en->context)->all) {
381 // ucsdet_getAllDetectableCharsets, all charset detector names
382 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
383 ((Context *)en->context)->currIndex++;
384 } else {
385 // ucsdet_getDetectableCharsets
386 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
387 if (enabledArray != NULL) {
388 // custome set
389 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
390 if (enabledArray[((Context *)en->context)->currIndex]) {
391 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
392 }
393 ((Context *)en->context)->currIndex++;
394 }
395 } else {
396 // default set
397 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
398 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
399 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
400 }
401 ((Context *)en->context)->currIndex++;
402 }
403 }
73c04bcf 404 }
73c04bcf 405 }
57a6839d 406
73c04bcf 407 if(resultLength != NULL) {
57a6839d 408 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
73c04bcf 409 }
73c04bcf
A
410
411 return currName;
412}
413
57a6839d 414
73c04bcf
A
415static void U_CALLCONV
416enumReset(UEnumeration *en, UErrorCode *) {
417 ((Context *)en->context)->currIndex = 0;
418}
419
420static const UEnumeration gCSDetEnumeration = {
421 NULL,
422 NULL,
423 enumClose,
424 enumCount,
425 uenum_unextDefault,
426 enumNext,
427 enumReset
428};
429
57a6839d
A
430U_CDECL_END
431
432U_NAMESPACE_BEGIN
433
434UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
73c04bcf 435{
46f4442e 436
57a6839d
A
437 /* Initialize recognized charsets. */
438 setRecognizers(status);
439
440 if(U_FAILURE(status)) {
73c04bcf
A
441 return 0;
442 }
443
57a6839d
A
444 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
445 if (en == NULL) {
446 status = U_MEMORY_ALLOCATION_ERROR;
447 return 0;
448 }
449 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
450 en->context = (void*)NEW_ARRAY(Context, 1);
451 if (en->context == NULL) {
452 status = U_MEMORY_ALLOCATION_ERROR;
453 DELETE_ARRAY(en);
454 return 0;
455 }
456 uprv_memset(en->context, 0, sizeof(Context));
457 ((Context*)en->context)->all = TRUE;
458 return en;
459}
460
461UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
462{
463 if(U_FAILURE(status)) {
464 return 0;
465 }
73c04bcf
A
466
467 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
57a6839d
A
468 if (en == NULL) {
469 status = U_MEMORY_ALLOCATION_ERROR;
470 return 0;
471 }
73c04bcf
A
472 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
473 en->context = (void*)NEW_ARRAY(Context, 1);
57a6839d
A
474 if (en->context == NULL) {
475 status = U_MEMORY_ALLOCATION_ERROR;
476 DELETE_ARRAY(en);
477 return 0;
478 }
73c04bcf 479 uprv_memset(en->context, 0, sizeof(Context));
57a6839d
A
480 ((Context*)en->context)->all = FALSE;
481 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
73c04bcf
A
482 return en;
483}
73c04bcf 484
57a6839d 485U_NAMESPACE_END
46f4442e 486
57a6839d 487#endif